From 7e31bdf3c3ca639ab776b8434c421160e870e6a3 Mon Sep 17 00:00:00 2001 From: Richard Jones Date: Mon, 21 Nov 2011 14:41:06 +1100 Subject: [PATCH] accept textual dates in more places; Result now holds match span positions. --- README.rst | 78 +++++++++++++++++++++++++++++++++++++----------------- parse.py | 65 +++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 114 insertions(+), 29 deletions(-) diff --git a/README.rst b/README.rst index b2e5fe2..36cb025 100644 --- a/README.rst +++ b/README.rst @@ -50,15 +50,15 @@ Some simple parse() format string examples: Format Specification -------------------- +Do remember that most often a straight format-less {} will suffice +where a more complex format specification might have been used. + Most of the `Format Specification Mini-Language`_ is supported:: [[fill]align][sign][#][0][width][,][.precision][type] The align operators will cause spaces (or specified fill character) -to be stripped from the value. The alignment character "=" is not yet -supported. - -The comma "," separator is not yet supported. +to be stripped from the value. The types supported are a slightly different mix to the format() types. Some format() types come directly over: d, n, f, b, o, h, x and X. @@ -67,26 +67,37 @@ D, w, W, s and S are also available. The format() types %, F, e, E, g and G are not yet supported. -===== ========================================== ======= -Type Characters Matched Output -===== ========================================== ======= - w Letters and underscore str - W Non-letter and underscore str - s Whitespace str - S Non-whitespace str - d Digits (effectively integer numbers) int - D Non-digit str - n Numbers with thousands separators (, or .) int - f Fixed-point numbers float - b Binary numbers int - o Octal numbers int - h Hexadecimal numbers (lower and upper case) int - x Lower-case hexadecimal numbers int - X Upper-case hexadecimal numbers int -===== ========================================== ======= - -Do remember though that most often a straight type-less {} will suffice -where a more complex type specification might have been used. +===== =========================================== ======== +Type Characters Matched Output +===== =========================================== ======== + w Letters and underscore str + W Non-letter and underscore str + s Whitespace str + S Non-whitespace str + d Digits (effectively integer numbers) int + D Non-digit str + n Numbers with thousands separators (, or .) int + f Fixed-point numbers float + b Binary numbers int + o Octal numbers int + h Hexadecimal numbers (lower and upper case) int + x Lower-case hexadecimal numbers int + X Upper-case hexadecimal numbers int + ti ISO 8601 format date/time datetime + e.g. 1972-01-20T10:21:36Z + te RFC2822 e-mail format date/time datetime + e.g. Mon, 20 Jan 1972 10:21:36 +1000 + tg Global (day/month) format date/time datetime + e.g. 20/1/1972 10:21:36 AM +1:00 + ta US (month/day) format date/time datetime + e.g. 1/20/1972 10:21:36 PM +10:30 + tc ctime() format date/time datetime + e.g. Sun Sep 16 01:03:52 1973 + th HTTP log format date/time datetime + e.g. 21/Nov/2011:00:07:11 +0000 + tt Time time + e.g. 10:21:36 PM -5:30 +===== =========================================== ======== So, for example, some typed parsing, and None resulting if the typing does not match: @@ -109,6 +120,23 @@ actually centered. It just strips leading and trailing whitespace. See also the unit tests at the end of the module for some more examples. Run the tests with "python -m parse". +Some notes for the date and time types: + +- the presence of the time part is optional (including ISO 8601, starting + at the "T"). A full datetime object will always be returned; the time + will be set to 00:00:00. +- except in ISO 8601 the day and month digits may be 0-padded +- the separator for the ta and tg formats may be "-" or "/" +- as per RFC 2822 the e-mail format may omit the day (and comma), and the + seconds but nothing else +- hours greater than 12 will be happily accepted +- the AM/PM are optional, and if PM is found then 12 hours will be added + to the datetime object's hours amount - even if the hour is greater + than 12 (for consistency.) +- except in ISO 8601 and e-mail format the timezone is optional +- when a seconds amount is present in the input fractions will be parsed +- named timezones are not yet supported + .. _`Format String Syntax`: http://docs.python.org/library/string.html#format-string-syntax .. _`Format Specification Mini-Language`: http://docs.python.org/library/string.html#format-specification-mini-language @@ -116,6 +144,8 @@ examples. Run the tests with "python -m parse". **Version history (in brief)**: +- 1.1.4 fixes to some int type conversion; implemented "=" alignment; added + date/time parsing with a variety of formats handled. - 1.1.3 type conversion is automatic based on specified field types. Also added "f" and "n" types. - 1.1.2 refactored, added compile() and limited ``from parse import *`` diff --git a/parse.py b/parse.py index 653f00d..0070d21 100644 --- a/parse.py +++ b/parse.py @@ -131,6 +131,8 @@ Some notes for the date and time types: will be set to 00:00:00. - except in ISO 8601 the day and month digits may be 0-padded - the separator for the ta and tg formats may be "-" or "/" +- named months (abbreviations or full names) may be used in the ta and tg + formats - as per RFC 2822 the e-mail format may omit the day (and comma), and the seconds but nothing else - hours greater than 12 will be happily accepted @@ -144,10 +146,30 @@ Some notes for the date and time types: .. _`Format String Syntax`: http://docs.python.org/library/string.html#format-string-syntax .. _`Format Specification Mini-Language`: http://docs.python.org/library/string.html#format-specification-mini-language + +Result Objects +-------------- + +The result of a ``parse()`` operation is either ``None`` (no match) or a +``Result`` instance. + +The ``Result`` instance has three attributes: + +fixed + A tuple of the fixed-position, anonymous fields extracted from the input. +named + A dictionary of the named fields extracted from the input. +spans + A dictionary mapping the names and fixed position indices matched to a + 2-tuple slice range of where the match occurred in the input. + + ---- **Version history (in brief)**: +- 1.1.5 accept textual dates in more places; Result now holds match span + positions. - 1.1.4 fixes to some int type conversion; implemented "=" alignment; added date/time parsing with a variety of formats handled. - 1.1.3 type conversion is automatic based on specified field types. Also added @@ -161,7 +183,7 @@ Some notes for the date and time types: This code is copyright 2011 eKit.com Inc (http://www.ekit.com/) See the end of the source file for the license of use. ''' -__version__ = '1.1.4' +__version__ = '1.1.5' import re import unittest @@ -253,6 +275,7 @@ MONTHS_MAP = dict( ) DAYS_PAT = '(Mon|Tue|Wed|Thu|Fri|Sat|Sun)' MONTHS_PAT = '(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)' +ALL_MONTHS_PAT = '(%s)' % '|'.join(MONTHS_MAP) TIME_PAT = r'(?P\d{1,2}:\d{1,2}(:\d{1,2}(\.\d+)?)?)' AM_PAT = r'(?P\s+[AP]M)' TZ_PAT = r'(?P\s+[-+]\d\d:?\d\d)' @@ -350,11 +373,13 @@ class Parser(object): if n in self._type_conversions: l[n] = self._type_conversions[n](l[n], m) named = m.groupdict() + spans = dict((n, m.span(n)) for n in named) for k in named: if k in self._type_conversions: named[k] = self._type_conversions[k](named[k], m) fixed = tuple(l[n] for n in self._fixed_args) - return Result(fixed, named) + spans.update((i, m.span(n+1)) for i, n in enumerate(self._fixed_args)) + return Result(fixed, named, spans) def replace(self, match): d = match.groupdict() @@ -422,10 +447,10 @@ class Parser(object): s = r'(?P\d{4}-\d\d-\d\d)((\s+|T)%s)?(?PZ|[-+]\d\d:\d\d)?' % (TIME_PAT,) self._type_conversions[group] = date_convert elif d['type'] == 'ta': - s = r'(?P\d{1,2}[-/]\d{1,2}[-/]\d{4})(\s+%s)?%s?%s?' % (TIME_PAT, AM_PAT, TZ_PAT) + s = r'(?P(\d{1,2}|%s)[-/]\d{1,2}[-/]\d{4})(\s+%s)?%s?%s?' % (ALL_MONTHS_PAT, TIME_PAT, AM_PAT, TZ_PAT) self._type_conversions[group] = date_convert elif d['type'] == 'tg': - s = r'(?P\d{1,2}[-/]\d{1,2}[-/]\d{4})(\s+%s)?%s?%s?' % (TIME_PAT, AM_PAT, TZ_PAT) + s = r'(?P\d{1,2}[-/](\d{1,2}|%s)[-/]\d{4})(\s+%s)?%s?%s?' % (ALL_MONTHS_PAT, TIME_PAT, AM_PAT, TZ_PAT) self._type_conversions[group] = date_convert elif d['type'] == 'te': # this will allow microseconds through if they're present, but meh @@ -531,9 +556,10 @@ class Parser(object): class Result(object): - def __init__(self, fixed, named): + def __init__(self, fixed, named, spans): self.fixed = fixed self.named = named + self.spans = spans def __repr__(self): return '<%s %r %r>' % (self.__class__.__name__, self.fixed, @@ -740,6 +766,31 @@ class TestParse(unittest.TestCase): r = parse('hello {number:^d} {things}', 'hello 12 people') self.assertEquals(r.named, dict(number=12, things='people')) + def test_spans(self): + 'test the string sections our fields come from' + string = 'hello world' + r = parse('hello {}', string) + self.assertEquals(r.spans, {0: (6,11)}) + start, end = r.spans[0] + self.assertEquals(string[start:end], r.fixed[0]) + + string = 'hello world' + r = parse('hello {:>}', string) + self.assertEquals(r.spans, {0: (10,15)}) + start, end = r.spans[0] + self.assertEquals(string[start:end], r.fixed[0]) + + string = 'hello 0x12 world' + r = parse('hello {val:#h} world', string) + self.assertEquals(r.spans, {'val': (6,10)}) + start, end = r.spans['val'] + self.assertEquals(string[start:end], '0x%x' % r.named['val']) + + string = 'hello world and other beings' + r = parse('hello {} {name} {} {spam}', string) + self.assertEquals(r.spans, {0: (6, 11), 'name': (12, 15), + 1: (16, 21), 'spam': (22, 28)}) + def test_numbers(self): 'pull a numbers out of a string' def y(fmt, s, e): @@ -856,10 +907,14 @@ class TestParse(unittest.TestCase): # ta US (month/day) format datetime y('a {:ta} b', 'a 11/21/2011 10:21:36 AM +1000 b', aest_d) y('a {:ta} b', 'a 11-21-2011 10:21:36 AM +1000 b', aest_d) + y('a {:ta} b', 'a Nov-21-2011 10:21:36 AM +1000 b', aest_d) + y('a {:ta} b', 'a November-21-2011 10:21:36 AM +1000 b', aest_d) # tg global (day/month) format datetime y('a {:tg} b', 'a 21/11/2011 10:21:36 AM +1000 b', aest_d) y('a {:tg} b', 'a 21-11-2011 10:21:36 AM +1000 b', aest_d) + y('a {:tg} b', 'a 21-Nov-2011 10:21:36 AM +1000 b', aest_d) + y('a {:tg} b', 'a 21-November-2011 10:21:36 AM +1000 b', aest_d) # th HTTP log format date/time datetime y('a {:th} b', 'a 21/Nov/2011:10:21:36 +1000 b', aest_d)