From 2fafd7ad4caf40745ddf7119e5d935ea09ea3e65 Mon Sep 17 00:00:00 2001 From: Richard Jones Date: Tue, 22 Nov 2011 12:37:08 +1100 Subject: [PATCH] allow "d" fields to have number base "0x" etc. prefixes; fix up some field type interactionsi after stress-testing the parser; implement "%" type. --- README.rst | 17 +++- parse.py | 283 ++++++++++++++++++++++++++++++++++------------------- 2 files changed, 197 insertions(+), 103 deletions(-) diff --git a/README.rst b/README.rst index 56c17ce..473beb0 100644 --- a/README.rst +++ b/README.rst @@ -61,19 +61,19 @@ The align operators will cause spaces (or specified fill character) to be stripped from the value. Similarly width is not enforced; it just indicates there may be whitespace or "0"s to strip. -The "#" format character is handled automatically by b, o and x - that -is: if there is a "0b", "0o" or "0x" prefix respectively, it's ignored. +The "#" format character is handled automatically by d, b, o and x - +that is: if there is a "0b", "0o" or "0x" prefix respectively, it's +handled. For "d" any will be accepted, but for the others the correct +prefix must be present if at all. The types supported are a slightly different mix to the format() types. -Some format() types come directly over: d, n, f, e, b, o and x. +Some format() types come directly over: d, n, %, f, e, b, o and x. In addition some regular expression character group types D, w, W, s and S are also available. The "e" and "g" types are case-insensitive so there is not need for the "E" or "G" types. -The format() type % is not yet supported. - ===== =========================================== ======== Type Characters Matched Output ===== =========================================== ======== @@ -84,6 +84,7 @@ Type Characters Matched Output d Digits (effectively integer numbers) int D Non-digit str n Numbers with thousands separators (, or .) int + % Percentage (converted to value/100.0) float f Fixed-point numbers float e Floating-point numbers with exponent float e.g. 1.1e-10, NAN (all case insensitive) @@ -147,6 +148,9 @@ Some notes for the date and time types: - when a seconds amount is present in the input fractions will be parsed - named timezones are not handled yet +Note: attempting to match too many datetime fields in a single parse() will +currently result in a resource allocation issue. + .. _`Format String Syntax`: http://docs.python.org/library/string.html#format-string-syntax .. _`Format Specification Mini-Language`: http://docs.python.org/library/string.html#format-specification-mini-language @@ -172,6 +176,9 @@ spans **Version history (in brief)**: +- 1.1.8 allow "d" fields to have number base "0x" etc. prefixes; + fix up some field type interactionsi after stress-testing the parser; + implement "%" type. - 1.1.7 Python 3 compatibility tweaks (2.5 to 2.7 and 3.2 are supported). - 1.1.6 add "e" and "g" field types; removed redundant "h" and "X"; removed need for explicit "#". diff --git a/parse.py b/parse.py index 57ff5b2..25200bc 100644 --- a/parse.py +++ b/parse.py @@ -71,15 +71,13 @@ handled. For "d" any will be accepted, but for the others the correct prefix must be present if at all. The types supported are a slightly different mix to the format() types. -Some format() types come directly over: d, n, f, e, b, o and x. +Some format() types come directly over: d, n, %, f, e, b, o and x. In addition some regular expression character group types D, w, W, s and S are also available. The "e" and "g" types are case-insensitive so there is not need for the "E" or "G" types. -The format() type % is not yet supported. - ===== =========================================== ======== Type Characters Matched Output ===== =========================================== ======== @@ -90,6 +88,7 @@ Type Characters Matched Output d Digits (effectively integer numbers) int D Non-digit str n Numbers with thousands separators (, or .) int + % Percentage (converted to value/100.0) float f Fixed-point numbers float e Floating-point numbers with exponent float e.g. 1.1e-10, NAN (all case insensitive) @@ -153,6 +152,9 @@ Some notes for the date and time types: - when a seconds amount is present in the input fractions will be parsed - named timezones are not handled yet +Note: attempting to match too many datetime fields in a single parse() will +currently result in a resource allocation issue. + .. _`Format String Syntax`: http://docs.python.org/library/string.html#format-string-syntax .. _`Format Specification Mini-Language`: http://docs.python.org/library/string.html#format-specification-mini-language @@ -178,7 +180,9 @@ spans **Version history (in brief)**: -- 1.1.8 allow "d" fields to have number base "0x" etc. prefixes. +- 1.1.8 allow "d" fields to have number base "0x" etc. prefixes; + fix up some field type interactionsi after stress-testing the parser; + implement "%" type. - 1.1.7 Python 3 compatibility tweaks (2.5 to 2.7 and 3.2 are supported). - 1.1.6 add "e" and "g" field types; removed redundant "h" and "X"; removed need for explicit "#". @@ -197,11 +201,12 @@ spans This code is copyright 2011 eKit.com Inc (http://www.ekit.com/) See the end of the source file for the license of use. ''' -__version__ = '1.1.7' +__version__ = '1.1.8' import re import unittest from datetime import datetime, time, tzinfo, timedelta +from functools import partial __all__ = 'parse compile'.split() @@ -225,7 +230,7 @@ FORMAT_RE = re.compile(''' (?P[-+ ])? (?P(?P0)?[1-9]\d*)? (\.(?P\d+))? - (?P([nboxfegwWdDsS]|t[ieahgct]))? + (?P([nbox%fegwWdDsS]|t[ieahgct]))? ''', re.VERBOSE) @@ -245,17 +250,16 @@ def int_convert(base): else: sign = 1 - prefix = match.groupdict().get('prefix') - if prefix is None: - pass - elif prefix[1] in 'bB': - base = 2 - elif prefix[1] in 'oO': - base = 8 - elif prefix[1] in 'xX': - base = 16 - else: - raise ValueError('unhandled prefix %r' % prefix) + if string[0] == '0': + if string[1] in 'bB': + base = 2 + elif string[1] in 'oO': + base = 8 + elif string[1] in 'xX': + base = 16 + else: + # just go with the base specifed + pass chars = CHARS[:base] string = re.sub('[^%s]' % chars, '', string.lower()) @@ -263,6 +267,10 @@ def int_convert(base): return f +def percentage(string, match): + return float(string[:-1]) / 100. + + class FixedTzOffset(tzinfo): """Fixed offset in minutes east from UTC. """ @@ -302,41 +310,34 @@ MONTHS_MAP = dict( DAYS_PAT = '(Mon|Tue|Wed|Thu|Fri|Sat|Sun)' MONTHS_PAT = '(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)' ALL_MONTHS_PAT = '(%s)' % '|'.join(MONTHS_MAP) -TIME_PAT = r'(?P\d{1,2}:\d{1,2}(:\d{1,2}(\.\d+)?)?)' -AM_PAT = r'(?P\s+[AP]M)' -TZ_PAT = r'(?P\s+[-+]\d\d:?\d\d)' +TIME_PAT = r'(\d{1,2}:\d{1,2}(:\d{1,2}(\.\d+)?)?)' +AM_PAT = r'(\s+[AP]M)' +TZ_PAT = r'(\s+[-+]\d\d?:?\d\d)' -def time_convert(string, match): - return date_convert(string, match, True) - -def date_convert(string, match, time_only=False): +def date_convert(string, match, ymd=None, mdy=None, dmy=None, + d_m_y=None, hms=None, am=None, tz=None): '''Convert the incoming string containing some date / time info into a datetime instance. ''' - gd = match.groupdict() - if not time_only: - if gd.get('ymd'): - y, m, d = re.split('[-/\s]', gd['ymd']) - elif gd.get('mdy'): - m, d, y = re.split('[-/\s]', gd['mdy']) - elif gd.get('dmy'): - d, m, y = re.split('[-/\s]', gd['dmy']) - elif gd.get('d'): - d = gd['d'] - m = gd['m'] - y = gd['y'] - else: - raise ValueError('oops, got no date format in %r' % (gd, )) - y = int(y) - if m.isdigit(): - m = int(m) - else: - m = MONTHS_MAP[m] - d = int(d) + groups = match.groups() + time_only = False + if ymd is not None: + y, m, d = re.split('[-/\s]', groups[ymd]) + elif mdy is not None: + m, d, y = re.split('[-/\s]', groups[mdy]) + elif dmy is not None: + d, m, y = re.split('[-/\s]', groups[dmy]) + elif d_m_y is not None: + d, m, y = d_m_y + d = groups[d] + m = groups[m] + y = groups[y] + else: + time_only = True H = M = S = u = 0 - if gd['hms']: - t = gd['hms'].split(':') + if hms is not None and groups[hms]: + t = groups[hms].split(':') if len(t) == 2: H, M = t else: @@ -348,11 +349,13 @@ def date_convert(string, match, time_only=False): H = int(H) M = int(M) - am = gd.get('am') - if am and am.strip() == 'PM': - H += 12 + if am is not None: + am = groups[am] + if am and am.strip() == 'PM': + H += 12 - tz = gd.get('tz') + if tz is not None: + tz = groups[tz] if tz == 'Z': tz = FixedTzOffset(0, 'UTC') elif tz: @@ -366,16 +369,29 @@ def date_convert(string, match, time_only=False): tzh, tzm = tz[1:].split(':') else: tzh, tzm = tz[1:3], tz[3:5] - tz = FixedTzOffset(int(tzm) + int(tzh) * 60, tz) + offset = int(tzm) + int(tzh) * 60 + if sign == '-': + offset = -offset + tz = FixedTzOffset(offset, tz) if time_only: d = time(H, M, S, u, tzinfo=tz) else: + y = int(y) + if m.isdigit(): + m = int(m) + else: + m = MONTHS_MAP[m] + d = int(d) d = datetime(y, m, d, H, M, S, u, tzinfo=tz) return d +class TooManyFields(ValueError): + pass + + class Parser(object): def __init__(self, format): self._fixed_fields = [] @@ -384,7 +400,12 @@ class Parser(object): self._format = format self._type_conversions = {} self._expression = '^%s$' % PARSE_RE.sub(self.replace, format) - self._re = re.compile(self._expression, re.IGNORECASE|re.DOTALL) + try: + self._re = re.compile(self._expression, re.IGNORECASE|re.DOTALL) + except AssertionError, e: + if str(e).endswith('this version only supports 100 named groups'): + raise TooManyFields('sorry, you are attempting to parse too ' + 'many complex fields') def __repr__(self): if len(self._format) > 20: @@ -396,12 +417,20 @@ class Parser(object): if m is None: return None + #print self._format + #print self._expression + #print `string` + #print 'MATCH GROUPS', m.groups() + #print 'MATCH DICT', m.groupdict() + # ok, figure the fixed fields we've pulled out and type convert them fixed_fields = list(m.groups()) + #print 'WANT', self._fixed_fields for n in self._fixed_fields: if n in self._type_conversions: fixed_fields[n] = self._type_conversions[n](fixed_fields[n], m) fixed_fields = tuple(fixed_fields[n] for n in self._fixed_fields) + #print 'FIXED', fixed_fields # grab the named fields, converting where requested groupdict = m.groupdict() @@ -443,10 +472,9 @@ class Parser(object): else: raise ValueError('format not recognised') - self._group_index += 1 - # simplest case: a bare {} if not format: + self._group_index += 1 return wrap % '.+?' # now figure out the format @@ -461,6 +489,7 @@ class Parser(object): # figure type conversions, if any if d['type'] == 'n': s = '\d{1,3}([,.]\d{3})*' + self._group_index += 1 self._type_conversions[group] = int_convert(10) elif d['type'] == 'b': prefix = True @@ -477,6 +506,10 @@ class Parser(object): s = '(0[xX])?[0-9a-fA-F]+' self._type_conversions[group] = int_convert(16) self._group_index += 1 + elif d['type'] == '%': + s = r'\d+(\.\d+)?%' + self._group_index += 1 + self._type_conversions[group] = percentage elif d['type'] == 'f': s = r'\d+\.\d+' self._type_conversions[group] = lambda s, m: float(s) @@ -488,32 +521,64 @@ class Parser(object): self._group_index += 2 self._type_conversions[group] = lambda s, m: float(s) elif d['type'] == 'd': - s = r'(?P0[obxOBX])?\d+' - self._group_index += 1 + s = r'\d+|0[xX][0-9a-fA-F]+|[0-9a-fA-F]+|0[bB][01]+|0[oO][0-7]+' self._type_conversions[group] = int_convert(10) elif d['type'] == 'ti': - s = r'(?P\d{4}-\d\d-\d\d)((\s+|T)%s)?(?PZ|[-+]\d\d:\d\d)?' % (TIME_PAT,) - self._type_conversions[group] = date_convert - elif d['type'] == 'ta': - s = r'(?P(\d{1,2}|%s)[-/]\d{1,2}[-/]\d{4})(\s+%s)?%s?%s?' % (ALL_MONTHS_PAT, TIME_PAT, AM_PAT, TZ_PAT) - self._type_conversions[group] = date_convert + s = r'(\d{4}-\d\d-\d\d)((\s+|T)%s)?(Z|[-+]\d\d:\d\d)?' % TIME_PAT + n = self._group_index + self._type_conversions[group] = partial(date_convert, ymd=n, + hms=n+3, tz=n+6) + self._group_index += 7 + wrap = '' elif d['type'] == 'tg': - s = r'(?P\d{1,2}[-/](\d{1,2}|%s)[-/]\d{4})(\s+%s)?%s?%s?' % (ALL_MONTHS_PAT, TIME_PAT, AM_PAT, TZ_PAT) - self._type_conversions[group] = date_convert + s = r'(\d{1,2}[-/](\d{1,2}|%s)[-/]\d{4})(\s+%s)?%s?%s?' % ( + ALL_MONTHS_PAT, TIME_PAT, AM_PAT, TZ_PAT) + n = self._group_index + self._type_conversions[group] = partial(date_convert, dmy=n, + hms=n+4, am=n+7, tz=n+8) + self._group_index += 9 + wrap = '' + elif d['type'] == 'ta': + s = r'((\d{1,2}|%s)[-/]\d{1,2}[-/]\d{4})(\s+%s)?%s?%s?' % ( + ALL_MONTHS_PAT, TIME_PAT, AM_PAT, TZ_PAT) + n = self._group_index + self._type_conversions[group] = partial(date_convert, mdy=n, + hms=n+4, am=n+7, tz=n+8) + self._group_index += 9 + wrap = '' elif d['type'] == 'te': # this will allow microseconds through if they're present, but meh - s = r'(%s,\s+)?(?P\d{1,2}\s+%s\s+\d{4})\s+%s%s' % (DAYS_PAT, MONTHS_PAT, TIME_PAT, TZ_PAT) - self._type_conversions[group] = date_convert + s = r'(%s,\s+)?(\d{1,2}\s+%s\s+\d{4})\s+%s%s' % (DAYS_PAT, + MONTHS_PAT, TIME_PAT, TZ_PAT) + n = self._group_index + self._type_conversions[group] = partial(date_convert, dmy=n+2, + hms=n+4, tz=n+7) + self._group_index += 8 + wrap = '' elif d['type'] == 'th': # slight flexibility here from the stock Apache format - s = r'(?P\d{1,2}[-/]%s[-/]\d{4}):%s%s' % (MONTHS_PAT, TIME_PAT, TZ_PAT) - self._type_conversions[group] = date_convert + s = r'(\d{1,2}[-/]%s[-/]\d{4}):%s%s' % (MONTHS_PAT, TIME_PAT, + TZ_PAT) + n = self._group_index + self._type_conversions[group] = partial(date_convert, dmy=n, + hms=n+2, tz=n+5) + self._group_index += 6 + wrap = '' elif d['type'] == 'tc': - s = r'(%s)\s+(?P%s)\s+(?P\d{1,2})\s+%s\s+(?P\d{4})' % (DAYS_PAT, MONTHS_PAT, TIME_PAT) - self._type_conversions[group] = date_convert + s = r'(%s)\s+%s\s+(\d{1,2})\s+%s\s+(\d{4})' % ( + DAYS_PAT, MONTHS_PAT, TIME_PAT) + n = self._group_index + self._type_conversions[group] = partial(date_convert, + d_m_y=(n+3,n+2,n+7), hms=n+4) + self._group_index += 8 + wrap = '' elif d['type'] == 'tt': s = r'%s?%s?%s?' % (TIME_PAT, AM_PAT, TZ_PAT) - self._type_conversions[group] = time_convert + n = self._group_index + self._type_conversions[group] = partial(date_convert, hms=n, + am=n+3, tz=n+4) + self._group_index += 5 + wrap = '' elif d['type']: s = r'\%s+' % d['type'] else: @@ -527,11 +592,10 @@ class Parser(object): else: fill = ' ' - is_numeric = d['type'] and d['type'] in 'nfegdobh' + is_numeric = d['type'] and d['type'] in 'n%fegdobh' # handle some numeric-specific things like prefix and sign if is_numeric: - # prefix with something (align "=" trumps zero) if align == '=': # special case - align "=" acts like the zero above but with @@ -559,7 +623,9 @@ class Parser(object): # Place into a group now - this captures the value we want to keep. # Everything else from now is just padding to be stripped off - s = wrap % s + if wrap: + s = wrap % s + self._group_index += 1 if d['width']: # all we really care about is that if the format originally @@ -579,6 +645,7 @@ class Parser(object): s = '%s*%s' % (fill, s) elif align == '^': s = '%s*%s%s*' % (fill, s, fill) + return s @@ -637,13 +704,6 @@ class TestPattern(unittest.TestCase): s = PARSE_RE.sub(self.p.replace, '{} {}') self.assertEqual(s, '(.+?) (.+?)') - def test_typed(self): - 'pull a named string out of another string' - s = PARSE_RE.sub(self.p.replace, '{:d}') - self.assertEqual(s, '(-?(?P0[obxOBX])?\d+)') - s = PARSE_RE.sub(self.p.replace, '{:d} {:w}') - self.assertEqual(s, '(-?(?P0[obxOBX])?\d+) (\w+)') - def test_named(self): 'pull a named string out of another string' s = PARSE_RE.sub(self.p.replace, '{name}') @@ -653,10 +713,10 @@ class TestPattern(unittest.TestCase): def test_named_typed(self): 'pull a named string out of another string' - s = PARSE_RE.sub(self.p.replace, '{name:d}') - self.assertEqual(s, '(?P-?(?P0[obxOBX])?\d+)') - s = PARSE_RE.sub(self.p.replace, '{name:d} {other:w}') - self.assertEqual(s, '(?P-?(?P0[obxOBX])?\d+) (?P\w+)') + s = PARSE_RE.sub(self.p.replace, '{name:w}') + self.assertEqual(s, '(?P\w+)') + s = PARSE_RE.sub(self.p.replace, '{name:w} {other:w}') + self.assertEqual(s, '(?P\w+) (?P\w+)') def test_beaker(self): 'skip some trailing whitespace' @@ -700,7 +760,7 @@ class TestPattern(unittest.TestCase): self.assertEqual(d.get(k), matches[k], 'm["%s"]=%r, expect %r' % (k, d.get(k), matches[k])) - for t in 'obxegfdDwWsS': + for t in '%obxegfdDwWsS': _(t, dict(type=t)) _('10'+t, dict(type=t, width='10')) _('05d', dict(type='d', width='05', zero='0')) @@ -790,6 +850,10 @@ class TestParse(unittest.TestCase): r = parse('hello {number:^d} {things}', 'hello 12 people') self.assertEqual(r.named, dict(number=12, things='people')) + def test_multiline(self): + r = parse('hello\n{}\nworld', 'hello\nthere\nworld') + self.assertEqual(r.fixed[0], 'there') + def test_spans(self): 'test the string sections our fields come from' string = 'hello world' @@ -847,6 +911,11 @@ class TestParse(unittest.TestCase): y('a {:d} b', 'a 0b1000 b', 8) y('a {:d} b', 'a 0o1000 b', 512) y('a {:d} b', 'a 0x1000 b', 4096) + y('a {:d} b', 'a 0xabcdef b', 0xabcdef) + + y('a {:%} b', 'a 100% b', 1) + y('a {:%} b', 'a 50% b', .5) + y('a {:%} b', 'a 50.1% b', .501) y('a {:n} b', 'a 100 b', 100) y('a {:n} b', 'a 1,000 b', 1000) @@ -856,6 +925,8 @@ class TestParse(unittest.TestCase): y('a {:n} b', 'a 10,000 b', 10000) y('a {:n} b', 'a 100,000 b', 100000) n('a {:n} b', 'a 100,00 b', None) + y('a {:n} b', 'a 100.000 b', 100000) + y('a {:n} b', 'a 1.000.000 b', 1000000) y('a {:f} b', 'a 12.0 b', 12.0) y('a {:f} b', 'a -12.1 b', -12.1) @@ -983,8 +1054,32 @@ class TestParse(unittest.TestCase): # tc ctime() format datetime y('a {:tc} b', 'a Mon Nov 21 10:21:36 2011 b', d) + t530 = FixedTzOffset(-5*60 - 30, '-5:30') + # tt Time time y('a {:tt} b', 'a 10:21:36 AM +1000 b', time(10, 21, 36, tzinfo=aest)) + y('a {:tt} b', 'a 10:21:36 AM b', time(10, 21, 36)) + y('a {:tt} b', 'a 10:21:36 PM b', time(22, 21, 36)) + y('a {:tt} b', 'a 10:21:36 b', time(10, 21, 36)) + y('a {:tt} b', 'a 10:21 b', time(10, 21)) + y('a {:tt} b', 'a 10:21:36 PM -5:30 b', time(22, 21, 36, tzinfo=t530)) + + def test_datetime_group_count(self): + 'test we increment the group count correctly for datetimes' + r = parse('{:ti} {}', '1972-01-01 spam') + self.assertEqual(r.fixed[1], 'spam') + r = parse('{:tg} {}', '1-1-1972 spam') + self.assertEqual(r.fixed[1], 'spam') + r = parse('{:ta} {}', '1-1-1972 spam') + self.assertEqual(r.fixed[1], 'spam') + r = parse('{:th} {}', '21/Nov/2011:10:21:36 +1000 spam') + self.assertEqual(r.fixed[1], 'spam') + r = parse('{:te} {}', '21 Nov 2011 10:21:36 +1000 spam') + self.assertEqual(r.fixed[1], 'spam') + r = parse('{:tc} {}', 'Mon Nov 21 10:21:36 2011 spam') + self.assertEqual(r.fixed[1], 'spam') + r = parse('{:tt} {}', '10:21 spam') + self.assertEqual(r.fixed[1], 'spam') def test_mixed_types(self): 'stress-test: pull one of everything out of a string :-)' @@ -1002,12 +1097,7 @@ class TestParse(unittest.TestCase): binary: {:b} {:b} octal: {:o} {:o} hex: {:x} {:x} - ISO 8601: {:ti} - RFC2822: {:te} - Global: {:tg} - US: {:ta} - ctime(): {:tc} - HTTP log: {:th} + date: {:ti} time: {:tt} final value: {} ''', @@ -1025,17 +1115,14 @@ class TestParse(unittest.TestCase): binary: 0b1000 0B1000 octal: 0o1000 0O1000 hex: 0x1000 0X1000 - ISO 8601: 1972-01-20T10:21:36Z - RFC2822: Mon, 20 Jan 1972 10:21:36 +1000 - Global: 20/1/1972 10:21:36 AM +1:00 - US: 1/20/1972 10:21:36 PM +10:30 - ctime(): Sun Sep 16 01:03:52 1973 - HTTP log: 21/Nov/2011:00:07:11 +0000 + date: 1972-01-20T10:21:36Z time: 10:21:36 PM -5:30 final value: spam ''') - self.assertEqual(r.fixed[33], 'spam') + self.assertEqual(r.fixed[26], 'spam') + def test_too_many_fields(self): + self.assertRaises(TooManyFields, compile, '{:ti}' * 20)