cleanup of Parser.parse() and attribute naming; cleanup of number prefix and allow prefix for "d". test_mixed_types fails.

2026-06-05 23:40:17 +00:00 · 2011-11-22 10:45:24 +11:00
parent 5b602066fb
commit 7e0a9d0d9f
2 changed files with 122 additions and 39 deletions
@@ -59,7 +59,7 @@ Most of the `Format Specification Mini-Language`_ is supported::

 The align operators will cause spaces (or specified fill character)
 to be stripped from the value. Similarly width is not enforced; it
-just indicates there may be whitespace to strip.
+just indicates there may be whitespace or "0"s to strip.

 The "#" format character is handled automatically by b, o and x - that
 is: if there is a "0b", "0o" or "0x" prefix respectively, it's ignored.
@@ -145,7 +145,7 @@ Some notes for the date and time types:
  than 12 (for consistency.)
 - except in ISO 8601 and e-mail format the timezone is optional
 - when a seconds amount is present in the input fractions will be parsed
- named timezones are not yet supported
+- named timezones are not handled yet

 .. _`Format String Syntax`: http://docs.python.org/library/string.html#format-string-syntax
 .. _`Format Specification Mini-Language`: http://docs.python.org/library/string.html#format-specification-mini-language
@@ -166,12 +166,13 @@ named
 spans
   A dictionary mapping the names and fixed position indices matched to a
   2-tuple slice range of where the match occurred in the input.
-
+   The span does not include any stripped padding (alignment or width).

 ----

 **Version history (in brief)**:

+- 1.1.7 Python 3 compatibility tweaks (2.5 to 2.7 and 3.2 are supported).
 - 1.1.6 add "e" and "g" field types; removed redundant "h" and "X";
  removed need for explicit "#".
 - 1.1.5 accept textual dates in more places; Result now holds match span
@@ -65,8 +65,10 @@ The align operators will cause spaces (or specified fill character)
 to be stripped from the value. Similarly width is not enforced; it
 just indicates there may be whitespace or "0"s to strip.

-The "#" format character is handled automatically by b, o and x - that
-is: if there is a "0b", "0o" or "0x" prefix respectively, it's ignored.
+The "#" format character is handled automatically by d, b, o and x -
+that is: if there is a "0b", "0o" or "0x" prefix respectively, it's
+handled. For "d" any will be accepted, but for the others the correct
+prefix must be present if at all.

 The types supported are a slightly different mix to the format() types.
 Some format() types come directly over: d, n, f, e, b, o and x.
@@ -176,6 +178,7 @@ spans

 **Version history (in brief)**:

+- 1.1.8 allow "d" fields to have number base "0x" etc. prefixes.
 - 1.1.7 Python 3 compatibility tweaks (2.5 to 2.7 and 3.2 are supported).
 - 1.1.6 add "e" and "g" field types; removed redundant "h" and "X";
  removed need for explicit "#".
@@ -225,6 +228,7 @@ FORMAT_RE = re.compile('''
    (?P<type>([nboxfegwWdDsS]|t[ieahgct]))?
 ''', re.VERBOSE)

+
 def int_convert(base):
    '''Convert a string to an integer.

@@ -241,6 +245,18 @@ def int_convert(base):
        else:
            sign = 1

+        prefix = match.groupdict().get('prefix')
+        if prefix is None:
+            pass
+        elif prefix[1] in 'bB':
+            base = 2
+        elif prefix[1] in 'oO':
+            base = 8
+        elif prefix[1] in 'xX':
+            base = 16
+        else:
+            raise ValueError('unhandled prefix %r' % prefix)
+
        chars = CHARS[:base]
        string = re.sub('[^%s]' % chars, '', string.lower())
        return sign * int(string, base)
@@ -362,8 +378,9 @@ def date_convert(string, match, time_only=False):

 class Parser(object):
    def __init__(self, format):
-        self._fixed_args = []
-        self._groups = 0
+        self._fixed_fields = []
+        self._named_fields = []
+        self._group_index = 0
        self._format = format
        self._type_conversions = {}
        self._expression = '^%s$' % PARSE_RE.sub(self.replace, format)
@@ -378,18 +395,29 @@ class Parser(object):
        m = self._re.match(string)
        if m is None:
            return None
-        l = list(m.groups())
-        for n in self._fixed_args:
+
+        # ok, figure the fixed fields we've pulled out and type convert them
+        fixed_fields = list(m.groups())
+        for n in self._fixed_fields:
            if n in self._type_conversions:
-                l[n] = self._type_conversions[n](l[n], m)
-        named = m.groupdict()
-        spans = dict((n, m.span(n)) for n in named)
-        for k in named:
+                fixed_fields[n] = self._type_conversions[n](fixed_fields[n], m)
+        fixed_fields = tuple(fixed_fields[n] for n in self._fixed_fields)
+
+        # grab the named fields, converting where requested
+        groupdict = m.groupdict()
+        named_fields = {}
+        for k in self._named_fields:
            if k in self._type_conversions:
-                named[k] = self._type_conversions[k](named[k], m)
-        fixed = tuple(l[n] for n in self._fixed_args)
-        spans.update((i, m.span(n+1)) for i, n in enumerate(self._fixed_args))
-        return Result(fixed, named, spans)
+                named_fields[k] = self._type_conversions[k](groupdict[k], m)
+            else:
+                named_fields[k] = groupdict[k]
+
+        # now figure the match spans
+        spans = dict((n, m.span(n)) for n in named_fields)
+        spans.update((i, m.span(n+1)) for i, n in enumerate(self._fixed_fields))
+
+        # and that's our result
+        return Result(fixed_fields, named_fields, spans)

    def replace(self, match):
        d = match.groupdict()
@@ -399,22 +427,23 @@ class Parser(object):
        format = ''

        if d['fixed']:
-            self._fixed_args.append(self._groups)
+            self._fixed_fields.append(self._group_index)
            wrap = '(%s)'
            if ':' in d['fixed']:
                format = d['fixed'][2:-1]
-            group = self._groups
+            group = self._group_index
        elif d['named']:
            if ':' in d['named']:
                name, format = d['named'].split(':')
            else:
                name = d['named']
+            self._named_fields.append(name)
            group = name
            wrap = '(?P<%s>%%s)' % name
        else:
            raise ValueError('format not recognised')

-        self._groups += 1
+        self._group_index += 1

        # simplest case: a bare {}
        if not format:
@@ -433,18 +462,21 @@ class Parser(object):
        if d['type'] == 'n':
            s = '\d{1,3}([,.]\d{3})*'
            self._type_conversions[group] = int_convert(10)
-        elif d['type'] == 'o':
-            prefix = True
-            s = '[0-7]+'
-            self._type_conversions[group] = int_convert(8)
        elif d['type'] == 'b':
            prefix = True
-            s = '[01]+'
+            s = '(0[bB])?[01]+'
            self._type_conversions[group] = int_convert(2)
+            self._group_index += 1
+        elif d['type'] == 'o':
+            prefix = True
+            s = '(0[oO])?[0-7]+'
+            self._type_conversions[group] = int_convert(8)
+            self._group_index += 1
        elif d['type'] == 'x':
            prefix = True
-            s = '[0-9a-fA-F]+'
+            s = '(0[xX])?[0-9a-fA-F]+'
            self._type_conversions[group] = int_convert(16)
+            self._group_index += 1
        elif d['type'] == 'f':
            s = r'\d+\.\d+'
            self._type_conversions[group] = lambda s, m: float(s)
@@ -453,9 +485,11 @@ class Parser(object):
            self._type_conversions[group] = lambda s, m: float(s)
        elif d['type'] == 'g':
            s = r'\d+(\.\d+)?([eE][-+]?\d+)?|nan|NAN|[-+]?inf|[-+]?INF'
+            self._group_index += 2
            self._type_conversions[group] = lambda s, m: float(s)
        elif d['type'] == 'd':
-            s = r'\d+'
+            s = r'(?P<prefix>0[obxOBX])?\d+'
+            self._group_index += 1
            self._type_conversions[group] = int_convert(10)
        elif d['type'] == 'ti':
            s = r'(?P<ymd>\d{4}-\d\d-\d\d)((\s+|T)%s)?(?P<tz>Z|[-+]\d\d:\d\d)?' % (TIME_PAT,)
@@ -493,17 +527,10 @@ class Parser(object):
        else:
            fill = ' '

-        is_numeric = d['type'] and d['type'] in 'nfdobhxX'
+        is_numeric = d['type'] and d['type'] in 'nfegdobh'

        # handle some numeric-specific things like prefix and sign
        if is_numeric:
-            if prefix:
-                if d['type'] == 'b':
-                    s = '(0b)?' + s
-                elif d['type'] == 'o':
-                    s = '(0o)?' + s
-                elif d['type'] in 'hxX':
-                    s = '(0x)?' + s

            # prefix with something (align "=" trumps zero)
            if align == '=':
@@ -613,9 +640,9 @@ class TestPattern(unittest.TestCase):
    def test_typed(self):
        'pull a named string out of another string'
        s = PARSE_RE.sub(self.p.replace, '{:d}')
-        self.assertEqual(s, '(-?\d+)')
+        self.assertEqual(s, '(-?(?P<prefix>0[obxOBX])?\d+)')
        s = PARSE_RE.sub(self.p.replace, '{:d} {:w}')
-        self.assertEqual(s, '(-?\d+) (\w+)')
+        self.assertEqual(s, '(-?(?P<prefix>0[obxOBX])?\d+) (\w+)')

    def test_named(self):
        'pull a named string out of another string'
@@ -627,9 +654,9 @@ class TestPattern(unittest.TestCase):
    def test_named_typed(self):
        'pull a named string out of another string'
        s = PARSE_RE.sub(self.p.replace, '{name:d}')
-        self.assertEqual(s, '(?P<name>-?\d+)')
+        self.assertEqual(s, '(?P<name>-?(?P<prefix>0[obxOBX])?\d+)')
        s = PARSE_RE.sub(self.p.replace, '{name:d} {other:w}')
-        self.assertEqual(s, '(?P<name>-?\d+) (?P<other>\w+)')
+        self.assertEqual(s, '(?P<name>-?(?P<prefix>0[obxOBX])?\d+) (?P<other>\w+)')

    def test_beaker(self):
        'skip some trailing whitespace'
@@ -817,6 +844,9 @@ class TestParse(unittest.TestCase):
        y('a {: d} b', 'a -12 b', -12)
        y('a {: d} b', 'a  12 b', 12)
        n('a {: d} b', 'a +12 b', None)
+        y('a {:d} b', 'a 0b1000 b', 8)
+        y('a {:d} b', 'a 0o1000 b', 512)
+        y('a {:d} b', 'a 0x1000 b', 4096)

        y('a {:n} b', 'a 100 b', 100)
        y('a {:n} b', 'a 1,000 b', 1000)
@@ -956,6 +986,58 @@ class TestParse(unittest.TestCase):
        # tt   Time                                        time
        y('a {:tt} b', 'a 10:21:36 AM +1000 b', time(10, 21, 36, tzinfo=aest))

+    def test_mixed_types(self):
+        'stress-test: pull one of everything out of a string :-)'
+        r = parse('''
+            letters: {:w}
+            non-letters: {:W}
+            whitespace: "{:s}"
+            non-whitespace: \t{:S}\n
+            digits: {:d} {:d} {:d}
+            non-digits: {:D}
+            numbers with thousands: {:n} {:n}
+            fixed-point: {:f} {:f}
+            floating-point: {:e} {:e}
+            general numbers: {:g} {:g} {:g} {:g}
+            binary: {:b} {:b}
+            octal: {:o} {:o}
+            hex: {:x} {:x}
+            ISO 8601: {:ti}
+            RFC2822: {:te}
+            Global: {:tg}
+            US: {:ta}
+            ctime(): {:tc}
+            HTTP log: {:th}
+            time: {:tt}
+            final value: {}
+        ''',
+        '''
+            letters: abcdef_GHIJLK
+            non-letters: !@#%$ *^%
+            whitespace: "   \t\n"
+            non-whitespace: \tabc\n
+            digits: 12345 0b1011011 0xabcdef
+            non-digits: abcdef
+            numbers with thousands: 1,000 1.000.000
+            fixed-point: 100.2345 0.00001
+            floating-point: 1.1e-10 NAN
+            general numbers: 1 1.1 1.1e10 nan
+            binary: 0b1000 0B1000
+            octal: 0o1000 0O1000
+            hex: 0x1000 0X1000
+            ISO 8601: 1972-01-20T10:21:36Z
+            RFC2822: Mon, 20 Jan 1972 10:21:36 +1000
+            Global: 20/1/1972 10:21:36 AM +1:00
+            US: 1/20/1972 10:21:36 PM +10:30
+            ctime(): Sun Sep 16 01:03:52 1973
+            HTTP log: 21/Nov/2011:00:07:11 +0000
+            time: 10:21:36 PM -5:30
+            final value: spam
+        ''')
+        self.assertEqual(r.fixed[33], 'spam')
+
+
+

 if __name__ == '__main__':
    unittest.main()