diff --git a/parse.py b/parse.py index acbe184..7748fa5 100644 --- a/parse.py +++ b/parse.py @@ -1,7 +1,3 @@ -# -# $Id$ -# $HeadURL$ -# '''Parse strings using a specification based on the Python format() syntax. ``parse()`` is the opposite of ``format()`` @@ -16,7 +12,8 @@ Basic usage: >>> p.parse("It's spam, I love it!") - +>>> ''.join(findall(">{}<", "

some bold text

")) +"some bold text" Format Syntax ------------- @@ -205,6 +202,7 @@ with the same identifier. **Version history (in brief)**: +- 1.3 added search() and findall() - 1.2 added ability for custom and override type conversions to be provided; some cleanup - 1.1.9 to keep things simpler number sign is handled automatically; @@ -232,8 +230,8 @@ See the end of the source file for the license of use. ''' __version__ = '1.2' +# yes, I now have two problems import re -import unittest from datetime import datetime, time, tzinfo, timedelta from functools import partial @@ -440,6 +438,24 @@ def extract_format(format, extra_types): PARSE_RE = re.compile('({{|}}|{}|{:[^}]+?}|{\w+?}|{\w+?:[^}]+?})') +class ResultIterator(object): + def __init__(self, parser, string, pos, endpos): + self.parser = parser + self.string = string + self.pos = pos + self.endpos = endpos + + def __iter__(self): + return self + + def next(self): + m = self.parser._search_re.search(self.string, self.pos, self.endpos) + if m is None: + raise StopIteration() + self.pos = m.end() + return self.parser._generate_result(m) + + class Parser(object): def __init__(self, format, extra_types={}): self._format = format @@ -449,13 +465,8 @@ class Parser(object): self._group_index = 0 self._type_conversions = {} self._expression = self.generate_expression() - try: - # yes, I now have two problems - self._re = re.compile(self._expression, re.IGNORECASE|re.DOTALL) - except AssertionError, e: - if str(e).endswith('this version only supports 100 named groups'): - raise TooManyFields('sorry, you are attempting to parse too ' - 'many complex fields') + self._search_re = None + self._match_re = None def __repr__(self): if len(self._format) > 20: @@ -463,10 +474,70 @@ class Parser(object): return '<%s %r>' % (self.__class__.__name__, self._format) def parse(self, string): - m = self._re.match(string) + '''Match my format to the string exactly. + + Return either a Result instance or None if there's no match. + ''' + if self._match_re is None: + expression = '^%s$' % self._expression + try: + self._match_re = re.compile(expression, re.IGNORECASE|re.DOTALL) + except AssertionError, e: + if str(e).endswith('this version only supports 100 named groups'): + raise TooManyFields('sorry, you are attempting to parse too ' + 'many complex fields') + m = self._match_re.match(string) if m is None: return None + return self._generate_result(m) + + def search(self, string, pos=0, endpos=None): + '''Search the string for my format. + + Optionally start the search at "pos" character index and limit the + search to a maximum index of endpos - equivalent to + search(string[:endpos]). + + Return either a Result instance or None if there's no match. + ''' + if self._search_re is None: + try: + self._search_re = re.compile(self._expression, re.IGNORECASE|re.DOTALL) + except AssertionError, e: + if str(e).endswith('this version only supports 100 named groups'): + raise TooManyFields('sorry, you are attempting to parse too ' + 'many complex fields') + if endpos is None: + endpos = len(string) + m = self._search_re.search(string, pos, endpos) + if m is None: + return None + + return self._generate_result(m) + + def findall(self, string, pos=0, endpos=None, extra_types={}): + '''Search "string" for the all occurrances of "format". + + Optionally start the search at "pos" character index and limit the + search to a maximum index of endpos - equivalent to + search(string[:endpos]). + + Returns an iterator that holds Result instances for each format match + found. + ''' + if self._search_re is None: + try: + self._search_re = re.compile(self._expression, re.IGNORECASE|re.DOTALL) + except AssertionError, e: + if str(e).endswith('this version only supports 100 named groups'): + raise TooManyFields('sorry, you are attempting to parse too ' + 'many complex fields') + if endpos is None: + endpos = len(string) + return ResultIterator(self, string, pos, endpos) + + def _generate_result(self, m): # ok, figure the fixed fields we've pulled out and type convert them fixed_fields = list(m.groups()) for n in self._fixed_fields: @@ -509,7 +580,7 @@ class Parser(object): else: # just some text to match e.append(REGEX_SAFETY.sub(self.re_replace, part)) - return '^%s$' % ''.join(e) + return ''.join(e) def handle_field(self, field): # first: lose the braces @@ -706,8 +777,11 @@ class Result(object): def parse(format, string, extra_types={}): '''Using "format" attempt to pull values from "string". + The format must match the string contents exactly. If the value + you're looking for is instead just a part of the string use + search(). - The return value will be an object with two attributes: + The return value will be an Result instance with two attributes: .fixed - tuple of fixed-position values from the string .named - dict of named values from the string @@ -721,6 +795,51 @@ def parse(format, string, extra_types={}): return Parser(format, extra_types=extra_types).parse(string) +def search(format, string, pos=0, endpos=None, extra_types={}): + '''Search "string" for the first occurance of "format". + + The format may occur anywhere within the string. If + instead you wish for the format to exactly match the string + use parse(). + + Optionally start the search at "pos" character index and limit the search to + a maximum index of endpos - equivalent to search(string[:endpos]). + + The return value will be an Result instance with two attributes: + + .fixed - tuple of fixed-position values from the string + .named - dict of named values from the string + + If the format is invalid a ValueError will be raised. + + See the module documentation for the use of "extra_types". + + In the case there is no match parse() will return None. + ''' + return Parser(format, extra_types=extra_types).search(string, pos, endpos) + + +def findall(format, string, pos=0, endpos=None, extra_types={}): + '''Search "string" for the all occurrances of "format". + + You will be returned an iterator that holds Result instances + for each format match found. + + Optionally start the search at "pos" character index and limit the search to + a maximum index of endpos - equivalent to search(string[:endpos]). + + Each Result instance has two attributes: + + .fixed - tuple of fixed-position values from the string + .named - dict of named values from the string + + If the format is invalid a ValueError will be raised. + + See the module documentation for the use of "extra_types". + ''' + return Parser(format, extra_types=extra_types).findall(string, pos, endpos) + + def compile(format, extra_types={}): '''Create a Parser instance to parse "format". @@ -737,443 +856,6 @@ def compile(format, extra_types={}): return Parser(format, extra_types=extra_types) -# yes, I now unit test both of the problems -class TestPattern(unittest.TestCase): - def _test_expression(self, format, expression): - self.assertEqual(Parser(format)._expression, expression) - - def test_braces(self): - 'pull a simple string out of another string' - self._test_expression('{{ }}', '^\{ \}$') - - def test_fixed(self): - 'pull a simple string out of another string' - self._test_expression('{}', '^(.+?)$') - self._test_expression('{} {}', '^(.+?) (.+?)$') - - def test_named(self): - 'pull a named string out of another string' - self._test_expression('{name}', '^(?P.+?)$') - self._test_expression('{name} {other}', - '^(?P.+?) (?P.+?)$') - - def test_named_typed(self): - 'pull a named string out of another string' - self._test_expression('{name:w}', '^(?P\w+)$') - self._test_expression('{name:w} {other:w}', - '^(?P\w+) (?P\w+)$') - - def test_beaker(self): - 'skip some trailing whitespace' - self._test_expression('{:<}', '^(.+?) *$') - - def test_left_fill(self): - 'skip some trailing periods' - self._test_expression('{:.<}', '^(.+?)\.*$') - - def test_bird(self): - 'skip some trailing whitespace' - self._test_expression('{:>}', '^ *(.+?)$') - - def test_center(self): - 'skip some surrounding whitespace' - self._test_expression('{:^}', '^ *(.+?) *$') - - def test_format(self): - def _(fmt, matches): - d = extract_format(fmt, {'spam':'spam'}) - for k in matches: - self.assertEqual(d.get(k), matches[k], - 'm["%s"]=%r, expect %r' % (k, d.get(k), matches[k])) - - for t in '%obxegfdDwWsS': - _(t, dict(type=t)) - _('10'+t, dict(type=t, width='10')) - _('05d', dict(type='d', width='5', zero=True)) - _('<', dict(align='<')) - _('.<', dict(align='<', fill='.')) - _('>', dict(align='>')) - _('.>', dict(align='>', fill='.')) - _('^', dict(align='^')) - _('.^', dict(align='^', fill='.')) - _('x=d', dict(type='d', align='=', fill='x')) - _('d', dict(type='d')) - _('ti', dict(type='ti')) - _('spam', dict(type='spam')) - - _('.^010d', dict(type='d', width='10', align='^', fill='.', - zero=True)) - - -class TestParse(unittest.TestCase): - def test_no_match(self): - 'string does not match format' - self.assertEqual(parse('{{hello}}', 'hello'), None) - - def test_nothing(self): - 'do no actual parsing' - r = parse('{{hello}}', '{hello}') - self.assertEqual(r.fixed, ()) - self.assertEqual(r.named, {}) - - def test_regular_expression(self): - 'match an actual regular expression' - s = r'^(hello\s[wW]{}!+.*)$' - e = s.replace('{}', 'orld') - r = parse(s, e) - self.assertEqual(r.fixed, ('orld',)) - e = s.replace('{}', '.*?') - r = parse(s, e) - self.assertEqual(r.fixed, ('.*?',)) - - def test_fixed(self): - 'pull a fixed value out of string' - r = parse('hello {}', 'hello world') - self.assertEqual(r.fixed, ('world', )) - - def test_left(self): - 'pull left-aligned text out of string' - r = parse('{:<} world', 'hello world') - self.assertEqual(r.fixed, ('hello', )) - - def test_right(self): - 'pull right-aligned text out of string' - r = parse('hello {:>}', 'hello world') - self.assertEqual(r.fixed, ('world', )) - - def test_center(self): - 'pull center-aligned text out of string' - r = parse('hello {:^} world', 'hello there world') - self.assertEqual(r.fixed, ('there', )) - - def test_typed(self): - 'pull a named, typed values out of string' - r = parse('hello {:d} {:w}', 'hello 12 people') - self.assertEqual(r.fixed, (12, 'people')) - r = parse('hello {:w} {:w}', 'hello 12 people') - self.assertEqual(r.fixed, ('12', 'people')) - - def test_custom_type(self): - 'use a custom type' - r = parse('{:shouty} {:spam}', 'hello world', - dict(shouty=lambda s:s.upper(), spam=lambda s:''.join(reversed(s)))) - self.assertEqual(r.fixed, ('HELLO', 'dlrow')) - r = parse('{:d}', '12', dict(d=lambda s: int(s) * 2)) - self.assertEqual(r.fixed, (24,)) - r = parse('{:d}', '12') - self.assertEqual(r.fixed, (12,)) - - def test_typed_fail(self): - 'pull a named, typed values out of string' - self.assertEqual(parse('hello {:d} {:w}', 'hello people 12'), None) - - def test_named(self): - 'pull a named value out of string' - r = parse('hello {name}', 'hello world') - self.assertEqual(r.named, {'name': 'world'}) - - def test_mixed(self): - 'pull a fixed and named values out of string' - r = parse('hello {} {name} {} {spam}', 'hello world and other beings') - self.assertEqual(r.fixed, ('world', 'other')) - self.assertEqual(r.named, dict(name='and', spam='beings')) - - def test_named_typed(self): - 'pull a named, typed values out of string' - r = parse('hello {number:d} {things}', 'hello 12 people') - self.assertEqual(r.named, dict(number=12, things='people')) - r = parse('hello {number:w} {things}', 'hello 12 people') - self.assertEqual(r.named, dict(number='12', things='people')) - - def test_named_aligned_typed(self): - 'pull a named, typed values out of string' - r = parse('hello {number:d} {things}', 'hello 12 people') - self.assertEqual(r.named, dict(number=12, things='people')) - r = parse('hello {number:^d} {things}', 'hello 12 people') - self.assertEqual(r.named, dict(number=12, things='people')) - - def test_multiline(self): - r = parse('hello\n{}\nworld', 'hello\nthere\nworld') - self.assertEqual(r.fixed[0], 'there') - - def test_spans(self): - 'test the string sections our fields come from' - string = 'hello world' - r = parse('hello {}', string) - self.assertEqual(r.spans, {0: (6,11)}) - start, end = r.spans[0] - self.assertEqual(string[start:end], r.fixed[0]) - - string = 'hello world' - r = parse('hello {:>}', string) - self.assertEqual(r.spans, {0: (10,15)}) - start, end = r.spans[0] - self.assertEqual(string[start:end], r.fixed[0]) - - string = 'hello 0x12 world' - r = parse('hello {val:x} world', string) - self.assertEqual(r.spans, {'val': (6,10)}) - start, end = r.spans['val'] - self.assertEqual(string[start:end], '0x%x' % r.named['val']) - - string = 'hello world and other beings' - r = parse('hello {} {name} {} {spam}', string) - self.assertEqual(r.spans, {0: (6, 11), 'name': (12, 15), - 1: (16, 21), 'spam': (22, 28)}) - - def test_numbers(self): - 'pull a numbers out of a string' - def y(fmt, s, e, str_equals=False): - p = compile(fmt) - r = p.parse(s) - if r is None: - self.fail('%r (%r) did not match %r' % (fmt, p._expression, s)) - r = r.fixed[0] - if str_equals: - self.assertEqual(str(r), str(e), - '%r found %r in %r, not %r' % (fmt, r, s, e)) - else: - self.assertEqual(r, e, - '%r found %r in %r, not %r' % (fmt, r, s, e)) - def n(fmt, s, e): - if parse(fmt, s) is not None: - self.fail('%r matched %r' % (fmt, s)) - y('a {:d} b', 'a 12 b', 12) - y('a {:5d} b', 'a 12 b', 12) - y('a {:5d} b', 'a -12 b', -12) - y('a {:d} b', 'a -12 b', -12) - y('a {:d} b', 'a +12 b', 12) - y('a {:d} b', 'a 12 b', 12) - y('a {:d} b', 'a 0b1000 b', 8) - y('a {:d} b', 'a 0o1000 b', 512) - y('a {:d} b', 'a 0x1000 b', 4096) - y('a {:d} b', 'a 0xabcdef b', 0xabcdef) - - y('a {:%} b', 'a 100% b', 1) - y('a {:%} b', 'a 50% b', .5) - y('a {:%} b', 'a 50.1% b', .501) - - y('a {:n} b', 'a 100 b', 100) - y('a {:n} b', 'a 1,000 b', 1000) - y('a {:n} b', 'a 1.000 b', 1000) - y('a {:n} b', 'a -1,000 b', -1000) - y('a {:n} b', 'a 10,000 b', 10000) - y('a {:n} b', 'a 100,000 b', 100000) - n('a {:n} b', 'a 100,00 b', None) - y('a {:n} b', 'a 100.000 b', 100000) - y('a {:n} b', 'a 1.000.000 b', 1000000) - - y('a {:f} b', 'a 12.0 b', 12.0) - y('a {:f} b', 'a -12.1 b', -12.1) - y('a {:f} b', 'a +12.1 b', 12.1) - n('a {:f} b', 'a 12 b', None) - - y('a {:e} b', 'a 1.0e10 b', 1.0e10) - y('a {:e} b', 'a 1.0E10 b', 1.0e10) - y('a {:e} b', 'a 1.10000e10 b', 1.1e10) - y('a {:e} b', 'a 1.0e-10 b', 1.0e-10) - y('a {:e} b', 'a 1.0e+10 b', 1.0e10) - # can't actually test this one on values 'cos nan != nan - y('a {:e} b', 'a nan b', float('nan'), str_equals=True) - y('a {:e} b', 'a NAN b', float('nan'), str_equals=True) - y('a {:e} b', 'a inf b', float('inf')) - y('a {:e} b', 'a +inf b', float('inf')) - y('a {:e} b', 'a -inf b', float('-inf')) - y('a {:e} b', 'a INF b', float('inf')) - y('a {:e} b', 'a +INF b', float('inf')) - y('a {:e} b', 'a -INF b', float('-inf')) - - y('a {:g} b', 'a 1 b', 1) - y('a {:g} b', 'a 1e10 b', 1e10) - y('a {:g} b', 'a 1.0e10 b', 1.0e10) - y('a {:g} b', 'a 1.0E10 b', 1.0e10) - - y('a {:b} b', 'a 1000 b', 8) - y('a {:b} b', 'a 0b1000 b', 8) - y('a {:o} b', 'a 12345670 b', int('12345670', 8)) - y('a {:o} b', 'a 0o12345670 b', int('12345670', 8)) - y('a {:x} b', 'a 1234567890abcdef b', 0x1234567890abcdef) - y('a {:x} b', 'a 1234567890ABCDEF b', 0x1234567890ABCDEF) - y('a {:x} b', 'a 0x1234567890abcdef b', 0x1234567890abcdef) - y('a {:x} b', 'a 0x1234567890ABCDEF b', 0x1234567890ABCDEF) - - y('a {:05d} b', 'a 00001 b', 1) - y('a {:05d} b', 'a -00001 b', -1) - y('a {:05d} b', 'a +00001 b', 1) - - y('a {:=d} b', 'a 000012 b', 12) - y('a {:x=5d} b', 'a xxx12 b', 12) - y('a {:x=5d} b', 'a -xxx12 b', -12) - - def test_datetimes(self): - def y(fmt, s, e, tz=None): - p = compile(fmt) - r = p.parse(s) - if r is None: - self.fail('%r (%r) did not match %r' % (fmt, p._expression, s)) - r = r.fixed[0] - self.assertEqual(r, e, - '%r found %r in %r, not %r' % (fmt, r, s, e)) - if tz is not None: - self.assertEqual(r.tzinfo, tz, - '%r found TZ %r in %r, not %r' % (fmt, r.tzinfo, s, e)) - def n(fmt, s, e): - if parse(fmt, s) is not None: - self.fail('%r matched %r' % (fmt, s)) - - utc = FixedTzOffset(0, 'UTC') - aest = FixedTzOffset(10*60, '+1000') - - # ISO 8660 variants - # YYYY-MM-DD (eg 1997-07-16) - y('a {:ti} b', 'a 1997-07-16 b', datetime(1997, 7, 16)) - - # YYYY-MM-DDThh:mmTZD (eg 1997-07-16T19:20+01:00) - y('a {:ti} b', 'a 1997-07-16T19:20 b', datetime(1997, 7, 16, 19, 20, 0)) - y('a {:ti} b', 'a 1997-07-16T19:20Z b', - datetime(1997, 7, 16, 19, 20, tzinfo=utc)) - y('a {:ti} b', 'a 1997-07-16T19:20+01:00 b', - datetime(1997, 7, 16, 19, 20, tzinfo=FixedTzOffset(60, '+01:00'))) - - # YYYY-MM-DDThh:mm:ssTZD (eg 1997-07-16T19:20:30+01:00) - y('a {:ti} b', 'a 1997-07-16T19:20:30 b', datetime(1997, 7, 16, 19, 20, 30)) - y('a {:ti} b', 'a 1997-07-16T19:20:30Z b', - datetime(1997, 7, 16, 19, 20, 30, tzinfo=utc)) - y('a {:ti} b', 'a 1997-07-16T19:20:30+01:00 b', - datetime(1997, 7, 16, 19, 20, 30, tzinfo= FixedTzOffset(60, '+01:00'))) - - # YYYY-MM-DDThh:mm:ss.sTZD (eg 1997-07-16T19:20:30.45+01:00) - y('a {:ti} b', 'a 1997-07-16T19:20:30.500000 b', datetime(1997, 7, 16, 19, 20, 30, 500000)) - y('a {:ti} b', 'a 1997-07-16T19:20:30.5Z b', - datetime(1997, 7, 16, 19, 20, 30, 500000, tzinfo=utc)) - y('a {:ti} b', 'a 1997-07-16T19:20:30.5+01:00 b', - datetime(1997, 7, 16, 19, 20, 30, 500000, tzinfo=FixedTzOffset(60, '+01:00'))) - - aest_d = datetime(2011, 11, 21, 10, 21, 36, tzinfo=aest) - dt = datetime(2011, 11, 21, 10, 21, 36) - dt00 = datetime(2011, 11, 21, 10, 21) - d = datetime(2011, 11, 21) - - # te RFC2822 e-mail format datetime - y('a {:te} b', 'a Mon, 21 Nov 2011 10:21:36 +1000 b', aest_d) - y('a {:te} b', 'a 21 Nov 2011 10:21:36 +1000 b', aest_d) - - # tg global (day/month) format datetime - y('a {:tg} b', 'a 21/11/2011 10:21:36 AM +1000 b', aest_d) - y('a {:tg} b', 'a 21-11-2011 10:21:36 AM +1000 b', aest_d) - y('a {:tg} b', 'a 21/11/2011 10:21:36 +1000 b', aest_d) - y('a {:tg} b', 'a 21/11/2011 10:21:36 b', dt) - y('a {:tg} b', 'a 21/11/2011 10:21 b', dt00) - y('a {:tg} b', 'a 21-11-2011 b', d) - y('a {:tg} b', 'a 21-Nov-2011 10:21:36 AM +1000 b', aest_d) - y('a {:tg} b', 'a 21-November-2011 10:21:36 AM +1000 b', aest_d) - - # ta US (month/day) format datetime - y('a {:ta} b', 'a 11/21/2011 10:21:36 AM +1000 b', aest_d) - y('a {:ta} b', 'a 11-21-2011 10:21:36 AM +1000 b', aest_d) - y('a {:ta} b', 'a 11/21/2011 10:21:36 +1000 b', aest_d) - y('a {:ta} b', 'a 11/21/2011 10:21:36 b', dt) - y('a {:ta} b', 'a 11/21/2011 10:21 b', dt00) - y('a {:ta} b', 'a 11-21-2011 b', d) - y('a {:ta} b', 'a Nov-21-2011 10:21:36 AM +1000 b', aest_d) - y('a {:ta} b', 'a November-21-2011 10:21:36 AM +1000 b', aest_d) - y('a {:ta} b', 'a November-21-2011 b', d) - - # th HTTP log format date/time datetime - y('a {:th} b', 'a 21/Nov/2011:10:21:36 +1000 b', aest_d) - - d = datetime(2011, 11, 21, 10, 21, 36) - - # tc ctime() format datetime - y('a {:tc} b', 'a Mon Nov 21 10:21:36 2011 b', d) - - t530 = FixedTzOffset(-5*60 - 30, '-5:30') - - # tt Time time - y('a {:tt} b', 'a 10:21:36 AM +1000 b', time(10, 21, 36, tzinfo=aest)) - y('a {:tt} b', 'a 10:21:36 AM b', time(10, 21, 36)) - y('a {:tt} b', 'a 10:21:36 PM b', time(22, 21, 36)) - y('a {:tt} b', 'a 10:21:36 b', time(10, 21, 36)) - y('a {:tt} b', 'a 10:21 b', time(10, 21)) - y('a {:tt} b', 'a 10:21:36 PM -5:30 b', time(22, 21, 36, tzinfo=t530)) - - def test_datetime_group_count(self): - 'test we increment the group count correctly for datetimes' - r = parse('{:ti} {}', '1972-01-01 spam') - self.assertEqual(r.fixed[1], 'spam') - r = parse('{:tg} {}', '1-1-1972 spam') - self.assertEqual(r.fixed[1], 'spam') - r = parse('{:ta} {}', '1-1-1972 spam') - self.assertEqual(r.fixed[1], 'spam') - r = parse('{:th} {}', '21/Nov/2011:10:21:36 +1000 spam') - self.assertEqual(r.fixed[1], 'spam') - r = parse('{:te} {}', '21 Nov 2011 10:21:36 +1000 spam') - self.assertEqual(r.fixed[1], 'spam') - r = parse('{:tc} {}', 'Mon Nov 21 10:21:36 2011 spam') - self.assertEqual(r.fixed[1], 'spam') - r = parse('{:tt} {}', '10:21 spam') - self.assertEqual(r.fixed[1], 'spam') - - def test_mixed_types(self): - 'stress-test: pull one of everything out of a string' - r = parse(''' - letters: {:w} - non-letters: {:W} - whitespace: "{:s}" - non-whitespace: \t{:S}\n - digits: {:d} {:d} {:d} - non-digits: {:D} - numbers with thousands: {:n} {:n} - fixed-point: {:f} {:f} - floating-point: {:e} {:e} - general numbers: {:g} {:g} {:g} {:g} - binary: {:b} {:b} - octal: {:o} {:o} - hex: {:x} {:x} - ISO 8601 e.g. {:ti} - RFC2822 e.g. {:te} - Global e.g. {:tg} - US e.g. {:ta} - ctime() e.g. {:tc} - HTTP e.g. {:th} - time: {:tt} - final value: {} - ''', - ''' - letters: abcdef_GHIJLK - non-letters: !@#%$ *^% - whitespace: " \t\n" - non-whitespace: \tabc\n - digits: 12345 0b1011011 0xabcdef - non-digits: abcdef - numbers with thousands: 1,000 1.000.000 - fixed-point: 100.2345 0.00001 - floating-point: 1.1e-10 NAN - general numbers: 1 1.1 1.1e10 nan - binary: 0b1000 0B1000 - octal: 0o1000 0O1000 - hex: 0x1000 0X1000 - ISO 8601 e.g. 1972-01-20T10:21:36Z - RFC2822 e.g. Mon, 20 Jan 1972 10:21:36 +1000 - Global e.g. 20/1/1972 10:21:36 AM +1:00 - US e.g. 1/20/1972 10:21:36 PM +10:30 - ctime() e.g. Sun Sep 16 01:03:52 1973 - HTTP e.g. 21/Nov/2011:00:07:11 +0000 - time: 10:21:36 PM -5:30 - final value: spam - ''') - self.assertEqual(r.fixed[31], 'spam') - - def test_too_many_fields(self): - self.assertRaises(TooManyFields, compile, '{:ti}' * 15) - - -if __name__ == '__main__': - unittest.main() - - # Copyright (c) 2011 eKit.com Inc (http://www.ekit.com/) # # Permission is hereby granted, free of charge, to any person obtaining a copy diff --git a/test_parse.py b/test_parse.py new file mode 100644 index 0000000..c1d9665 --- /dev/null +++ b/test_parse.py @@ -0,0 +1,493 @@ +'''Test suite for parse.py + +This code is copyright 2011 eKit.com Inc (http://www.ekit.com/) +See the end of the source file for the license of use. +''' + +import unittest +from datetime import datetime, time + +import parse + +class TestPattern(unittest.TestCase): + def _test_expression(self, format, expression): + self.assertEqual(parse.Parser(format)._expression, expression) + + def test_braces(self): + 'pull a simple string out of another string' + self._test_expression('{{ }}', '\{ \}') + + def test_fixed(self): + 'pull a simple string out of another string' + self._test_expression('{}', '(.+?)') + self._test_expression('{} {}', '(.+?) (.+?)') + + def test_named(self): + 'pull a named string out of another string' + self._test_expression('{name}', '(?P.+?)') + self._test_expression('{name} {other}', + '(?P.+?) (?P.+?)') + + def test_named_typed(self): + 'pull a named string out of another string' + self._test_expression('{name:w}', '(?P\w+)') + self._test_expression('{name:w} {other:w}', + '(?P\w+) (?P\w+)') + + def test_beaker(self): + 'skip some trailing whitespace' + self._test_expression('{:<}', '(.+?) *') + + def test_left_fill(self): + 'skip some trailing periods' + self._test_expression('{:.<}', '(.+?)\.*') + + def test_bird(self): + 'skip some trailing whitespace' + self._test_expression('{:>}', ' *(.+?)') + + def test_center(self): + 'skip some surrounding whitespace' + self._test_expression('{:^}', ' *(.+?) *') + + def test_format(self): + def _(fmt, matches): + d = parse.extract_format(fmt, {'spam':'spam'}) + for k in matches: + self.assertEqual(d.get(k), matches[k], + 'm["%s"]=%r, expect %r' % (k, d.get(k), matches[k])) + + for t in '%obxegfdDwWsS': + _(t, dict(type=t)) + _('10'+t, dict(type=t, width='10')) + _('05d', dict(type='d', width='5', zero=True)) + _('<', dict(align='<')) + _('.<', dict(align='<', fill='.')) + _('>', dict(align='>')) + _('.>', dict(align='>', fill='.')) + _('^', dict(align='^')) + _('.^', dict(align='^', fill='.')) + _('x=d', dict(type='d', align='=', fill='x')) + _('d', dict(type='d')) + _('ti', dict(type='ti')) + _('spam', dict(type='spam')) + + _('.^010d', dict(type='d', width='10', align='^', fill='.', + zero=True)) + + +class TestParse(unittest.TestCase): + def test_no_match(self): + 'string does not match format' + self.assertEqual(parse.parse('{{hello}}', 'hello'), None) + + def test_nothing(self): + 'do no actual parsing' + r = parse.parse('{{hello}}', '{hello}') + self.assertEqual(r.fixed, ()) + self.assertEqual(r.named, {}) + + def test_regular_expression(self): + 'match an actual regular expression' + s = r'^(hello\s[wW]{}!+.*)$' + e = s.replace('{}', 'orld') + r = parse.parse(s, e) + self.assertEqual(r.fixed, ('orld',)) + e = s.replace('{}', '.*?') + r = parse.parse(s, e) + self.assertEqual(r.fixed, ('.*?',)) + + def test_fixed(self): + 'pull a fixed value out of string' + r = parse.parse('hello {}', 'hello world') + self.assertEqual(r.fixed, ('world', )) + + def test_left(self): + 'pull left-aligned text out of string' + r = parse.parse('{:<} world', 'hello world') + self.assertEqual(r.fixed, ('hello', )) + + def test_right(self): + 'pull right-aligned text out of string' + r = parse.parse('hello {:>}', 'hello world') + self.assertEqual(r.fixed, ('world', )) + + def test_center(self): + 'pull center-aligned text out of string' + r = parse.parse('hello {:^} world', 'hello there world') + self.assertEqual(r.fixed, ('there', )) + + def test_typed(self): + 'pull a named, typed values out of string' + r = parse.parse('hello {:d} {:w}', 'hello 12 people') + self.assertEqual(r.fixed, (12, 'people')) + r = parse.parse('hello {:w} {:w}', 'hello 12 people') + self.assertEqual(r.fixed, ('12', 'people')) + + def test_custom_type(self): + 'use a custom type' + r = parse.parse('{:shouty} {:spam}', 'hello world', + dict(shouty=lambda s:s.upper(), spam=lambda s:''.join(reversed(s)))) + self.assertEqual(r.fixed, ('HELLO', 'dlrow')) + r = parse.parse('{:d}', '12', dict(d=lambda s: int(s) * 2)) + self.assertEqual(r.fixed, (24,)) + r = parse.parse('{:d}', '12') + self.assertEqual(r.fixed, (12,)) + + def test_typed_fail(self): + 'pull a named, typed values out of string' + self.assertEqual(parse.parse('hello {:d} {:w}', 'hello people 12'), None) + + def test_named(self): + 'pull a named value out of string' + r = parse.parse('hello {name}', 'hello world') + self.assertEqual(r.named, {'name': 'world'}) + + def test_mixed(self): + 'pull a fixed and named values out of string' + r = parse.parse('hello {} {name} {} {spam}', 'hello world and other beings') + self.assertEqual(r.fixed, ('world', 'other')) + self.assertEqual(r.named, dict(name='and', spam='beings')) + + def test_named_typed(self): + 'pull a named, typed values out of string' + r = parse.parse('hello {number:d} {things}', 'hello 12 people') + self.assertEqual(r.named, dict(number=12, things='people')) + r = parse.parse('hello {number:w} {things}', 'hello 12 people') + self.assertEqual(r.named, dict(number='12', things='people')) + + def test_named_aligned_typed(self): + 'pull a named, typed values out of string' + r = parse.parse('hello {number:d} {things}', 'hello 12 people') + self.assertEqual(r.named, dict(number=12, things='people')) + r = parse.parse('hello {number:^d} {things}', 'hello 12 people') + self.assertEqual(r.named, dict(number=12, things='people')) + + def test_multiline(self): + r = parse.parse('hello\n{}\nworld', 'hello\nthere\nworld') + self.assertEqual(r.fixed[0], 'there') + + def test_spans(self): + 'test the string sections our fields come from' + string = 'hello world' + r = parse.parse('hello {}', string) + self.assertEqual(r.spans, {0: (6,11)}) + start, end = r.spans[0] + self.assertEqual(string[start:end], r.fixed[0]) + + string = 'hello world' + r = parse.parse('hello {:>}', string) + self.assertEqual(r.spans, {0: (10,15)}) + start, end = r.spans[0] + self.assertEqual(string[start:end], r.fixed[0]) + + string = 'hello 0x12 world' + r = parse.parse('hello {val:x} world', string) + self.assertEqual(r.spans, {'val': (6,10)}) + start, end = r.spans['val'] + self.assertEqual(string[start:end], '0x%x' % r.named['val']) + + string = 'hello world and other beings' + r = parse.parse('hello {} {name} {} {spam}', string) + self.assertEqual(r.spans, {0: (6, 11), 'name': (12, 15), + 1: (16, 21), 'spam': (22, 28)}) + + def test_numbers(self): + 'pull a numbers out of a string' + def y(fmt, s, e, str_equals=False): + p = parse.compile(fmt) + r = p.parse(s) + if r is None: + self.fail('%r (%r) did not match %r' % (fmt, p._expression, s)) + r = r.fixed[0] + if str_equals: + self.assertEqual(str(r), str(e), + '%r found %r in %r, not %r' % (fmt, r, s, e)) + else: + self.assertEqual(r, e, + '%r found %r in %r, not %r' % (fmt, r, s, e)) + def n(fmt, s, e): + if parse.parse(fmt, s) is not None: + self.fail('%r matched %r' % (fmt, s)) + y('a {:d} b', 'a 12 b', 12) + y('a {:5d} b', 'a 12 b', 12) + y('a {:5d} b', 'a -12 b', -12) + y('a {:d} b', 'a -12 b', -12) + y('a {:d} b', 'a +12 b', 12) + y('a {:d} b', 'a 12 b', 12) + y('a {:d} b', 'a 0b1000 b', 8) + y('a {:d} b', 'a 0o1000 b', 512) + y('a {:d} b', 'a 0x1000 b', 4096) + y('a {:d} b', 'a 0xabcdef b', 0xabcdef) + + y('a {:%} b', 'a 100% b', 1) + y('a {:%} b', 'a 50% b', .5) + y('a {:%} b', 'a 50.1% b', .501) + + y('a {:n} b', 'a 100 b', 100) + y('a {:n} b', 'a 1,000 b', 1000) + y('a {:n} b', 'a 1.000 b', 1000) + y('a {:n} b', 'a -1,000 b', -1000) + y('a {:n} b', 'a 10,000 b', 10000) + y('a {:n} b', 'a 100,000 b', 100000) + n('a {:n} b', 'a 100,00 b', None) + y('a {:n} b', 'a 100.000 b', 100000) + y('a {:n} b', 'a 1.000.000 b', 1000000) + + y('a {:f} b', 'a 12.0 b', 12.0) + y('a {:f} b', 'a -12.1 b', -12.1) + y('a {:f} b', 'a +12.1 b', 12.1) + n('a {:f} b', 'a 12 b', None) + + y('a {:e} b', 'a 1.0e10 b', 1.0e10) + y('a {:e} b', 'a 1.0E10 b', 1.0e10) + y('a {:e} b', 'a 1.10000e10 b', 1.1e10) + y('a {:e} b', 'a 1.0e-10 b', 1.0e-10) + y('a {:e} b', 'a 1.0e+10 b', 1.0e10) + # can't actually test this one on values 'cos nan != nan + y('a {:e} b', 'a nan b', float('nan'), str_equals=True) + y('a {:e} b', 'a NAN b', float('nan'), str_equals=True) + y('a {:e} b', 'a inf b', float('inf')) + y('a {:e} b', 'a +inf b', float('inf')) + y('a {:e} b', 'a -inf b', float('-inf')) + y('a {:e} b', 'a INF b', float('inf')) + y('a {:e} b', 'a +INF b', float('inf')) + y('a {:e} b', 'a -INF b', float('-inf')) + + y('a {:g} b', 'a 1 b', 1) + y('a {:g} b', 'a 1e10 b', 1e10) + y('a {:g} b', 'a 1.0e10 b', 1.0e10) + y('a {:g} b', 'a 1.0E10 b', 1.0e10) + + y('a {:b} b', 'a 1000 b', 8) + y('a {:b} b', 'a 0b1000 b', 8) + y('a {:o} b', 'a 12345670 b', int('12345670', 8)) + y('a {:o} b', 'a 0o12345670 b', int('12345670', 8)) + y('a {:x} b', 'a 1234567890abcdef b', 0x1234567890abcdef) + y('a {:x} b', 'a 1234567890ABCDEF b', 0x1234567890ABCDEF) + y('a {:x} b', 'a 0x1234567890abcdef b', 0x1234567890abcdef) + y('a {:x} b', 'a 0x1234567890ABCDEF b', 0x1234567890ABCDEF) + + y('a {:05d} b', 'a 00001 b', 1) + y('a {:05d} b', 'a -00001 b', -1) + y('a {:05d} b', 'a +00001 b', 1) + + y('a {:=d} b', 'a 000012 b', 12) + y('a {:x=5d} b', 'a xxx12 b', 12) + y('a {:x=5d} b', 'a -xxx12 b', -12) + + def test_datetimes(self): + def y(fmt, s, e, tz=None): + p = parse.compile(fmt) + r = p.parse(s) + if r is None: + self.fail('%r (%r) did not match %r' % (fmt, p._expression, s)) + r = r.fixed[0] + self.assertEqual(r, e, + '%r found %r in %r, not %r' % (fmt, r, s, e)) + if tz is not None: + self.assertEqual(r.tzinfo, tz, + '%r found TZ %r in %r, not %r' % (fmt, r.tzinfo, s, e)) + def n(fmt, s, e): + if parse.parse(fmt, s) is not None: + self.fail('%r matched %r' % (fmt, s)) + + utc = parse.FixedTzOffset(0, 'UTC') + aest = parse.FixedTzOffset(10*60, '+1000') + tz60 = parse.FixedTzOffset(60, '+01:00') + + # ISO 8660 variants + # YYYY-MM-DD (eg 1997-07-16) + y('a {:ti} b', 'a 1997-07-16 b', datetime(1997, 7, 16)) + + # YYYY-MM-DDThh:mmTZD (eg 1997-07-16T19:20+01:00) + y('a {:ti} b', 'a 1997-07-16T19:20 b', datetime(1997, 7, 16, 19, 20, 0)) + y('a {:ti} b', 'a 1997-07-16T19:20Z b', + datetime(1997, 7, 16, 19, 20, tzinfo=utc)) + y('a {:ti} b', 'a 1997-07-16T19:20+01:00 b', + datetime(1997, 7, 16, 19, 20, tzinfo=tz60)) + + # YYYY-MM-DDThh:mm:ssTZD (eg 1997-07-16T19:20:30+01:00) + y('a {:ti} b', 'a 1997-07-16T19:20:30 b', datetime(1997, 7, 16, 19, 20, 30)) + y('a {:ti} b', 'a 1997-07-16T19:20:30Z b', + datetime(1997, 7, 16, 19, 20, 30, tzinfo=utc)) + y('a {:ti} b', 'a 1997-07-16T19:20:30+01:00 b', + datetime(1997, 7, 16, 19, 20, 30, tzinfo=tz60)) + + # YYYY-MM-DDThh:mm:ss.sTZD (eg 1997-07-16T19:20:30.45+01:00) + y('a {:ti} b', 'a 1997-07-16T19:20:30.500000 b', datetime(1997, 7, 16, 19, 20, 30, 500000)) + y('a {:ti} b', 'a 1997-07-16T19:20:30.5Z b', + datetime(1997, 7, 16, 19, 20, 30, 500000, tzinfo=utc)) + y('a {:ti} b', 'a 1997-07-16T19:20:30.5+01:00 b', + datetime(1997, 7, 16, 19, 20, 30, 500000, tzinfo=tz60)) + + aest_d = datetime(2011, 11, 21, 10, 21, 36, tzinfo=aest) + dt = datetime(2011, 11, 21, 10, 21, 36) + dt00 = datetime(2011, 11, 21, 10, 21) + d = datetime(2011, 11, 21) + + # te RFC2822 e-mail format datetime + y('a {:te} b', 'a Mon, 21 Nov 2011 10:21:36 +1000 b', aest_d) + y('a {:te} b', 'a 21 Nov 2011 10:21:36 +1000 b', aest_d) + + # tg global (day/month) format datetime + y('a {:tg} b', 'a 21/11/2011 10:21:36 AM +1000 b', aest_d) + y('a {:tg} b', 'a 21-11-2011 10:21:36 AM +1000 b', aest_d) + y('a {:tg} b', 'a 21/11/2011 10:21:36 +1000 b', aest_d) + y('a {:tg} b', 'a 21/11/2011 10:21:36 b', dt) + y('a {:tg} b', 'a 21/11/2011 10:21 b', dt00) + y('a {:tg} b', 'a 21-11-2011 b', d) + y('a {:tg} b', 'a 21-Nov-2011 10:21:36 AM +1000 b', aest_d) + y('a {:tg} b', 'a 21-November-2011 10:21:36 AM +1000 b', aest_d) + + # ta US (month/day) format datetime + y('a {:ta} b', 'a 11/21/2011 10:21:36 AM +1000 b', aest_d) + y('a {:ta} b', 'a 11-21-2011 10:21:36 AM +1000 b', aest_d) + y('a {:ta} b', 'a 11/21/2011 10:21:36 +1000 b', aest_d) + y('a {:ta} b', 'a 11/21/2011 10:21:36 b', dt) + y('a {:ta} b', 'a 11/21/2011 10:21 b', dt00) + y('a {:ta} b', 'a 11-21-2011 b', d) + y('a {:ta} b', 'a Nov-21-2011 10:21:36 AM +1000 b', aest_d) + y('a {:ta} b', 'a November-21-2011 10:21:36 AM +1000 b', aest_d) + y('a {:ta} b', 'a November-21-2011 b', d) + + # th HTTP log format date/time datetime + y('a {:th} b', 'a 21/Nov/2011:10:21:36 +1000 b', aest_d) + + d = datetime(2011, 11, 21, 10, 21, 36) + + # tc ctime() format datetime + y('a {:tc} b', 'a Mon Nov 21 10:21:36 2011 b', d) + + t530 = parse.FixedTzOffset(-5*60 - 30, '-5:30') + + # tt Time time + y('a {:tt} b', 'a 10:21:36 AM +1000 b', time(10, 21, 36, tzinfo=aest)) + y('a {:tt} b', 'a 10:21:36 AM b', time(10, 21, 36)) + y('a {:tt} b', 'a 10:21:36 PM b', time(22, 21, 36)) + y('a {:tt} b', 'a 10:21:36 b', time(10, 21, 36)) + y('a {:tt} b', 'a 10:21 b', time(10, 21)) + y('a {:tt} b', 'a 10:21:36 PM -5:30 b', time(22, 21, 36, tzinfo=t530)) + + def test_datetime_group_count(self): + 'test we increment the group count correctly for datetimes' + r = parse.parse('{:ti} {}', '1972-01-01 spam') + self.assertEqual(r.fixed[1], 'spam') + r = parse.parse('{:tg} {}', '1-1-1972 spam') + self.assertEqual(r.fixed[1], 'spam') + r = parse.parse('{:ta} {}', '1-1-1972 spam') + self.assertEqual(r.fixed[1], 'spam') + r = parse.parse('{:th} {}', '21/Nov/2011:10:21:36 +1000 spam') + self.assertEqual(r.fixed[1], 'spam') + r = parse.parse('{:te} {}', '21 Nov 2011 10:21:36 +1000 spam') + self.assertEqual(r.fixed[1], 'spam') + r = parse.parse('{:tc} {}', 'Mon Nov 21 10:21:36 2011 spam') + self.assertEqual(r.fixed[1], 'spam') + r = parse.parse('{:tt} {}', '10:21 spam') + self.assertEqual(r.fixed[1], 'spam') + + def test_mixed_types(self): + 'stress-test: pull one of everything out of a string' + r = parse.parse(''' + letters: {:w} + non-letters: {:W} + whitespace: "{:s}" + non-whitespace: \t{:S}\n + digits: {:d} {:d} {:d} + non-digits: {:D} + numbers with thousands: {:n} {:n} + fixed-point: {:f} {:f} + floating-point: {:e} {:e} + general numbers: {:g} {:g} {:g} {:g} + binary: {:b} {:b} + octal: {:o} {:o} + hex: {:x} {:x} + ISO 8601 e.g. {:ti} + RFC2822 e.g. {:te} + Global e.g. {:tg} + US e.g. {:ta} + ctime() e.g. {:tc} + HTTP e.g. {:th} + time: {:tt} + final value: {} + ''', + ''' + letters: abcdef_GHIJLK + non-letters: !@#%$ *^% + whitespace: " \t\n" + non-whitespace: \tabc\n + digits: 12345 0b1011011 0xabcdef + non-digits: abcdef + numbers with thousands: 1,000 1.000.000 + fixed-point: 100.2345 0.00001 + floating-point: 1.1e-10 NAN + general numbers: 1 1.1 1.1e10 nan + binary: 0b1000 0B1000 + octal: 0o1000 0O1000 + hex: 0x1000 0X1000 + ISO 8601 e.g. 1972-01-20T10:21:36Z + RFC2822 e.g. Mon, 20 Jan 1972 10:21:36 +1000 + Global e.g. 20/1/1972 10:21:36 AM +1:00 + US e.g. 1/20/1972 10:21:36 PM +10:30 + ctime() e.g. Sun Sep 16 01:03:52 1973 + HTTP e.g. 21/Nov/2011:00:07:11 +0000 + time: 10:21:36 PM -5:30 + final value: spam + ''') + self.assertEqual(r.fixed[31], 'spam') + + def test_too_many_fields(self): + p = parse.compile('{:ti}' * 15) + self.assertRaises(parse.TooManyFields, p.parse, '') + + +class TestSearch(unittest.TestCase): + def test_basic(self): + 'basic search() test' + r = parse.search('a {} c', ' a b c ') + self.assertEqual(r.fixed, ('b',)) + + def test_multiline(self): + 'multiline search() test' + r = parse.search('age: {:d}\n', 'name: Rufus\nage: 42\ncolor: red\n') + self.assertEqual(r.fixed, (42,)) + + def test_pos(self): + 'basic search() test' + r = parse.search('a {} c', ' a b c ', 2) + self.assertEqual(r, None) + +class TestFindall(unittest.TestCase): + def test_findall(self): + 'basic findall() test' + s = ''.join(r.fixed[0] for r in parse.findall(">{}<", "

some bold text

")) + self.assertEqual(s, "some bold text") + + +if __name__ == '__main__': + unittest.main() + + +# Copyright (c) 2011 eKit.com Inc (http://www.ekit.com/) +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# vim: set filetype=python ts=4 sw=4 et si tw=75