From 67bbe947f1ae92e8c64496b07f6df11b0d076d29 Mon Sep 17 00:00:00 2001 From: Mark Pilgrim Date: Fri, 25 Sep 2009 22:51:40 -0400 Subject: [PATCH] markup fiddling, fixed SRE_Match repr --- regular-expressions.html | 54 ++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/regular-expressions.html b/regular-expressions.html index 01ab4fc..549544b 100755 --- a/regular-expressions.html +++ b/regular-expressions.html @@ -96,14 +96,14 @@ body{counter-reset:h1 5} >>> import re >>> pattern = '^M?M?M?$' >>> re.search(pattern, 'M') -<SRE_Match object at 0106FB58> +<_sre.SRE_Match object at 0106FB58> >>> re.search(pattern, 'MM') -<SRE_Match object at 0106C290> +<_sre.SRE_Match object at 0106C290> >>> re.search(pattern, 'MMM') -<SRE_Match object at 0106AA38> +<_sre.SRE_Match object at 0106AA38> >>> re.search(pattern, 'MMMM') >>> re.search(pattern, '') -<SRE_Match object at 0106F4A8> +<_sre.SRE_Match object at 0106F4A8>
  1. This pattern has three parts. ^ matches what follows only at the beginning of the string. If this were not specified, the pattern would match no matter where the M characters were, which is not what you want. You want to make sure that the M characters, if they’re there, are at the beginning of the string. M? optionally matches a single M character. Since this is repeated three times, you’re matching anywhere from zero to three M characters in a row. And $ matches the end of the string. When combined with the ^ character at the beginning, this means that the pattern must match the entire string, with no other characters before or after the M characters.
  2. The essence of the re module is the search() function, that takes a regular expression (pattern) and a string ('M') to try to match against the regular expression. If a match is found, search() returns an object which has various methods to describe the match; if no match is found, search() returns None, the Python null value. All you care about at the moment is whether the pattern matches, which you can tell by just looking at the return value of search(). 'M' matches this regular expression, because the first optional M matches and the second and third optional M characters are ignored. @@ -142,14 +142,14 @@ body{counter-reset:h1 5} >>> import re >>> pattern = '^M?M?M?(CM|CD|D?C?C?C?)$' >>> re.search(pattern, 'MCM') -<SRE_Match object at 01070390> +<_sre.SRE_Match object at 01070390> >>> re.search(pattern, 'MD') -<SRE_Match object at 01073A50> +<_sre.SRE_Match object at 01073A50> >>> re.search(pattern, 'MMMCCC') -<SRE_Match object at 010748A8> +<_sre.SRE_Match object at 010748A8> >>> re.search(pattern, 'MCMC') >>> re.search(pattern, '') -<SRE_Match object at 01071D98> +<_sre.SRE_Match object at 01071D98>
    1. This pattern starts out the same as the previous one, checking for the beginning of the string (^), then the thousands place (M?M?M?). Then it has the new part, in parentheses, which defines a set of three mutually exclusive patterns, separated by vertical bars: CM, CD, and D?C?C?C? (which is an optional D followed by zero to three optional C characters). The regular expression parser checks for each of these patterns in order (from left to right), takes the first one that matches, and ignores the rest.
    2. 'MCM' matches because the first M matches, the second and third M characters are ignored, and the CM matches (so the CD and D?C?C?C? patterns are never even considered). MCM is the Roman numeral representation of 1900. @@ -168,14 +168,14 @@ body{counter-reset:h1 5} >>> import re >>> pattern = '^M?M?M?$' >>> re.search(pattern, 'M') -<_sre.SRE_Match object at 0x008EE090> +<_sre.SRE_Match object at 0x008EE090> >>> pattern = '^M?M?M?$' >>> re.search(pattern, 'MM') -<_sre.SRE_Match object at 0x008EEB48> +<_sre.SRE_Match object at 0x008EEB48> >>> pattern = '^M?M?M?$' ->>> re.search(pattern, 'MMM') +>>> re.search(pattern, 'MMM') <_sre.SRE_Match object at 0x008EE090> ->>> re.search(pattern, 'MMMM') +>>> re.search(pattern, 'MMMM') >>>
      1. This matches the start of the string, and then the first optional M, but not the second and third M (but that’s okay because they’re optional), and then the end of the string. @@ -186,13 +186,13 @@ body{counter-reset:h1 5}
         >>> pattern = '^M{0,3}$'        
         >>> re.search(pattern, 'M')     
        -<_sre.SRE_Match object at 0x008EEB48>
        +<_sre.SRE_Match object at 0x008EEB48>
         >>> re.search(pattern, 'MM')    
        -<_sre.SRE_Match object at 0x008EE090>
        +<_sre.SRE_Match object at 0x008EE090>
         >>> re.search(pattern, 'MMM')   
        -<_sre.SRE_Match object at 0x008EEDA8>
        +<_sre.SRE_Match object at 0x008EEDA8>
         >>> re.search(pattern, 'MMMM')  
        ->>> 
        +>>>
        1. This pattern says: “Match the start of the string, then anywhere from zero to three M characters, then the end of the string.” The 0 and 3 can be any numbers; if you want to match at least one but no more than three M characters, you could say M{1,3}.
        2. This matches the start of the string, then one M out of a possible three, then the end of the string. @@ -205,13 +205,13 @@ body{counter-reset:h1 5}
           >>> pattern = '^M?M?M?(CM|CD|D?C?C?C?)(XC|XL|L?X?X?X?)$'
           >>> re.search(pattern, 'MCMXL')     
          -<_sre.SRE_Match object at 0x008EEB48>
          +<_sre.SRE_Match object at 0x008EEB48>
           >>> re.search(pattern, 'MCML')      
          -<_sre.SRE_Match object at 0x008EEB48>
          +<_sre.SRE_Match object at 0x008EEB48>
           >>> re.search(pattern, 'MCMLX')     
          -<_sre.SRE_Match object at 0x008EEB48>
          +<_sre.SRE_Match object at 0x008EEB48>
           >>> re.search(pattern, 'MCMLXXX')   
          -<_sre.SRE_Match object at 0x008EEB48>
          +<_sre.SRE_Match object at 0x008EEB48>
           >>> re.search(pattern, 'MCMLXXXX')  
           >>> 
            @@ -229,13 +229,13 @@ body{counter-reset:h1 5}
             >>> pattern = '^M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$'
             >>> re.search(pattern, 'MDLV')              
            -<_sre.SRE_Match object at 0x008EEB48>
            +<_sre.SRE_Match object at 0x008EEB48>
             >>> re.search(pattern, 'MMDCLXVI')          
            -<_sre.SRE_Match object at 0x008EEB48>
            +<_sre.SRE_Match object at 0x008EEB48>
             >>> re.search(pattern, 'MMMDCCCLXXXVIII')   
            -<_sre.SRE_Match object at 0x008EEB48>
            +<_sre.SRE_Match object at 0x008EEB48>
             >>> re.search(pattern, 'I')                 
            -<_sre.SRE_Match object at 0x008EEB48>
            +<_sre.SRE_Match object at 0x008EEB48>
            1. This matches the start of the string, then one of a possible three M characters, then D?C{0,3}. Of that, it matches the optional D and zero of three possible C characters. Moving on, it matches L?X{0,3} by matching the optional L and zero of three possible X characters. Then it matches V?I{0,3} by matching the optional V and zero of three possible I characters, and finally the end of the string. MDLV is the Roman numeral representation of 1555.
            2. This matches the start of the string, then two of a possible three M characters, then the D?C{0,3} with a D and one of three possible C characters; then L?X{0,3} with an L and one of three possible X characters; then V?I{0,3} with a V and one of three possible I characters; then the end of the string. MMDCLXVI is the Roman numeral representation of 2666. @@ -267,11 +267,11 @@ body{counter-reset:h1 5} $ # end of string ''' >>> re.search(pattern, 'M', re.VERBOSE) -<_sre.SRE_Match object at 0x008EEB48> +<_sre.SRE_Match object at 0x008EEB48> >>> re.search(pattern, 'MCMLXXXIX', re.VERBOSE) -<_sre.SRE_Match object at 0x008EEB48> +<_sre.SRE_Match object at 0x008EEB48> >>> re.search(pattern, 'MMMDCCCLXXXVIII', re.VERBOSE) -<_sre.SRE_Match object at 0x008EEB48> +<_sre.SRE_Match object at 0x008EEB48> >>> re.search(pattern, 'M')
              1. The most important thing to remember when using verbose regular expressions is that you need to pass an extra argument when working with them: re.VERBOSE is a constant defined in the re module that signals that the pattern should be treated as a verbose regular expression. As you can see, this pattern has quite a bit of whitespace (all of which is ignored), and several comments (all of which are ignored). Once you ignore the whitespace and the comments, this is exactly the same regular expression as you saw in the previous section, but it’s a lot more readable.