dive-into-python3/case-study-porting-chardet-to-python-3.txt

C:\home\chardet>python c:\Python30\Tools\Scripts\2to3.py -w chardet\
RefactoringTool: Skipping implicit fixer: buffer
RefactoringTool: Skipping implicit fixer: idioms
RefactoringTool: Skipping implicit fixer: set_literal
RefactoringTool: Skipping implicit fixer: ws_comma
--- chardet\__init__.py (original)
+++ chardet\__init__.py (refactored)
@@ -18,7 +18,7 @@
 __version__ = "1.0.1"

 def detect(aBuf):
-    import universaldetector
+    from . import universaldetector
     u = universaldetector.UniversalDetector()
     u.reset()
     u.feed(aBuf)
--- chardet\big5prober.py (original)
+++ chardet\big5prober.py (refactored)
@@ -25,10 +25,10 @@
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################

-from mbcharsetprober import MultiByteCharSetProber
-from codingstatemachine import CodingStateMachine
-from chardistribution import Big5DistributionAnalysis
-from mbcssm import Big5SMModel
+from .mbcharsetprober import MultiByteCharSetProber
+from .codingstatemachine import CodingStateMachine
+from .chardistribution import Big5DistributionAnalysis
+from .mbcssm import Big5SMModel

 class Big5Prober(MultiByteCharSetProber):
     def __init__(self):
--- chardet\chardistribution.py (original)
+++ chardet\chardistribution.py (refactored)
@@ -25,12 +25,12 @@
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################

-import constants
-from euctwfreq import EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE, EUCTW_TYPICAL_DISTRIBUTION_RATIO
-from euckrfreq import EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE, EUCKR_TYPICAL_DISTRIBUTION_RATIO
-from gb2312freq import GB2312CharToFreqOrder, GB2312_TABLE_SIZE, GB2312_TYPICAL_DISTRIBUTION_RATIO
-from big5freq import Big5CharToFreqOrder, BIG5_TABLE_SIZE, BIG5_TYPICAL_DISTRIBUTION_RATIO
-from jisfreq import JISCharToFreqOrder, JIS_TABLE_SIZE, JIS_TYPICAL_DISTRIBUTION_RATIO
+from . import constants
+from .euctwfreq import EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE, EUCTW_TYPICAL_DISTRIBUTION_RATIO
+from .euckrfreq import EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE, EUCKR_TYPICAL_DISTRIBUTION_RATIO
+from .gb2312freq import GB2312CharToFreqOrder, GB2312_TABLE_SIZE, GB2312_TYPICAL_DISTRIBUTION_RATIO
+from .big5freq import Big5CharToFreqOrder, BIG5_TABLE_SIZE, BIG5_TYPICAL_DISTRIBUTION_RATIO
+from .jisfreq import JISCharToFreqOrder, JIS_TABLE_SIZE, JIS_TYPICAL_DISTRIBUTION_RATIO

 ENOUGH_DATA_THRESHOLD = 1024
 SURE_YES = 0.99
--- chardet\charsetgroupprober.py (original)
+++ chardet\charsetgroupprober.py (refactored)
@@ -26,7 +26,7 @@
 ######################### END LICENSE BLOCK #########################

 import constants, sys
-from charsetprober import CharSetProber
+from .charsetprober import CharSetProber

 class CharSetGroupProber(CharSetProber):
     def __init__(self):
--- chardet\codingstatemachine.py (original)
+++ chardet\codingstatemachine.py (refactored)
@@ -25,7 +25,7 @@
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################

-from constants import eStart, eError, eItsMe
+from .constants import eStart, eError, eItsMe

 class CodingStateMachine:
     def __init__(self, sm):
--- chardet\constants.py (original)
+++ chardet\constants.py (refactored)
@@ -38,10 +38,10 @@

 SHORTCUT_THRESHOLD = 0.95

-import __builtin__
+import builtins
 if not hasattr(__builtin__, 'False'):
     False = 0
     True = 1
 else:
-    False = __builtin__.False
-    True = __builtin__.True
+    False = builtins.False
+    True = builtins.True
--- chardet\escprober.py (original)
+++ chardet\escprober.py (refactored)
@@ -26,9 +26,9 @@
 ######################### END LICENSE BLOCK #########################

 import constants, sys
-from escsm import HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel, ISO2022KRSMModel
-from charsetprober import CharSetProber
-from codingstatemachine import CodingStateMachine
+from .escsm import HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel, ISO2022KRSMModel
+from .charsetprober import CharSetProber
+from .codingstatemachine import CodingStateMachine

 class EscCharSetProber(CharSetProber):
     def __init__(self):
--- chardet\escsm.py (original)
+++ chardet\escsm.py (refactored)
@@ -25,7 +25,7 @@
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################

-from constants import eStart, eError, eItsMe
+from .constants import eStart, eError, eItsMe

 HZ_cls = ( \
 1,0,0,0,0,0,0,0,  # 00 - 07
--- chardet\eucjpprober.py (original)
+++ chardet\eucjpprober.py (refactored)
@@ -26,12 +26,12 @@
 ######################### END LICENSE BLOCK #########################

 import constants, sys
-from constants import eStart, eError, eItsMe
-from mbcharsetprober import MultiByteCharSetProber
-from codingstatemachine import CodingStateMachine
-from chardistribution import EUCJPDistributionAnalysis
-from jpcntx import EUCJPContextAnalysis
-from mbcssm import EUCJPSMModel
+from .constants import eStart, eError, eItsMe
+from .mbcharsetprober import MultiByteCharSetProber
+from .codingstatemachine import CodingStateMachine
+from .chardistribution import EUCJPDistributionAnalysis
+from .jpcntx import EUCJPContextAnalysis
+from .mbcssm import EUCJPSMModel

 class EUCJPProber(MultiByteCharSetProber):
     def __init__(self):
--- chardet\euckrprober.py (original)
+++ chardet\euckrprober.py (refactored)
@@ -25,10 +25,10 @@
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################

-from mbcharsetprober import MultiByteCharSetProber
-from codingstatemachine import CodingStateMachine
-from chardistribution import EUCKRDistributionAnalysis
-from mbcssm import EUCKRSMModel
+from .mbcharsetprober import MultiByteCharSetProber
+from .codingstatemachine import CodingStateMachine
+from .chardistribution import EUCKRDistributionAnalysis
+from .mbcssm import EUCKRSMModel

 class EUCKRProber(MultiByteCharSetProber):
     def __init__(self):
--- chardet\euctwprober.py (original)
+++ chardet\euctwprober.py (refactored)
@@ -25,10 +25,10 @@
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################

-from mbcharsetprober import MultiByteCharSetProber
-from codingstatemachine import CodingStateMachine
-from chardistribution import EUCTWDistributionAnalysis
-from mbcssm import EUCTWSMModel
+from .mbcharsetprober import MultiByteCharSetProber
+from .codingstatemachine import CodingStateMachine
+from .chardistribution import EUCTWDistributionAnalysis
+from .mbcssm import EUCTWSMModel

 class EUCTWProber(MultiByteCharSetProber):
     def __init__(self):
--- chardet\gb2312prober.py (original)
+++ chardet\gb2312prober.py (refactored)
@@ -25,10 +25,10 @@
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################

-from mbcharsetprober import MultiByteCharSetProber
-from codingstatemachine import CodingStateMachine
-from chardistribution import GB2312DistributionAnalysis
-from mbcssm import GB2312SMModel
+from .mbcharsetprober import MultiByteCharSetProber
+from .codingstatemachine import CodingStateMachine
+from .chardistribution import GB2312DistributionAnalysis
+from .mbcssm import GB2312SMModel

 class GB2312Prober(MultiByteCharSetProber):
     def __init__(self):
--- chardet\hebrewprober.py (original)
+++ chardet\hebrewprober.py (refactored)
@@ -25,8 +25,8 @@
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################

-from charsetprober import CharSetProber
-import constants
+from .charsetprober import CharSetProber
+from . import constants

 # This prober doesn't actually recognize a language or a charset.
 # It is a helper prober for the use of the Hebrew model probers
--- chardet\jpcntx.py (original)
+++ chardet\jpcntx.py (refactored)
@@ -25,7 +25,7 @@
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################

-import constants
+from . import constants

 NUM_OF_CATEGORY = 6
 DONT_KNOW = -1
--- chardet\langbulgarianmodel.py (original)
+++ chardet\langbulgarianmodel.py (refactored)
@@ -25,7 +25,7 @@
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################

-import constants
+from . import constants

 # 255: Control characters that usually does not exist in any text
 # 254: Carriage/Return
--- chardet\langcyrillicmodel.py (original)
+++ chardet\langcyrillicmodel.py (refactored)
@@ -25,7 +25,7 @@
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################

-import constants
+from . import constants

 # KOI8-R language model
 # Character Mapping Table:
--- chardet\langgreekmodel.py (original)
+++ chardet\langgreekmodel.py (refactored)
@@ -25,7 +25,7 @@
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################

-import constants
+from . import constants

 # 255: Control characters that usually does not exist in any text
 # 254: Carriage/Return
--- chardet\langhebrewmodel.py (original)
+++ chardet\langhebrewmodel.py (refactored)
@@ -27,7 +27,7 @@
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################

-import constants
+from . import constants

 # 255: Control characters that usually does not exist in any text
 # 254: Carriage/Return
--- chardet\langhungarianmodel.py (original)
+++ chardet\langhungarianmodel.py (refactored)
@@ -25,7 +25,7 @@
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################

-import constants
+from . import constants

 # 255: Control characters that usually does not exist in any text
 # 254: Carriage/Return
--- chardet\langthaimodel.py (original)
+++ chardet\langthaimodel.py (refactored)
@@ -25,7 +25,7 @@
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################

-import constants
+from . import constants

 # 255: Control characters that usually does not exist in any text
 # 254: Carriage/Return
--- chardet\latin1prober.py (original)
+++ chardet\latin1prober.py (refactored)
@@ -26,8 +26,8 @@
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################

-from charsetprober import CharSetProber
-import constants
+from .charsetprober import CharSetProber
+from . import constants
 import operator

 FREQ_CAT_NUM = 4
--- chardet\mbcharsetprober.py (original)
+++ chardet\mbcharsetprober.py (refactored)
@@ -28,8 +28,8 @@
 ######################### END LICENSE BLOCK #########################

 import constants, sys
-from constants import eStart, eError, eItsMe
-from charsetprober import CharSetProber
+from .constants import eStart, eError, eItsMe
+from .charsetprober import CharSetProber

 class MultiByteCharSetProber(CharSetProber):
     def __init__(self):
--- chardet\mbcsgroupprober.py (original)
+++ chardet\mbcsgroupprober.py (refactored)
@@ -27,14 +27,14 @@
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################

-from charsetgroupprober import CharSetGroupProber
-from utf8prober import UTF8Prober
-from sjisprober import SJISProber
-from eucjpprober import EUCJPProber
-from gb2312prober import GB2312Prober
-from euckrprober import EUCKRProber
-from big5prober import Big5Prober
-from euctwprober import EUCTWProber
+from .charsetgroupprober import CharSetGroupProber
+from .utf8prober import UTF8Prober
+from .sjisprober import SJISProber
+from .eucjpprober import EUCJPProber
+from .gb2312prober import GB2312Prober
+from .euckrprober import EUCKRProber
+from .big5prober import Big5Prober
+from .euctwprober import EUCTWProber

 class MBCSGroupProber(CharSetGroupProber):
     def __init__(self):
--- chardet\mbcssm.py (original)
+++ chardet\mbcssm.py (refactored)
@@ -25,7 +25,7 @@
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################

-from constants import eStart, eError, eItsMe
+from .constants import eStart, eError, eItsMe

 # BIG5

--- chardet\sbcharsetprober.py (original)
+++ chardet\sbcharsetprober.py (refactored)
@@ -27,7 +27,7 @@
 ######################### END LICENSE BLOCK #########################

 import constants, sys
-from charsetprober import CharSetProber
+from .charsetprober import CharSetProber

 SAMPLE_SIZE = 64
 SB_ENOUGH_REL_THRESHOLD = 1024
--- chardet\sbcsgroupprober.py (original)
+++ chardet\sbcsgroupprober.py (refactored)
@@ -27,15 +27,15 @@
 ######################### END LICENSE BLOCK #########################

 import constants, sys
-from charsetgroupprober import CharSetGroupProber
-from sbcharsetprober import SingleByteCharSetProber
-from langcyrillicmodel import Win1251CyrillicModel, Koi8rModel, Latin5CyrillicModel, MacCyrillicModel, Ibm866Model, Ibm855Model
-from langgreekmodel import Latin7GreekModel, Win1253GreekModel
-from langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel
-from langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel
-from langthaimodel import TIS620ThaiModel
-from langhebrewmodel import Win1255HebrewModel
-from hebrewprober import HebrewProber
+from .charsetgroupprober import CharSetGroupProber
+from .sbcharsetprober import SingleByteCharSetProber
+from .langcyrillicmodel import Win1251CyrillicModel, Koi8rModel, Latin5CyrillicModel, MacCyrillicModel, Ibm866Model, Ibm855Model
+from .langgreekmodel import Latin7GreekModel, Win1253GreekModel
+from .langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel
+from .langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel
+from .langthaimodel import TIS620ThaiModel
+from .langhebrewmodel import Win1255HebrewModel
+from .hebrewprober import HebrewProber

 class SBCSGroupProber(CharSetGroupProber):
     def __init__(self):
--- chardet\sjisprober.py (original)
+++ chardet\sjisprober.py (refactored)
@@ -25,13 +25,13 @@
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################

-from mbcharsetprober import MultiByteCharSetProber
-from codingstatemachine import CodingStateMachine
-from chardistribution import SJISDistributionAnalysis
-from jpcntx import SJISContextAnalysis
-from mbcssm import SJISSMModel
+from .mbcharsetprober import MultiByteCharSetProber
+from .codingstatemachine import CodingStateMachine
+from .chardistribution import SJISDistributionAnalysis
+from .jpcntx import SJISContextAnalysis
+from .mbcssm import SJISSMModel
 import constants, sys
-from constants import eStart, eError, eItsMe
+from .constants import eStart, eError, eItsMe

 class SJISProber(MultiByteCharSetProber):
     def __init__(self):
--- chardet\universaldetector.py (original)
+++ chardet\universaldetector.py (refactored)
@@ -27,10 +27,10 @@
 ######################### END LICENSE BLOCK #########################

 import constants, sys
-from latin1prober import Latin1Prober # windows-1252
-from mbcsgroupprober import MBCSGroupProber # multi-byte character sets
-from sbcsgroupprober import SBCSGroupProber # single-byte character sets
-from escprober import EscCharSetProber # ISO-2122, etc.
+from .latin1prober import Latin1Prober # windows-1252
+from .mbcsgroupprober import MBCSGroupProber # multi-byte character sets
+from .sbcsgroupprober import SBCSGroupProber # single-byte character sets
+from .escprober import EscCharSetProber # ISO-2122, etc.
 import re

 MINIMUM_THRESHOLD = 0.20
--- chardet\utf8prober.py (original)
+++ chardet\utf8prober.py (refactored)
@@ -26,10 +26,10 @@
 ######################### END LICENSE BLOCK #########################

 import constants, sys
-from constants import eStart, eError, eItsMe
-from charsetprober import CharSetProber
-from codingstatemachine import CodingStateMachine
-from mbcssm import UTF8SMModel
+from .constants import eStart, eError, eItsMe
+from .charsetprober import CharSetProber
+from .codingstatemachine import CodingStateMachine
+from .mbcssm import UTF8SMModel

 ONE_CHAR_PROB = 0.5

RefactoringTool: Files that were modified:
RefactoringTool: chardet\__init__.py
RefactoringTool: chardet\big5prober.py
RefactoringTool: chardet\chardistribution.py
RefactoringTool: chardet\charsetgroupprober.py
RefactoringTool: chardet\codingstatemachine.py
RefactoringTool: chardet\constants.py
RefactoringTool: chardet\escprober.py
RefactoringTool: chardet\escsm.py
RefactoringTool: chardet\eucjpprober.py
RefactoringTool: chardet\euckrprober.py
RefactoringTool: chardet\euctwprober.py
RefactoringTool: chardet\gb2312prober.py
RefactoringTool: chardet\hebrewprober.py
RefactoringTool: chardet\jpcntx.py
RefactoringTool: chardet\langbulgarianmodel.py
RefactoringTool: chardet\langcyrillicmodel.py
RefactoringTool: chardet\langgreekmodel.py
RefactoringTool: chardet\langhebrewmodel.py
RefactoringTool: chardet\langhungarianmodel.py
RefactoringTool: chardet\langthaimodel.py
RefactoringTool: chardet\latin1prober.py
RefactoringTool: chardet\mbcharsetprober.py
RefactoringTool: chardet\mbcsgroupprober.py
RefactoringTool: chardet\mbcssm.py
RefactoringTool: chardet\sbcharsetprober.py
RefactoringTool: chardet\sbcsgroupprober.py
RefactoringTool: chardet\sjisprober.py
RefactoringTool: chardet\universaldetector.py
RefactoringTool: chardet\utf8prober.py


Now run the 2to3 script on the testing harness, test.py.

C:\home\chardet>python c:\Python30\Tools\Scripts\2to3.py -w test.py
RefactoringTool: Skipping implicit fixer: buffer
RefactoringTool: Skipping implicit fixer: idioms
RefactoringTool: Skipping implicit fixer: set_literal
RefactoringTool: Skipping implicit fixer: ws_comma
--- test.py (original)
+++ test.py (refactored)
@@ -4,7 +4,7 @@
 count = 0
 u = UniversalDetector()
 for f in glob.glob(sys.argv[1]):
-    print f.ljust(60),
+    print(f.ljust(60), end=' ')
     u.reset()
     for line in file(f, 'rb'):
         u.feed(line)
@@ -12,8 +12,8 @@
     u.close()
     result = u.result
     if result['encoding']:
-        print result['encoding'], 'with confidence', result['confidence']
+        print(result['encoding'], 'with confidence', result['confidence'])
     else:
-        print '******** no result'
+        print('******** no result')
     count += 1
-print count, 'tests'
+print(count, 'tests')
RefactoringTool: Files that were modified:
RefactoringTool: test.py


Well, that wasn't so hard!  Just a few imports and print statements to convert.  Time to run the new version.

C:\home\chardet>python test.py tests\*\*
Traceback (most recent call last):
  File "test.py", line 1, in <module>
    from chardet.universaldetector import UniversalDetector
  File "C:\home\chardet\chardet\universaldetector.py", line 51
    self.done = constants.False
                              ^
SyntaxError: invalid syntax


Hmm, a small snag.  In Python 3, False is a reserved word, so you can't use it as a variable name.  Let's look at constants.py to see where it's defined.  Here's the original version from constants.py, before the 2to3 script changed it:

import __builtin__
if not hasattr(__builtin__, 'False'):
    False = 0
    True = 1
else:
    False = __builtin__.False
    True = __builtin__.True

This piece of code is designed to allow this library to run under older versions of Python 2.  Prior to Python 2.3 [FIXME-LINK], Python had no built-in Boolean type.  This code detects the absence of the built-in constants True and False, and defines them if necessary.

However, Python 3 will always have a Boolean type, so this entire code snippet is unnecessary.  The simplest solution is to replace all instances of "constants.True" and "constants.False" with "True" and "False", respectively, then delete this dead code from constants.py.

So this line in universaldetector.py:

    self.done = constants.False

Becomes

    self.done = False

Ah, wasn't that satisfying?  The code is shorter and more readable already.

OK, what's next?  Time to run test.py again and see how far it gets.

C:\home\chardet>python test.py tests\*\*
Traceback (most recent call last):
  File "test.py", line 1, in <module>
    from chardet.universaldetector import UniversalDetector
  File "C:\home\chardet\chardet\universaldetector.py", line 29, in <module>
    import constants, sys
ImportError: No module named constants

What's that you say?  No module named constants?  Of course there's a module named constants. ... Oh wait, no there isn't.  Remember when the 2to3 script fixed up all those import statements?  This library has a lot of relative imports -- that is, modules that import other modules within the library.  In Python 3, all import statements are absolute by default [FIXME-LINK PEP 0328].  To do relative imports, you must do something like this:

    from . import constants

But wait.  Wasn't the 2to3 script supposed to take care of these for you?  Well, it did, but this particular import statement combined two different types of imports into one line: a relative import of the constants module within the library, and an absolute import of the sys module that is pre-installed in the Python standard library.  In Python 2, you could combine these into one import statement.  In Python 3, you can't, and the 2to3 script is not smart enough to split the import statement into two.

The solution is to split the import statement manually.  So this two-in-one import:

    import constants, sys

Needs to become two separate imports:

    from . import constants
    import sys

There are variations of this problem scattered throughout the chardet library.  In some places it's "import constants, sys"; in other places, it's "import constants, re".  The fix is the same: manually split the import statement into two lines, one for the relative import, the other for the absolute import.

Onward!

C:\home\chardet>python test.py tests\*\*
tests\ascii\howto.diveintomark.org.xml
Traceback (most recent call last):
  File "test.py", line 9, in <module>
    for line in file(f, 'rb'):
NameError: name 'file' is not defined

This one surprised me, because I've been using this idiom as long as I can remember.  In Python 2, the global file() function was an alias for open(), which was the standard way of opening files for reading.  In Python 3, the entire system for reading and writing files has been refactored into the io module. [FIXME-LINK PEP 3116]  I'll cover the new I/O module in more detail in Chapter FIXME, but for now, the important bit is that the global file() function no longer exists.  However, the open() function does still exist.  (Technically, it's an alias for io.open(), but never mind that right now.)

Thus, the simple solution to the problem of the missing file() is to call open() instead:

    for line in open(f, 'rb'):

And that's all I have to say about that.

C:\home\chardet>python test.py tests\*\*
tests\ascii\howto.diveintomark.org.xml
Traceback (most recent call last):
  File "test.py", line 10, in <module>
    u.feed(line)
  File "C:\home\chardet\chardet\universaldetector.py", line 98, in feed
    if self._highBitDetector.search(aBuf):
TypeError: can't use a string pattern on a bytes-like object

Now things are starting to get interesting.  And by "interesting," I mean "confusing as all hell."

First, let's see what self._highBitDetector is.  It's defined in the __init__ method of the UniversalDetector class:

class UniversalDetector:
    def __init__(self):
        self._highBitDetector = re.compile(r'[\x80-\xFF]')

This pre-compiles a regular expression designed to find non-ASCII characters in the range 128-255 (0x80-0xFF).  Wait, that's not quite right; I need to be more precise with my terminology.  This pattern is designed to find non-ASCII <em>bytes</em> in the range 128-255.  And therein lies the problem.

In Python 2, a string was an array of bytes whose character encoding was tracked separately.  If you wanted Python 2 to keep track of the character encoding, you had to use a Unicode string (u'') instead.  But in Python 3, a string is always what Python 2 called a Unicode string -- that is, an array of Unicode characters (of possibly varying byte lengths).  Since this regular expression is defined by a string pattern, it can only be used to search a string -- again, an array of characters.  But what we're searching is not a string, it's a byte array.  Looking at the traceback, this error occurred in universaldetector.py:

def feed(self, aBuf):
    .
    .
    .
    if self._mInputState == ePureAscii:
        if self._highBitDetector.search(aBuf):

And what is aBuf?  Let's backtrack further to a place that calls UniversalDetector.feed().  One place that calls it is the test harness, test.py.

u = UniversalDetector()
.
.
.
for line in open(f, 'rb'):
    u.feed(line)

And we have our answer: in the UniversalDetector.feed() method, aBuf is a line read from a file on disk.  Look carefully at the parameters used to open the file: 'rb'.  'r' is for read; OK, big deal, we're reading the file.  Ah, but 'b' is for 'bytes'.  Without the 'b' flag, this for loop would read the file, line by line, and convert each line into a string -- an array of Unicode characters -- according to the system default character encoding.  (You could override the system encoding with another parameter to open(), but never mind that for now.)  But with the 'b' flag, this for loop reads the file, line by line, and stores each line exactly as it appears in the file, as an array of bytes.  That byte array gets passed to UniversalDetector.feed(), and eventually gets passed to the pre-compiled regular expression, self._highBitDetector, to search for high-bit... characters.  But we don't have characters; we have bytes.

What we need this regular expression to search is not an array of characters, but an array of bytes.

Once you realize that, the solution is not difficult.  Regular expressions defined with strings can search strings.  Regular expressions defined with byte arrays can search byte arrays.  To define a byte array pattern, we simply change the type of the argument we use to define the regular expression to a byte array.  So instead of this:

    self._highBitDetector = re.compile(r'[\x80-\xFF]')

We now have this:

    self._highBitDetector = re.compile(b'[\x80-\xFF]')

There is one other case of this same problem, on the very next line:

    self._escDetector = re.compile(r'(\033|~{)')

Again, this is going to be used to search a byte array (the same aBuf variable, in fact), so the regular expression pattern needs to be defined as a byte array:

    self._escDetector = re.compile(b'(\033|~{)')

Curiouser and curiouser...

C:\home\chardet>python test.py tests\*\*
tests\ascii\howto.diveintomark.org.xml
Traceback (most recent call last):
  File "test.py", line 10, in <module>
    u.feed(line)
  File "C:\home\chardet\chardet\universaldetector.py", line 100, in feed
    elif (self._mInputState == ePureAscii) and self._escDetector.search(self._mLastChar + aBuf):
TypeError: Can't convert 'bytes' object to str implicitly