Files
dive-into-python3/case-study-porting-chardet-to-python-3.txt
T

658 lines
26 KiB
Plaintext

C:\home\chardet>python c:\Python30\Tools\Scripts\2to3.py -w chardet\
RefactoringTool: Skipping implicit fixer: buffer
RefactoringTool: Skipping implicit fixer: idioms
RefactoringTool: Skipping implicit fixer: set_literal
RefactoringTool: Skipping implicit fixer: ws_comma
--- chardet\__init__.py (original)
+++ chardet\__init__.py (refactored)
@@ -18,7 +18,7 @@
__version__ = "1.0.1"
def detect(aBuf):
- import universaldetector
+ from . import universaldetector
u = universaldetector.UniversalDetector()
u.reset()
u.feed(aBuf)
--- chardet\big5prober.py (original)
+++ chardet\big5prober.py (refactored)
@@ -25,10 +25,10 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
-from mbcharsetprober import MultiByteCharSetProber
-from codingstatemachine import CodingStateMachine
-from chardistribution import Big5DistributionAnalysis
-from mbcssm import Big5SMModel
+from .mbcharsetprober import MultiByteCharSetProber
+from .codingstatemachine import CodingStateMachine
+from .chardistribution import Big5DistributionAnalysis
+from .mbcssm import Big5SMModel
class Big5Prober(MultiByteCharSetProber):
def __init__(self):
--- chardet\chardistribution.py (original)
+++ chardet\chardistribution.py (refactored)
@@ -25,12 +25,12 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
-import constants
-from euctwfreq import EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE, EUCTW_TYPICAL_DISTRIBUTION_RATIO
-from euckrfreq import EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE, EUCKR_TYPICAL_DISTRIBUTION_RATIO
-from gb2312freq import GB2312CharToFreqOrder, GB2312_TABLE_SIZE, GB2312_TYPICAL_DISTRIBUTION_RATIO
-from big5freq import Big5CharToFreqOrder, BIG5_TABLE_SIZE, BIG5_TYPICAL_DISTRIBUTION_RATIO
-from jisfreq import JISCharToFreqOrder, JIS_TABLE_SIZE, JIS_TYPICAL_DISTRIBUTION_RATIO
+from . import constants
+from .euctwfreq import EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE, EUCTW_TYPICAL_DISTRIBUTION_RATIO
+from .euckrfreq import EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE, EUCKR_TYPICAL_DISTRIBUTION_RATIO
+from .gb2312freq import GB2312CharToFreqOrder, GB2312_TABLE_SIZE, GB2312_TYPICAL_DISTRIBUTION_RATIO
+from .big5freq import Big5CharToFreqOrder, BIG5_TABLE_SIZE, BIG5_TYPICAL_DISTRIBUTION_RATIO
+from .jisfreq import JISCharToFreqOrder, JIS_TABLE_SIZE, JIS_TYPICAL_DISTRIBUTION_RATIO
ENOUGH_DATA_THRESHOLD = 1024
SURE_YES = 0.99
--- chardet\charsetgroupprober.py (original)
+++ chardet\charsetgroupprober.py (refactored)
@@ -26,7 +26,7 @@
######################### END LICENSE BLOCK #########################
import constants, sys
-from charsetprober import CharSetProber
+from .charsetprober import CharSetProber
class CharSetGroupProber(CharSetProber):
def __init__(self):
--- chardet\codingstatemachine.py (original)
+++ chardet\codingstatemachine.py (refactored)
@@ -25,7 +25,7 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
-from constants import eStart, eError, eItsMe
+from .constants import eStart, eError, eItsMe
class CodingStateMachine:
def __init__(self, sm):
--- chardet\constants.py (original)
+++ chardet\constants.py (refactored)
@@ -38,10 +38,10 @@
SHORTCUT_THRESHOLD = 0.95
-import __builtin__
+import builtins
if not hasattr(__builtin__, 'False'):
False = 0
True = 1
else:
- False = __builtin__.False
- True = __builtin__.True
+ False = builtins.False
+ True = builtins.True
--- chardet\escprober.py (original)
+++ chardet\escprober.py (refactored)
@@ -26,9 +26,9 @@
######################### END LICENSE BLOCK #########################
import constants, sys
-from escsm import HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel, ISO2022KRSMModel
-from charsetprober import CharSetProber
-from codingstatemachine import CodingStateMachine
+from .escsm import HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel, ISO2022KRSMModel
+from .charsetprober import CharSetProber
+from .codingstatemachine import CodingStateMachine
class EscCharSetProber(CharSetProber):
def __init__(self):
--- chardet\escsm.py (original)
+++ chardet\escsm.py (refactored)
@@ -25,7 +25,7 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
-from constants import eStart, eError, eItsMe
+from .constants import eStart, eError, eItsMe
HZ_cls = ( \
1,0,0,0,0,0,0,0, # 00 - 07
--- chardet\eucjpprober.py (original)
+++ chardet\eucjpprober.py (refactored)
@@ -26,12 +26,12 @@
######################### END LICENSE BLOCK #########################
import constants, sys
-from constants import eStart, eError, eItsMe
-from mbcharsetprober import MultiByteCharSetProber
-from codingstatemachine import CodingStateMachine
-from chardistribution import EUCJPDistributionAnalysis
-from jpcntx import EUCJPContextAnalysis
-from mbcssm import EUCJPSMModel
+from .constants import eStart, eError, eItsMe
+from .mbcharsetprober import MultiByteCharSetProber
+from .codingstatemachine import CodingStateMachine
+from .chardistribution import EUCJPDistributionAnalysis
+from .jpcntx import EUCJPContextAnalysis
+from .mbcssm import EUCJPSMModel
class EUCJPProber(MultiByteCharSetProber):
def __init__(self):
--- chardet\euckrprober.py (original)
+++ chardet\euckrprober.py (refactored)
@@ -25,10 +25,10 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
-from mbcharsetprober import MultiByteCharSetProber
-from codingstatemachine import CodingStateMachine
-from chardistribution import EUCKRDistributionAnalysis
-from mbcssm import EUCKRSMModel
+from .mbcharsetprober import MultiByteCharSetProber
+from .codingstatemachine import CodingStateMachine
+from .chardistribution import EUCKRDistributionAnalysis
+from .mbcssm import EUCKRSMModel
class EUCKRProber(MultiByteCharSetProber):
def __init__(self):
--- chardet\euctwprober.py (original)
+++ chardet\euctwprober.py (refactored)
@@ -25,10 +25,10 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
-from mbcharsetprober import MultiByteCharSetProber
-from codingstatemachine import CodingStateMachine
-from chardistribution import EUCTWDistributionAnalysis
-from mbcssm import EUCTWSMModel
+from .mbcharsetprober import MultiByteCharSetProber
+from .codingstatemachine import CodingStateMachine
+from .chardistribution import EUCTWDistributionAnalysis
+from .mbcssm import EUCTWSMModel
class EUCTWProber(MultiByteCharSetProber):
def __init__(self):
--- chardet\gb2312prober.py (original)
+++ chardet\gb2312prober.py (refactored)
@@ -25,10 +25,10 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
-from mbcharsetprober import MultiByteCharSetProber
-from codingstatemachine import CodingStateMachine
-from chardistribution import GB2312DistributionAnalysis
-from mbcssm import GB2312SMModel
+from .mbcharsetprober import MultiByteCharSetProber
+from .codingstatemachine import CodingStateMachine
+from .chardistribution import GB2312DistributionAnalysis
+from .mbcssm import GB2312SMModel
class GB2312Prober(MultiByteCharSetProber):
def __init__(self):
--- chardet\hebrewprober.py (original)
+++ chardet\hebrewprober.py (refactored)
@@ -25,8 +25,8 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
-from charsetprober import CharSetProber
-import constants
+from .charsetprober import CharSetProber
+from . import constants
# This prober doesn't actually recognize a language or a charset.
# It is a helper prober for the use of the Hebrew model probers
--- chardet\jpcntx.py (original)
+++ chardet\jpcntx.py (refactored)
@@ -25,7 +25,7 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
-import constants
+from . import constants
NUM_OF_CATEGORY = 6
DONT_KNOW = -1
--- chardet\langbulgarianmodel.py (original)
+++ chardet\langbulgarianmodel.py (refactored)
@@ -25,7 +25,7 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
-import constants
+from . import constants
# 255: Control characters that usually does not exist in any text
# 254: Carriage/Return
--- chardet\langcyrillicmodel.py (original)
+++ chardet\langcyrillicmodel.py (refactored)
@@ -25,7 +25,7 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
-import constants
+from . import constants
# KOI8-R language model
# Character Mapping Table:
--- chardet\langgreekmodel.py (original)
+++ chardet\langgreekmodel.py (refactored)
@@ -25,7 +25,7 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
-import constants
+from . import constants
# 255: Control characters that usually does not exist in any text
# 254: Carriage/Return
--- chardet\langhebrewmodel.py (original)
+++ chardet\langhebrewmodel.py (refactored)
@@ -27,7 +27,7 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
-import constants
+from . import constants
# 255: Control characters that usually does not exist in any text
# 254: Carriage/Return
--- chardet\langhungarianmodel.py (original)
+++ chardet\langhungarianmodel.py (refactored)
@@ -25,7 +25,7 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
-import constants
+from . import constants
# 255: Control characters that usually does not exist in any text
# 254: Carriage/Return
--- chardet\langthaimodel.py (original)
+++ chardet\langthaimodel.py (refactored)
@@ -25,7 +25,7 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
-import constants
+from . import constants
# 255: Control characters that usually does not exist in any text
# 254: Carriage/Return
--- chardet\latin1prober.py (original)
+++ chardet\latin1prober.py (refactored)
@@ -26,8 +26,8 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
-from charsetprober import CharSetProber
-import constants
+from .charsetprober import CharSetProber
+from . import constants
import operator
FREQ_CAT_NUM = 4
--- chardet\mbcharsetprober.py (original)
+++ chardet\mbcharsetprober.py (refactored)
@@ -28,8 +28,8 @@
######################### END LICENSE BLOCK #########################
import constants, sys
-from constants import eStart, eError, eItsMe
-from charsetprober import CharSetProber
+from .constants import eStart, eError, eItsMe
+from .charsetprober import CharSetProber
class MultiByteCharSetProber(CharSetProber):
def __init__(self):
--- chardet\mbcsgroupprober.py (original)
+++ chardet\mbcsgroupprober.py (refactored)
@@ -27,14 +27,14 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
-from charsetgroupprober import CharSetGroupProber
-from utf8prober import UTF8Prober
-from sjisprober import SJISProber
-from eucjpprober import EUCJPProber
-from gb2312prober import GB2312Prober
-from euckrprober import EUCKRProber
-from big5prober import Big5Prober
-from euctwprober import EUCTWProber
+from .charsetgroupprober import CharSetGroupProber
+from .utf8prober import UTF8Prober
+from .sjisprober import SJISProber
+from .eucjpprober import EUCJPProber
+from .gb2312prober import GB2312Prober
+from .euckrprober import EUCKRProber
+from .big5prober import Big5Prober
+from .euctwprober import EUCTWProber
class MBCSGroupProber(CharSetGroupProber):
def __init__(self):
--- chardet\mbcssm.py (original)
+++ chardet\mbcssm.py (refactored)
@@ -25,7 +25,7 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
-from constants import eStart, eError, eItsMe
+from .constants import eStart, eError, eItsMe
# BIG5
--- chardet\sbcharsetprober.py (original)
+++ chardet\sbcharsetprober.py (refactored)
@@ -27,7 +27,7 @@
######################### END LICENSE BLOCK #########################
import constants, sys
-from charsetprober import CharSetProber
+from .charsetprober import CharSetProber
SAMPLE_SIZE = 64
SB_ENOUGH_REL_THRESHOLD = 1024
--- chardet\sbcsgroupprober.py (original)
+++ chardet\sbcsgroupprober.py (refactored)
@@ -27,15 +27,15 @@
######################### END LICENSE BLOCK #########################
import constants, sys
-from charsetgroupprober import CharSetGroupProber
-from sbcharsetprober import SingleByteCharSetProber
-from langcyrillicmodel import Win1251CyrillicModel, Koi8rModel, Latin5CyrillicModel, MacCyrillicModel, Ibm866Model, Ibm855Model
-from langgreekmodel import Latin7GreekModel, Win1253GreekModel
-from langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel
-from langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel
-from langthaimodel import TIS620ThaiModel
-from langhebrewmodel import Win1255HebrewModel
-from hebrewprober import HebrewProber
+from .charsetgroupprober import CharSetGroupProber
+from .sbcharsetprober import SingleByteCharSetProber
+from .langcyrillicmodel import Win1251CyrillicModel, Koi8rModel, Latin5CyrillicModel, MacCyrillicModel, Ibm866Model, Ibm855Model
+from .langgreekmodel import Latin7GreekModel, Win1253GreekModel
+from .langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel
+from .langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel
+from .langthaimodel import TIS620ThaiModel
+from .langhebrewmodel import Win1255HebrewModel
+from .hebrewprober import HebrewProber
class SBCSGroupProber(CharSetGroupProber):
def __init__(self):
--- chardet\sjisprober.py (original)
+++ chardet\sjisprober.py (refactored)
@@ -25,13 +25,13 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
-from mbcharsetprober import MultiByteCharSetProber
-from codingstatemachine import CodingStateMachine
-from chardistribution import SJISDistributionAnalysis
-from jpcntx import SJISContextAnalysis
-from mbcssm import SJISSMModel
+from .mbcharsetprober import MultiByteCharSetProber
+from .codingstatemachine import CodingStateMachine
+from .chardistribution import SJISDistributionAnalysis
+from .jpcntx import SJISContextAnalysis
+from .mbcssm import SJISSMModel
import constants, sys
-from constants import eStart, eError, eItsMe
+from .constants import eStart, eError, eItsMe
class SJISProber(MultiByteCharSetProber):
def __init__(self):
--- chardet\universaldetector.py (original)
+++ chardet\universaldetector.py (refactored)
@@ -27,10 +27,10 @@
######################### END LICENSE BLOCK #########################
import constants, sys
-from latin1prober import Latin1Prober # windows-1252
-from mbcsgroupprober import MBCSGroupProber # multi-byte character sets
-from sbcsgroupprober import SBCSGroupProber # single-byte character sets
-from escprober import EscCharSetProber # ISO-2122, etc.
+from .latin1prober import Latin1Prober # windows-1252
+from .mbcsgroupprober import MBCSGroupProber # multi-byte character sets
+from .sbcsgroupprober import SBCSGroupProber # single-byte character sets
+from .escprober import EscCharSetProber # ISO-2122, etc.
import re
MINIMUM_THRESHOLD = 0.20
--- chardet\utf8prober.py (original)
+++ chardet\utf8prober.py (refactored)
@@ -26,10 +26,10 @@
######################### END LICENSE BLOCK #########################
import constants, sys
-from constants import eStart, eError, eItsMe
-from charsetprober import CharSetProber
-from codingstatemachine import CodingStateMachine
-from mbcssm import UTF8SMModel
+from .constants import eStart, eError, eItsMe
+from .charsetprober import CharSetProber
+from .codingstatemachine import CodingStateMachine
+from .mbcssm import UTF8SMModel
ONE_CHAR_PROB = 0.5
RefactoringTool: Files that were modified:
RefactoringTool: chardet\__init__.py
RefactoringTool: chardet\big5prober.py
RefactoringTool: chardet\chardistribution.py
RefactoringTool: chardet\charsetgroupprober.py
RefactoringTool: chardet\codingstatemachine.py
RefactoringTool: chardet\constants.py
RefactoringTool: chardet\escprober.py
RefactoringTool: chardet\escsm.py
RefactoringTool: chardet\eucjpprober.py
RefactoringTool: chardet\euckrprober.py
RefactoringTool: chardet\euctwprober.py
RefactoringTool: chardet\gb2312prober.py
RefactoringTool: chardet\hebrewprober.py
RefactoringTool: chardet\jpcntx.py
RefactoringTool: chardet\langbulgarianmodel.py
RefactoringTool: chardet\langcyrillicmodel.py
RefactoringTool: chardet\langgreekmodel.py
RefactoringTool: chardet\langhebrewmodel.py
RefactoringTool: chardet\langhungarianmodel.py
RefactoringTool: chardet\langthaimodel.py
RefactoringTool: chardet\latin1prober.py
RefactoringTool: chardet\mbcharsetprober.py
RefactoringTool: chardet\mbcsgroupprober.py
RefactoringTool: chardet\mbcssm.py
RefactoringTool: chardet\sbcharsetprober.py
RefactoringTool: chardet\sbcsgroupprober.py
RefactoringTool: chardet\sjisprober.py
RefactoringTool: chardet\universaldetector.py
RefactoringTool: chardet\utf8prober.py
Now run the 2to3 script on the testing harness, test.py.
C:\home\chardet>python c:\Python30\Tools\Scripts\2to3.py -w test.py
RefactoringTool: Skipping implicit fixer: buffer
RefactoringTool: Skipping implicit fixer: idioms
RefactoringTool: Skipping implicit fixer: set_literal
RefactoringTool: Skipping implicit fixer: ws_comma
--- test.py (original)
+++ test.py (refactored)
@@ -4,7 +4,7 @@
count = 0
u = UniversalDetector()
for f in glob.glob(sys.argv[1]):
- print f.ljust(60),
+ print(f.ljust(60), end=' ')
u.reset()
for line in file(f, 'rb'):
u.feed(line)
@@ -12,8 +12,8 @@
u.close()
result = u.result
if result['encoding']:
- print result['encoding'], 'with confidence', result['confidence']
+ print(result['encoding'], 'with confidence', result['confidence'])
else:
- print '******** no result'
+ print('******** no result')
count += 1
-print count, 'tests'
+print(count, 'tests')
RefactoringTool: Files that were modified:
RefactoringTool: test.py
Well, that wasn't so hard! Just a few imports and print statements to convert. Time to run the new version.
C:\home\chardet>python test.py tests\*\*
Traceback (most recent call last):
File "test.py", line 1, in <module>
from chardet.universaldetector import UniversalDetector
File "C:\home\chardet\chardet\universaldetector.py", line 51
self.done = constants.False
^
SyntaxError: invalid syntax
Hmm, a small snag. In Python 3, False is a reserved word, so you can't use it as a variable name. Let's look at constants.py to see where it's defined. Here's the original version from constants.py, before the 2to3 script changed it:
import __builtin__
if not hasattr(__builtin__, 'False'):
False = 0
True = 1
else:
False = __builtin__.False
True = __builtin__.True
This piece of code is designed to allow this library to run under older versions of Python 2. Prior to Python 2.3 [FIXME-LINK], Python had no built-in Boolean type. This code detects the absence of the built-in constants True and False, and defines them if necessary.
However, Python 3 will always have a Boolean type, so this entire code snippet is unnecessary. The simplest solution is to replace all instances of "constants.True" and "constants.False" with "True" and "False", respectively, then delete this dead code from constants.py.
So this line in universaldetector.py:
self.done = constants.False
Becomes
self.done = False
Ah, wasn't that satisfying? The code is shorter and more readable already.
OK, what's next? Time to run test.py again and see how far it gets.
C:\home\chardet>python test.py tests\*\*
Traceback (most recent call last):
File "test.py", line 1, in <module>
from chardet.universaldetector import UniversalDetector
File "C:\home\chardet\chardet\universaldetector.py", line 29, in <module>
import constants, sys
ImportError: No module named constants
What's that you say? No module named constants? Of course there's a module named constants. ... Oh wait, no there isn't. Remember when the 2to3 script fixed up all those import statements? This library has a lot of relative imports -- that is, modules that import other modules within the library. In Python 3, all import statements are absolute by default [FIXME-LINK PEP 0328]. To do relative imports, you must do something like this:
from . import constants
But wait. Wasn't the 2to3 script supposed to take care of these for you? Well, it did, but this particular import statement combined two different types of imports into one line: a relative import of the constants module within the library, and an absolute import of the sys module that is pre-installed in the Python standard library. In Python 2, you could combine these into one import statement. In Python 3, you can't, and the 2to3 script is not smart enough to split the import statement into two.
The solution is to split the import statement manually. So this two-in-one import:
import constants, sys
Needs to become two separate imports:
from . import constants
import sys
There are variations of this problem scattered throughout the chardet library. In some places it's "import constants, sys"; in other places, it's "import constants, re". The fix is the same: manually split the import statement into two lines, one for the relative import, the other for the absolute import.
Onward!
C:\home\chardet>python test.py tests\*\*
tests\ascii\howto.diveintomark.org.xml
Traceback (most recent call last):
File "test.py", line 9, in <module>
for line in file(f, 'rb'):
NameError: name 'file' is not defined
This one surprised me, because I've been using this idiom as long as I can remember. In Python 2, the global file() function was an alias for open(), which was the standard way of opening files for reading. In Python 3, the entire system for reading and writing files has been refactored into the io module. [FIXME-LINK PEP 3116] I'll cover the new I/O module in more detail in Chapter FIXME, but for now, the important bit is that the global file() function no longer exists. However, the open() function does still exist. (Technically, it's an alias for io.open(), but never mind that right now.)
Thus, the simple solution to the problem of the missing file() is to call open() instead:
for line in open(f, 'rb'):
And that's all I have to say about that.
C:\home\chardet>python test.py tests\*\*
tests\ascii\howto.diveintomark.org.xml
Traceback (most recent call last):
File "test.py", line 10, in <module>
u.feed(line)
File "C:\home\chardet\chardet\universaldetector.py", line 98, in feed
if self._highBitDetector.search(aBuf):
TypeError: can't use a string pattern on a bytes-like object
Now things are starting to get interesting. And by "interesting," I mean "confusing as all hell."
First, let's see what self._highBitDetector is. It's defined in the __init__ method of the UniversalDetector class:
class UniversalDetector:
def __init__(self):
self._highBitDetector = re.compile(r'[\x80-\xFF]')
This pre-compiles a regular expression designed to find non-ASCII characters in the range 128-255 (0x80-0xFF). Wait, that's not quite right; I need to be more precise with my terminology. This pattern is designed to find non-ASCII <em>bytes</em> in the range 128-255. And therein lies the problem.
In Python 2, a string was an array of bytes whose character encoding was tracked separately. If you wanted Python 2 to keep track of the character encoding, you had to use a Unicode string (u'') instead. But in Python 3, a string is always what Python 2 called a Unicode string -- that is, an array of Unicode characters (of possibly varying byte lengths). Since this regular expression is defined by a string pattern, it can only be used to search a string -- again, an array of characters. But what we're searching is not a string, it's a byte array. Looking at the traceback, this error occurred in universaldetector.py:
def feed(self, aBuf):
.
.
.
if self._mInputState == ePureAscii:
if self._highBitDetector.search(aBuf):
And what is aBuf? Let's backtrack further to a place that calls UniversalDetector.feed(). One place that calls it is the test harness, test.py.
u = UniversalDetector()
.
.
.
for line in open(f, 'rb'):
u.feed(line)
And we have our answer: in the UniversalDetector.feed() method, aBuf is a line read from a file on disk. Look carefully at the parameters used to open the file: 'rb'. 'r' is for read; OK, big deal, we're reading the file. Ah, but 'b' is for 'bytes'. Without the 'b' flag, this for loop would read the file, line by line, and convert each line into a string -- an array of Unicode characters -- according to the system default character encoding. (You could override the system encoding with another parameter to open(), but never mind that for now.) But with the 'b' flag, this for loop reads the file, line by line, and stores each line exactly as it appears in the file, as an array of bytes. That byte array gets passed to UniversalDetector.feed(), and eventually gets passed to the pre-compiled regular expression, self._highBitDetector, to search for high-bit... characters. But we don't have characters; we have bytes.
What we need this regular expression to search is not an array of characters, but an array of bytes.
Once you realize that, the solution is not difficult. Regular expressions defined with strings can search strings. Regular expressions defined with byte arrays can search byte arrays. To define a byte array pattern, we simply change the type of the argument we use to define the regular expression to a byte array. So instead of this:
self._highBitDetector = re.compile(r'[\x80-\xFF]')
We now have this:
self._highBitDetector = re.compile(b'[\x80-\xFF]')
There is one other case of this same problem, on the very next line:
self._escDetector = re.compile(r'(\033|~{)')
Again, this is going to be used to search a byte array (the same aBuf variable, in fact), so the regular expression pattern needs to be defined as a byte array:
self._escDetector = re.compile(b'(\033|~{)')
Curiouser and curiouser...
C:\home\chardet>python test.py tests\*\*
tests\ascii\howto.diveintomark.org.xml
Traceback (most recent call last):
File "test.py", line 10, in <module>
u.feed(line)
File "C:\home\chardet\chardet\universaldetector.py", line 100, in feed
elif (self._mInputState == ePureAscii) and self._escDetector.search(self._mLastChar + aBuf):
TypeError: Can't convert 'bytes' object to str implicitly