diff --git a/case-study-porting-chardet-to-python-3.html b/case-study-porting-chardet-to-python-3.html
index 77016ea..47d9f93 100755
--- a/case-study-porting-chardet-to-python-3.html
+++ b/case-study-porting-chardet-to-python-3.html
@@ -374,7 +374,7 @@ TypeError: Can't convert 'bytes' object to str implicitly
.
- self._mLastChar = ''
+ self._mLastChar = b''
-
Searching the entire codebase for “mLastChar” turns up a similar problem in mbcharsetprober.py, but instead of tracking the last character, it tracks the last two characters. The MultiByteCharSetProber class uses a list of 1-character strings to track the last two characters; in Python 3, it needs to use a list of integers.
+
Searching the entire codebase for “mLastChar” turns up a similar problem in mbcharsetprober.py, but instead of tracking the last character, it tracks the last two characters. The MultiByteCharSetProber class uses a list of 1-character strings to track the last two characters. In Python 3, it needs to use a list of integers, because it’s not really tracking characters, it’s tracking bytes. (Bytes are just integers from 0-255.)
class MultiByteCharSetProber(CharSetProber):
def __init__(self):
CharSetProber.__init__(self)
@@ -535,7 +535,6 @@ tests\Big5\0804.blogspot.com.xml
File "C:\home\chardet\chardet\jpcntx.py", line 176, in get_order
if ((aStr[0] >= '\x81') and (aStr[0] <= '\x9F')) or \
TypeError: unorderable types: int() >= str()
-Did you notice? This time around, the code passed the first test case (tests\ascii\howto.diveintomark.org.xml). You’re making real progress here.
So what’s this all about? “Unorderable types”? Once again, the difference between byte arrays and strings is rearing its ugly head. Take a look at the code:
class SJISContextAnalysis(JapaneseContextAnalysis):
def get_order(self, aStr):
@@ -556,15 +555,16 @@ TypeError: unorderable types: int() >= str()
order, charLen = self.get_order(aBuf[i:i+2])
Oh look, it’s our old friend, aBuf. As you might have guessed from every other issue we’ve encountered in this chapter, aBuf is a byte array. Here, the feed() method isn’t just passing it on wholesale; it’s slicing it. But as you saw earlier in this chapter, slicing a byte array returns a byte array, so the aStr parameter that gets passed to the get_order() method is still a byte array.
And what is this code trying to do with aStr? It’s taking the first element of the byte array and comparing it to a string of length 1. In Python 2, that worked, because aStr and aBuf were strings, and aStr[0] would be a string, and you can compare strings for inequality. But in Python 3, aStr and aBuf are byte arrays, aStr[0] is an integer, and you can’t compare integers and strings for inequality without explicitly coercing one of them. -
In this case, there’s no need to make the code more complicated by adding an explicit coercion. aStr[0] yields an integer; the things you’re comparing to are all constants. Let’s change them from 1-character strings to integers. +
In this case, there’s no need to make the code more complicated by adding an explicit coercion. aStr[0] yields an integer; the things you’re comparing to are all constants. Let’s change them from 1-character strings to integers. And while we’re at it, let’s change aStr to aBuf, since it’s not actually a string.
class SJISContextAnalysis(JapaneseContextAnalysis):
- def get_order(self, aStr):
+- def get_order(self, aStr):
++ def get_order(self, aBuf):
if not aStr: return -1, 1
# find out current char's byte length
- if ((aStr[0] >= '\x81') and (aStr[0] <= '\x9F')) or \
-- ((aStr[0] >= '\xE0') and (aStr[0] <= '\xFC')):
+- ((aBuf[0] >= '\xE0') and (aBuf[0] <= '\xFC')):
+ if ((aStr[0] >= 0x81) and (aStr[0] <= 0x9F)) or \
-+ ((aStr[0] >= 0xE0) and (aStr[0] <= 0xFC)):
++ ((aBuf[0] >= 0xE0) and (aBuf[0] <= 0xFC)):
charLen = 2
else:
charLen = 1
@@ -575,24 +575,25 @@ TypeError: unorderable types: int() >= str()
Searching the entire codebase for occurrences of the ord() function uncovers the same problem in chardistribution.py:
@@ -635,11 +636,12 @@ TypeError: unorderable types: int() >= str()
self._mTableSize = EUCTW_TABLE_SIZE
self._mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
- def get_order(self, aStr):
+- def get_order(self, aStr):
- if aStr[0] >= '\xC4':
- return 94 * (ord(aStr[0]) - 0xC4) + ord(aStr[1]) - 0xA1
-+ if aStr[0] >= 0xC4:
-+ return 94 * (aStr[0] - 0xC4) + aStr[1] - 0xA1
++ def get_order(self, aBuf):
++ if aBuf[0] >= 0xC4:
++ return 94 * (aBuf[0] - 0xC4) + aBuf[1] - 0xA1
else:
return -1
@@ -650,11 +652,12 @@ TypeError: unorderable types: int() >= str()
self._mTableSize = EUCKR_TABLE_SIZE
self._mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
- def get_order(self, aStr):
+- def get_order(self, aStr):
- if aStr[0] >= '\xB0':
- return 94 * (ord(aStr[0]) - 0xB0) + ord(aStr[1]) - 0xA1
-+ if aStr[0] >= '\xB0':
-+ return 94 * (aStr[0] - 0xB0) + aStr[1] - 0xA1
++ def get_order(self, aBuf):
++ if aBuf[0] >= '\xB0':
++ return 94 * (aBuf[0] - 0xB0) + aBuf[1] - 0xA1
else:
return -1;
@@ -665,11 +668,12 @@ TypeError: unorderable types: int() >= str()
self._mTableSize = GB2312_TABLE_SIZE
self._mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO
- def get_order(self, aStr):
+- def get_order(self, aStr):
- if (aStr[0] >= '\xB0') and (aStr[1] >= '\xA1'):
- return 94 * (ord(aStr[0]) - 0xB0) + ord(aStr[1]) - 0xA1
-+ if (aStr[0] >= 0xB0) and (aStr[1] >= 0xA1):
-+ return 94 * (aStr[0] - 0xB0) + aStr[1] - 0xA1
++ def get_order(self, aBuf):
++ if (aBuf[0] >= 0xB0) and (aBuf[1] >= 0xA1):
++ return 94 * (aBuf[0] - 0xB0) + aBuf[1] - 0xA1
else:
return -1;
@@ -680,16 +684,17 @@ TypeError: unorderable types: int() >= str()
self._mTableSize = BIG5_TABLE_SIZE
self._mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO
- def get_order(self, aStr):
+- def get_order(self, aStr):
- if aStr[0] >= '\xA4':
- if aStr[1] >= '\xA1':
- return 157 * (ord(aStr[0]) - 0xA4) + ord(aStr[1]) - 0xA1 + 63
-+ if aStr[0] >= 0xA4:
-+ if aStr[1] >= 0xA1:
-+ return 157 * (aStr[0] - 0xA4) + aStr[1] - 0xA1 + 63
++ def get_order(self, aBuf):
++ if aBuf[0] >= 0xA4:
++ if aBuf[1] >= 0xA1:
++ return 157 * (aBuf[0] - 0xA4) + aBuf[1] - 0xA1 + 63
else:
- return 157 * (ord(aStr[0]) - 0xA4) + ord(aStr[1]) - 0x40
-+ return 157 * (aStr[0] - 0xA4) + aStr[1] - 0x40
++ return 157 * (aBuf[0] - 0xA4) + aBuf[1] - 0x40
else:
return -1
@@ -700,21 +705,22 @@ TypeError: unorderable types: int() >= str()
self._mTableSize = JIS_TABLE_SIZE
self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
- def get_order(self, aStr):
+- def get_order(self, aStr):
- if (aStr[0] >= '\x81') and (aStr[0] <= '\x9F'):
- order = 188 * (ord(aStr[0]) - 0x81)
- elif (aStr[0] >= '\xE0') and (aStr[0] <= '\xEF'):
- order = 188 * (ord(aStr[0]) - 0xE0 + 31)
-+ if (aStr[0] >= 0x81) and (aStr[0] <= 0x9F):
-+ order = 188 * (aStr[0] - 0x81)
-+ elif (aStr[0] >= 0xE0) and (aStr[0] <= 0xEF):
-+ order = 188 * (aStr[0] - 0xE0 + 31)
++ def get_order(self, aBuf):
++ if (aBuf[0] >= 0x81) and (aBuf[0] <= 0x9F):
++ order = 188 * (aBuf[0] - 0x81)
++ elif (aBuf[0] >= 0xE0) and (aBuf[0] <= 0xEF):
++ order = 188 * (aBuf[0] - 0xE0 + 31)
else:
return -1;
- order = order + ord(aStr[1]) - 0x40
- if aStr[1] > '\x7F':
-+ order = order + aStr[1] - 0x40
-+ if aStr[1] > 0x7F:
++ order = order + aBuf[1] - 0x40
++ if aBuf[1] > 0x7F:
order =- 1
return order
@@ -725,11 +731,12 @@ TypeError: unorderable types: int() >= str()
self._mTableSize = JIS_TABLE_SIZE
self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
- def get_order(self, aStr):
+- def get_order(self, aStr):
- if aStr[0] >= '\xA0':
- return 94 * (ord(aStr[0]) - 0xA1) + ord(aStr[1]) - 0xA1
-+ if aStr[0] >= 0xA0:
-+ return 94 * (aStr[0] - 0xA1) + aStr[1] - 0xA1
++ def get_order(self, aBuf):
++ if aBuf[0] >= 0xA0:
++ return 94 * (aBuf[0] - 0xA1) + aBuf[1] - 0xA1
else:
return -1
'reduce' is not defined