diff --git a/case-study-porting-chardet-to-python-3.html b/case-study-porting-chardet-to-python-3.html index 9bc8ed7..d6f121c 100755 --- a/case-study-porting-chardet-to-python-3.html +++ b/case-study-porting-chardet-to-python-3.html @@ -669,135 +669,8 @@ TypeError: unorderable types: int() >= str() + return aBuf[1] - 0xA1, charLen return -1, charLen -

Searching the entire codebase for occurrences of the ord() function uncovers the same problem in chardistribution.py: -

C:\home\chardet> python test.py tests\*\*
-tests\ascii\howto.diveintomark.org.xml                       ascii with confidence 1.0
-tests\Big5\0804.blogspot.com.xml
-Traceback (most recent call last):
-  File "test.py", line 10, in <module>
-    u.feed(line)
-  File "C:\home\chardet\chardet\universaldetector.py", line 117, in feed
-    if prober.feed(aBuf) == constants.eFoundIt:
-  File "C:\home\chardet\chardet\charsetgroupprober.py", line 60, in feed
-    st = prober.feed(aBuf)
-  File "C:\home\chardet\chardet\sjisprober.py", line 72, in feed
-    self._mDistributionAnalyzer.feed(aBuf[i - 1 : i + 1], charLen)
-  File "C:\home\chardet\chardet\chardistribution.py", line 56, in feed
-    order = self.get_order(aStr)
-  File "C:\home\chardet\chardet\chardistribution.py", line 174, in get_order
-    if (aStr[0] >= '\x81') and (aStr[0] <= '\x9F'):
-TypeError: unorderable types: int() >= str()
-

The fix is the same: -

  class EUCTWDistributionAnalysis(CharDistributionAnalysis):
-      def __init__(self):
-          CharDistributionAnalysis.__init__(self)
-          self._mCharToFreqOrder = EUCTWCharToFreqOrder
-          self._mTableSize = EUCTW_TABLE_SIZE
-          self._mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
+

Searching the entire codebase for occurrences of the ord() function uncovers the same problem in chardistribution.py (specifically, in the EUCTWDistributionAnalysis, EUCKRDistributionAnalysis, GB2312DistributionAnalysis, Big5DistributionAnalysis, SJISDistributionAnalysis, and EUCJPDistributionAnalysis classes. In each case, the fix is similar to the change we made to the EUCJPContextAnalysis and SJISContextAnalysis classes in jpcntx.py. -- def get_order(self, aStr): -- if aStr[0] >= '\xC4': -- return 94 * (ord(aStr[0]) - 0xC4) + ord(aStr[1]) - 0xA1 -+ def get_order(self, aBuf): -+ if aBuf[0] >= 0xC4: -+ return 94 * (aBuf[0] - 0xC4) + aBuf[1] - 0xA1 - else: - return -1 - - class EUCKRDistributionAnalysis(CharDistributionAnalysis): - def __init__(self): - CharDistributionAnalysis.__init__(self) - self._mCharToFreqOrder = EUCKRCharToFreqOrder - self._mTableSize = EUCKR_TABLE_SIZE - self._mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO - -- def get_order(self, aStr): -- if aStr[0] >= '\xB0': -- return 94 * (ord(aStr[0]) - 0xB0) + ord(aStr[1]) - 0xA1 -+ def get_order(self, aBuf): -+ if aBuf[0] >= '\xB0': -+ return 94 * (aBuf[0] - 0xB0) + aBuf[1] - 0xA1 - else: - return -1; - - class GB2312DistributionAnalysis(CharDistributionAnalysis): - def __init__(self): - CharDistributionAnalysis.__init__(self) - self._mCharToFreqOrder = GB2312CharToFreqOrder - self._mTableSize = GB2312_TABLE_SIZE - self._mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO - -- def get_order(self, aStr): -- if (aStr[0] >= '\xB0') and (aStr[1] >= '\xA1'): -- return 94 * (ord(aStr[0]) - 0xB0) + ord(aStr[1]) - 0xA1 -+ def get_order(self, aBuf): -+ if (aBuf[0] >= 0xB0) and (aBuf[1] >= 0xA1): -+ return 94 * (aBuf[0] - 0xB0) + aBuf[1] - 0xA1 - else: - return -1; - - class Big5DistributionAnalysis(CharDistributionAnalysis): - def __init__(self): - CharDistributionAnalysis.__init__(self) - self._mCharToFreqOrder = Big5CharToFreqOrder - self._mTableSize = BIG5_TABLE_SIZE - self._mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO - -- def get_order(self, aStr): -- if aStr[0] >= '\xA4': -- if aStr[1] >= '\xA1': -- return 157 * (ord(aStr[0]) - 0xA4) + ord(aStr[1]) - 0xA1 + 63 -+ def get_order(self, aBuf): -+ if aBuf[0] >= 0xA4: -+ if aBuf[1] >= 0xA1: -+ return 157 * (aBuf[0] - 0xA4) + aBuf[1] - 0xA1 + 63 - else: -- return 157 * (ord(aStr[0]) - 0xA4) + ord(aStr[1]) - 0x40 -+ return 157 * (aBuf[0] - 0xA4) + aBuf[1] - 0x40 - else: - return -1 - - class SJISDistributionAnalysis(CharDistributionAnalysis): - def __init__(self): - CharDistributionAnalysis.__init__(self) - self._mCharToFreqOrder = JISCharToFreqOrder - self._mTableSize = JIS_TABLE_SIZE - self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO - -- def get_order(self, aStr): -- if (aStr[0] >= '\x81') and (aStr[0] <= '\x9F'): -- order = 188 * (ord(aStr[0]) - 0x81) -- elif (aStr[0] >= '\xE0') and (aStr[0] <= '\xEF'): -- order = 188 * (ord(aStr[0]) - 0xE0 + 31) -+ def get_order(self, aBuf): -+ if (aBuf[0] >= 0x81) and (aBuf[0] <= 0x9F): -+ order = 188 * (aBuf[0] - 0x81) -+ elif (aBuf[0] >= 0xE0) and (aBuf[0] <= 0xEF): -+ order = 188 * (aBuf[0] - 0xE0 + 31) - else: - return -1; -- order = order + ord(aStr[1]) - 0x40 -- if aStr[1] > '\x7F': -+ order = order + aBuf[1] - 0x40 -+ if aBuf[1] > 0x7F: - order =- 1 - return order - - class EUCJPDistributionAnalysis(CharDistributionAnalysis): - def __init__(self): - CharDistributionAnalysis.__init__(self) - self._mCharToFreqOrder = JISCharToFreqOrder - self._mTableSize = JIS_TABLE_SIZE - self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO - -- def get_order(self, aStr): -- if aStr[0] >= '\xA0': -- return 94 * (ord(aStr[0]) - 0xA1) + ord(aStr[1]) - 0xA1 -+ def get_order(self, aBuf): -+ if aBuf[0] >= 0xA0: -+ return 94 * (aBuf[0] - 0xA1) + aBuf[1] - 0xA1 - else: - return -1

Global name 'reduce' is not defined

Once more into the breach…

C:\home\chardet> python test.py tests\*\*