diff --git a/case-study-porting-chardet-to-python-3.html b/case-study-porting-chardet-to-python-3.html index db3033b..a2ca212 100644 --- a/case-study-porting-chardet-to-python-3.html +++ b/case-study-porting-chardet-to-python-3.html @@ -5,7 +5,7 @@ Case study: porting chardet to Python 3 - Dive into Python 3 diff --git a/case-study-porting-chardet-to-python-3.txt b/case-study-porting-chardet-to-python-3.txt deleted file mode 100644 index d68c7d8..0000000 --- a/case-study-porting-chardet-to-python-3.txt +++ /dev/null @@ -1,657 +0,0 @@ -C:\home\chardet>python c:\Python30\Tools\Scripts\2to3.py -w chardet\ -RefactoringTool: Skipping implicit fixer: buffer -RefactoringTool: Skipping implicit fixer: idioms -RefactoringTool: Skipping implicit fixer: set_literal -RefactoringTool: Skipping implicit fixer: ws_comma ---- chardet\__init__.py (original) -+++ chardet\__init__.py (refactored) -@@ -18,7 +18,7 @@ - __version__ = "1.0.1" - - def detect(aBuf): -- import universaldetector -+ from . import universaldetector - u = universaldetector.UniversalDetector() - u.reset() - u.feed(aBuf) ---- chardet\big5prober.py (original) -+++ chardet\big5prober.py (refactored) -@@ -25,10 +25,10 @@ - # 02110-1301 USA - ######################### END LICENSE BLOCK ######################### - --from mbcharsetprober import MultiByteCharSetProber --from codingstatemachine import CodingStateMachine --from chardistribution import Big5DistributionAnalysis --from mbcssm import Big5SMModel -+from .mbcharsetprober import MultiByteCharSetProber -+from .codingstatemachine import CodingStateMachine -+from .chardistribution import Big5DistributionAnalysis -+from .mbcssm import Big5SMModel - - class Big5Prober(MultiByteCharSetProber): - def __init__(self): ---- chardet\chardistribution.py (original) -+++ chardet\chardistribution.py (refactored) -@@ -25,12 +25,12 @@ - # 02110-1301 USA - ######################### END LICENSE BLOCK ######################### - --import constants --from euctwfreq import EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE, EUCTW_TYPICAL_DISTRIBUTION_RATIO --from euckrfreq import EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE, EUCKR_TYPICAL_DISTRIBUTION_RATIO --from gb2312freq import GB2312CharToFreqOrder, GB2312_TABLE_SIZE, GB2312_TYPICAL_DISTRIBUTION_RATIO --from big5freq import Big5CharToFreqOrder, BIG5_TABLE_SIZE, BIG5_TYPICAL_DISTRIBUTION_RATIO --from jisfreq import JISCharToFreqOrder, JIS_TABLE_SIZE, JIS_TYPICAL_DISTRIBUTION_RATIO -+from . import constants -+from .euctwfreq import EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE, EUCTW_TYPICAL_DISTRIBUTION_RATIO -+from .euckrfreq import EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE, EUCKR_TYPICAL_DISTRIBUTION_RATIO -+from .gb2312freq import GB2312CharToFreqOrder, GB2312_TABLE_SIZE, GB2312_TYPICAL_DISTRIBUTION_RATIO -+from .big5freq import Big5CharToFreqOrder, BIG5_TABLE_SIZE, BIG5_TYPICAL_DISTRIBUTION_RATIO -+from .jisfreq import JISCharToFreqOrder, JIS_TABLE_SIZE, JIS_TYPICAL_DISTRIBUTION_RATIO - - ENOUGH_DATA_THRESHOLD = 1024 - SURE_YES = 0.99 ---- chardet\charsetgroupprober.py (original) -+++ chardet\charsetgroupprober.py (refactored) -@@ -26,7 +26,7 @@ - ######################### END LICENSE BLOCK ######################### - - import constants, sys --from charsetprober import CharSetProber -+from .charsetprober import CharSetProber - - class CharSetGroupProber(CharSetProber): - def __init__(self): ---- chardet\codingstatemachine.py (original) -+++ chardet\codingstatemachine.py (refactored) -@@ -25,7 +25,7 @@ - # 02110-1301 USA - ######################### END LICENSE BLOCK ######################### - --from constants import eStart, eError, eItsMe -+from .constants import eStart, eError, eItsMe - - class CodingStateMachine: - def __init__(self, sm): ---- chardet\constants.py (original) -+++ chardet\constants.py (refactored) -@@ -38,10 +38,10 @@ - - SHORTCUT_THRESHOLD = 0.95 - --import __builtin__ -+import builtins - if not hasattr(__builtin__, 'False'): - False = 0 - True = 1 - else: -- False = __builtin__.False -- True = __builtin__.True -+ False = builtins.False -+ True = builtins.True ---- chardet\escprober.py (original) -+++ chardet\escprober.py (refactored) -@@ -26,9 +26,9 @@ - ######################### END LICENSE BLOCK ######################### - - import constants, sys --from escsm import HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel, ISO2022KRSMModel --from charsetprober import CharSetProber --from codingstatemachine import CodingStateMachine -+from .escsm import HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel, ISO2022KRSMModel -+from .charsetprober import CharSetProber -+from .codingstatemachine import CodingStateMachine - - class EscCharSetProber(CharSetProber): - def __init__(self): ---- chardet\escsm.py (original) -+++ chardet\escsm.py (refactored) -@@ -25,7 +25,7 @@ - # 02110-1301 USA - ######################### END LICENSE BLOCK ######################### - --from constants import eStart, eError, eItsMe -+from .constants import eStart, eError, eItsMe - - HZ_cls = ( \ - 1,0,0,0,0,0,0,0, # 00 - 07 ---- chardet\eucjpprober.py (original) -+++ chardet\eucjpprober.py (refactored) -@@ -26,12 +26,12 @@ - ######################### END LICENSE BLOCK ######################### - - import constants, sys --from constants import eStart, eError, eItsMe --from mbcharsetprober import MultiByteCharSetProber --from codingstatemachine import CodingStateMachine --from chardistribution import EUCJPDistributionAnalysis --from jpcntx import EUCJPContextAnalysis --from mbcssm import EUCJPSMModel -+from .constants import eStart, eError, eItsMe -+from .mbcharsetprober import MultiByteCharSetProber -+from .codingstatemachine import CodingStateMachine -+from .chardistribution import EUCJPDistributionAnalysis -+from .jpcntx import EUCJPContextAnalysis -+from .mbcssm import EUCJPSMModel - - class EUCJPProber(MultiByteCharSetProber): - def __init__(self): ---- chardet\euckrprober.py (original) -+++ chardet\euckrprober.py (refactored) -@@ -25,10 +25,10 @@ - # 02110-1301 USA - ######################### END LICENSE BLOCK ######################### - --from mbcharsetprober import MultiByteCharSetProber --from codingstatemachine import CodingStateMachine --from chardistribution import EUCKRDistributionAnalysis --from mbcssm import EUCKRSMModel -+from .mbcharsetprober import MultiByteCharSetProber -+from .codingstatemachine import CodingStateMachine -+from .chardistribution import EUCKRDistributionAnalysis -+from .mbcssm import EUCKRSMModel - - class EUCKRProber(MultiByteCharSetProber): - def __init__(self): ---- chardet\euctwprober.py (original) -+++ chardet\euctwprober.py (refactored) -@@ -25,10 +25,10 @@ - # 02110-1301 USA - ######################### END LICENSE BLOCK ######################### - --from mbcharsetprober import MultiByteCharSetProber --from codingstatemachine import CodingStateMachine --from chardistribution import EUCTWDistributionAnalysis --from mbcssm import EUCTWSMModel -+from .mbcharsetprober import MultiByteCharSetProber -+from .codingstatemachine import CodingStateMachine -+from .chardistribution import EUCTWDistributionAnalysis -+from .mbcssm import EUCTWSMModel - - class EUCTWProber(MultiByteCharSetProber): - def __init__(self): ---- chardet\gb2312prober.py (original) -+++ chardet\gb2312prober.py (refactored) -@@ -25,10 +25,10 @@ - # 02110-1301 USA - ######################### END LICENSE BLOCK ######################### - --from mbcharsetprober import MultiByteCharSetProber --from codingstatemachine import CodingStateMachine --from chardistribution import GB2312DistributionAnalysis --from mbcssm import GB2312SMModel -+from .mbcharsetprober import MultiByteCharSetProber -+from .codingstatemachine import CodingStateMachine -+from .chardistribution import GB2312DistributionAnalysis -+from .mbcssm import GB2312SMModel - - class GB2312Prober(MultiByteCharSetProber): - def __init__(self): ---- chardet\hebrewprober.py (original) -+++ chardet\hebrewprober.py (refactored) -@@ -25,8 +25,8 @@ - # 02110-1301 USA - ######################### END LICENSE BLOCK ######################### - --from charsetprober import CharSetProber --import constants -+from .charsetprober import CharSetProber -+from . import constants - - # This prober doesn't actually recognize a language or a charset. - # It is a helper prober for the use of the Hebrew model probers ---- chardet\jpcntx.py (original) -+++ chardet\jpcntx.py (refactored) -@@ -25,7 +25,7 @@ - # 02110-1301 USA - ######################### END LICENSE BLOCK ######################### - --import constants -+from . import constants - - NUM_OF_CATEGORY = 6 - DONT_KNOW = -1 ---- chardet\langbulgarianmodel.py (original) -+++ chardet\langbulgarianmodel.py (refactored) -@@ -25,7 +25,7 @@ - # 02110-1301 USA - ######################### END LICENSE BLOCK ######################### - --import constants -+from . import constants - - # 255: Control characters that usually does not exist in any text - # 254: Carriage/Return ---- chardet\langcyrillicmodel.py (original) -+++ chardet\langcyrillicmodel.py (refactored) -@@ -25,7 +25,7 @@ - # 02110-1301 USA - ######################### END LICENSE BLOCK ######################### - --import constants -+from . import constants - - # KOI8-R language model - # Character Mapping Table: ---- chardet\langgreekmodel.py (original) -+++ chardet\langgreekmodel.py (refactored) -@@ -25,7 +25,7 @@ - # 02110-1301 USA - ######################### END LICENSE BLOCK ######################### - --import constants -+from . import constants - - # 255: Control characters that usually does not exist in any text - # 254: Carriage/Return ---- chardet\langhebrewmodel.py (original) -+++ chardet\langhebrewmodel.py (refactored) -@@ -27,7 +27,7 @@ - # 02110-1301 USA - ######################### END LICENSE BLOCK ######################### - --import constants -+from . import constants - - # 255: Control characters that usually does not exist in any text - # 254: Carriage/Return ---- chardet\langhungarianmodel.py (original) -+++ chardet\langhungarianmodel.py (refactored) -@@ -25,7 +25,7 @@ - # 02110-1301 USA - ######################### END LICENSE BLOCK ######################### - --import constants -+from . import constants - - # 255: Control characters that usually does not exist in any text - # 254: Carriage/Return ---- chardet\langthaimodel.py (original) -+++ chardet\langthaimodel.py (refactored) -@@ -25,7 +25,7 @@ - # 02110-1301 USA - ######################### END LICENSE BLOCK ######################### - --import constants -+from . import constants - - # 255: Control characters that usually does not exist in any text - # 254: Carriage/Return ---- chardet\latin1prober.py (original) -+++ chardet\latin1prober.py (refactored) -@@ -26,8 +26,8 @@ - # 02110-1301 USA - ######################### END LICENSE BLOCK ######################### - --from charsetprober import CharSetProber --import constants -+from .charsetprober import CharSetProber -+from . import constants - import operator - - FREQ_CAT_NUM = 4 ---- chardet\mbcharsetprober.py (original) -+++ chardet\mbcharsetprober.py (refactored) -@@ -28,8 +28,8 @@ - ######################### END LICENSE BLOCK ######################### - - import constants, sys --from constants import eStart, eError, eItsMe --from charsetprober import CharSetProber -+from .constants import eStart, eError, eItsMe -+from .charsetprober import CharSetProber - - class MultiByteCharSetProber(CharSetProber): - def __init__(self): ---- chardet\mbcsgroupprober.py (original) -+++ chardet\mbcsgroupprober.py (refactored) -@@ -27,14 +27,14 @@ - # 02110-1301 USA - ######################### END LICENSE BLOCK ######################### - --from charsetgroupprober import CharSetGroupProber --from utf8prober import UTF8Prober --from sjisprober import SJISProber --from eucjpprober import EUCJPProber --from gb2312prober import GB2312Prober --from euckrprober import EUCKRProber --from big5prober import Big5Prober --from euctwprober import EUCTWProber -+from .charsetgroupprober import CharSetGroupProber -+from .utf8prober import UTF8Prober -+from .sjisprober import SJISProber -+from .eucjpprober import EUCJPProber -+from .gb2312prober import GB2312Prober -+from .euckrprober import EUCKRProber -+from .big5prober import Big5Prober -+from .euctwprober import EUCTWProber - - class MBCSGroupProber(CharSetGroupProber): - def __init__(self): ---- chardet\mbcssm.py (original) -+++ chardet\mbcssm.py (refactored) -@@ -25,7 +25,7 @@ - # 02110-1301 USA - ######################### END LICENSE BLOCK ######################### - --from constants import eStart, eError, eItsMe -+from .constants import eStart, eError, eItsMe - - # BIG5 - ---- chardet\sbcharsetprober.py (original) -+++ chardet\sbcharsetprober.py (refactored) -@@ -27,7 +27,7 @@ - ######################### END LICENSE BLOCK ######################### - - import constants, sys --from charsetprober import CharSetProber -+from .charsetprober import CharSetProber - - SAMPLE_SIZE = 64 - SB_ENOUGH_REL_THRESHOLD = 1024 ---- chardet\sbcsgroupprober.py (original) -+++ chardet\sbcsgroupprober.py (refactored) -@@ -27,15 +27,15 @@ - ######################### END LICENSE BLOCK ######################### - - import constants, sys --from charsetgroupprober import CharSetGroupProber --from sbcharsetprober import SingleByteCharSetProber --from langcyrillicmodel import Win1251CyrillicModel, Koi8rModel, Latin5CyrillicModel, MacCyrillicModel, Ibm866Model, Ibm855Model --from langgreekmodel import Latin7GreekModel, Win1253GreekModel --from langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel --from langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel --from langthaimodel import TIS620ThaiModel --from langhebrewmodel import Win1255HebrewModel --from hebrewprober import HebrewProber -+from .charsetgroupprober import CharSetGroupProber -+from .sbcharsetprober import SingleByteCharSetProber -+from .langcyrillicmodel import Win1251CyrillicModel, Koi8rModel, Latin5CyrillicModel, MacCyrillicModel, Ibm866Model, Ibm855Model -+from .langgreekmodel import Latin7GreekModel, Win1253GreekModel -+from .langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel -+from .langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel -+from .langthaimodel import TIS620ThaiModel -+from .langhebrewmodel import Win1255HebrewModel -+from .hebrewprober import HebrewProber - - class SBCSGroupProber(CharSetGroupProber): - def __init__(self): ---- chardet\sjisprober.py (original) -+++ chardet\sjisprober.py (refactored) -@@ -25,13 +25,13 @@ - # 02110-1301 USA - ######################### END LICENSE BLOCK ######################### - --from mbcharsetprober import MultiByteCharSetProber --from codingstatemachine import CodingStateMachine --from chardistribution import SJISDistributionAnalysis --from jpcntx import SJISContextAnalysis --from mbcssm import SJISSMModel -+from .mbcharsetprober import MultiByteCharSetProber -+from .codingstatemachine import CodingStateMachine -+from .chardistribution import SJISDistributionAnalysis -+from .jpcntx import SJISContextAnalysis -+from .mbcssm import SJISSMModel - import constants, sys --from constants import eStart, eError, eItsMe -+from .constants import eStart, eError, eItsMe - - class SJISProber(MultiByteCharSetProber): - def __init__(self): ---- chardet\universaldetector.py (original) -+++ chardet\universaldetector.py (refactored) -@@ -27,10 +27,10 @@ - ######################### END LICENSE BLOCK ######################### - - import constants, sys --from latin1prober import Latin1Prober # windows-1252 --from mbcsgroupprober import MBCSGroupProber # multi-byte character sets --from sbcsgroupprober import SBCSGroupProber # single-byte character sets --from escprober import EscCharSetProber # ISO-2122, etc. -+from .latin1prober import Latin1Prober # windows-1252 -+from .mbcsgroupprober import MBCSGroupProber # multi-byte character sets -+from .sbcsgroupprober import SBCSGroupProber # single-byte character sets -+from .escprober import EscCharSetProber # ISO-2122, etc. - import re - - MINIMUM_THRESHOLD = 0.20 ---- chardet\utf8prober.py (original) -+++ chardet\utf8prober.py (refactored) -@@ -26,10 +26,10 @@ - ######################### END LICENSE BLOCK ######################### - - import constants, sys --from constants import eStart, eError, eItsMe --from charsetprober import CharSetProber --from codingstatemachine import CodingStateMachine --from mbcssm import UTF8SMModel -+from .constants import eStart, eError, eItsMe -+from .charsetprober import CharSetProber -+from .codingstatemachine import CodingStateMachine -+from .mbcssm import UTF8SMModel - - ONE_CHAR_PROB = 0.5 - -RefactoringTool: Files that were modified: -RefactoringTool: chardet\__init__.py -RefactoringTool: chardet\big5prober.py -RefactoringTool: chardet\chardistribution.py -RefactoringTool: chardet\charsetgroupprober.py -RefactoringTool: chardet\codingstatemachine.py -RefactoringTool: chardet\constants.py -RefactoringTool: chardet\escprober.py -RefactoringTool: chardet\escsm.py -RefactoringTool: chardet\eucjpprober.py -RefactoringTool: chardet\euckrprober.py -RefactoringTool: chardet\euctwprober.py -RefactoringTool: chardet\gb2312prober.py -RefactoringTool: chardet\hebrewprober.py -RefactoringTool: chardet\jpcntx.py -RefactoringTool: chardet\langbulgarianmodel.py -RefactoringTool: chardet\langcyrillicmodel.py -RefactoringTool: chardet\langgreekmodel.py -RefactoringTool: chardet\langhebrewmodel.py -RefactoringTool: chardet\langhungarianmodel.py -RefactoringTool: chardet\langthaimodel.py -RefactoringTool: chardet\latin1prober.py -RefactoringTool: chardet\mbcharsetprober.py -RefactoringTool: chardet\mbcsgroupprober.py -RefactoringTool: chardet\mbcssm.py -RefactoringTool: chardet\sbcharsetprober.py -RefactoringTool: chardet\sbcsgroupprober.py -RefactoringTool: chardet\sjisprober.py -RefactoringTool: chardet\universaldetector.py -RefactoringTool: chardet\utf8prober.py - - - - -Now run the 2to3 script on the testing harness, test.py. - -C:\home\chardet>python c:\Python30\Tools\Scripts\2to3.py -w test.py -RefactoringTool: Skipping implicit fixer: buffer -RefactoringTool: Skipping implicit fixer: idioms -RefactoringTool: Skipping implicit fixer: set_literal -RefactoringTool: Skipping implicit fixer: ws_comma ---- test.py (original) -+++ test.py (refactored) -@@ -4,7 +4,7 @@ - count = 0 - u = UniversalDetector() - for f in glob.glob(sys.argv[1]): -- print f.ljust(60), -+ print(f.ljust(60), end=' ') - u.reset() - for line in file(f, 'rb'): - u.feed(line) -@@ -12,8 +12,8 @@ - u.close() - result = u.result - if result['encoding']: -- print result['encoding'], 'with confidence', result['confidence'] -+ print(result['encoding'], 'with confidence', result['confidence']) - else: -- print '******** no result' -+ print('******** no result') - count += 1 --print count, 'tests' -+print(count, 'tests') -RefactoringTool: Files that were modified: -RefactoringTool: test.py - - - - - -Well, that wasn't so hard! Just a few imports and print statements to convert. Time to run the new version. - -C:\home\chardet>python test.py tests\*\* -Traceback (most recent call last): - File "test.py", line 1, in - from chardet.universaldetector import UniversalDetector - File "C:\home\chardet\chardet\universaldetector.py", line 51 - self.done = constants.False - ^ -SyntaxError: invalid syntax - - - - -Hmm, a small snag. In Python 3, False is a reserved word, so you can't use it as a variable name. Let's look at constants.py to see where it's defined. Here's the original version from constants.py, before the 2to3 script changed it: - -import __builtin__ -if not hasattr(__builtin__, 'False'): - False = 0 - True = 1 -else: - False = __builtin__.False - True = __builtin__.True - -This piece of code is designed to allow this library to run under older versions of Python 2. Prior to Python 2.3 [FIXME-LINK], Python had no built-in Boolean type. This code detects the absence of the built-in constants True and False, and defines them if necessary. - -However, Python 3 will always have a Boolean type, so this entire code snippet is unnecessary. The simplest solution is to replace all instances of "constants.True" and "constants.False" with "True" and "False", respectively, then delete this dead code from constants.py. - -So this line in universaldetector.py: - - self.done = constants.False - -Becomes - - self.done = False - -Ah, wasn't that satisfying? The code is shorter and more readable already. - -OK, what's next? Time to run test.py again and see how far it gets. - -C:\home\chardet>python test.py tests\*\* -Traceback (most recent call last): - File "test.py", line 1, in - from chardet.universaldetector import UniversalDetector - File "C:\home\chardet\chardet\universaldetector.py", line 29, in - import constants, sys -ImportError: No module named constants - -What's that you say? No module named constants? Of course there's a module named constants. ... Oh wait, no there isn't. Remember when the 2to3 script fixed up all those import statements? This library has a lot of relative imports -- that is, modules that import other modules within the library. In Python 3, all import statements are absolute by default [FIXME-LINK PEP 0328]. To do relative imports, you must do something like this: - - from . import constants - -But wait. Wasn't the 2to3 script supposed to take care of these for you? Well, it did, but this particular import statement combined two different types of imports into one line: a relative import of the constants module within the library, and an absolute import of the sys module that is pre-installed in the Python standard library. In Python 2, you could combine these into one import statement. In Python 3, you can't, and the 2to3 script is not smart enough to split the import statement into two. - -The solution is to split the import statement manually. So this two-in-one import: - - import constants, sys - -Needs to become two separate imports: - - from . import constants - import sys - -There are variations of this problem scattered throughout the chardet library. In some places it's "import constants, sys"; in other places, it's "import constants, re". The fix is the same: manually split the import statement into two lines, one for the relative import, the other for the absolute import. - -Onward! - -C:\home\chardet>python test.py tests\*\* -tests\ascii\howto.diveintomark.org.xml -Traceback (most recent call last): - File "test.py", line 9, in - for line in file(f, 'rb'): -NameError: name 'file' is not defined - -This one surprised me, because I've been using this idiom as long as I can remember. In Python 2, the global file() function was an alias for open(), which was the standard way of opening files for reading. In Python 3, the entire system for reading and writing files has been refactored into the io module. [FIXME-LINK PEP 3116] I'll cover the new I/O module in more detail in Chapter FIXME, but for now, the important bit is that the global file() function no longer exists. However, the open() function does still exist. (Technically, it's an alias for io.open(), but never mind that right now.) - -Thus, the simple solution to the problem of the missing file() is to call open() instead: - - for line in open(f, 'rb'): - -And that's all I have to say about that. - -C:\home\chardet>python test.py tests\*\* -tests\ascii\howto.diveintomark.org.xml -Traceback (most recent call last): - File "test.py", line 10, in - u.feed(line) - File "C:\home\chardet\chardet\universaldetector.py", line 98, in feed - if self._highBitDetector.search(aBuf): -TypeError: can't use a string pattern on a bytes-like object - -Now things are starting to get interesting. And by "interesting," I mean "confusing as all hell." - -First, let's see what self._highBitDetector is. It's defined in the __init__ method of the UniversalDetector class: - -class UniversalDetector: - def __init__(self): - self._highBitDetector = re.compile(r'[\x80-\xFF]') - -This pre-compiles a regular expression designed to find non-ASCII characters in the range 128-255 (0x80-0xFF). Wait, that's not quite right; I need to be more precise with my terminology. This pattern is designed to find non-ASCII bytes in the range 128-255. And therein lies the problem. - -In Python 2, a string was an array of bytes whose character encoding was tracked separately. If you wanted Python 2 to keep track of the character encoding, you had to use a Unicode string (u'') instead. But in Python 3, a string is always what Python 2 called a Unicode string -- that is, an array of Unicode characters (of possibly varying byte lengths). Since this regular expression is defined by a string pattern, it can only be used to search a string -- again, an array of characters. But what we're searching is not a string, it's a byte array. Looking at the traceback, this error occurred in universaldetector.py: - -def feed(self, aBuf): - . - . - . - if self._mInputState == ePureAscii: - if self._highBitDetector.search(aBuf): - -And what is aBuf? Let's backtrack further to a place that calls UniversalDetector.feed(). One place that calls it is the test harness, test.py. - -u = UniversalDetector() -. -. -. -for line in open(f, 'rb'): - u.feed(line) - -And we have our answer: in the UniversalDetector.feed() method, aBuf is a line read from a file on disk. Look carefully at the parameters used to open the file: 'rb'. 'r' is for read; OK, big deal, we're reading the file. Ah, but 'b' is for 'bytes'. Without the 'b' flag, this for loop would read the file, line by line, and convert each line into a string -- an array of Unicode characters -- according to the system default character encoding. (You could override the system encoding with another parameter to open(), but never mind that for now.) But with the 'b' flag, this for loop reads the file, line by line, and stores each line exactly as it appears in the file, as an array of bytes. That byte array gets passed to UniversalDetector.feed(), and eventually gets passed to the pre-compiled regular expression, self._highBitDetector, to search for high-bit... characters. But we don't have characters; we have bytes. - -What we need this regular expression to search is not an array of characters, but an array of bytes. - -Once you realize that, the solution is not difficult. Regular expressions defined with strings can search strings. Regular expressions defined with byte arrays can search byte arrays. To define a byte array pattern, we simply change the type of the argument we use to define the regular expression to a byte array. So instead of this: - - self._highBitDetector = re.compile(r'[\x80-\xFF]') - -We now have this: - - self._highBitDetector = re.compile(b'[\x80-\xFF]') - -There is one other case of this same problem, on the very next line: - - self._escDetector = re.compile(r'(\033|~{)') - -Again, this is going to be used to search a byte array (the same aBuf variable, in fact), so the regular expression pattern needs to be defined as a byte array: - - self._escDetector = re.compile(b'(\033|~{)') - -Curiouser and curiouser... - -C:\home\chardet>python test.py tests\*\* -tests\ascii\howto.diveintomark.org.xml -Traceback (most recent call last): - File "test.py", line 10, in - u.feed(line) - File "C:\home\chardet\chardet\universaldetector.py", line 100, in feed - elif (self._mInputState == ePureAscii) and self._escDetector.search(self._mLastChar + aBuf): -TypeError: Can't convert 'bytes' object to str implicitly - diff --git a/dip3.css b/dip3.css index d2e68ff..c6088f8 100644 --- a/dip3.css +++ b/dip3.css @@ -4,9 +4,11 @@ a{background:transparent;text-decoration:none;border-bottom:1px dotted} a:hover{border-bottom:1px solid} a:link{color:#1b67c9} a:visited{color:darkorchid} +a[href^="http:"]:before,a[href^="https:"]:before{content:"\27A6 "} h1 a,h2 a,h3 a,#nav a{color:inherit !important} abbr,.p{border:0;letter-spacing:0.1em;text-transform:lowercase;font-variant:small-caps} h1,h2,h3,p,ul,ol,#nav{margin:1.75em 0} +li ol{margin:0} h1,h2,h3{font-size:medium} h1{background:papayawhip;color:#000;width:100%;margin:0} #index h2{margin-left:1.75em} @@ -20,14 +22,11 @@ blockquote{font-size:small;font-style:oblique;margin-left:2.154em} blockquote p{margin:2.154em 0} .c{text-align:center;clear:both;font-size:small} p.fancy:first-letter{float:left;background:transparent;color:gainsboro;padding:0.11em 4px 0 0;font:normal 4em/0.68 serif} -#arc{width:100%,border-collapse:collapse} -#arc th,#arc td{list-style:none;margin:0;padding:0} -#arc th{padding:0 1.75em 0 0;text-align:right;vertical-align:baseline} figure{display:block;text-align:center;margin:1.75em 0} figure img{display:block;margin:0 auto} section,article,footer{display:block} var{font-family:monospace;font-style:normal} -a.skip{font-size:small;display:block;margin:auto;text-align:center;border:0} +a.skip{font-size:small;display:block;margin:auto;text-align:right;border:0} table{width:100%;border-collapse:collapse} th{text-align:left;padding:0 0.5em;vertical-align:baseline;border:1px dotted} th,td{width:45%;vertical-align:top} @@ -36,6 +35,7 @@ th:first-child{width:10%;text-align:center} td{border:1px dotted;padding:0 0.5em} body{counter-reset:h1} h1:before{counter-increment:h1;content:counter(h1) ". "} +.appendix h1:before{content:""} h1{counter-reset:h2} h2:before{counter-increment:h2;content:counter(h1) "." counter(h2) ". "} h2{counter-reset:h3} diff --git a/humansize.py b/humansize.py index 77db8ff..ef94c93 100644 --- a/humansize.py +++ b/humansize.py @@ -1,36 +1,39 @@ -from optparse import OptionParser +"""Convert file sizes to human-readable form. -SUFFIXES = ('KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB') -MULTIPLES = {True: 1024, False: 1000} +Available functions: +human_size(size, a_kilobyte_is_1024_bytes) + takes a file size and returns a human-readable string -def human_size(size, use_binary_multiples=True): +Examples: +>>> human_size(1024) +'1.0 KiB' +>>> human_size(1000, False) +'1.0 KB' +""" + +SUFFIXES = {1000: ('KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'), + 1024: ('KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB')} + +def human_size(size, a_kilobyte_is_1024_bytes=True): """Convert a file size to human-readable form. Keyword arguments: size -- file size in bytes - use_binary_multiples -- if False, use multiples of 1000 - if True, use multiples of 1024 (default=True) + a_kilobyte_is_1024_bytes -- if True (default), use multiples of 1024 + if False, use multiples of 1000 Returns: string """ - multiple = MULTIPLES[use_binary_multiples] - for suffix in SUFFIXES: + if size < 0: + raise ValueError('number must be non-negative') + multiple = 1024 if a_kilobyte_is_1024_bytes else 1000 + for suffix in SUFFIXES[multiple]: size /= multiple if size < multiple: return "{0:.1f} {1}".format(size, suffix) - return "Too large to contemplate!" + raise ValueError('number too large') if __name__ == "__main__": - parser = OptionParser() - parser.add_option("-d", "--decimal", - action="store_false", - dest="binary_multiples", - default=True, - help="use multiples of 1000 instead of 1024") - (options, args) = parser.parse_args() - if args: - print(human_size(int(args[0]), options.binary_multiples)) - else: - parser.print_help() - + print(human_size(1000000000000, False)) + print(human_size(1000000000000)) diff --git a/index.html b/index.html index 70b11c8..0348c57 100644 --- a/index.html +++ b/index.html @@ -55,9 +55,13 @@
-

Documenting your code

+

Writing readable code

+

Why bother?

Docstrings

+

Function annotations

+

Style conventions

+

...

@@ -582,179 +586,6 @@ -
-

Porting code to Python 3 with 2to3

- -
-

print statement

-
- -
-

<> comparison

-
- -
-

has_key() dictionary method

-
- -
-

Dictionary methods

-
- -
-

Modules that have been renamed or reorganized

-
- -
-

apply() global function

-
- -
-

intern() global function

-
- -
-

exec statement

-
- -
-

repr literals (backticks)

-
- -
-

try...except statement

-
- -
-

raise statement

-
- -
-

throw statement

-
- -
-

long data type

-
- -
-

xrange() global function

-
- -
-

raw_input() global function

-
- -
-

func_* function attributes

-
- -
-

xreadlines() I/O method

-
- -
-

lambda functions with multiple parameters

-
- -
-

__class__ special class attribute

-
- -
-

next() iterator method

-
- -
-

__nonzero__ special class attribute

-
- -
-

Number literals

-
- -
-

sys.maxint

-
- -
-

unicode() global function

-
- -
-

Unicode string literals

-
- -
-

callable() global function

-
- -
-

filter() global function

-
- -
-

map() global function

-
- -
-

zip() global function

-
- -
-

StandardError() exception

-
- -
-

types module constants

-
- -
-

basestring datatype

-
- -
-

itertools module

-
- -
-

Relative imports

-
- -
-

sys.exc_type, sys.exc_value, sys.exc_traceback

-
- -
-

List comprehensions over tuples

-
- -
-

os.getcwdu() function

-
- -
-

Metaclasses

-
- -
-

set() literals

-
- -
-

buffer() global function

-
- -
-

Whitespace around commas

-
- -
-

Common idioms

-
- -
-

Case study: porting chardet to Python 3

@@ -791,6 +622,8 @@

Packaging Python libraries

+ +

A brief history of packaging (and why it's harder than you think)

@@ -815,7 +648,6 @@

Platform-specific packaging

Packaging by Linux distributions

Py2exe

-

Psyco

@@ -824,7 +656,7 @@

Creating graphics with the Python Imaging Library

-

...if it gets ported...

+

...will likely get ported in time...

@@ -898,6 +730,10 @@ + +