diff --git a/case-study-porting-chardet-to-python-3.html b/case-study-porting-chardet-to-python-3.html new file mode 100644 index 0000000..1c0a3aa --- /dev/null +++ b/case-study-porting-chardet-to-python-3.html @@ -0,0 +1,701 @@ + + + + +Case study: porting chardet to Python 3 - Dive into Python 3 + + + +

Case study: porting chardet to Python 3

+ + + +
+ +

Running 2to3

+ +

We're going to migrate the chardet module from Python 2 to Python 3. Python 3 comes with a utility script to help with this, called 2to3. 2to3 takes your actual Python 2 source code as input, and auto-converts as much as it can to Python 3. [FIXME reference 2to3 chapter once it's done]

+ +

The chardet library is split across several different files, all in the same directory. The 2to3 script makes it easy to convert multiple files at once: just pass a directory as a command line argument, and 2to3 will convert each of the files in turn.

+ +
C:\home\chardet>python c:\Python30\Tools\Scripts\2to3.py -w chardet\
+RefactoringTool: Skipping implicit fixer: buffer
+RefactoringTool: Skipping implicit fixer: idioms
+RefactoringTool: Skipping implicit fixer: set_literal
+RefactoringTool: Skipping implicit fixer: ws_comma
+--- chardet\__init__.py (original)
++++ chardet\__init__.py (refactored)
+@@ -18,7 +18,7 @@
+ __version__ = "1.0.1"
+
+ def detect(aBuf):
+-    import universaldetector
++    from . import universaldetector
+     u = universaldetector.UniversalDetector()
+     u.reset()
+     u.feed(aBuf)
+--- chardet\big5prober.py (original)
++++ chardet\big5prober.py (refactored)
+@@ -25,10 +25,10 @@
+ # 02110-1301  USA
+ ######################### END LICENSE BLOCK #########################
+
+-from mbcharsetprober import MultiByteCharSetProber
+-from codingstatemachine import CodingStateMachine
+-from chardistribution import Big5DistributionAnalysis
+-from mbcssm import Big5SMModel
++from .mbcharsetprober import MultiByteCharSetProber
++from .codingstatemachine import CodingStateMachine
++from .chardistribution import Big5DistributionAnalysis
++from .mbcssm import Big5SMModel
+
+ class Big5Prober(MultiByteCharSetProber):
+     def __init__(self):
+--- chardet\chardistribution.py (original)
++++ chardet\chardistribution.py (refactored)
+@@ -25,12 +25,12 @@
+ # 02110-1301  USA
+ ######################### END LICENSE BLOCK #########################
+
+-import constants
+-from euctwfreq import EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE, EUCTW_TYPICAL_DISTRIBUTION_RATIO
+-from euckrfreq import EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE, EUCKR_TYPICAL_DISTRIBUTION_RATIO
+-from gb2312freq import GB2312CharToFreqOrder, GB2312_TABLE_SIZE, GB2312_TYPICAL_DISTRIBUTION_RATIO
+-from big5freq import Big5CharToFreqOrder, BIG5_TABLE_SIZE, BIG5_TYPICAL_DISTRIBUTION_RATIO
+-from jisfreq import JISCharToFreqOrder, JIS_TABLE_SIZE, JIS_TYPICAL_DISTRIBUTION_RATIO
++from . import constants
++from .euctwfreq import EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE, EUCTW_TYPICAL_DISTRIBUTION_RATIO
++from .euckrfreq import EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE, EUCKR_TYPICAL_DISTRIBUTION_RATIO
++from .gb2312freq import GB2312CharToFreqOrder, GB2312_TABLE_SIZE, GB2312_TYPICAL_DISTRIBUTION_RATIO
++from .big5freq import Big5CharToFreqOrder, BIG5_TABLE_SIZE, BIG5_TYPICAL_DISTRIBUTION_RATIO
++from .jisfreq import JISCharToFreqOrder, JIS_TABLE_SIZE, JIS_TYPICAL_DISTRIBUTION_RATIO
+
+ ENOUGH_DATA_THRESHOLD = 1024
+ SURE_YES = 0.99
+--- chardet\charsetgroupprober.py (original)
++++ chardet\charsetgroupprober.py (refactored)
+@@ -26,7 +26,7 @@
+ ######################### END LICENSE BLOCK #########################
+
+ import constants, sys
+-from charsetprober import CharSetProber
++from .charsetprober import CharSetProber
+
+ class CharSetGroupProber(CharSetProber):
+     def __init__(self):
+--- chardet\codingstatemachine.py (original)
++++ chardet\codingstatemachine.py (refactored)
+@@ -25,7 +25,7 @@
+ # 02110-1301  USA
+ ######################### END LICENSE BLOCK #########################
+
+-from constants import eStart, eError, eItsMe
++from .constants import eStart, eError, eItsMe
+
+ class CodingStateMachine:
+     def __init__(self, sm):
+--- chardet\constants.py (original)
++++ chardet\constants.py (refactored)
+@@ -38,10 +38,10 @@
+
+ SHORTCUT_THRESHOLD = 0.95
+
+-import __builtin__
++import builtins
+ if not hasattr(__builtin__, 'False'):
+     False = 0
+     True = 1
+ else:
+-    False = __builtin__.False
+-    True = __builtin__.True
++    False = builtins.False
++    True = builtins.True
+--- chardet\escprober.py (original)
++++ chardet\escprober.py (refactored)
+@@ -26,9 +26,9 @@
+ ######################### END LICENSE BLOCK #########################
+
+ import constants, sys
+-from escsm import HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel, ISO2022KRSMModel
+-from charsetprober import CharSetProber
+-from codingstatemachine import CodingStateMachine
++from .escsm import HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel, ISO2022KRSMModel
++from .charsetprober import CharSetProber
++from .codingstatemachine import CodingStateMachine
+
+ class EscCharSetProber(CharSetProber):
+     def __init__(self):
+--- chardet\escsm.py (original)
++++ chardet\escsm.py (refactored)
+@@ -25,7 +25,7 @@
+ # 02110-1301  USA
+ ######################### END LICENSE BLOCK #########################
+
+-from constants import eStart, eError, eItsMe
++from .constants import eStart, eError, eItsMe
+
+ HZ_cls = ( \
+ 1,0,0,0,0,0,0,0,  # 00 - 07
+--- chardet\eucjpprober.py (original)
++++ chardet\eucjpprober.py (refactored)
+@@ -26,12 +26,12 @@
+ ######################### END LICENSE BLOCK #########################
+
+ import constants, sys
+-from constants import eStart, eError, eItsMe
+-from mbcharsetprober import MultiByteCharSetProber
+-from codingstatemachine import CodingStateMachine
+-from chardistribution import EUCJPDistributionAnalysis
+-from jpcntx import EUCJPContextAnalysis
+-from mbcssm import EUCJPSMModel
++from .constants import eStart, eError, eItsMe
++from .mbcharsetprober import MultiByteCharSetProber
++from .codingstatemachine import CodingStateMachine
++from .chardistribution import EUCJPDistributionAnalysis
++from .jpcntx import EUCJPContextAnalysis
++from .mbcssm import EUCJPSMModel
+
+ class EUCJPProber(MultiByteCharSetProber):
+     def __init__(self):
+--- chardet\euckrprober.py (original)
++++ chardet\euckrprober.py (refactored)
+@@ -25,10 +25,10 @@
+ # 02110-1301  USA
+ ######################### END LICENSE BLOCK #########################
+
+-from mbcharsetprober import MultiByteCharSetProber
+-from codingstatemachine import CodingStateMachine
+-from chardistribution import EUCKRDistributionAnalysis
+-from mbcssm import EUCKRSMModel
++from .mbcharsetprober import MultiByteCharSetProber
++from .codingstatemachine import CodingStateMachine
++from .chardistribution import EUCKRDistributionAnalysis
++from .mbcssm import EUCKRSMModel
+
+ class EUCKRProber(MultiByteCharSetProber):
+     def __init__(self):
+--- chardet\euctwprober.py (original)
++++ chardet\euctwprober.py (refactored)
+@@ -25,10 +25,10 @@
+ # 02110-1301  USA
+ ######################### END LICENSE BLOCK #########################
+
+-from mbcharsetprober import MultiByteCharSetProber
+-from codingstatemachine import CodingStateMachine
+-from chardistribution import EUCTWDistributionAnalysis
+-from mbcssm import EUCTWSMModel
++from .mbcharsetprober import MultiByteCharSetProber
++from .codingstatemachine import CodingStateMachine
++from .chardistribution import EUCTWDistributionAnalysis
++from .mbcssm import EUCTWSMModel
+
+ class EUCTWProber(MultiByteCharSetProber):
+     def __init__(self):
+--- chardet\gb2312prober.py (original)
++++ chardet\gb2312prober.py (refactored)
+@@ -25,10 +25,10 @@
+ # 02110-1301  USA
+ ######################### END LICENSE BLOCK #########################
+
+-from mbcharsetprober import MultiByteCharSetProber
+-from codingstatemachine import CodingStateMachine
+-from chardistribution import GB2312DistributionAnalysis
+-from mbcssm import GB2312SMModel
++from .mbcharsetprober import MultiByteCharSetProber
++from .codingstatemachine import CodingStateMachine
++from .chardistribution import GB2312DistributionAnalysis
++from .mbcssm import GB2312SMModel
+
+ class GB2312Prober(MultiByteCharSetProber):
+     def __init__(self):
+--- chardet\hebrewprober.py (original)
++++ chardet\hebrewprober.py (refactored)
+@@ -25,8 +25,8 @@
+ # 02110-1301  USA
+ ######################### END LICENSE BLOCK #########################
+
+-from charsetprober import CharSetProber
+-import constants
++from .charsetprober import CharSetProber
++from . import constants
+
+ # This prober doesn't actually recognize a language or a charset.
+ # It is a helper prober for the use of the Hebrew model probers
+--- chardet\jpcntx.py (original)
++++ chardet\jpcntx.py (refactored)
+@@ -25,7 +25,7 @@
+ # 02110-1301  USA
+ ######################### END LICENSE BLOCK #########################
+
+-import constants
++from . import constants
+
+ NUM_OF_CATEGORY = 6
+ DONT_KNOW = -1
+--- chardet\langbulgarianmodel.py (original)
++++ chardet\langbulgarianmodel.py (refactored)
+@@ -25,7 +25,7 @@
+ # 02110-1301  USA
+ ######################### END LICENSE BLOCK #########################
+
+-import constants
++from . import constants
+
+ # 255: Control characters that usually does not exist in any text
+ # 254: Carriage/Return
+--- chardet\langcyrillicmodel.py (original)
++++ chardet\langcyrillicmodel.py (refactored)
+@@ -25,7 +25,7 @@
+ # 02110-1301  USA
+ ######################### END LICENSE BLOCK #########################
+
+-import constants
++from . import constants
+
+ # KOI8-R language model
+ # Character Mapping Table:
+--- chardet\langgreekmodel.py (original)
++++ chardet\langgreekmodel.py (refactored)
+@@ -25,7 +25,7 @@
+ # 02110-1301  USA
+ ######################### END LICENSE BLOCK #########################
+
+-import constants
++from . import constants
+
+ # 255: Control characters that usually does not exist in any text
+ # 254: Carriage/Return
+--- chardet\langhebrewmodel.py (original)
++++ chardet\langhebrewmodel.py (refactored)
+@@ -27,7 +27,7 @@
+ # 02110-1301  USA
+ ######################### END LICENSE BLOCK #########################
+
+-import constants
++from . import constants
+
+ # 255: Control characters that usually does not exist in any text
+ # 254: Carriage/Return
+--- chardet\langhungarianmodel.py (original)
++++ chardet\langhungarianmodel.py (refactored)
+@@ -25,7 +25,7 @@
+ # 02110-1301  USA
+ ######################### END LICENSE BLOCK #########################
+
+-import constants
++from . import constants
+
+ # 255: Control characters that usually does not exist in any text
+ # 254: Carriage/Return
+--- chardet\langthaimodel.py (original)
++++ chardet\langthaimodel.py (refactored)
+@@ -25,7 +25,7 @@
+ # 02110-1301  USA
+ ######################### END LICENSE BLOCK #########################
+
+-import constants
++from . import constants
+
+ # 255: Control characters that usually does not exist in any text
+ # 254: Carriage/Return
+--- chardet\latin1prober.py (original)
++++ chardet\latin1prober.py (refactored)
+@@ -26,8 +26,8 @@
+ # 02110-1301  USA
+ ######################### END LICENSE BLOCK #########################
+
+-from charsetprober import CharSetProber
+-import constants
++from .charsetprober import CharSetProber
++from . import constants
+ import operator
+
+ FREQ_CAT_NUM = 4
+--- chardet\mbcharsetprober.py (original)
++++ chardet\mbcharsetprober.py (refactored)
+@@ -28,8 +28,8 @@
+ ######################### END LICENSE BLOCK #########################
+
+ import constants, sys
+-from constants import eStart, eError, eItsMe
+-from charsetprober import CharSetProber
++from .constants import eStart, eError, eItsMe
++from .charsetprober import CharSetProber
+
+ class MultiByteCharSetProber(CharSetProber):
+     def __init__(self):
+--- chardet\mbcsgroupprober.py (original)
++++ chardet\mbcsgroupprober.py (refactored)
+@@ -27,14 +27,14 @@
+ # 02110-1301  USA
+ ######################### END LICENSE BLOCK #########################
+
+-from charsetgroupprober import CharSetGroupProber
+-from utf8prober import UTF8Prober
+-from sjisprober import SJISProber
+-from eucjpprober import EUCJPProber
+-from gb2312prober import GB2312Prober
+-from euckrprober import EUCKRProber
+-from big5prober import Big5Prober
+-from euctwprober import EUCTWProber
++from .charsetgroupprober import CharSetGroupProber
++from .utf8prober import UTF8Prober
++from .sjisprober import SJISProber
++from .eucjpprober import EUCJPProber
++from .gb2312prober import GB2312Prober
++from .euckrprober import EUCKRProber
++from .big5prober import Big5Prober
++from .euctwprober import EUCTWProber
+
+ class MBCSGroupProber(CharSetGroupProber):
+     def __init__(self):
+--- chardet\mbcssm.py (original)
++++ chardet\mbcssm.py (refactored)
+@@ -25,7 +25,7 @@
+ # 02110-1301  USA
+ ######################### END LICENSE BLOCK #########################
+
+-from constants import eStart, eError, eItsMe
++from .constants import eStart, eError, eItsMe
+
+ # BIG5
+
+--- chardet\sbcharsetprober.py (original)
++++ chardet\sbcharsetprober.py (refactored)
+@@ -27,7 +27,7 @@
+ ######################### END LICENSE BLOCK #########################
+
+ import constants, sys
+-from charsetprober import CharSetProber
++from .charsetprober import CharSetProber
+
+ SAMPLE_SIZE = 64
+ SB_ENOUGH_REL_THRESHOLD = 1024
+--- chardet\sbcsgroupprober.py (original)
++++ chardet\sbcsgroupprober.py (refactored)
+@@ -27,15 +27,15 @@
+ ######################### END LICENSE BLOCK #########################
+
+ import constants, sys
+-from charsetgroupprober import CharSetGroupProber
+-from sbcharsetprober import SingleByteCharSetProber
+-from langcyrillicmodel import Win1251CyrillicModel, Koi8rModel, Latin5CyrillicModel, MacCyrillicModel, Ibm866Model, Ibm855Model
+-from langgreekmodel import Latin7GreekModel, Win1253GreekModel
+-from langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel
+-from langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel
+-from langthaimodel import TIS620ThaiModel
+-from langhebrewmodel import Win1255HebrewModel
+-from hebrewprober import HebrewProber
++from .charsetgroupprober import CharSetGroupProber
++from .sbcharsetprober import SingleByteCharSetProber
++from .langcyrillicmodel import Win1251CyrillicModel, Koi8rModel, Latin5CyrillicModel, MacCyrillicModel, Ibm866Model, Ibm855Model
++from .langgreekmodel import Latin7GreekModel, Win1253GreekModel
++from .langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel
++from .langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel
++from .langthaimodel import TIS620ThaiModel
++from .langhebrewmodel import Win1255HebrewModel
++from .hebrewprober import HebrewProber
+
+ class SBCSGroupProber(CharSetGroupProber):
+     def __init__(self):
+--- chardet\sjisprober.py (original)
++++ chardet\sjisprober.py (refactored)
+@@ -25,13 +25,13 @@
+ # 02110-1301  USA
+ ######################### END LICENSE BLOCK #########################
+
+-from mbcharsetprober import MultiByteCharSetProber
+-from codingstatemachine import CodingStateMachine
+-from chardistribution import SJISDistributionAnalysis
+-from jpcntx import SJISContextAnalysis
+-from mbcssm import SJISSMModel
++from .mbcharsetprober import MultiByteCharSetProber
++from .codingstatemachine import CodingStateMachine
++from .chardistribution import SJISDistributionAnalysis
++from .jpcntx import SJISContextAnalysis
++from .mbcssm import SJISSMModel
+ import constants, sys
+-from constants import eStart, eError, eItsMe
++from .constants import eStart, eError, eItsMe
+
+ class SJISProber(MultiByteCharSetProber):
+     def __init__(self):
+--- chardet\universaldetector.py (original)
++++ chardet\universaldetector.py (refactored)
+@@ -27,10 +27,10 @@
+ ######################### END LICENSE BLOCK #########################
+
+ import constants, sys
+-from latin1prober import Latin1Prober # windows-1252
+-from mbcsgroupprober import MBCSGroupProber # multi-byte character sets
+-from sbcsgroupprober import SBCSGroupProber # single-byte character sets
+-from escprober import EscCharSetProber # ISO-2122, etc.
++from .latin1prober import Latin1Prober # windows-1252
++from .mbcsgroupprober import MBCSGroupProber # multi-byte character sets
++from .sbcsgroupprober import SBCSGroupProber # single-byte character sets
++from .escprober import EscCharSetProber # ISO-2122, etc.
+ import re
+
+ MINIMUM_THRESHOLD = 0.20
+--- chardet\utf8prober.py (original)
++++ chardet\utf8prober.py (refactored)
+@@ -26,10 +26,10 @@
+ ######################### END LICENSE BLOCK #########################
+
+ import constants, sys
+-from constants import eStart, eError, eItsMe
+-from charsetprober import CharSetProber
+-from codingstatemachine import CodingStateMachine
+-from mbcssm import UTF8SMModel
++from .constants import eStart, eError, eItsMe
++from .charsetprober import CharSetProber
++from .codingstatemachine import CodingStateMachine
++from .mbcssm import UTF8SMModel
+
+ ONE_CHAR_PROB = 0.5
+
+RefactoringTool: Files that were modified:
+RefactoringTool: chardet\__init__.py
+RefactoringTool: chardet\big5prober.py
+RefactoringTool: chardet\chardistribution.py
+RefactoringTool: chardet\charsetgroupprober.py
+RefactoringTool: chardet\codingstatemachine.py
+RefactoringTool: chardet\constants.py
+RefactoringTool: chardet\escprober.py
+RefactoringTool: chardet\escsm.py
+RefactoringTool: chardet\eucjpprober.py
+RefactoringTool: chardet\euckrprober.py
+RefactoringTool: chardet\euctwprober.py
+RefactoringTool: chardet\gb2312prober.py
+RefactoringTool: chardet\hebrewprober.py
+RefactoringTool: chardet\jpcntx.py
+RefactoringTool: chardet\langbulgarianmodel.py
+RefactoringTool: chardet\langcyrillicmodel.py
+RefactoringTool: chardet\langgreekmodel.py
+RefactoringTool: chardet\langhebrewmodel.py
+RefactoringTool: chardet\langhungarianmodel.py
+RefactoringTool: chardet\langthaimodel.py
+RefactoringTool: chardet\latin1prober.py
+RefactoringTool: chardet\mbcharsetprober.py
+RefactoringTool: chardet\mbcsgroupprober.py
+RefactoringTool: chardet\mbcssm.py
+RefactoringTool: chardet\sbcharsetprober.py
+RefactoringTool: chardet\sbcsgroupprober.py
+RefactoringTool: chardet\sjisprober.py
+RefactoringTool: chardet\universaldetector.py
+RefactoringTool: chardet\utf8prober.py
+ +

Now run the 2to3 script on the testing harness, test.py.

+ +
C:\home\chardet>python c:\Python30\Tools\Scripts\2to3.py -w test.py
+RefactoringTool: Skipping implicit fixer: buffer
+RefactoringTool: Skipping implicit fixer: idioms
+RefactoringTool: Skipping implicit fixer: set_literal
+RefactoringTool: Skipping implicit fixer: ws_comma
+--- test.py (original)
++++ test.py (refactored)
+@@ -4,7 +4,7 @@
+ count = 0
+ u = UniversalDetector()
+ for f in glob.glob(sys.argv[1]):
+-    print f.ljust(60),
++    print(f.ljust(60), end=' ')
+     u.reset()
+     for line in file(f, 'rb'):
+         u.feed(line)
+@@ -12,8 +12,8 @@
+     u.close()
+     result = u.result
+     if result['encoding']:
+-        print result['encoding'], 'with confidence', result['confidence']
++        print(result['encoding'], 'with confidence', result['confidence'])
+     else:
+-        print '******** no result'
++        print('******** no result')
+     count += 1
+-print count, 'tests'
++print(count, 'tests')
+RefactoringTool: Files that were modified:
+RefactoringTool: test.py
+ +

Well, that wasn't so hard. Just a few imports and print statements to convert. Time to run the new version. Do you think it'll work?

+
+ +
+

False is invalid syntax

+ +

Now for the real test: running the test harness against the test suite. Since the test suite is designed to cover all the possible code paths, it's a good way to test our ported code to make sure there aren't any bugs lurking anywhere.

+ +
C:\home\chardet>python test.py tests\*\*
+Traceback (most recent call last):
+  File "test.py", line 1, in <module>
+    from chardet.universaldetector import UniversalDetector
+  File "C:\home\chardet\chardet\universaldetector.py", line 51
+    self.done = constants.False
+                              ^
+SyntaxError: invalid syntax
+ +

Hmm, a small snag. In Python 3, False is a reserved word, so you can't use it as a variable name. Let's look at constants.py to see where it's defined. Here's the original version from constants.py, before the 2to3 script changed it:

+ +
import __builtin__
+if not hasattr(__builtin__, 'False'):
+    False = 0
+    True = 1
+else:
+    False = __builtin__.False
+    True = __builtin__.True
+ +

This piece of code is designed to allow this library to run under older versions of Python 2. Prior to Python 2.3 [FIXME-LINK], Python had no built-in Boolean type. This code detects the absence of the built-in constants True and False, and defines them if necessary.

+ +

However, Python 3 will always have a Boolean type, so this entire code snippet is unnecessary. The simplest solution is to replace all instances of "constants.True" and "constants.False" with "True" and "False", respectively, then delete this dead code from constants.py.

+ +

So this line in universaldetector.py:

+ +
self.done = constants.False
+ +

Becomes

+ +
self.done = False
+ +

Ah, wasn't that satisfying? The code is shorter and more readable already.

+
+ +
+

No module named constants

+ +

Time to run test.py again and see how far it gets.

+ +
C:\home\chardet>python test.py tests\*\*
+Traceback (most recent call last):
+  File "test.py", line 1, in <module>
+    from chardet.universaldetector import UniversalDetector
+  File "C:\home\chardet\chardet\universaldetector.py", line 29, in <module>
+    import constants, sys
+ImportError: No module named constants
+ +

What's that you say? No module named constants? Of course there's a module named constants. ... Oh wait, no there isn't. Remember when the 2to3 script fixed up all those import statements? This library has a lot of relative imports -- that is, modules that import other modules within the library. In Python 3, all import statements are absolute by default [FIXME-LINK PEP 0328]. To do relative imports, you need to do something like this instead:

+ +
from . import constants
+ +

But wait. Wasn't the 2to3 script supposed to take care of these for you? Well, it did, but this particular import statement combines two different types of imports into one line: a relative import of the constants module within the library, and an absolute import of the sys module that is pre-installed in the Python standard library. In Python 2, you could combine these into one import statement. In Python 3, you can't, and the 2to3 script is not smart enough to split the import statement into two.

+ +

The solution is to split the import statement manually. So this two-in-one import:

+ +
import constants, sys
+ +

Needs to become two separate imports:

+ +
from . import constants
+import sys
+ +

There are variations of this problem scattered throughout the chardet library. In some places it's "import constants, sys"; in other places, it's "import constants, re". The fix is the same: manually split the import statement into two lines, one for the relative import, the other for the absolute import.

+ +

Onward!

+
+ +
+

Name 'file' is not defined

+ +
C:\home\chardet>python test.py tests\*\*
+tests\ascii\howto.diveintomark.org.xml
+Traceback (most recent call last):
+  File "test.py", line 9, in <module>
+    for line in file(f, 'rb'):
+NameError: name 'file' is not defined
+ +

This one surprised me, because I've been using this idiom as long as I can remember. In Python 2, the global file() function was an alias for open(), which was the standard way of opening files for reading. In Python 3, the entire system for reading and writing files has been refactored into the io module. [FIXME-LINK PEP 3116] I'll cover the new I/O module in more detail in Chapter FIXME, but for now, the important bit is that the global file() function no longer exists. However, the open() function does still exist. (Technically, it's an alias for io.open(), but never mind that right now.)

+ +

Thus, the simplest solution to the problem of the missing file() is to call open() instead:

+ +
for line in open(f, 'rb'):
+ +

And that's all I have to say about that.

+
+ +
+

Can't use a string pattern on a bytes-like object

+ +

FIXME intro

+ +
C:\home\chardet>python test.py tests\*\*
+tests\ascii\howto.diveintomark.org.xml
+Traceback (most recent call last):
+  File "test.py", line 10, in <module>
+    u.feed(line)
+  File "C:\home\chardet\chardet\universaldetector.py", line 98, in feed
+    if self._highBitDetector.search(aBuf):
+TypeError: can't use a string pattern on a bytes-like object
+ +

Now things are starting to get interesting. And by "interesting," I mean "confusing as all hell."

+ +

First, let's see what self._highBitDetector is. It's defined in the __init__ method of the UniversalDetector class:

+ +
class UniversalDetector:
+    def __init__(self):
+        self._highBitDetector = re.compile(r'[\x80-\xFF]')
+ +

This pre-compiles a regular expression designed to find non-ASCII characters in the range 128-255 (0x80-0xFF). Wait, that's not quite right; I need to be more precise with my terminology. This pattern is designed to find non-ASCII bytes in the range 128-255.

+ +

And therein lies the problem.

+ +

In Python 2, a string was an array of bytes whose character encoding was tracked separately. If you wanted Python 2 to keep track of the character encoding, you had to use a Unicode string (u'') instead. But in Python 3, a string is always what Python 2 called a Unicode string -- that is, an array of Unicode characters (of possibly varying byte lengths). Since this regular expression is defined by a string pattern, it can only be used to search a string -- again, an array of characters. But what we're searching is not a string, it's a byte array. Looking at the traceback, this error occurred in universaldetector.py:

+ +
def feed(self, aBuf):
+    .
+    .
+    .
+    if self._mInputState == ePureAscii:
+        if self._highBitDetector.search(aBuf):
+ +

And what is aBuf? Let's backtrack further to a place that calls UniversalDetector.feed(). One place that calls it is the test harness, test.py.

+ +
u = UniversalDetector()
+.
+.
+.
+for line in open(f, 'rb'):
+    u.feed(line)
+ +

And here we find our answer: in the UniversalDetector.feed() method, aBuf is a line read from a file on disk. Look carefully at the parameters used to open the file: 'rb'. 'r' is for "read"; OK, big deal, we're reading the file. Ah, but 'b' is for "bytes." Without the 'b' flag, this for loop would read the file, line by line, and convert each line into a string -- an array of Unicode characters -- according to the system default character encoding. (You could override the system encoding with another parameter to open(), but never mind that for now.) But with the 'b' flag, this for loop reads the file, line by line, and stores each line exactly as it appears in the file, as an array of bytes. That byte array gets passed to UniversalDetector.feed(), and eventually gets passed to the pre-compiled regular expression, self._highBitDetector, to search for high-bit... characters. But we don't have characters; we have bytes. Oops.

+ +

What we need this regular expression to search is not an array of characters, but an array of bytes.

+ +

Once you realize that, the solution is not difficult. Regular expressions defined with strings can search strings. Regular expressions defined with byte arrays can search byte arrays. To define a byte array pattern, we simply change the type of the argument we use to define the regular expression to a byte array. So instead of this:

+ +
self._highBitDetector = re.compile(r'[\x80-\xFF]')
+ +

We now have this:

+ +
self._highBitDetector = re.compile(b'[\x80-\xFF]')
+ +

There is one other case of this same problem, on the very next line:

+ +
self._escDetector = re.compile(r'(\033|~{)')
+ +

Again, this is going to be used to search a byte array (the same aBuf variable, in fact), so the regular expression pattern needs to be defined as a byte array:

+ +
self._escDetector = re.compile(b'(\033|~{)')
+
+ +
+

Can't convert 'bytes' object to str implicitly

+ +

Curiouser and curiouser...

+ +
C:\home\chardet>python test.py tests\*\*
+tests\ascii\howto.diveintomark.org.xml
+Traceback (most recent call last):
+  File "test.py", line 10, in <module>
+    u.feed(line)
+  File "C:\home\chardet\chardet\universaldetector.py", line 100, in feed
+    elif (self._mInputState == ePureAscii) and self._escDetector.search(self._mLastChar + aBuf):
+TypeError: Can't convert 'bytes' object to str implicitly
+ +
+ + diff --git a/dip2.html b/dip2.html index 1c33d67..ce33600 100644 --- a/dip2.html +++ b/dip2.html @@ -346,7 +346,7 @@ several months behind in updating their ActivePython installer when new version PythonWin 2.2.2 (#37, Nov 26 2002, 10:24:37) [MSC 32 bit (Intel)] on win32. Portions Copyright 1994-2001 Mark Hammond (mhammond@skippinet.com.au) - see 'Help/About PythonWin' for further copyright information. - class="prompt">>>> +>>>

Procedure 1.2. Option 2: Installing Python from Python.org

@@ -383,7 +383,7 @@ Type "copyright", "credits" or "license()" for more information. **************************************************************** IDLE 1.0 - class="prompt">>>> +>>>
@@ -418,7 +418,7 @@ Welcome to Darwin! Python 2.2 (#1, 07/14/02, 23:25:09) [GCC Apple cpp-precomp 6.14] on darwin Type "help", "copyright", "credits", or "license" for more information. - class="prompt">>>> [press Ctrl+D to get back to the command prompt] +>>> [press Ctrl+D to get back to the command prompt] [localhost:~] you%
@@ -458,7 +458,7 @@ Window->Python Interactive (Cmd-0). The opening win [GCC 3.1 20020420 (prerelease)] Type "copyright", "credits" or "license" for more information. MacPython IDE 1.0.1 - class="prompt">>>> +>>>

Note that once you install the latest version, the pre-installed version is still present. If you are running scripts from the command line, you need to be aware which version of Python you are using.

@@ -467,12 +467,12 @@ the command line, you need to be aware which version of Python you are using.

Python 2.2 (#1, 07/14/02, 23:25:09) [GCC Apple cpp-precomp 6.14] on darwin Type "help", "copyright", "credits", or "license" for more information. - class="prompt">>>> [press Ctrl+D to get back to the command prompt] +>>> [press Ctrl+D to get back to the command prompt] [localhost:~] you% /usr/local/bin/python Python 2.3 (#2, Jul 30 2003, 11:45:28) [GCC 3.1 20020420 (prerelease)] on darwin Type "help", "copyright", "credits", or "license" for more information. - class="prompt">>>> [press Ctrl+D to get back to the command prompt] +>>> [press Ctrl+D to get back to the command prompt] [localhost:~] you% @@ -512,7 +512,7 @@ Window->Python Interactive (Cmd-0). You'll see a sc [GCC 3.1 20020420 (prerelease)] Type "copyright", "credits" or "license" for more information. MacPython IDE 1.0.1 - class="prompt">>>> +>>>
@@ -529,19 +529,19 @@ Connecting to python.org[194.109.137.226]:80... connected. HTTP request sent, awaiting response... 200 OK Length: 7,495,111 [application/octet-stream] ... - class="prompt">[root@localhost root]# rpm -Uvh python2.3-2.3-5pydotorg.i386.rpm +[root@localhost root]# rpm -Uvh python2.3-2.3-5pydotorg.i386.rpm Preparing... ########################################### [100%] 1:python2.3 ########################################### [100%] - class="prompt">[root@localhost root]# python 1 +[root@localhost root]# python 1 Python 2.2.2 (#1, Feb 24 2003, 19:13:11) [GCC 3.2.2 20030222 (Red Hat Linux 3.2.2-4)] on linux2 Type "help", "copyright", "credits", or "license" for more information. - class="prompt">>>> [press Ctrl+D to exit] +>>> [press Ctrl+D to exit] [root@localhost root]# python2.3 2 Python 2.3 (#1, Sep 12 2003, 10:53:56) [GCC 3.2.2 20030222 (Red Hat Linux 3.2.2-5)] on linux2 Type "help", "copyright", "credits", or "license" for more information. - class="prompt">>>> [press Ctrl+D to exit] +>>> [press Ctrl+D to exit] [root@localhost root]# which python2.3 3 /usr/bin/python2.3
@@ -586,7 +586,7 @@ The following NEW packages will be installed: 0 upgraded, 2 newly installed, 0 to remove and 3 not upgraded. Need to get 0B/2880kB of archives. After unpacking 9351kB of additional disk space will be used. - class="prompt">Do you want to continue? [Y/n] Y +Do you want to continue? [Y/n] Y Selecting previously deselected package python2.3. (Reading database ... 22848 files and directories currently installed.) Unpacking python2.3 (from .../python2.3_2.3.1-1_i386.deb) ... @@ -596,13 +596,13 @@ Setting up python (2.3.1-1) ... Setting up python2.3 (2.3.1-1) ... Compiling python modules in /usr/lib/python2.3 ... Compiling optimized python modules in /usr/lib/python2.3 ... - class="prompt">localhost:~# exit +localhost:~# exit logout localhost:~$ python Python 2.3.1 (#2, Sep 24 2003, 11:39:14) [GCC 3.3.2 20030908 (Debian prerelease)] on linux2 Type "help", "copyright", "credits" or "license" for more information. - class="prompt">>>> [press Ctrl+D to exit] +>>> [press Ctrl+D to exit]
@@ -617,14 +617,14 @@ Connecting to www.python.org[194.109.137.226]:80... connected. HTTP request sent, awaiting response... 200 OK Length: 8,436,880 [application/x-tar] ... - class="prompt">localhost:~# tar xfz Python-2.3.tgz +localhost:~# tar xfz Python-2.3.tgz localhost:~# cd Python-2.3 localhost:~/Python-2.3# ./configure checking MACHDEP... linux2 checking EXTRAPLATDIR... checking for --without-gcc... no ... - class="prompt">localhost:~/Python-2.3# make +localhost:~/Python-2.3# make gcc -pthread -c -fno-strict-aliasing -DNDEBUG -g -O3 -Wall -Wstrict-prototypes -I. -I./Include -DPy_BUILD_CORE -o Modules/python.o Modules/python.c gcc -pthread -c -fno-strict-aliasing -DNDEBUG -g -O3 -Wall -Wstrict-prototypes @@ -632,10 +632,10 @@ gcc -pthread -c -fno-strict-aliasing -DNDEBUG -g -O3 -Wall -Wstrict-prototypes gcc -pthread -c -fno-strict-aliasing -DNDEBUG -g -O3 -Wall -Wstrict-prototypes -I. -I./Include -DPy_BUILD_CORE -o Parser/grammar1.o Parser/grammar1.c ... - class="prompt">localhost:~/Python-2.3# make install +localhost:~/Python-2.3# make install /usr/bin/install -c python /usr/local/bin/python2.3 ... - class="prompt">localhost:~/Python-2.3# exit +localhost:~/Python-2.3# exit logout localhost:~$ which python /usr/local/bin/python @@ -643,7 +643,7 @@ logout Python 2.3.1 (#2, Sep 24 2003, 11:39:14) [GCC 3.3.2 20030908 (Debian prerelease)] on linux2 Type "help", "copyright", "credits" or "license" for more information. - class="prompt">>>> [press Ctrl+D to get back to the command prompt] +>>> [press Ctrl+D to get back to the command prompt] localhost:~$
@@ -892,7 +892,7 @@ Returns string.
['', '/usr/local/lib/python2.2', '/usr/local/lib/python2.2/plat-linux2', '/usr/local/lib/python2.2/lib-dynload', '/usr/local/lib/python2.2/site-packages', '/usr/local/lib/python2.2/site-packages/PIL', '/usr/local/lib/python2.2/site-packages/piddle'] - class="prompt">>>> sys 3 +>>> sys 3 <module 'sys' (built-in)> >>> sys.path.append('/my/new/path') 4
@@ -1227,7 +1227,7 @@ KeyError: mpilgrim

Example 3.5. Deleting Items from a Dictionary

>>> d
 {'server': 'mpilgrim', 'uid': 'sa', 'database': 'master',
 42: 'douglas', 'retrycount': 3}
- class="prompt">>>> del d[42] 1
+>>> del d[42] 1
 >>> d
 {'server': 'mpilgrim', 'uid': 'sa', 'database': 'master', 'retrycount': 3}
 >>> d.clear() 2
@@ -1504,7 +1504,7 @@ KeyError: mpilgrim
Traceback (innermost last): File "<interactive input>", line 1, in ? ValueError: list.index(x): x not in list - class="prompt">>>> "c" in li 4 +>>> "c" in li 4 False
@@ -1573,7 +1573,7 @@ False
Traceback (innermost last): File "<interactive input>", line 1, in ? ValueError: list.remove(x): x not in list - class="prompt">>>> li.pop() 4 +>>> li.pop() 4 'elements' >>> li ['a', 'b', 'mpilgrim', 'example', 'new', 'two']
@@ -1705,15 +1705,15 @@ ValueError: list.remove(x): x not in list Traceback (innermost last): File "<interactive input>", line 1, in ? AttributeError: 'tuple' object has no attribute 'append' - class="prompt">>>> t.remove("z") 2 +>>> t.remove("z") 2 Traceback (innermost last): File "<interactive input>", line 1, in ? AttributeError: 'tuple' object has no attribute 'remove' - class="prompt">>>> t.index("example") 3 +>>> t.index("example") 3 Traceback (innermost last): File "<interactive input>", line 1, in ? AttributeError: 'tuple' object has no attribute 'index' - class="prompt">>>> "z" in t 4 +>>> "z" in t 4 True
@@ -1809,7 +1809,7 @@ a matter of style.

Traceback (innermost last): File "<interactive input>", line 1, in ? NameError: There is no variable named 'x' - class="prompt">>>> x = 1 +>>> x = 1 >>> x 1

You will thank Python for this one day.

@@ -2487,7 +2487,7 @@ True
>>> dir(li) 1 ['append', 'count', 'extend', 'index', 'insert', 'pop', 'remove', 'reverse', 'sort'] - class="prompt">>>> d = {} +>>> d = {} >>> dir(d) 2 ['clear', 'copy', 'get', 'has_key', 'items', 'keys', 'setdefault', 'update', 'values'] >>> import odbchelper @@ -3035,7 +3035,7 @@ a lambda function; if you need something more compl >>> print s this is a test - class="prompt">>>> print s.split() 2 +>>> print s.split() 2 ['this', 'is', 'a', 'test'] >>> print " ".join(s.split()) 3 'this is a test'
@@ -3450,7 +3450,7 @@ can import individual items or use from Traceback (innermost last): File "<interactive input>", line 1, in ? NameError: There is no variable named 'FunctionType' - class="prompt">>>> from types import FunctionType 3 +>>> from types import FunctionType 3 >>> FunctionType 4 <type 'function'>
@@ -4077,7 +4077,7 @@ provide a way to map non-method-calling syntax into method calls.

{'album': 'Rave Mix', 'artist': '***DJ MARY-JANE***', 'genre': 31, 'title': 'KAIRO****THE BEST GOA', 'name': '/music/_singles/kairo.mp3', 'year': '2000', 'comment': 'http://mp3.com/DJMARYJANE'} - class="prompt">>>> mp3file["name"] = "/music/_singles/sidewinder.mp3" 3 +>>> mp3file["name"] = "/music/_singles/sidewinder.mp3" 3 >>> mp3file {'album': '', 'artist': 'The Cynic Project', 'genre': 18, 'title': 'Sidewinder', 'name': '/music/_singles/sidewinder.mp3', 'year': '2000', @@ -4206,7 +4206,7 @@ class MP3FileInfo(FileInfo): 'year': (93, 97, <function stripnulls at 0260C8D4>), 'comment': (97, 126, <function stripnulls at 0260C8D4>), 'album': (63, 93, <function stripnulls at 0260C8D4>)} - class="prompt">>>> m = fileinfo.MP3FileInfo() 3 +>>> m = fileinfo.MP3FileInfo() 3 >>> m.tagDataMap {'title': (3, 33, <function stripnulls at 0260C8D4>), 'genre': (127, 128, <built-in function ord>), @@ -4421,7 +4421,7 @@ a line of code may raise an exception, you should handle the exception using a < Traceback (innermost last): File "<interactive input>", line 1, in ? IOError: [Errno 2] No such file or directory: '/notthere' - class="prompt">>>> try: +>>> try: ... fsock = open("/notthere") 2 ... except IOError: 3 ... print "The file does not exist, exiting gracefully" @@ -4597,7 +4597,7 @@ exceptions, errors occur immediately, and you can handle them in a standard way >>> tagData 'TAGKAIRO****THE BEST GOA ***DJ MARY-JANE*** Rave Mix 2000http://mp3.com/DJMARYJANE \037' - class="prompt">>>> f.tell() 5 +>>> f.tell() 5 7543037
@@ -4657,15 +4657,15 @@ True Traceback (innermost last): File "<interactive input>", line 1, in ? ValueError: I/O operation on closed file - class="prompt">>>> f.tell() +>>> f.tell() Traceback (innermost last): File "<interactive input>", line 1, in ? ValueError: I/O operation on closed file - class="prompt">>>> f.read() +>>> f.read() Traceback (innermost last): File "<interactive input>", line 1, in ? ValueError: I/O operation on closed file - class="prompt">>>> f.close() 5
+>>> f.close() 5
1 @@ -4849,7 +4849,7 @@ or other iteratable entities. But in Python, a for a b e - class="prompt">>>> print "\n".join(li) 3 +>>> print "\n".join(li) 3 a b e
@@ -4884,7 +4884,7 @@ e
2 3 4 - class="prompt">>>> li = ['a', 'b', 'c', 'd', 'e'] +>>> li = ['a', 'b', 'c', 'd', 'e'] >>> for i in range(len(li)): 2 ... print li[i] a @@ -4921,7 +4921,7 @@ COMPUTERNAME=MPILGRIM USERNAME=mpilgrim [...snip...] - class="prompt">>>> print "\n".join(["%s=%s" % (k, v) +>>> print "\n".join(["%s=%s" % (k, v) ... for k, v in os.environ.items()]) 3 USERPROFILE=C:\Documents and Settings\mpilgrim OS=Windows_NT @@ -5044,7 +5044,7 @@ site signal UserDict stat - class="prompt">>>> fileinfo +>>> fileinfo <module 'fileinfo' from 'fileinfo.pyc'> >>> sys.modules["fileinfo"] 2 <module 'fileinfo' from 'fileinfo.pyc'>
@@ -5228,18 +5228,18 @@ stat ['a_time_long_forgotten_con.mp3', 'hellraiser.mp3', 'kairo.mp3', 'long_way_home1.mp3', 'sidewinder.mp3', 'spinning.mp3'] - class="prompt">>>> dirname = "c:\\" +>>> dirname = "c:\\" >>> os.listdir(dirname) 2 ['AUTOEXEC.BAT', 'boot.ini', 'CONFIG.SYS', 'cygwin', 'docbook', 'Documents and Settings', 'Incoming', 'Inetpub', 'IO.SYS', 'MSDOS.SYS', 'Music', 'NTDETECT.COM', 'ntldr', 'pagefile.sys', 'Program Files', 'Python20', 'RECYCLER', 'System Volume Information', 'TEMP', 'WINNT'] - class="prompt">>>> [f for f in os.listdir(dirname) +>>> [f for f in os.listdir(dirname) ... if os.path.isfile(os.path.join(dirname, f))] 3 ['AUTOEXEC.BAT', 'boot.ini', 'CONFIG.SYS', 'IO.SYS', 'MSDOS.SYS', 'NTDETECT.COM', 'ntldr', 'pagefile.sys'] - class="prompt">>>> [f for f in os.listdir(dirname) +>>> [f for f in os.listdir(dirname) ... if os.path.isdir(os.path.join(dirname, f))] 4 ['cygwin', 'docbook', 'Documents and Settings', 'Incoming', 'Inetpub', 'Music', 'Program Files', 'Python20', 'RECYCLER', @@ -5331,7 +5331,7 @@ may already be familiar with from working on the command line.

['a_time_long_forgotten_con.mp3', 'hellraiser.mp3', 'kairo.mp3', 'long_way_home1.mp3', 'sidewinder.mp3', 'spinning.mp3'] - class="prompt">>>> import glob +>>> import glob >>> glob.glob('c:\\music\\_singles\\*.mp3') 2 ['c:\\music\\_singles\\a_time_long_forgotten_con.mp3', 'c:\\music\\_singles\\hellraiser.mp3', @@ -5339,10 +5339,10 @@ may already be familiar with from working on the command line.

'c:\\music\\_singles\\long_way_home1.mp3', 'c:\\music\\_singles\\sidewinder.mp3', 'c:\\music\\_singles\\spinning.mp3']
- class="prompt">>>> glob.glob('c:\\music\\_singles\\s*.mp3') 3 +>>> glob.glob('c:\\music\\_singles\\s*.mp3') 3 ['c:\\music\\_singles\\sidewinder.mp3', 'c:\\music\\_singles\\spinning.mp3'] - class="prompt">>>> glob.glob('c:\\music\\*\\*.mp3')4 +>>> glob.glob('c:\\music\\*\\*.mp3')4
@@ -6100,7 +6100,7 @@ it a verbose regular expression. This example shows how.

# or 5-8 (V, followed by 0 to 3 I's) $ # end of string """ - class="prompt">>>> re.search(pattern, 'M', re.VERBOSE) 1 +>>> re.search(pattern, 'M', re.VERBOSE) 1 <_sre.SRE_Match object at 0x008EEB48> >>> re.search(pattern, 'MCMLXXXIX', re.VERBOSE) 2 <_sre.SRE_Match object at 0x008EEB48> @@ -6440,7 +6440,7 @@ you made.

(\d*) # extension is optional and can be any number of digits $ # end of string ''', re.VERBOSE) - class="prompt">>>> phonePattern.search('work 1-(800) 555.1212 #1234').groups() 1 +>>> phonePattern.search('work 1-(800) 555.1212 #1234').groups() 1 ('800', '555', '1212', '1234') >>> phonePattern.search('800-555-1212') 2 ('800', '555', '1212', '') @@ -8366,21 +8366,21 @@ package architecture. It's one of the many things Python is good at, so take ad >>> grammarNode.childNodes1[<DOM Text node "\n">, <DOM Element: ref at 17533332>, \ <DOM Text node "\n">, <DOM Element: ref at 17549660>, <DOM Text node "\n">] - class="prompt">>>> print grammarNode.firstChild.toxml() 2 +>>> print grammarNode.firstChild.toxml() 2 - class="prompt">>>> print grammarNode.childNodes[1].toxml() 3 +>>> print grammarNode.childNodes[1].toxml() 3<ref id="bit"> <p>0</p> <p>1</p> </ref> - class="prompt">>>> print grammarNode.childNodes[3].toxml() 4 +>>> print grammarNode.childNodes[3].toxml() 4<ref id="byte"> <p><xref id="bit"/><xref id="bit"/><xref id="bit"/><xref id="bit"/>\ <xref id="bit"/><xref id="bit"/><xref id="bit"/><xref id="bit"/></p> </ref> - class="prompt">>>> print grammarNode.lastChild.toxml() 5 +>>> print grammarNode.lastChild.toxml() 5
@@ -8428,7 +8428,7 @@ package architecture. It's one of the many things Python is good at, so take ad [<DOM Text node "\n">, <DOM Text node " ">, <DOM Element: p at 19315844>, \ <DOM Text node "\n">, <DOM Text node " ">, \ <DOM Element: p at 19462036>, <DOM Text node "\n">] - class="prompt">>>> pNode = refNode.childNodes[2] +>>> pNode = refNode.childNodes[2] >>> pNode <DOM Element: p at 19315844> >>> print pNode.toxml() 3 @@ -8525,7 +8525,7 @@ Dive in
Traceback (innermost last): File "<interactive input>", line 1, in ? UnicodeError: ASCII encoding error: ordinal not in range(128) - class="prompt">>>> print s.encode('latin-1') 3 +>>> print s.encode('latin-1') 3 La Peña
@@ -8639,7 +8639,7 @@ u'\u041f\u0440\u0435\u0434\u0438\u0441\u043b\u043e\u0432\u0438\u0435' Traceback (innermost last): File "<interactive input>", line 1, in ? UnicodeError: ASCII encoding error: ordinal not in range(128) - class="prompt">>>> convertedtitle = title.encode('koi8-r') 4 +>>> convertedtitle = title.encode('koi8-r') 4>>> convertedtitle '\xf0\xd2\xc5\xc4\xc9\xd3\xcc\xcf\xd7\xc9\xc5' >>> print convertedtitle 5 @@ -8724,7 +8724,7 @@ in Python. If your XML documents are all 7-bit ASCI <p>0</p> <p>1</p> </ref> - class="prompt">>>> print reflist[1].toxml() +>>> print reflist[1].toxml() <ref id="byte"> <p><xref id="bit"/><xref id="bit"/><xref id="bit"/><xref id="bit"/>\ <xref id="bit"/><xref id="bit"/><xref id="bit"/><xref id="bit"/></p> @@ -8747,7 +8747,7 @@ in Python. If your XML documents are all 7-bit ASCI <p>0</p> <p>1</p> </ref> - class="prompt">>>> plist = firstref.getElementsByTagName("p") 2 +>>> plist = firstref.getElementsByTagName("p") 2 >>> plist [<DOM Element: p at 136140116>, <DOM Element: p at 136142172>] >>> print plist[0].toxml() 3 @@ -8832,7 +8832,7 @@ in Python. If your XML documents are all 7-bit ASCI <p>0</p> <p>1</p> </ref> - class="prompt">>>> bitref.attributes 1 +>>> bitref.attributes 1 <xml.dom.minidom.NamedNodeMap instance at 0x81e0c9c> >>> bitref.attributes.keys() 2 3 [u'id'] @@ -9213,7 +9213,7 @@ with a window-based Python IDE, stdout< Dive in Dive in Dive in - class="prompt">>>> import sys +>>> import sys >>> for i in range(3): ... sys.stdout.write('Dive in') 2 Dive inDive inDive in @@ -9385,7 +9385,7 @@ one program's output to the next program's input.

<xref id="bit"/><xref id="bit"/><xref id="bit"/><xref id="bit"/></p> </ref> </grammar> - class="prompt">[you@localhost kgp]$ cat binary.xml | python kgp.py -g - 3 4 +[you@localhost kgp]$ cat binary.xml | python kgp.py -g - 3 4 10110001
@@ -9681,10 +9681,10 @@ argecho.py argecho.py abc def - class="prompt">[you@localhost py]$ python argecho.py --help 3 +[you@localhost py]$ python argecho.py --help 3 argecho.py --help - class="prompt">[you@localhost py]$ python argecho.py -m kant.xml 4 +[you@localhost py]$ python argecho.py -m kant.xml 4 argecho.py -m kant.xml
@@ -10414,7 +10414,7 @@ turn it off by setting httplib.HTTPConnection.debuglevel = 'content-length': '15955', 'accept-ranges': 'bytes', 'connection': 'close'} - class="prompt">>>> request.add_header('If-Modified-Since', +>>> request.add_header('If-Modified-Since', ... firstdatastream.headers.get('Last-Modified')) 2 >>> seconddatastream = opener.open(request) 3 Traceback (most recent call last): @@ -10556,7 +10556,7 @@ class DefaultErrorHandler(urllib2.HTTPDefaultErrorHandler): dive into mark</title> <link rel="alternate" type="text/html" href="http://diveintomark.org/"/> <-- rest of feed omitted for brevity --> - class="prompt">>>> request.add_header('If-None-Match', +>>> request.add_header('If-None-Match', ... firstdatastream.headers.get('ETag')) 3 >>> seconddatastream = opener.open(request) >>> seconddatastream.status 4 @@ -10646,7 +10646,7 @@ header: Accept-Ranges: bytes header: Content-Length: 15955 header: Connection: close header: Content-Type: application/atom+xml - class="prompt">>>> f.url 6 +>>> f.url 6 'http://diveintomark.org/xml/atom.xml' >>> f.headers.dict {'content-length': '15955', @@ -10657,7 +10657,7 @@ header: Content-Type: application/atom+xml 'etag': '"e842a-3e53-55d97640"', 'date': 'Thu, 15 Apr 2004 22:06:25 GMT', 'content-type': 'application/atom+xml'} - class="prompt">>>> f.status +>>> f.status Traceback (most recent call last): File "<stdin>", line 1, in ? AttributeError: addinfourl instance has no attribute 'status' @@ -10786,7 +10786,7 @@ header: Content-Length: 15955 header: Connection: close header: Content-Type: application/atom+xml - class="prompt">>>> f.status 3 +>>> f.status 3 301 >>> f.url 'http://diveintomark.org/xml/atom.xml' @@ -10848,7 +10848,7 @@ header: Accept-Ranges: bytes header: Content-Length: 15955 header: Connection: close header: Content-Type: application/atom+xml - class="prompt">>>> f.status 4 +>>> f.status 4 302 >>> f.url http://diveintomark.org/xml/atom.xml @@ -10963,7 +10963,7 @@ header: Content-Type: application/atom+xml <title mode="escaped">dive into mark</title> <link rel="alternate" type="text/html" href="http://diveintomark.org/"/> <-- rest of feed omitted for brevity --> - class="prompt">>>> len(data) +>>> len(data) 15955
@@ -11186,7 +11186,7 @@ def fetch(source, etag=None, last_modified=None, agent=USER_AGENT): 'data': '<?xml version="1.0" encoding="iso-8859-1"?> <feed version="0.3" <-- rest of data omitted for brevity -->'} - class="prompt">>>> if params['status'] == 301:3 +>>> if params['status'] == 301:3 ... url = params['url'] >>> newparams = openanything.fetch( ... url, params['etag'], params['lastmodified'], useragent) 4 @@ -11526,7 +11526,7 @@ region.

</SOAP-ENV:Envelope> ************************************************************************ - class="prompt">>>> temperature +>>> temperature 80.0
@@ -11784,7 +11784,7 @@ u'return' </SOAP-ENV:Envelope> ************************************************************************ - class="prompt">>>> temperature +>>> temperature 66.0
@@ -11949,7 +11949,7 @@ and you can access it programmatically too.

{'fullViewableName': 'Top/Arts/Literature/World_Literature/American/19th_Century/Twain,_Mark', 'specialEncoding': ''}] - class="prompt">>>> results.directoryCategories[0].fullViewableName +>>> results.directoryCategories[0].fullViewableName 'Top/Arts/Literature/World_Literature/American/19th_Century/Twain,_Mark'
@@ -13295,7 +13295,7 @@ def fromRoman(s): File "roman3.py", line 27, in toRoman raise OutOfRangeError, "number out of range (must be 1..3999)" OutOfRangeError: number out of range (must be 1..3999) - class="prompt">>>> roman3.toRoman(1.5) +>>> roman3.toRoman(1.5) Traceback (most recent call last): File "<interactive input>", line 1, in ? File "roman3.py", line 29, in toRoman @@ -14828,11 +14828,11 @@ print 'full path =', os.path.abspath(pathname) sys.argv[0] = /home/you/diveintopython3/common/py/fullpath.py path = /home/you/diveintopython3/common/py full path = /home/you/diveintopython3/common/py - class="prompt">[you@localhost diveintopython3]$ python common/py/fullpath.py 2 +[you@localhost diveintopython3]$ python common/py/fullpath.py 2 sys.argv[0] = common/py/fullpath.py path = common/py full path = /home/you/diveintopython3/common/py - class="prompt">[you@localhost diveintopython3]$ cd common/py +[you@localhost diveintopython3]$ cd common/py [you@localhost py]$ python fullpath.py 3 sys.argv[0] = fullpath.py path = @@ -15164,7 +15164,7 @@ to doesn't need to match the module name, either. You could import a series of <module 'os' from 'c:\Python22\lib\os.pyc'>, <module 're' from 'c:\Python22\lib\re.pyc'>, <module 'unittest' from 'c:\Python22\lib\unittest.pyc'>] - class="prompt">>>> modules[0].version 4 +>>> modules[0].version 4 '2.2.2 (#37, Nov 26 2002, 10:24:37) [MSC 32 bit (Intel)]' >>> import sys >>> sys.version @@ -15318,7 +15318,7 @@ return unittest.TestSuite(map(load, modules)) <module 'odbchelpertest' from 'odbchelpertest.py'>, <module 'pluraltest' from 'pluraltest.py'>, <module 'romantest' from 'romantest.py'>] - class="prompt">>>> modules[-1] 3 +>>> modules[-1] 3 <module 'romantest' from 'romantest.py'>
@@ -15356,7 +15356,7 @@ return unittest.TestSuite(map(load, modules)) ... ] ] - class="prompt">>>> unittest.TestSuite(map(load, modules)) 2 +>>> unittest.TestSuite(map(load, modules)) 2
@@ -16023,10 +16023,10 @@ def plural(noun, language='en'): >>> counter.next() 4 entering make_counter 2 - class="prompt">>>> counter.next() 5 +>>> counter.next() 5 incrementing x 3 - class="prompt">>>> counter.next() 6 +>>> counter.next() 6 incrementing x 4
diff --git a/dip3.css b/dip3.css index be5cd64..6fcb32c 100644 --- a/dip3.css +++ b/dip3.css @@ -1,5 +1,4 @@ -/*dive into minimalism(c)2008 Mark Pilgrim,MIT-licensed*/ -html{background:white;color:black} +html{background:#fff;color:#000} body{font:normal medium 'Gill Sans','Gill Sans MT','Ikarius ADF',Candara,Jara,sans-serif;margin:1.75em auto;width:40em;line-height:1.75;word-spacing:0.1em} a{background:transparent;text-decoration:none;border-bottom:1px dotted} a:hover{border-bottom:1px solid} @@ -9,21 +8,22 @@ h1 a,h2 a,h3 a,#nav a{color:inherit !important} abbr,.p{border:0;letter-spacing:0.1em;text-transform:lowercase;font-variant:small-caps} h1,h2,h3,p,ul,ol,#nav{margin:1.75em 0} h1,h2,h3{font-size:medium} -h1{background:papayawhip;color:black;width:100%;margin:0} -h2{margin-left:1.75em} -h3{margin-left:3.5em} +h1{background:papayawhip;color:#000;width:100%;margin:0} +#index h2{margin-left:1.75em} +#index h3{margin-left:3.5em} pre,tt{white-space:pre-wrap;font-size:medium;line-height:2.154} cite{font-style:normal} img{border:0} .framed{border:1px solid} -blockquote{font-size:small;line-height:2.154;margin:2.154em 0;padding:0} -blockquote{font-style:oblique;border-left:1px dotted;margin-left:2.154em;padding-left:2.154em} +pre,blockquote{line-height:2.154;margin:2.154em 0;padding:0 0 0 2.154em;border-left:1px dotted} +blockquote{font-size:small;font-style:oblique;margin-left:2.154em} blockquote p{margin:2.154em 0} .f,.c{text-align:center;clear:both} -article + p:first-letter{float:left;color:gainsboro;padding:0.11em 4px 0 0;font:normal 4em/0.68 serif} +section h2 + p:first-letter{float:left;background:transparent;color:gainsboro;padding:0.11em 4px 0 0;font:normal 4em/0.68 serif} #arc{width:100%,border-collapse:collapse} #arc th,#arc td{list-style:none;margin:0;padding:0} #arc th{padding:0 1.75em 0 0;text-align:right;vertical-align:baseline} figure{display:block;text-align:center;margin:1.75em 0} figure img{display:block;margin:0 auto} section,article,footer{display:block} +var{font-family:monospace;font-style:normal} diff --git a/index.html b/index.html index 8e8d881..9203663 100644 --- a/index.html +++ b/index.html @@ -8,7 +8,7 @@ - +

Dive into Python 3 will cover Python 3 and its differences from Python 2. Compared to the original Dive into Python, it will be about 50% revised and 50% new material.

I will publish drafts online as I go. The final book will be published on paper by Apress. The book will remain online under the CC-BY-3.0 license.

Below is the draft table of contents. There is no text yet.

@@ -17,72 +17,72 @@

Installing Python

-
+

Python on Windows

-
+
-
+

Python on Mac OS X

-
+ -
+

Python on Linux

-
+ -
+

Python from source

-
+ -
+

The interactive shell

-
+ -
+

Summary

-
+

Your first Python program

-
+

Diving in

-
+
-
+

Declaring functions

How Python's datatypes compare to other programming languages

-
+ -
+

Documenting functions

-
+ -
+

Everything is an object

The import search path

What's an object?

-
+ -
+

Indenting code

-
+ -
+

Testing modules

-
+ -
+

Summary

-
+

Native Python datatypes

-
+

Lists

Differences from Python 2

Creating new a list

@@ -92,9 +92,9 @@

List operators

Looping through a list (list comprehensions)

Tuples

-
+
-
+

Dictionaries

Differences from Python 2

Creating a new dictionary

@@ -102,9 +102,9 @@

Deleting items from a dictionary

Looping through a dictionary (dictionary comprehensions)

Dictionary views

-
+ -
+

Sets

Differences from Python 2

Creating a new set

@@ -112,9 +112,9 @@

Deleting elements from a set

Common set operations: union, intersection, and difference

Frozen sets

-
+ -
+

Numbers

Differences from Python 2

Integers

@@ -122,527 +122,527 @@

Floating point numbers

Complex numbers

Common numerical operations

-
+

Strings

-
+

There ain't no such thing as "plain text"

A brief history of character encoding

What's a character?

How strings are stored in memory

Converting between different character encodings

-
+
-
+

Differences from Python 2

-
+ -
+

Formatting strings

-
+ -
+

What's my string?

-
+ -
+

Lists and strings

-
+ -
+

Historical note on the string module

-
+ -
+

Byte streams

-
+ -
+

Summary

-
+

The power of introspection

-
+

Diving in

-
+
-
+

Using optional and named arguments

Keyword-only arguments

-
+ -
+

Using type, str, dir, and other built-in functions

The type function

The str function

Built-in functions

-
+ -
+

Getting object references with getattr

getattr with modules

getattr as a dispatcher

-
+ -
+

Filtering lists

-
+ -
+

The peculiar nature of and and or

Using the and-or trick

-
+ -
+

Using lambda functions

Real-world lambda functions

-
+ -
+

Putting it all together

-
+ -
+

Summary

-
+

Objects and object-orientation

-
+

...major changes afoot...

-
+

Exceptions

-
+

...

-
+

Files

-
+

File objects

-
+
-
+

Reading files

-
+ -
+

Close your files... or don't

-
+ -
+

Handling I/O errors

-
+ -
+

Writing to files

-
+

Regular expressions

-
+

Diving in

-
+
-
+

Case study: street addresses

-
+ -
+

Case study: Roman numerals

Checking for thousands

Checking for hundreds

-
+ -
+

Using the {n,m} syntax

Checking for tens and ones

-
+ -
+

Verbose regular expressions

-
+ -
+

Case study: parsing phone numbers

-
+ -
+

Summary

-
+

HTML processing

-
+

Diving in

-
+
-
+

html5lib

Installing html5lib

Using html5lib

-
+ -
+

Extracting data from HTML documents

-
+ -
+

Building HTML documents

-
+ -
+

Putting it all together

-
+ -
+

Summary

-
+

XML Processing

-
+

...major changes afoot...

-
+

Scripts and streams

-
+

...will be folded into other chapters...

-
+

HTTP web services

-
+

Diving in

-
+
-
+

How not to fetch data over HTTP

-
+ -
+

Features of HTTP

User-Agent

Redirects

Last-Modified/If-Modified-Since

ETag-If-None-Match

Compression

-
+ -
+

Differences from Python 2

-
+ -
+

httplib2 (note: needs port)

Installing httplib2

Why httplib2 is better than http.client

-
+ -
+

Debugging HTTP web services

-
+ -
+

Setting the User-Agent

-
+ -
+

Handling Last-Modified and ETag

-
+ -
+

Handling redirects

-
+ -
+

Handling compressed data

-
+ -
+

Putting it all together

-
+ -
+

Summary

-
+

SOAP web services

-
+

...no one will miss you...

-
+

Unit testing

-
+

Introduction to Roman numerals

-
+
-
+

Diving in

-
+ -
+

Introducing romantest.py

-
+ -
+

Testing for success

-
+ -
+

Testing for failure

-
+ -
+

Testing for sanity

-
+

Test-first programming

-
+

roman.py, stage 1

-
+
-
+

roman.py, stage 2

-
+ -
+

roman.py, stage 3

-
+ -
+

roman.py, stage 4

-
+ -
+

roman.py, stage 5

-
+

Refactoring your code

-
+

Handling bugs

-
+
-
+

Handling changing requirements

-
+ -
+

The art of refactoring

-
+ -
+

Postscript

-
+ -
+

Summary

-
+

Functional programming

-
+

...bits and pieces will be folded into other chapters...

-
+

Dynamic functions

-
+

Diving in

-
+
-
+

plural.py, stage 1

-
+ -
+

plural.py, stage 2

-
+ -
+

plural.py, stage 3

-
+ -
+

plural.py, stage 4

-
+ -
+

plural.py, stage 5

-
+ -
+

plural.py, stage 6

-
+ -
+

Summary

-
+

Metaclasses

-
+

...once I figure out WTF metaclasses are...

-
+

Performance tuning

-
+

Diving in

-
+
-
+

Using the timeit module

-
+ -
+

Optimizing regular expressions

-
+ -
+

Optimizing dictionary lookups

-
+ -
+

Optimizing list operations

-
+ -
+

Optimizing string manipulation

-
+ -
+

Summary

-
+

Migrating old code to Python 3

-
+

Diving in

-
+
-
+

The 2to3 migration tool

-
+ -
+

Things the 2to3 tools won't catch

-
+ -
+

Case study: feedparser

Just shoot me

-
+ -
+

Summary

-
+

Creating graphics with the Python Imaging Library

-
+

...if it gets ported...

-
+

Packaging Python libraries

-
+

A brief history of packaging (and why it's harder than you think)

-
+
-
+

setuptools

-
+ -
+

distutils

-
+ -
+

Eggs

-
+ -
+

pip

-
+ -
+

Platform-specific packaging

Packaging by Linux distributions

Py2exe

Psyco

-
+ @@ -650,41 +650,41 @@

Where to go from here

Tentative because most of these have not been ported to Python 3 yet.

-
+

WSGI

-
+ -
+

Django

-
+ -
+

Pylons

-
+ -
+

TurboGears

-
+ -
+

AppEngine

-
+ -
+

IronPython

-
+ -
+

Jython

-
+ -
+

PyPy

-
+ -
+

Stackless Python

-
+