From 7e2b1808a814ccce0a01628455110a86ccd86767 Mon Sep 17 00:00:00 2001 From: Mark Pilgrim Date: Mon, 1 Jun 2009 12:24:15 -0700 Subject: [PATCH] clarifications [thanks G.P.] --- advanced-classes.html | 2 +- case-study-porting-chardet-to-python-3.html | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/advanced-classes.html b/advanced-classes.html index 3ae009d..c60ac02 100644 --- a/advanced-classes.html +++ b/advanced-classes.html @@ -107,7 +107,7 @@ class OrderedDict(dict, collections.MutableMapping):
 >>> import ordereddict
 >>> od = ordereddict.OrderedDict()
->>> klass = od.__class__
+>>> klass = od.__class__       
 >>> type(klass)
 <class 'abc.ABCMeta'>
 >>> klass.__name__
diff --git a/case-study-porting-chardet-to-python-3.html b/case-study-porting-chardet-to-python-3.html
index bea9baa..4e4b180 100644
--- a/case-study-porting-chardet-to-python-3.html
+++ b/case-study-porting-chardet-to-python-3.html
@@ -600,8 +600,8 @@ if not hasattr(__builtin__, 'False'):
 else:
     False = __builtin__.False
     True = __builtin__.True
-

This piece of code is designed to allow this library to run under older versions of Python 2. Prior to Python 2.3 [FIXME-LINK], Python had no built-in Boolean type. This code detects the absence of the built-in constants True and False, and defines them if necessary. -

However, Python 3 will always have a Boolean type, so this entire code snippet is unnecessary. The simplest solution is to replace all instances of constants.True and constants.False with True and False, respectively, then delete this dead code from constants.py. +

This piece of code is designed to allow this library to run under older versions of Python 2. Prior to Python 2.3 [FIXME-LINK], Python had no built-in bool type. This code detects the absence of the built-in constants True and False, and defines them if necessary. +

However, Python 3 will always have a bool type, so this entire code snippet is unnecessary. The simplest solution is to replace all instances of constants.True and constants.False with True and False, respectively, then delete this dead code from constants.py.

So this line in universaldetector.py:

self.done = constants.False

Becomes @@ -635,8 +635,8 @@ import sys File "test.py", line 9, in <module> for line in file(f, 'rb'): NameError: name 'file' is not defined -

This one surprised me, because I’ve been using this idiom as long as I can remember. In Python 2, the global file() function was an alias for open(), which was the standard way of opening files for reading. In Python 3, the entire system for reading and writing files has been refactored into the io module. [FIXME-LINK PEP 3116] I’ll cover the new I/O module in more detail in Chapter FIXME, but for now, the important bit is that the global file() function no longer exists. However, the open() function does still exist. (Technically, it’s an alias for io.open(), but never mind that right now.) -

Thus, the simplest solution to the problem of the missing file() is to call open() instead: +

This one surprised me, because I’ve been using this idiom as long as I can remember. In Python 2, the global file() function was an alias for the open() function, which was the standard way of opening files for reading. In Python 3, the entire system for reading and writing files has been refactored into the io module. [FIXME-LINK PEP 3116] I’ll cover the new I/O module in more detail in Chapter FIXME, but for now, the important bit is that the global file() function no longer exists. However, the open() function does still exist. (Technically, it’s an alias for io.open(), but never mind that right now.) +

Thus, the simplest solution to the problem of the missing file() is to call the open() function instead:

for line in open(f, 'rb'):

And that’s all I have to say about that.

Can’t use a string pattern on a bytes-like object

@@ -670,7 +670,7 @@ TypeError: can't use a string pattern on a bytes-like object for line in open(f, 'rb'): u.feed(line) -

And here we find our answer: in the UniversalDetector.feed() method, aBuf is a line read from a file on disk. Look carefully at the parameters used to open the file: 'rb'. 'r' is for “read”; OK, big deal, we’re reading the file. Ah, but 'b' is for “binary.” Without the 'b' flag, this for loop would read the file, line by line, and convert each line into a string — an array of Unicode characters — according to the system default character encoding. (You could override the system encoding with another parameter to open(), but never mind that for now.) But with the 'b' flag, this for loop reads the file, line by line, and stores each line exactly as it appears in the file, as an array of bytes. That byte array gets passed to UniversalDetector.feed(), and eventually gets passed to the pre-compiled regular expression, self._highBitDetector, to search for high-bit… characters. But we don’t have characters; we have bytes. Oops. +

And here we find our answer: in the UniversalDetector.feed() method, aBuf is a line read from a file on disk. Look carefully at the parameters used to open the file: 'rb'. 'r' is for “read”; OK, big deal, we’re reading the file. Ah, but 'b' is for “binary.” Without the 'b' flag, this for loop would read the file, line by line, and convert each line into a string — an array of Unicode characters — according to the system default character encoding. (You could override the system encoding with another parameter to the open() function, but never mind that for now.) But with the 'b' flag, this for loop reads the file, line by line, and stores each line exactly as it appears in the file, as an array of bytes. That byte array gets passed to UniversalDetector.feed(), and eventually gets passed to the pre-compiled regular expression, self._highBitDetector, to search for high-bit… characters. But we don’t have characters; we have bytes. Oops.

What we need this regular expression to search is not an array of characters, but an array of bytes.

Once you realize that, the solution is not difficult. Regular expressions defined with strings can search strings. Regular expressions defined with byte arrays can search byte arrays. To define a byte array pattern, we simply change the type of the argument we use to define the regular expression to a byte array. (There is one other case of this same problem, on the very next line.)

  class UniversalDetector: