diff --git a/advanced-iterators.html b/advanced-iterators.html index a20cdcc..98eee7a 100755 --- a/advanced-iterators.html +++ b/advanced-iterators.html @@ -45,7 +45,7 @@ E = 4
In this chapter, we’ll dive into an incredible Python program originally written by Raymond Hettinger. This program solves alphametic puzzles in just 14 lines of code.
import re
+import re
import itertools
def solve(puzzle):
@@ -150,7 +150,7 @@ if __name__ == '__main__':
The alphametics solver uses this technique to get a list of all the unique characters in the puzzle.
-
unique_characters = set(''.join(words))
+unique_characters = set(''.join(words))
This list is later used to assign digits to characters as the solver iterates through the possible solutions.
@@ -178,11 +178,11 @@ AssertionError: Only for very large values of 2
Therefore, this line of code:
-
assert len(unique_characters) <= 10, 'Too many letters'
+assert len(unique_characters) <= 10, 'Too many letters'
…is equivalent to this:
-
if len(unique_characters) > 10:
+if len(unique_characters) > 10:
raise AssertionError('Too many letters')
The alphametics solver uses this exact assert statement to bail out early if the puzzle contains more than ten unique letters. Since each letter is assigned a unique digit, and there are only ten digits, a puzzle with more than ten unique letters can not possibly have a solution.
@@ -217,7 +217,7 @@ AssertionError: Only for very large values of 2
Here’s another way to accomplish the same thing, using a generator function:
-
def ord_map(a_string):
+def ord_map(a_string):
for c in a_string:
yield ord(c)
@@ -413,7 +413,7 @@ Wesley
The alphametics solver uses this technique to create a dictionary that maps letters in the puzzle to digits in the solution, for each possible solution.
-
characters = tuple(ord(c) for c in sorted_characters)
+characters = tuple(ord(c) for c in sorted_characters)
digits = tuple(ord(c) for c in '0123456789')
...
for guess in itertools.permutations(digits, len(characters)):
diff --git a/case-study-porting-chardet-to-python-3.html b/case-study-porting-chardet-to-python-3.html
index a89259d..e433412 100755
--- a/case-study-porting-chardet-to-python-3.html
+++ b/case-study-porting-chardet-to-python-3.html
@@ -228,7 +228,7 @@ RefactoringTool: test.py
Let’s take a peek in that __init__.py file.
-
def detect(aBuf): ①
+def detect(aBuf): ①
from . import universaldetector ②
u = universaldetector.UniversalDetector()
u.reset()
@@ -242,7 +242,7 @@ RefactoringTool: test.py
The answer lies in that odd-looking import statement:
-
from . import universaldetector
+from . import universaldetector
Translated into English, that means “import the universaldetector module; that’s in the same directory I am,” where “I” is the chardet/__init__.py file. This is called a relative import. It’s a way for the files within a multi-file module to reference each other, without worrying about naming conflicts with other modules you may have installed in your import search path. This import statement will only look for the universaldetector module within the chardet/ directory itself.
@@ -267,7 +267,7 @@ RefactoringTool: test.py
^
SyntaxError: invalid syntax
Hmm, a small snag. In Python 3, False is a reserved word, so you can’t use it as a variable name. Let’s look at constants.py to see where it’s defined. Here’s the original version from constants.py, before the 2to3 script changed it:
-
import __builtin__
+import __builtin__
if not hasattr(__builtin__, 'False'):
False = 0
True = 1
@@ -277,9 +277,9 @@ else:
This piece of code is designed to allow this library to run under older versions of Python 2. Prior to Python 2.3, Python had no built-in bool type. This code detects the absence of the built-in constants True and False, and defines them if necessary.
However, Python 3 will always have a bool type, so this entire code snippet is unnecessary. The simplest solution is to replace all instances of constants.True and constants.False with True and False, respectively, then delete this dead code from constants.py.
So this line in universaldetector.py:
-
self.done = constants.False
+self.done = constants.False
Becomes
-
self.done = False
+self.done = False
Ah, wasn’t that satisfying? The code is shorter and more readable already.
No module named constants
Time to run test.py again and see how far it gets.
@@ -293,12 +293,12 @@ ImportError: No module named constants
What’s that you say? No module named constants? Of course there’s a module named constants. It’s right there, in chardet/constants.py.
Remember when the 2to3 script fixed up all those import statements? This library has a lot of relative imports — that is, modules that import other modules within the same library — but the logic behind relative imports has changed in Python 3. In Python 2, you could just import constants and it would look in the chardet/ directory first. In Python 3, all import statements are absolute by default. If you want to do a relative import in Python 3, you need to be explicit about it:
-
from . import constants
+from . import constants
But wait. Wasn’t the 2to3 script supposed to take care of these for you? Well, it did, but this particular import statement combines two different types of imports into one line: a relative import of the constants module within the library, and an absolute import of the sys module that is pre-installed in the Python standard library. In Python 2, you could combine these into one import statement. In Python 3, you can’t, and the 2to3 script is not smart enough to split the import statement into two.
The solution is to split the import statement manually. So this two-in-one import:
-
import constants, sys
+import constants, sys
Needs to become two separate imports:
-
from . import constants
+from . import constants
import sys
There are variations of this problem scattered throughout the chardet library. In some places it’s “import constants, sys”; in other places, it’s “import constants, re”. The fix is the same: manually split the import statement into two lines, one for the relative import, the other for the absolute import.
Onward!
@@ -313,7 +313,7 @@ import sys
NameError: name 'file' is not defined
This one surprised me, because I’ve been using this idiom as long as I can remember. In Python 2, the global file() function was an alias for the open() function, which was the standard way of opening text files for reading. In Python 3, the global file() function no longer exists, but the open() function still exists.
Thus, the simplest solution to the problem of the missing file() is to call the open() function instead:
-
for line in open(f, 'rb'):
+for line in open(f, 'rb'):
And that’s all I have to say about that.
Can’t use a string pattern on a bytes-like object
Now things are starting to get interesting. And by “interesting,” I mean “confusing as all hell.”
@@ -326,20 +326,20 @@ NameError: name 'file' is not defined
if self._highBitDetector.search(aBuf):
TypeError: can't use a string pattern on a bytes-like object
To debug this, let’s see what self._highBitDetector is. It’s defined in the __init__ method of the UniversalDetector class: -
class UniversalDetector:
+class UniversalDetector:
def __init__(self):
self._highBitDetector = re.compile(r'[\x80-\xFF]')
This pre-compiles a regular expression designed to find non-ASCII characters in the range 128–255 (0x80–0xFF). Wait, that’s not quite right; I need to be more precise with my terminology. This pattern is designed to find non-ASCII bytes in the range 128-255.
And therein lies the problem.
In Python 2, a string was an array of bytes whose character encoding was tracked separately. If you wanted Python 2 to keep track of the character encoding, you had to use a Unicode string (u'') instead. But in Python 3, a string is always what Python 2 called a Unicode string — that is, an array of Unicode characters (of possibly varying byte lengths). Since this regular expression is defined by a string pattern, it can only be used to search a string — again, an array of characters. But what we’re searching is not a string, it’s a byte array. Looking at the traceback, this error occurred in universaldetector.py:
-
def feed(self, aBuf):
+def feed(self, aBuf):
.
.
.
if self._mInputState == ePureAscii:
if self._highBitDetector.search(aBuf):
And what is aBuf? Let’s backtrack further to a place that calls UniversalDetector.feed(). One place that calls it is the test harness, test.py.
-
u = UniversalDetector()
+u = UniversalDetector()
.
.
.
@@ -349,7 +349,7 @@ for line in open(f, 'rb'):
And here we find our answer: in the UniversalDetector.feed() method, aBuf is a line read from a file on disk. Look carefully at the parameters used to open the file: 'rb'. 'r' is for “read”; OK, big deal, we’re reading the file. Ah, but 'b' is for “binary.” Without the 'b' flag, this for loop would read the file, line by line, and convert each line into a string — an array of Unicode characters — according to the system default character encoding. But with the 'b' flag, this for loop reads the file, line by line, and stores each line exactly as it appears in the file, as an array of bytes. That byte array gets passed to UniversalDetector.feed(), and eventually gets passed to the pre-compiled regular expression, self._highBitDetector, to search for high-bit… characters. But we don’t have characters; we have bytes. Oops.
What we need this regular expression to search is not an array of characters, but an array of bytes.
Once you realize that, the solution is not difficult. Regular expressions defined with strings can search strings. Regular expressions defined with byte arrays can search byte arrays. To define a byte array pattern, we simply change the type of the argument we use to define the regular expression to a byte array. (There is one other case of this same problem, on the very next line.)
-
class UniversalDetector:
+ class UniversalDetector:
def __init__(self):
- self._highBitDetector = re.compile(r'[\x80-\xFF]')
- self._escDetector = re.compile(r'(\033|~{)')
@@ -359,7 +359,7 @@ for line in open(f, 'rb'):
self._mCharSetProbers = []
self.reset()
Searching the entire codebase for other uses of the re module turns up two more instances, in charsetprober.py. Again, the code is defining regular expressions as strings but executing them on aBuf, which is a byte array. The solution is the same: define the regular expression patterns as byte arrays.
-
class CharSetProber:
+ class CharSetProber:
.
.
.
@@ -384,7 +384,7 @@ for line in open(f, 'rb'):
elif (self._mInputState == ePureAscii) and self._escDetector.search(self._mLastChar + aBuf):
TypeError: Can't convert 'bytes' object to str implicitly
There’s an unfortunate clash of coding style and Python interpreter here. The TypeError could be anywhere on that line, but the traceback doesn’t tell you exactly where it is. It could be in the first conditional or the second, and the traceback would look the same. To narrow it down, you should split the line in half, like this:
-
elif (self._mInputState == ePureAscii) and \
+elif (self._mInputState == ePureAscii) and \
self._escDetector.search(self._mLastChar + aBuf):
And re-run the test:
C:\home\chardet> python test.py tests\*\*
@@ -397,7 +397,7 @@ TypeError: Can't convert 'bytes' object to str implicitly
TypeError: Can't convert 'bytes' object to str implicitly
Aha! The problem was not in the first conditional (self._mInputState == ePureAscii) but in the second one. So what could cause a TypeError there? Perhaps you’re thinking that the search() method is expecting a value of a different type, but that wouldn’t generate this traceback. Python functions can take any value; if you pass the right number of arguments, the function will execute. It may crash if you pass it a value of a different type than it’s expecting, but if that happened, the traceback would point to somewhere inside the function. But this traceback says it never got as far as calling the search() method. So the problem must be in that + operation, as it’s trying to construct the value that it will eventually pass to the search() method.
We know from previous debugging that aBuf is a byte array. So what is self._mLastChar? It’s an instance variable, defined in the reset() method, which is actually called from the __init__() method.
-
class UniversalDetector:
+class UniversalDetector:
def __init__(self):
self._highBitDetector = re.compile(b'[\x80-\xFF]')
self._escDetector = re.compile(b'(\033|~{)')
@@ -414,7 +414,7 @@ TypeError: Can't convert 'bytes' object to str implicitly
self._mLastChar = ''
And now we have our answer. Do you see it? self._mLastChar is a string, but aBuf is a byte array. And you can’t concatenate a string to a byte array — not even a zero-length string.
So what is self._mLastChar anyway? In the feed() method, just a few lines down from where the trackback occurred.
-
if self._mInputState == ePureAscii:
+if self._mInputState == ePureAscii:
if self._highBitDetector.search(aBuf):
self._mInputState = eHighbyte
elif (self._mInputState == ePureAscii) and \
@@ -423,14 +423,14 @@ TypeError: Can't convert 'bytes' object to str implicitly
self._mLastChar = aBuf[-1]
The calling function calls this feed() method over and over again with a few bytes at a time. The method processes the bytes it was given (passed in as aBuf), then stores the last byte in self._mLastChar in case it’s needed during the next call. (In a multi-byte encoding, the feed() method might get called with half of a character, then called again with the other half.) But because aBuf is now a byte array instead of a string, self._mLastChar needs to be a byte array as well. Thus:
-
def reset(self):
+ def reset(self):
.
.
.
- self._mLastChar = ''
+ self._mLastChar = b''
Searching the entire codebase for “mLastChar” turns up a similar problem in mbcharsetprober.py, but instead of tracking the last character, it tracks the last two characters. The MultiByteCharSetProber class uses a list of 1-character strings to track the last two characters. In Python 3, it needs to use a list of integers, because it’s not really tracking characters, it’s tracking bytes. (Bytes are just integers from 0-255.)
-
class MultiByteCharSetProber(CharSetProber):
+ class MultiByteCharSetProber(CharSetProber):
def __init__(self):
CharSetProber.__init__(self)
self._mDistributionAnalyzer = None
@@ -459,7 +459,7 @@ TypeError: unsupported operand type(s) for +: 'int' and 'bytes'
…The bad news is it doesn’t always feel like progress.
But this is progress! Really! Even though the traceback calls out the same line of code, it’s a different error than it used to be. Progress! So what’s the problem now? The last time I checked, this line of code didn’t try to concatenate an int with a byte array (bytes). In fact, you just spent a lot of time ensuring that self._mLastChar was a byte array. How did it turn into an int?
The answer lies not in the previous lines of code, but in the following lines.
-
if self._mInputState == ePureAscii:
+if self._mInputState == ePureAscii:
if self._highBitDetector.search(aBuf):
self._mInputState = eHighbyte
elif (self._mInputState == ePureAscii) and \
@@ -496,7 +496,7 @@ TypeError: unsupported operand type(s) for +: 'int' and 'bytes'
Concatenating a byte array of length 1 with a byte array of length 3 returns a new byte array of length 4.
So, to ensure that the feed() method in universaldetector.py continues to work no matter how often it’s called, you need to initialize self._mLastChar as a 0-length byte array, then make sure it stays a byte array.
-
self._escDetector.search(self._mLastChar + aBuf):
+ self._escDetector.search(self._mLastChar + aBuf):
self._mInputState = eEscAscii
- self._mLastChar = aBuf[-1]
@@ -519,25 +519,25 @@ tests\Big5\0804.blogspot.com.xml
byteCls = self._mModel['classTable'][ord(c)]
TypeError: ord() expected string of length 1, but int found
OK, so c is an int, but the ord() function was expecting a 1-character string. Fair enough. Where is c defined?
-
# codingstatemachine.py
+# codingstatemachine.py
def next_state(self, c):
# for each byte we get its class
# if it is first byte, we also get byte length
byteCls = self._mModel['classTable'][ord(c)]
That’s no help; it’s just passed into the function. Let’s pop the stack.
-
# utf8prober.py
+# utf8prober.py
def feed(self, aBuf):
for c in aBuf:
codingState = self._mCodingSM.next_state(c)
Do you see it? In Python 2, aBuf was a string, so c was a 1-character string. (That’s what you get when you iterate over a string — all the characters, one by one.) But now, aBuf is a byte array, so c is an int, not a 1-character string. In other words, there’s no need to call the ord() function because c is already an int!
Thus:
-
def next_state(self, c):
+ def next_state(self, c):
# for each byte we get its class
# if it is first byte, we also get byte length
- byteCls = self._mModel['classTable'][ord(c)]
+ byteCls = self._mModel['classTable'][c]
Searching the entire codebase for instances of “ord(c)” uncovers similar problems in sbcharsetprober.py…
-
# sbcharsetprober.py
+# sbcharsetprober.py
def feed(self, aBuf):
if not self._mModel['keepEnglishLetter']:
aBuf = self.filter_without_english_letters(aBuf)
@@ -547,13 +547,13 @@ def feed(self, aBuf):
for c in aBuf:
order = self._mModel['charToOrderMap'][ord(c)]
…and latin1prober.py…
-
# latin1prober.py
+# latin1prober.py
def feed(self, aBuf):
aBuf = self.filter_with_english_letters(aBuf)
for c in aBuf:
charClass = Latin1_CharToClass[ord(c)]
c is iterating over aBuf, which means it is an integer, not a 1-character string. The solution is the same: change ord(c) to just plain c.
-
# sbcharsetprober.py
+ # sbcharsetprober.py
def feed(self, aBuf):
if not self._mModel['keepEnglishLetter']:
aBuf = self.filter_without_english_letters(aBuf)
@@ -591,7 +591,7 @@ tests\Big5\0804.blogspot.com.xml
if ((aStr[0] >= '\x81') and (aStr[0] <= '\x9F')) or \
TypeError: unorderable types: int() >= str()
So what’s this all about? “Unorderable types”? Once again, the difference between byte arrays and strings is rearing its ugly head. Take a look at the code:
-
class SJISContextAnalysis(JapaneseContextAnalysis):
+class SJISContextAnalysis(JapaneseContextAnalysis):
def get_order(self, aStr):
if not aStr: return -1, 1
# find out current char's byte length
@@ -601,7 +601,7 @@ TypeError: unorderable types: int() >= str()
else:
charLen = 1
And where does aStr come from? Let’s pop the stack:
-
def feed(self, aBuf, aLen):
+def feed(self, aBuf, aLen):
.
.
.
@@ -611,7 +611,7 @@ TypeError: unorderable types: int() >= str()
Oh look, it’s our old friend, aBuf. As you might have guessed from every other issue we’ve encountered in this chapter, aBuf is a byte array. Here, the feed() method isn’t just passing it on wholesale; it’s slicing it. But as you saw earlier in this chapter, slicing a byte array returns a byte array, so the aStr parameter that gets passed to the get_order() method is still a byte array.
And what is this code trying to do with aStr? It’s taking the first element of the byte array and comparing it to a string of length 1. In Python 2, that worked, because aStr and aBuf were strings, and aStr[0] would be a string, and you can compare strings for inequality. But in Python 3, aStr and aBuf are byte arrays, aStr[0] is an integer, and you can’t compare integers and strings for inequality without explicitly coercing one of them.
In this case, there’s no need to make the code more complicated by adding an explicit coercion. aStr[0] yields an integer; the things you’re comparing to are all constants. Let’s change them from 1-character strings to integers. And while we’re at it, let’s change aStr to aBuf, since it’s not actually a string.
-
class SJISContextAnalysis(JapaneseContextAnalysis):
+ class SJISContextAnalysis(JapaneseContextAnalysis):
- def get_order(self, aStr):
- if not aStr: return -1, 1
+ def get_order(self, aBuf):
@@ -688,7 +688,7 @@ tests\Big5\0804.blogspot.com.xml
if (aStr[0] >= '\x81') and (aStr[0] <= '\x9F'):
TypeError: unorderable types: int() >= str()
The fix is the same:
-
class EUCTWDistributionAnalysis(CharDistributionAnalysis):
+ class EUCTWDistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
CharDistributionAnalysis.__init__(self)
self._mCharToFreqOrder = EUCTWCharToFreqOrder
@@ -812,21 +812,21 @@ tests\Big5\0804.blogspot.com.xml
total = reduce(operator.add, self._mFreqCounter)
NameError: global name 'reduce' is not defined
According to the official What’s New In Python 3.0 guide, the reduce() function has been moved out of the global namespace and into the functools module. Quoting the guide: “Use functools.reduce() if you really need it; however, 99 percent of the time an explicit for loop is more readable.” You can read more about the decision from Guido van Rossum’s weblog: The fate of reduce() in Python 3000.
-
def get_confidence(self):
+def get_confidence(self):
if self.get_state() == constants.eNotMe:
return 0.01
total = reduce(operator.add, self._mFreqCounter)
The reduce() function takes two arguments — a function and a list (strictly speaking, any iterable object will do) — and applies the function cumulatively to each item of the list. In other words, this is a fancy and roundabout way of adding up all the items in a list and returning the result.
This monstrosity was so common that Python added a global sum() function.
-
def get_confidence(self):
+ def get_confidence(self):
if self.get_state() == constants.eNotMe:
return 0.01
- total = reduce(operator.add, self._mFreqCounter)
+ total = sum(self._mFreqCounter)
Since you’re no longer using the operator module, you can remove that import from the top of the file as well.
-
from .charsetprober import CharSetProber
+ from .charsetprober import CharSetProber
from . import constants
- import operator
I CAN HAZ TESTZ?
diff --git a/dip3.css b/dip3.css
index cb93d52..cf1125d 100755
--- a/dip3.css
+++ b/dip3.css
@@ -200,7 +200,6 @@ a.hl:hover, h2[id]:hover a.hl, h3[id]:hover a.hl {
/* code blocks */
pre {
- white-space: pre-wrap;
padding-left: 2.154em;
border-left: 1px solid #ddd;
}
@@ -323,10 +322,10 @@ aside a {
border: 0;
display: block;
}
-.v a:first-child {
+.v a {
float: left;
}
-.v a:last-child {
+.v a + a {
float: right;
}
.v span {
diff --git a/files.html b/files.html
index a21c1cf..01fca94 100644
--- a/files.html
+++ b/files.html
@@ -26,7 +26,7 @@ body{counter-reset:h1 11}
Before you can read from a file, you need to open it. Opening a file in Python couldn’t be easier:
-
a_file = open('examples/chinese.txt', encoding='utf-8')
+a_file = open('examples/chinese.txt', encoding='utf-8')
Python has a built-in open() function, which takes a filename as an argument. Here the filename is 'examples/chinese.txt'. There are five interesting things about this filename:
@@ -207,7 +207,7 @@ ValueError: I/O operation on closed file.
Python 2 had a solution for this: the try..finally block. That still works in Python 3, and you may see it in other people’s code or in older code that was ported to Python 3. But Python 3 also adds a cleaner solution: the with statement.
-
with open('examples/chinese.txt', encoding='utf-8') as a_file:
+with open('examples/chinese.txt', encoding='utf-8') as a_file:
a_file.seek(17)
a_character = a_file.read(1)
print(a_character)
@@ -235,7 +235,7 @@ ValueError: I/O operation on closed file.
So, how do you actually do it? Read a file one line at a time, that is. It’s so simple, it’s beautiful.
line_number = 0
+line_number = 0
with open('examples/favorite-people.txt', encoding='utf-8') as a_file: ①
for a_line in a_file: ②
line_number += 1
@@ -450,7 +450,7 @@ IOError: not readable
So sys.stdout and sys.stderr are file-like objects, albeit ones that only support writing. But they’re not constants; they’re variables. That means you can assign them a new value — another file object, or another file-like object — and redirect their output.
import sys
+import sys
class RedirectStdoutTo:
def __init__(self, out_new):
@@ -479,7 +479,7 @@ C
Let’s take the last part first.
-
+
print('A') ①
with open('out.log', mode='w', encoding='utf-8') as a_file, RedirectStdoutTo(a_file): ②
print('B') ③
@@ -493,7 +493,7 @@ C
Now take a look at the RedirectStdoutTo class. It is a custom context manager. Upon entering the context, it redirects sys.stdout to a given file-like object. Upon exiting the context, it restores sys.stdout to its original value.
-
class RedirectStdoutTo:
+class RedirectStdoutTo:
def __init__(self, out_new): ①
self.out_new = out_new
diff --git a/generators.html b/generators.html
index 46b67c2..e2cedde 100755
--- a/generators.html
+++ b/generators.html
@@ -38,7 +38,7 @@ body{counter-reset:h1 6}
I Know, Let’s Use Regular Expressions!
So you’re looking at words, which, at least in English, means you’re looking at strings of characters. You have rules that say you need to find different combinations of characters, then do different things to them. This sounds like a job for regular expressions!
import re
+import re
def plural(noun):
if re.search('[sxz]$', noun): ①
@@ -74,7 +74,7 @@ def plural(noun):
And now, back to the plural() function…
-
def plural(noun):
+def plural(noun):
if re.search('[sxz]$', noun):
return re.sub('$', 'es', noun) ①
elif re.search('[^aeioudgkprt]h$', noun): ②
@@ -126,7 +126,7 @@ def plural(noun):
Now you’re going to add a level of abstraction. You started by defining a list of rules: if this, do that, otherwise go to the next rule. Let’s temporarily complicate part of the program so you can simplify another part.
import re
+import re
def match_sxz(noun):
return re.search('[sxz]$', noun)
@@ -174,7 +174,7 @@ def plural(noun):
If this additional level of abstraction is confusing, try unrolling the function to see the equivalence. The entire for loop is equivalent to the following:
-
+
def plural(noun):
if match_sxz(noun):
return apply_sxz(noun)
@@ -206,7 +206,7 @@ def plural(noun):
Defining separate named functions for each match and apply rule isn’t really necessary. You never call them directly; you add them to the rules sequence and call them through there. Furthermore, each function follows one of two patterns. All the match functions call re.search(), and all the apply functions call re.sub(). Let’s factor out the patterns so that defining new rules can be easier.
import re
+import re
def build_match_and_apply_functions(pattern, search, replace):
def matches_rule(word): ①
@@ -222,7 +222,7 @@ def build_match_and_apply_functions(pattern, search, replace):
If this is incredibly confusing (and it should be, this is weird stuff), it may become clearer when you see how to use it.
-
patterns = \ ①
+patterns = \ ①
(
('[sxz]$', '$', 'es'),
('[^aeioudgkprt]h$', '$', 'es'),
@@ -239,7 +239,7 @@ def build_match_and_apply_functions(pattern, search, replace):
Rounding out this version of the script is the main entry point, the plural() function.
-
def plural(noun):
+def plural(noun):
for matches_rule, apply_rule in rules: ①
if matches_rule(noun):
return apply_rule(noun)
@@ -256,7 +256,7 @@ def build_match_and_apply_functions(pattern, search, replace):
First, let’s create a text file that contains the rules you want. No fancy data structures, just whitespace-delimited strings in three columns. Let’s call it plural4-rules.txt.
[download plural4-rules.txt]
-
[sxz]$ $ es
+[sxz]$ $ es
[^aeioudgkprt]h$ $ es
[^aeiou]y$ y$ ies
$ $ s
@@ -264,7 +264,7 @@ $ $ s
Now let’s see how you can use this rules file.
import re
+import re
def build_match_and_apply_functions(pattern, search, replace): ①
def matches_rule(word):
@@ -295,7 +295,7 @@ rules = []
Wouldn’t it be grand to have a generic plural() function that parses the rules file? Get rules, check for a match, apply appropriate transformation, go to next rule. That’s all the plural() function has to do, and that’s all the plural() function should do.
def rules(rules_filename):
+def rules(rules_filename):
with open('plural5-rules.txt', encoding='utf-8') as pattern_file:
for line in pattern_file:
pattern, search, replace = line.split(None, 3)
@@ -343,7 +343,7 @@ def plural(noun, rules_filename='plural5-rules.txt'):
A Fibonacci Generator
def fib(max):
+def fib(max):
a, b = 0, 1 ①
while a < max:
yield a ②
@@ -375,7 +375,7 @@ def plural(noun, rules_filename='plural5-rules.txt'):
Let’s go back to plural5.py and see how this version of the plural() function works.
-
def rules(rules_filename):
+def rules(rules_filename):
with open(rules_filename, encoding='utf-8') as pattern_file:
for line in pattern_file:
pattern, search, replace = line.split(None, 3) ①
diff --git a/iterators.html b/iterators.html
index 50eb560..4974cde 100755
--- a/iterators.html
+++ b/iterators.html
@@ -25,7 +25,7 @@ body{counter-reset:h1 7}
Remember the Fibonacci generator? Here it is as a built-from-scratch iterator:
class Fib:
+class Fib:
'''iterator that yields numbers in the Fibonacci sequence'''
def __init__(self, max):
@@ -45,7 +45,7 @@ body{counter-reset:h1 7}
Let’s take that one line at a time.
-
class Fib:
+class Fib:
class? What’s a class?
@@ -57,7 +57,7 @@ body{counter-reset:h1 7}
Defining a class in Python is simple. As with functions, there is no separate interface definition. Just define the class and start coding. A Python class starts with the reserved word class, followed by the class name. Technically, that’s all that’s required, since a class doesn’t need to inherit from any other class.
-
class PapayaWhip: ①
+class PapayaWhip: ①
pass ②
- The name of this class is
PapayaWhip, and it doesn’t inherit from any other class. Class names are usually capitalized, EachWordLikeThis, but this is only a convention, not a requirement.
@@ -76,7 +76,7 @@ body{counter-reset:h1 7}
This example shows the initialization of the Fib class using the __init__ method.
-
class Fib:
+class Fib:
'''iterator that yields numbers in the Fibonacci sequence''' ①
def __init__(self, max): ②
@@ -120,14 +120,14 @@ body{counter-reset:h1 7}
On to the next line:
-
class Fib:
+class Fib:
def __init__(self, max):
self.max = max ①
- What is self.max? It’s an instance variable. It is completely separate from max, which was passed into the
__init__() method as an argument. self.max is “global” to the instance. That means that you can access it from other methods.
-class Fib:
+class Fib:
def __init__(self, max):
self.max = max ①
.
@@ -163,7 +163,7 @@ All three of these class methods, __init__, __iter__,
class Fib: ①
+class Fib: ①
def __init__(self, max): ②
self.max = max
@@ -214,7 +214,7 @@ All three of these class methods, __init__, __iter__,
Now it’s time for the finale. Let’s rewrite the plural rules generator as an iterator.
class LazyRules:
+class LazyRules:
rules_filename = 'plural6-rules.txt'
def __init__(self):
@@ -250,7 +250,7 @@ rules = LazyRules()
Let’s take the class one bite at a time.
-
class LazyRules:
+class LazyRules:
rules_filename = 'plural6-rules.txt'
def __init__(self):
@@ -297,7 +297,7 @@ rules = LazyRules()
And now back to our show.
-
def __iter__(self): ①
+ def __iter__(self): ①
self.cache_index = 0 ②
return self ③
@@ -307,7 +307,7 @@ rules = LazyRules()
- Finally, the
__iter__() method returns self, which signals that this class will take care of returning its own values throughout an iteration.
- def __next__(self): ①
+ def __next__(self): ①
.
.
.
@@ -324,7 +324,7 @@ rules = LazyRules()
Moving backwards…
-
def __next__(self):
+ def __next__(self):
.
.
.
@@ -343,7 +343,7 @@ rules = LazyRules()
Moving backwards all the way to the start of the __next__() method…
-
def __next__(self):
+ def __next__(self):
self.cache_index += 1
if len(self.cache) >= self.cache_index:
return self.cache[self.cache_index - 1] ①
diff --git a/j/prettify.js b/j/prettify.js
index 879900f..f63bbae 100644
--- a/j/prettify.js
+++ b/j/prettify.js
@@ -11,11 +11,7 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
-//
-// Changes from upstream:
-// - use class=pp instead of class=prettyprint to declare blocks-to-colorize
-// - removed support for
-// - added support for and
+
/**
* @fileoverview
@@ -36,6 +32,9 @@
*
* 2) define style rules. See the example page for examples.
* 3) mark the and tags in your source with class=pp.
+ * You can also use the (html deprecated) tag, but the pretty printer
+ * needs to do more substantial DOM manipulations to support that, so some
+ * css styles may not be preserved.
* That's it. I wanted to keep the API as simple as possible, so there's no
* need to specify which language the code is in.
*
@@ -269,6 +268,11 @@ window['_pr_isIE6'] = function () {
.replace(pr_nbspEnt, ' ');
}
+ /** is the given node's innerHTML normally unescaped? */
+ function isRawContent(node) {
+ return 'XMP' === node.tagName;
+ }
+
function normalizedHtml(node, out) {
switch (node.nodeType) {
case 1: // an element
@@ -541,6 +545,10 @@ window['_pr_isIE6'] = function () {
if (PR_innerHtmlWorks) {
var content = node.innerHTML;
+ // XMP tags contain unescaped entities so require special handling.
+ if (isRawContent(node)) {
+ content = textToHtml(content);
+ }
return content;
}
@@ -603,13 +611,14 @@ window['_pr_isIE6'] = function () {
'[^<]+' // A run of characters other than '<'
+ '|<\!--[\\s\\S]*?--\>' // an HTML comment
+ '|' // a CDATA section
- + '|?[a-zA-Z][^>]*>' // a probable tag that should not be highlighted
+ // a probable tag that should not be highlighted
+ + '|<\/?[a-zA-Z](?:[^>\"\']|\'[^\']*\'|\"[^\"]*\")*>'
+ '|<', // A '<' that does not begin a larger chunk
'g');
var pr_commentPrefix = /^<\!--/;
- var pr_cdataPrefix = /^<\[CDATA\[/;
+ var pr_cdataPrefix = /^= 0) {
nested = true;
break;
@@ -1348,7 +1359,31 @@ window['_pr_isIE6'] = function () {
var cs = prettyPrintingJob.sourceNode;
// push the prettified html back into the tag.
- cs.innerHTML = newContent;
+ if (!isRawContent(cs)) {
+ // just replace the old html with the new
+ cs.innerHTML = newContent;
+ } else {
+ // we need to change the tag to a since s do not allow
+ // embedded tags such as the span tags used to attach styles to
+ // sections of source code.
+ var pre = document.createElement('PRE');
+ for (var i = 0; i < cs.attributes.length; ++i) {
+ var a = cs.attributes[i];
+ if (a.specified) {
+ var aname = a.name.toLowerCase();
+ if (aname === 'class') {
+ pre.className = a.value; // For IE 6
+ } else {
+ pre.setAttribute(a.name, a.value);
+ }
+ }
+ }
+ pre.innerHTML = newContent;
+
+ // remove the old
+ cs.parentNode.replaceChild(pre, cs);
+ cs = pre;
+ }
// Replace
s with line-feeds so that copying and pasting works
// on IE 6.
diff --git a/native-datatypes.html b/native-datatypes.html
index f752685..fa398a0 100755
--- a/native-datatypes.html
+++ b/native-datatypes.html
@@ -40,7 +40,7 @@ body{counter-reset:h1 2}
Booleans are either true or false. Python has two constants, cleverly True and False, which can be used to assign boolean values directly. Expressions can also evaluate to a boolean value. In certain places (like if statements), Python expects an expression to evaluate to a boolean value. These places are called boolean contexts. You can use virtually any expression in a boolean context, and Python will try to determine its truth value. Different datatypes have different rules about which values are true or false in a boolean context. (This will make more sense once you see some concrete examples later in this chapter.)
For example, take this snippet from humansize.py:
-
if size < 0:
+if size < 0:
raise ValueError('number must be non-negative')
size is an integer, 0 is an integer, and < is a numerical operator. The result of the expression size < 0 is always a boolean. You can test this yourself in the Python interactive shell:
@@ -865,7 +865,7 @@ KeyError: 'db.diveintopython3.org'
Mixed-Value Dictionaries
Dictionaries aren’t just for strings. Dictionary values can be any datatype, including integers, booleans, arbitrary objects, or even other dictionaries. And within a single dictionary, the values don’t all need to be the same type; you can mix and match as needed. Dictionary keys are more restricted, but they can be strings, integers, and a few other types. You can also mix and match key datatypes within a dictionary.
In fact, you’ve already seen a dictionary with non-string keys and values, in your first Python program.
-
SUFFIXES = {1000: ['KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'],
+SUFFIXES = {1000: ['KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'],
1024: ['KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB']}
Let's tear that apart in the interactive shell.
diff --git a/packaging.html b/packaging.html
index 6b37659..9f162c0 100644
--- a/packaging.html
+++ b/packaging.html
@@ -29,7 +29,7 @@ mark{display:inline}
In this chapter, you’ll learn how the setup scripts for chardet and httplib2 work, and you’ll step through the process of releasing your own Python software.
-
# chardet's setup.py
+# chardet's setup.py
from distutils.core import setup
setup(
name = "chardet",
@@ -157,7 +157,7 @@ chardet/
The first line of every Distutils setup script is always the same:
-
from distutils.core import setup
+from distutils.core import setup
This imports the setup() function, which is the main entry point into Distutils. 95% of all Distutils setup scripts consist of a single call to setup() and nothing else. (I totally just made up that statistic, but if your Distutils setup script is doing more than calling the Distutils setup() function, you should have a good reason. Do you have a good reason? I didn’t think so.)
@@ -187,7 +187,7 @@ chardet/
Now let’s look at the chardet setup script. It has all of these required and recommended parameters, plus one I haven’t mentioned yet: packages.
-
from distutils.core import setup
+from distutils.core import setup
setup(
name = 'chardet',
packages = ['chardet'],
diff --git a/porting-code-to-python-3-with-2to3.html b/porting-code-to-python-3-with-2to3.html
index 682cfc7..0db8976 100644
--- a/porting-code-to-python-3-with-2to3.html
+++ b/porting-code-to-python-3-with-2to3.html
@@ -246,7 +246,7 @@ td pre{padding:0;border:0}
import cookielib
import http.cookiejar
④
- import BaseHTTPServer
+import BaseHTTPServer
import SimpleHTTPServer
import CGIHttpServer
import http.server
@@ -280,14 +280,14 @@ import CGIHttpServer
import robotparser
import urllib.robotparser
⑤
- from urllib import FancyURLopener
+from urllib import FancyURLopener
from urllib import urlencode
-from urllib.request import FancyURLopener
+from urllib.request import FancyURLopener
from urllib.parse import urlencode
⑥
- from urllib2 import Request
+from urllib2 import Request
from urllib2 import HTTPError
-from urllib.request import Request
+from urllib.request import Request
from urllib.error import HTTPError
@@ -307,9 +307,9 @@ from urllib.error import HTTPError
Python 2
Python 3
- import urllib
+import urllib
print urllib.urlopen('http://diveintopython3.org/').read()
-import urllib.request, urllib.parse, urllib.error
+import urllib.request, urllib.parse, urllib.error
print(urllib.request.urlopen('http://diveintopython3.org/').read())
@@ -334,7 +334,7 @@ print(urllib.request.urlopen('http://diveintopython3.org/').read())
import dumbdbm
import dbm.dumb
- import anydbm
+import anydbm
import whichdb
import dbm
@@ -351,7 +351,7 @@ import whichdb
import xmlrpclib
import xmlrpc.client
- import DocXMLRPCServer
+import DocXMLRPCServer
import SimpleXMLRPCServer
import xmlrpc.server
@@ -363,13 +363,13 @@ import SimpleXMLRPCServer
Python 2
Python 3
①
- try:
+try:
import cStringIO as StringIO
except ImportError:
import StringIO
import io
②
- try:
+try:
import cPickle as pickle
except ImportError:
import pickle
@@ -456,22 +456,22 @@ except ImportError:
a_function_that_returns_an_iterator().next()
next(a_function_that_returns_an_iterator())
③
- class A:
+class A:
def next(self):
pass
-class A:
+class A:
def __next__(self):
pass
④
- class A:
+class A:
def next(self, x, y):
pass
no change
⑤
- next = 42
+next = 42
for an_iterator in a_sequence_of_iterators:
an_iterator.next()
-next = 42
+next = 42
for an_iterator in a_sequence_of_iterators:
an_iterator.__next__()
@@ -560,7 +560,7 @@ for an_iterator in a_sequence_of_iterators:
Python 3
reduce(a, b, c)
-from functools import reduce
+from functools import reduce
reduce(a, b, c)
@@ -674,31 +674,31 @@ reduce(a, b, c)
Python 2
Python 3
①
- try:
+try:
import mymodule
except ImportError, e
pass
-try:
+try:
import mymodule
except ImportError as e:
pass
②
- try:
+try:
import mymodule
except (RuntimeError, ImportError), e
pass
-try:
+try:
import mymodule
except (RuntimeError, ImportError) as e:
pass
③
- try:
+try:
import mymodule
except ImportError:
pass
no change
④
- try:
+try:
import mymodule
except:
pass
@@ -951,14 +951,14 @@ except:
Python 2
Python 3
①
- class A:
+class A:
def __nonzero__(self):
pass
-class A:
+class A:
def __bool__(self):
pass
②
- class A:
+class A:
def __nonzero__(self, x, y):
pass
no change
@@ -1233,18 +1233,18 @@ except:
Python 2
Python 3
①
- class C(metaclass=PapayaMeta):
+class C(metaclass=PapayaMeta):
pass
unchanged
②
- class Whip:
+class Whip:
__metaclass__ = PapayaMeta
-class Whip(metaclass=PapayaMeta):
+class Whip(metaclass=PapayaMeta):
pass
③
- class C(Whipper, Beater):
+class C(Whipper, Beater):
__metaclass__ = PapayaMeta
-class C(Whipper, Beater, metaclass=PapayaMeta):
+class C(Whipper, Beater, metaclass=PapayaMeta):
pass
@@ -1335,9 +1335,9 @@ except:
After
- while 1:
+while 1:
do_stuff()
-while True:
+while True:
do_stuff()
type(x) == T
@@ -1346,10 +1346,10 @@ except:
type(x) is T
isinstance(x, T)
- a_list = list(a_sequence)
+