diff --git a/advanced-classes.html b/advanced-classes.html index 7b5f163..05b7f6c 100644 --- a/advanced-classes.html +++ b/advanced-classes.html @@ -105,29 +105,29 @@ class OrderedDict(dict, collections.MutableMapping):
FIXME
->>> import ordereddict ->>> od = ordereddict.OrderedDict() ->>> klass = od.__class__ ① ->>> type(klass) -<class 'abc.ABCMeta'> ->>> klass.__name__ -'OrderedDict' +>>> import ordereddict +>>> od = ordereddict.OrderedDict() +>>> klass = od.__class__ ① +>>> type(klass) +<class 'abc.ABCMeta'> +>>> klass.__name__ +'OrderedDict' ->>> klass.__module__ -'ordereddict' ->>> klass.__bases__ -(<class 'dict'>, <class '_abcoll.MutableMapping'>)+>>> klass.__module__ +'ordereddict' +>>> klass.__bases__ +(<class 'dict'>, <class '_abcoll.MutableMapping'>)
# continued from previous example
->>> klass.__dict__
-{'__abstractmethods__': frozenset(),
+>>> klass.__dict__
+{'__abstractmethods__': frozenset(),
'__delitem__': <function __delitem__ at 0x00DCB6A8>,
'__dict__': <attribute '__dict__' of 'OrderedDict' objects>,
'__doc__': None,
diff --git a/advanced-iterators.html b/advanced-iterators.html
index 10b2529..c626212 100644
--- a/advanced-iterators.html
+++ b/advanced-iterators.html
@@ -90,11 +90,11 @@ if __name__ == '__main__':
The first thing this alphametics solver does is find all the letters (A–Z) in the puzzle.
->>> import re
->>> re.findall('[0-9]+', '16 2-by-4s in rows of 8') ①
-['16', '2', '4', '8']
->>> re.findall('[A-Z]+', 'SEND + MORE == MONEY') ②
-['SEND', 'MORE', 'MONEY']
+>>> import re
+>>> re.findall('[0-9]+', '16 2-by-4s in rows of 8') ①
+['16', '2', '4', '8']
+>>> re.findall('[A-Z]+', 'SEND + MORE == MONEY') ②
+['SEND', 'MORE', 'MONEY']
re module is Python’s implementation of regular expressions. It has a nifty function called findall() which takes a regular expression pattern and a string, and finds all occurrences of the pattern within the string. In this case, the pattern matches sequences of numbers. The findall() function returns a list of all the substrings that matched the pattern.
Set comprehensions make it trivial to find the unique items in a sequence. [FIXME-not sure if I’m going to cover set comprehensions in an earlier chapter; if not, this is certainly an abrupt and inadequate introduction to the topic.]
->>> a_list = ['a', 'c', 'b', 'a', 'd', 'b'] ->>> {c for c in a_list} ① -{'a', 'c', 'b', 'd'} ->>> a_string = 'EAST IS EAST' ->>> {c for c in a_string} ② -{'A', ' ', 'E', 'I', 'S', 'T'} ->>> words = ['SEND', 'MORE', 'MONEY'] ->>> ''.join(words) ③ -'SENDMOREMONEY' ->>> {c for c in ''.join(words)} ④ -{'E', 'D', 'M', 'O', 'N', 'S', 'R', 'Y'}+>>> a_list = ['a', 'c', 'b', 'a', 'd', 'b'] +>>> {c for c in a_list} ① +{'a', 'c', 'b', 'd'} +>>> a_string = 'EAST IS EAST' +>>> {c for c in a_string} ② +{'A', ' ', 'E', 'I', 'S', 'T'} +>>> words = ['SEND', 'MORE', 'MONEY'] +>>> ''.join(words) ③ +'SENDMOREMONEY' +>>> {c for c in ''.join(words)} ④ +{'E', 'D', 'M', 'O', 'N', 'S', 'R', 'Y'}
for loop. Take the first item from the list, put it in the set. Second. Third. Fourth — wait, that’s in the set already, so it only gets listed once. Fifth. Sixth — again, a duplicate, so it only gets listed once. The end result? All the unique items in the original list, without any duplicates. The original list doesn’t even need to be sorted first.
Like many programming languages, Python has an assert statement. Here’s how it works.
->>> assert 1 + 1 == 2 ① ->>> assert 1 + 1 == 3 ② +>>> assert 1 + 1 == 2 ① +>>> assert 1 + 1 == 3 ② Traceback (most recent call last): File "<stdin>", line 1, in@@ -168,16 +168,16 @@ AssertionErrorAssertionError
A generator expression is like a generator function without the function.
->>> unique_characters = {'E', 'D', 'M', 'O', 'N', 'S', 'R', 'Y'}
->>> gen = (ord(c) for c in unique_characters) ①
->>> gen ②
-<generator object <genexpr> at 0x00BADC10>
->>> next(gen) ③
-69
->>> next(gen)
-68
->>> tuple(ord(c) for c in unique_characters) ④
-(69, 68, 77, 79, 78, 83, 82, 89)
+>>> unique_characters = {'E', 'D', 'M', 'O', 'N', 'S', 'R', 'Y'}
+>>> gen = (ord(c) for c in unique_characters) ①
+>>> gen ②
+<generator object <genexpr> at 0x00BADC10>
+>>> next(gen) ③
+69
+>>> next(gen)
+68
+>>> tuple(ord(c) for c in unique_characters) ④
+(69, 68, 77, 79, 78, 83, 82, 89)
The idea is that you take a list of things (could be numbers, could be letters, could be dancing bears) and find all the possible ways to split them up into smaller lists. All the smaller lists have the same size, which can be as small as 1 and as large as the total number of items. Oh, and nothing can be repeated. Mathematicians say things like “let’s find the permutations of 3 different items taken 2 at a time,” which means you have a sequence of 3 items and you want to find all the possible ordered pairs.
->>> import itertools ① ->>> perms = itertools.permutations([1, 2, 3], 2) ② ->>> next(perms) ③ -(1, 2) ->>> next(perms) -(1, 3) ->>> next(perms) -(2, 1) ④ ->>> next(perms) -(2, 3) ->>> next(perms) -(3, 1) ->>> next(perms) -(3, 2) ->>> next(perms) ⑤ +>>> import itertools ① +>>> perms = itertools.permutations([1, 2, 3], 2) ② +>>> next(perms) ③ +(1, 2) +>>> next(perms) +(1, 3) +>>> next(perms) +(2, 1) ④ +>>> next(perms) +(2, 3) +>>> next(perms) +(3, 1) +>>> next(perms) +(3, 2) +>>> next(perms) ⑤ Traceback (most recent call last): File "<stdin>", line 1, in@@ -231,26 +231,26 @@ StopIterationStopIteration
The permutations() function doesn’t have to take a list. It can take any sequence — even a string.
->>> import itertools ->>> perms = itertools.permutations('ABC', 3) ① ->>> next(perms) -('A', 'B', 'C') ② ->>> next(perms) -('A', 'C', 'B') ->>> next(perms) -('B', 'A', 'C') ->>> next(perms) -('B', 'C', 'A') ->>> next(perms) -('C', 'A', 'B') ->>> next(perms) -('C', 'B', 'A') ->>> next(perms) +>>> import itertools +>>> perms = itertools.permutations('ABC', 3) ① +>>> next(perms) +('A', 'B', 'C') ② +>>> next(perms) +('A', 'C', 'B') +>>> next(perms) +('B', 'A', 'C') +>>> next(perms) +('B', 'C', 'A') +>>> next(perms) +('C', 'A', 'B') +>>> next(perms) +('C', 'B', 'A') +>>> next(perms) Traceback (most recent call last): File "<stdin>", line 1, inStopIteration ->>> list(itertools.permutations('ABC', 3)) ③ -[('A', 'B', 'C'), ('A', 'C', 'B'), +>>> list(itertools.permutations('ABC', 3)) ③ +[('A', 'B', 'C'), ('A', 'C', 'B'), ('B', 'A', 'C'), ('B', 'C', 'A'), ('C', 'A', 'B'), ('C', 'B', 'A')]
itertools Module->>> import itertools ->>> list(itertools.product('ABC', '123')) ① -[('A', '1'), ('A', '2'), ('A', '3'), +>>> import itertools +>>> list(itertools.product('ABC', '123')) ① +[('A', '1'), ('A', '2'), ('A', '3'), ('B', '1'), ('B', '2'), ('B', '3'), ('C', '1'), ('C', '2'), ('C', '3')] ->>> list(itertools.combinations('ABC', 2)) ② -[('A', 'B'), ('A', 'C'), ('B', 'C')]+>>> list(itertools.combinations('ABC', 2)) ② +[('A', 'B'), ('A', 'C'), ('B', 'C')]
itertools.product() function returns an iterator containing the Cartesian product of two sequences.
itertools.combinations() function returns an iterator containing all the possible combinations of the given sequence of the given length. This is like the itertools.permutations() function, except combinations don’t include items that are duplicates of other items in a different order. So itertools.permutations('ABC', 2) will return both ('A', 'B') and ('B', 'A') (among others), but itertools.combinations('ABC', 2) will not return ('B', 'A') because it is a duplicate of ('A', 'B') in a different order.
@@ -277,21 +277,21 @@ StopIteration
[download favorite-people.txt]
->>> names = list(open('examples/favorite-people.txt')) ① ->>> names -['Dora\n', 'Ethan\n', 'Wesley\n', 'John\n', 'Anne\n', +>>> names = list(open('examples/favorite-people.txt')) ① +>>> names +['Dora\n', 'Ethan\n', 'Wesley\n', 'John\n', 'Anne\n', 'Mike\n', 'Chris\n', 'Sarah\n', 'Alex\n', 'Lizzie\n'] ->>> names = [name.rstrip() for name in names] ② ->>> names -['Dora', 'Ethan', 'Wesley', 'John', 'Anne', +>>> names = [name.rstrip() for name in names] ② +>>> names +['Dora', 'Ethan', 'Wesley', 'John', 'Anne', 'Mike', 'Chris', 'Sarah', 'Alex', 'Lizzie'] ->>> names = sorted(names) ③ ->>> names -['Alex', 'Anne', 'Chris', 'Dora', 'Ethan', +>>> names = sorted(names) ③ +>>> names +['Alex', 'Anne', 'Chris', 'Dora', 'Ethan', 'John', 'Lizzie', 'Mike', 'Sarah', 'Wesley'] ->>> names = sorted(names, key=len) ④ ->>> names -['Alex', 'Anne', 'Dora', 'John', 'Mike', +>>> names = sorted(names, key=len) ④ +>>> names +['Alex', 'Anne', 'Dora', 'John', 'Mike', 'Chris', 'Ethan', 'Sarah', 'Lizzie', 'Wesley']
…continuing from the previous interactive shell… ->>> import itertools ->>> groups = itertools.groupby(names, len) ① ->>> groups -<itertools.groupby object at 0x00BB20C0> ->>> list(groups) -[(4, <itertools._grouper object at 0x00BA8BF0>), +>>> import itertools +>>> groups = itertools.groupby(names, len) ① +>>> groups +<itertools.groupby object at 0x00BB20C0> +>>> list(groups) +[(4, <itertools._grouper object at 0x00BA8BF0>), (5, <itertools._grouper object at 0x00BB4050>), (6, <itertools._grouper object at 0x00BB4030>)] ->>> groups = itertools.groupby(names, len) ->>> for name_length, name_iter in groups: ② -... print('Names with {0:d} letters:'.format(name_length)) -... for name in name_iter: -... print(name) +>>> groups = itertools.groupby(names, len) +>>> for name_length, name_iter in groups: ② +... print('Names with {0:d} letters:'.format(name_length)) +... for name in name_iter: +... print(name) ... Names with 4 letters: Alex @@ -338,18 +338,18 @@ Wesley
Are you watching closely?
->>> list(range(0, 3)) -[0, 1, 2] ->>> list(range(10, 13)) -[10, 11, 12] ->>> list(itertools.chain(range(0, 3), range(10, 13))) ① -[0, 1, 2, 10, 11, 12] ->>> list(zip(range(0, 3), range(10, 13))) ② -[(0, 10), (1, 11), (2, 12)] ->>> list(zip(range(0, 3), range(10, 14))) ③ -[(0, 10), (1, 11), (2, 12)] ->>> list(itertools.zip_longest(range(0, 3), range(10, 14))) ④ -[(0, 10), (1, 11), (2, 12), (None, 13)]+>>> list(range(0, 3)) +[0, 1, 2] +>>> list(range(10, 13)) +[10, 11, 12] +>>> list(itertools.chain(range(0, 3), range(10, 13))) ① +[0, 1, 2, 10, 11, 12] +>>> list(zip(range(0, 3), range(10, 13))) ② +[(0, 10), (1, 11), (2, 12)] +>>> list(zip(range(0, 3), range(10, 14))) ③ +[(0, 10), (1, 11), (2, 12)] +>>> list(itertools.zip_longest(range(0, 3), range(10, 14))) ④ +[(0, 10), (1, 11), (2, 12), (None, 13)]
itertools.chain() function takes two iterators and returns an iterator that contains all the items from the first iterator, followed by all the items from the second iterator. (Actually, it can take any number of iterators, and it chains them all in the order they were passed to the function.)
zip() function does something prosaic that turns out to be extremely useful: it any number of sequences and returns an iterator with the first items of each sequence, then the second items of each, then the third, and so on.
@@ -360,13 +360,13 @@ Wesley
OK, that was all very interesting, but how does it relate to the alphametics solver? Here’s how:
->>> characters = ('S', 'M', 'E', 'D', 'O', 'N', 'R', 'Y')
->>> guess = ('1', '2', '0', '3', '4', '5', '6', '7')
->>> tuple(zip(characters, guess)) ①
-(('S', '1'), ('M', '2'), ('E', '0'), ('D', '3'),
+>>> characters = ('S', 'M', 'E', 'D', 'O', 'N', 'R', 'Y')
+>>> guess = ('1', '2', '0', '3', '4', '5', '6', '7')
+>>> tuple(zip(characters, guess)) ①
+(('S', '1'), ('M', '2'), ('E', '0'), ('D', '3'),
('O', '4'), ('N', '5'), ('R', '6'), ('Y', '7'))
->>> dict(zip(characters, guess)) ②
-{'E': '0', 'D': '3', 'M': '2', 'O': '4',
+>>> dict(zip(characters, guess)) ②
+{'E': '0', 'D': '3', 'M': '2', 'O': '4',
'N': '5', 'S': '1', 'R': '6', 'Y': '7'}
zip function will create a pairing of letters and digits, in order.
@@ -391,11 +391,11 @@ for guess in itertools.permutations(digits, len(characters)):
Python strings have many methods. You learned about some of those methods in the Strings chapter: lower(), count(), and format(). Now I want to introduce you to a powerful but little-known string manipulation technique: the translate() method.
->>> translation_table = {ord('A'): ord('O')} ① ->>> translation_table ② -{65: 79} ->>> 'MARK'.translate(translation_table) ③ -'MORK'+>>> translation_table = {ord('A'): ord('O')} ① +>>> translation_table ② +{65: 79} +>>> 'MARK'.translate(translation_table) ③ +'MORK'
ord() function returns the ASCII value of a character, which, in the case of A–Z, is always a byte from 65 to 90.
@@ -405,17 +405,17 @@ for guess in itertools.permutations(digits, len(characters)):
What does this have to do with solving alphametic puzzles? As it turns out, everything.
->>> characters = tuple(ord(c) for c in 'SMEDONRY') ① ->>> characters -(83, 77, 69, 68, 79, 78, 82, 89) ->>> guess = tuple(ord(c) for c in '91570682') ② ->>> guess -(57, 49, 53, 55, 48, 54, 56, 50) ->>> translation_table = dict(zip(characters, guess)) ③ ->>> translation_table -{68: 55, 69: 53, 77: 49, 78: 54, 79: 48, 82: 56, 83: 57, 89: 50} ->>> 'SEND + MORE == MONEY'.translate(translation_table) ④ -'9567 + 1085 == 10652'+>>> characters = tuple(ord(c) for c in 'SMEDONRY') ① +>>> characters +(83, 77, 69, 68, 79, 78, 82, 89) +>>> guess = tuple(ord(c) for c in '91570682') ② +>>> guess +(57, 49, 53, 55, 48, 54, 56, 50) +>>> translation_table = dict(zip(characters, guess)) ③ +>>> translation_table +{68: 55, 69: 53, 77: 49, 78: 54, 79: 48, 82: 56, 83: 57, 89: 50} +>>> 'SEND + MORE == MONEY'.translate(translation_table) ④ +'9567 + 1085 == 10652'
alphametics.solve() function.
itertools.permutations() function in the alphametics.solve() function.
@@ -432,36 +432,36 @@ for guess in itertools.permutations(digits, len(characters)):
This is the final piece of the puzzle (or rather, the final piece of the puzzle solver). After all that fancy string manipulation, we’re left with a string like '9567 + 1085 == 10652'. But that’s a string, and what good is a string? Enter eval(), the universal Python evaluation tool.
->>> eval('1 + 1 == 2')
-True
->>> eval('1 + 1 == 3')
-False
->>> eval('9567 + 1085 == 10652')
-True
+>>> eval('1 + 1 == 2')
+True
+>>> eval('1 + 1 == 3')
+False
+>>> eval('9567 + 1085 == 10652')
+True
But wait, there’s more! The eval() function isn’t limited to boolean expressions. It can handle any Python expression and returns any datatype.
->>> eval('"A" + "B"')
-'AB'
->>> eval('"MARK".translate({65: 79})')
-'MORK'
->>> eval('"AAAAA".count("A")')
-5
->>> eval('["*"] * 5')
-['*', '*', '*', '*', '*']
+>>> eval('"A" + "B"')
+'AB'
+>>> eval('"MARK".translate({65: 79})')
+'MORK'
+>>> eval('"AAAAA".count("A")')
+5
+>>> eval('["*"] * 5')
+['*', '*', '*', '*', '*']
But wait, that’s not all!
->>> x = 5 ->>> eval("x * 5") ① -25 ->>> eval("pow(x, 2)") ② -25 ->>> import math ->>> eval("math.sqrt(x)") ③ -2.2360679774997898+>>> x = 5 +>>> eval("x * 5") ① +25 +>>> eval("pow(x, 2)") ② +25 +>>> import math +>>> eval("math.sqrt(x)") ③ +2.2360679774997898
eval() takes can reference global variables defined outside the eval(). If called within a function, it can reference local variables too.
Hey, wait a minute…
->>> import subprocess ->>> eval("subprocess.getoutput('ls ~')") ① -'Desktop Library Pictures \ +>>> import subprocess +>>> eval("subprocess.getoutput('ls ~')") ① +'Desktop Library Pictures \ Documents Movies Public \ Music Sites' ->>> eval("subprocess.getoutput('rm -rf /')") ②+>>> eval("subprocess.getoutput('rm -rf /')") ②
subprocess module allows you to run arbitrary shell commands and get the result as a Python string.
It’s even worse than that, because there’s a global __import__() function that takes a module name as a string, imports the module, and returns a reference to it. Combined with the power of eval(), you can construct a single expression that will wipe out all your files:
->>> eval("__import__('subprocess').getoutput('rm -rf /')") ①+>>> eval("__import__('subprocess').getoutput('rm -rf /')") ①
But surely there’s some way to evaluate expressions safely? To put eval() in a sandbox where it can’t access or harm the outside world? Well, yeah, but it’s tricky.
->>> x = 5 ->>> eval("x * 5", {}, {}) ① +>>> x = 5 +>>> eval("x * 5", {}, {}) ① Traceback (most recent call last): File "<stdin>", line 1, in <module> File "<string>", line 1, in <module> NameError: name 'x' is not defined ->>> eval("x * 5", {"x": x}, {}) ② ->>> import math ->>> eval("math.sqrt(x)", {"x": x}, {}) ② +>>> eval("x * 5", {"x": x}, {}) ② +>>> import math +>>> eval("math.sqrt(x)", {"x": x}, {}) ② Traceback (most recent call last): File "<stdin>", line 1, in <module> File "<string>", line 1, in <module> @@ -519,10 +519,10 @@ NameError: name 'math' is not defined
Gee, that was easy. Lemme make an alphametics web service now!
->>> eval("pow(5, 2)", {}, {}) ① -25 ->>> eval("__import__('math').sqrt(5)", {}, {}) ② -2.2360679774997898+>>> eval("pow(5, 2)", {}, {}) ① +25 +>>> eval("__import__('math').sqrt(5)", {}, {}) ② +2.2360679774997898
pow(5, 2) works, because 5 and 2 are literals, and pow() is a built-in function.
__import__() function is also a built-in function, so it works too.
@@ -531,7 +531,7 @@ NameError: name 'math' is not defined
Yeah, that means you can still do nasty things, even if you explicitly set the global and local namespaces to empty dictionaries when calling eval():
->>> eval("__import__('subprocess').getoutput('rm -rf /')", {}, {}) ①+>>> eval("__import__('subprocess').getoutput('rm -rf /')", {}, {}) ①
Oops. I’m glad I didn’t make that alphametics web service. Is there any way to use eval() safely?
->>> eval("__import__('math').sqrt(5)",
-... {"__builtins__":None}, {}) ①
+>>> eval("__import__('math').sqrt(5)",
+... {"__builtins__":None}, {}) ①
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "<string>", line 1, in <module>
NameError: name '__import__' is not defined
->>> eval("__import__('subprocess').getoutput('rm -rf /')",
-... {"__builtins__":None}, {}) ②
+>>> eval("__import__('subprocess').getoutput('rm -rf /')",
+... {"__builtins__":None}, {}) ②
Traceback (most recent call last):
File "", line 1, in
File "", line 1, in
diff --git a/case-study-porting-chardet-to-python-3.html b/case-study-porting-chardet-to-python-3.html
index 9601890..7f6c5bc 100644
--- a/case-study-porting-chardet-to-python-3.html
+++ b/case-study-porting-chardet-to-python-3.html
@@ -795,23 +795,23 @@ TypeError: unsupported operand type(s) for +: 'int' and 'bytes'
This error doesn’t occur the first time the feed() method gets called; it occurs the second time, after self._mLastChar has been set to the last byte of aBuf. Well, what’s the problem with that? Getting a single element from a byte array yields an integer, not a byte array. To see the difference, follow me to the interactive shell:
->>> aBuf = b'\xEF\xBB\xBF' ① ->>> len(aBuf) -3 ->>> mLastChar = aBuf[-1] ->>> mLastChar ② -191 ->>> type(mLastChar) ③ -<class 'int'> ->>> mLastChar + aBuf ④ +>>> aBuf = b'\xEF\xBB\xBF' ① +>>> len(aBuf) +3 +>>> mLastChar = aBuf[-1] +>>> mLastChar ② +191 +>>> type(mLastChar) ③ +<class 'int'> +>>> mLastChar + aBuf ④ Traceback (most recent call last): File "<stdin>", line 1, in <module> TypeError: unsupported operand type(s) for +: 'int' and 'bytes' ->>> mLastChar = aBuf[-1:] ⑤ ->>> mLastChar -b'\xbf' ->>> mLastChar + aBuf ⑥ -b'\xbf\xef\xbb\xbf'+>>> mLastChar = aBuf[-1:] ⑤ +>>> mLastChar +b'\xbf' +>>> mLastChar + aBuf ⑥ +b'\xbf\xef\xbb\xbf'
C:\home\chardet> python test.py tests\*\*
tests\ascii\howto.diveintomark.org.xml ascii with confidence 1.0
tests\Big5\0804.blogspot.com.xml
-Traceback (most recent call last):
+Traceback (most recent call last):
File "test.py", line 10, in <module>
u.feed(line)
File "C:\home\chardet\chardet\universaldetector.py", line 116, in feed
diff --git a/dip3.css b/dip3.css
index 0dfb0e0..0777144 100644
--- a/dip3.css
+++ b/dip3.css
@@ -183,10 +183,8 @@ pre a, .w a {
.w a {
text-decoration: underline;
}
-kbd, mark {
- font-weight: bold;
-}
mark {
+ font-weight: bold;
display: inline-block;
width: 100%;
background: #ff8;
diff --git a/generators.html b/generators.html
index 73e3a9a..9bafb4a 100644
--- a/generators.html
+++ b/generators.html
@@ -56,15 +56,15 @@ def plural(noun):
Let’s look at regular expression substitutions in more detail.
->>> import re
->>> re.search('[abc]', 'Mark') ①
+>>> import re
+>>> re.search('[abc]', 'Mark') ①
<_sre.SRE_Match object at 0x001C1FA8>
->>> re.sub('[abc]', 'o', 'Mark') ②
-'Mork'
->>> re.sub('[abc]', 'o', 'rock') ③
-'rook'
->>> re.sub('[abc]', 'o', 'caps') ④
-'oops'
+>>> re.sub('[abc]', 'o', 'Mark') ②
+'Mork'
+>>> re.sub('[abc]', 'o', 'rock') ③
+'rook'
+>>> re.sub('[abc]', 'o', 'caps') ④
+'oops'
Mark contain a, b, or c? Yes, it contains a.
a, b, or c, and replace it with o. Mark becomes Mork.
@@ -92,14 +92,14 @@ def plural(noun):
Let’s look at negation regular expressions in more detail.
->>> import re ->>> re.search('[^aeiou]y$', 'vacancy') ① +>>> import re +>>> re.search('[^aeiou]y$', 'vacancy') ① <_sre.SRE_Match object at 0x001C1FA8> ->>> re.search('[^aeiou]y$', 'boy') ② +>>> re.search('[^aeiou]y$', 'boy') ② >>> ->>> re.search('[^aeiou]y$', 'day') +>>> re.search('[^aeiou]y$', 'day') >>> ->>> re.search('[^aeiou]y$', 'pita') ③ +>>> re.search('[^aeiou]y$', 'pita') ③ >>>
vacancy matches this regular expression, because it ends in cy, and c is not a, e, i, o, or u.
@@ -107,12 +107,12 @@ def plural(noun):
pita does not match, because it does not end in y.
->>> re.sub('y$', 'ies', 'vacancy') ① -'vacancies' ->>> re.sub('y$', 'ies', 'agency') -'agencies' ->>> re.sub('([^aeiou])y$', r'\1ies', 'vacancy') ② -'vacancies'+>>> re.sub('y$', 'ies', 'vacancy') ① +'vacancies' +>>> re.sub('y$', 'ies', 'agency') +'agencies' +>>> re.sub('([^aeiou])y$', r'\1ies', 'vacancy') ② +'vacancies'
vacancy into vacancies and agency into agencies, which is what you wanted. Note that it would also turn boy into boies, but that will never happen in the function because you did that re.search first to find out whether you should do this re.sub.
y. Then in the substitution string, you use a new syntax, \1, which means “hey, that first group you remembered? put it right here.” In this case, you remember the c before the y; when you do the substitution, you substitute c in place of c, and ies in place of y. (If you have more than one remembered group, you can use \2 and \3 and so on.)
@@ -313,23 +313,23 @@ def plural(noun):
How the heck does that work? Let’s look at an interactive example first.
->>> def make_counter(x):
-... print('entering make_counter')
-... while True:
-... yield x ①
-... print('incrementing x')
-... x = x + 1
+>>> def make_counter(x):
+... print('entering make_counter')
+... while True:
+... yield x ①
+... print('incrementing x')
+... x = x + 1
...
->>> counter = make_counter(2) ②
->>> counter ③
+>>> counter = make_counter(2) ②
+>>> counter ③
<generator object at 0x001C9C10>
->>> next(counter) ④
+>>> next(counter) ④
entering make_counter
2
->>> next(counter) ⑤
+>>> next(counter) ⑤
incrementing x
3
->>> next(counter) ⑥
+>>> next(counter) ⑥
incrementing x
4
So you have a function that spits out successive Fibonacci numbers. Sure, you could do that with recursion, but this way is easier to read. Also, it works well with for loops.
->>> from fibonacci import fib ->>> for n in fib(1000): ① -... print(n, end=' ') ② -0 1 1 2 3 5 8 13 21 34 55 89 144 233 377 610 987+>>> from fibonacci import fib +>>> for n in fib(1000): ① +... print(n, end=' ') ② +0 1 1 2 3 5 8 13 21 34 55 89 144 233 377 610 987
fib() in a for loop directly. The for loop will automatically call the next() function to get values from the fib() generator and assign them to the for loop index variable (n).
for loop, n gets a new value from the yield statement in fib(), and all you have to do is print it out. Once fib() runs out of numbers (a becomes bigger than max, which in this case is 1000), then the for loop exits gracefully.
diff --git a/http-web-services.html b/http-web-services.html
index 719cb65..537392b 100644
--- a/http-web-services.html
+++ b/http-web-services.html
@@ -187,10 +187,10 @@ Cache-Control: max-age=31536000, public
Let’s say you want to download a resource over HTTP, such as an Atom feed. Being a feed, you’re not just going to download it once; you’re going to download it over and over again. (Most feed readers will check for changes once an hour.) Let’s do it the quick-and-dirty way first, and then see how you can do better.
->>> import urllib.request ->>> data = urllib.request.urlopen('http://diveintopython3.org/examples/feed.xml').read() ① ->>> print(data) -<?xml version='1.0' encoding='utf-8'?> +>>> import urllib.request +>>> data = urllib.request.urlopen('http://diveintopython3.org/examples/feed.xml').read() ① +>>> print(data) +<?xml version='1.0' encoding='utf-8'?> <feed xmlns='http://www.w3.org/2005/Atom' xml:lang='en'> <title>dive into mark</title> <subtitle>currently between addictions</subtitle> @@ -212,10 +212,10 @@ Cache-Control: max-age=31536000, public
To see why this is inefficient and rude, let’s turn on the debugging features of Python’s HTTP library and see what’s being sent “on the wire.”
->>> from http.client import HTTPConnection ->>> HTTPConnection.debuglevel = 1 ① ->>> from urllib.request import urlopen ->>> response = urlopen('http://diveintopython3.org/examples/feed.xml') ② +>>> from http.client import HTTPConnection +>>> HTTPConnection.debuglevel = 1 ① +>>> from urllib.request import urlopen +>>> response = urlopen('http://diveintopython3.org/examples/feed.xml') ② send: b'GET /examples/feed.xml HTTP/1.1 ③ Host: diveintopython3.org ④ Accept-Encoding: identity ⑤ @@ -236,7 +236,7 @@ reply: 'HTTP/1.1 200 OK'# continued from previous example ->>> print(response.headers.as_string()) ① +>>> print(response.headers.as_string()) ① Date: Sun, 31 May 2009 19:23:06 GMT ② Server: Apache Last-Modified: Sun, 31 May 2009 06:39:55 GMT ③ @@ -248,9 +248,9 @@ Expires: Mon, 01 Jun 2009 19:23:06 GMT Vary: Accept-Encoding Connection: close Content-Type: application/xml ->>> data = response.read() ⑦ ->>> len(data) -3070+>>> data = response.read() ⑦ +>>> len(data) +3070
urllib.request.urlopen() function contains all the HTTP headers the server sent back. It also contains methods to download the actual data; we’ll get to that in a minute.
# continued from the previous example ->>> response2 = urlopen('http://diveintopython3.org/examples/feed.xml') +>>> response2 = urlopen('http://diveintopython3.org/examples/feed.xml') send: b'GET /examples/feed.xml HTTP/1.1 Host: diveintopython3.org Accept-Encoding: identity @@ -282,7 +282,7 @@ reply: 'HTTP/1.1 200 OK'# continued from the previous example ->>> print(response2.headers.as_string()) ① +>>> print(response2.headers.as_string()) ① Date: Mon, 01 Jun 2009 03:58:00 GMT Server: Apache Last-Modified: Sun, 31 May 2009 22:51:11 GMT @@ -294,11 +294,11 @@ Expires: Tue, 02 Jun 2009 03:58:00 GMT Vary: Accept-Encoding Connection: close Content-Type: application/xml ->>> data2 = response2.read() ->>> len(data2) ② -3070 ->>> data2 == data ③ -True+>>> data2 = response2.read() +>>> len(data2) ② +3070 +>>> data2 == data ③ +True
Cache-Control and Expires to allow caching, Last-Modified and ETag to enable “not-modified” tracking. Even the Vary: Accept-Encoding header hints that the server would support compression, if only you would ask for it. But you didn’t.
To use httplib2, create an instance of the httplib2.Http class.
->>> import httplib2 ->>> h = httplib2.Http('.cache') ① ->>> response, content = h.request('http://diveintopython3.org/examples/feed.xml') ② ->>> response.status ③ -200 ->>> content[:52] ④ -b"<?xml version='1.0' encoding='utf-8'?>\r\n<feed xmlns=" ->>> len(content) -3070+>>> import httplib2 +>>> h = httplib2.Http('.cache') ① +>>> response, content = h.request('http://diveintopython3.org/examples/feed.xml') ② +>>> response.status ③ +200 +>>> content[:52] ④ +b"<?xml version='1.0' encoding='utf-8'?>\r\n<feed xmlns=" +>>> len(content) +3070
httplib2 is the Http object. For reasons you’ll see in the next section, you should always pass a directory name when you create an Http object. The directory does not need to exist; httplib2 will create it if necessary.
Http object, retrieving data is as simple as calling the request() method with the address of the data you want. This will issue an HTTP GET request for that URL. (Later in this chapter, you’ll see how to issue other HTTP requests, like POST.)
@@ -340,13 +340,13 @@ Content-Type: application/xml
# continued from the previous example ->>> response2, content2 = h.request('http://diveintopython3.org/examples/feed.xml') ① ->>> response2.status ② -200 ->>> content2[:52] ③ -b"<?xml version='1.0' encoding='utf-8'?>\r\n<feed xmlns=" ->>> len(content2) -3070+>>> response2, content2 = h.request('http://diveintopython3.org/examples/feed.xml') ① +>>> response2.status ② +200 +>>> content2[:52] ③ +b"<?xml version='1.0' encoding='utf-8'?>\r\n<feed xmlns=" +>>> len(content2) +3070
status is once again 200, just like last time.
@@ -359,16 +359,16 @@ Content-Type: application/xml
# NOT continued from previous example!
# Please exit out of the interactive shell
# and launch a new one.
->>> import httplib2
->>> httplib2.debuglevel = 1 ①
->>> h = httplib2.Http('.cache') ②
->>> response, content = h.request('http://diveintopython3.org/examples/feed.xml') ③
->>> len(content) ④
-3070
->>> response.status ⑤
-200
->>> response.fromcache ⑥
-True
+>>> import httplib2
+>>> httplib2.debuglevel = 1 ①
+>>> h = httplib2.Http('.cache') ②
+>>> response, content = h.request('http://diveintopython3.org/examples/feed.xml') ③
+>>> len(content) ④
+3070
+>>> response.status ⑤
+200
+>>> response.fromcache ⑥
+True
httplib2 equivalent of turning on debugging in http.client. httplib2 will print all the data being sent to the server and some key information being sent back.
httplib2.Http object with the same directory name as before.
@@ -388,8 +388,8 @@ Content-Type: application/xml
# continued from the previous example
->>> response2, content2 = h.request('http://diveintopython3.org/examples/feed.xml',
-... headers={'cache-control':'no-cache'}) ①
+>>> response2, content2 = h.request('http://diveintopython3.org/examples/feed.xml',
+... headers={'cache-control':'no-cache'}) ①
connect: (diveintopython3.org, 80) ②
send: b'GET /examples/feed.xml HTTP/1.1
Host: diveintopython3.org
@@ -398,12 +398,12 @@ accept-encoding: deflate, gzip
cache-control: no-cache'
reply: 'HTTP/1.1 200 OK'
…further debugging information omitted…
->>> response2.status
-200
->>> response2.fromcache ③
-False
->>> print(dict(response2.items())) ④
-{'status': '200',
+>>> response2.status
+200
+>>> response2.fromcache ③
+False
+>>> print(dict(response2.items())) ④
+{'status': '200',
'content-length': '3070',
'content-location': 'http://diveintopython3.org/examples/feed.xml',
'accept-ranges': 'bytes',
@@ -431,18 +431,18 @@ reply: 'HTTP/1.1 200 OK'
But what about the case where the data might have changed, but hasn’t? HTTP defines Last-Modified and Etag headers for this purpose. These headers are called validators. If the local cache is no longer fresh, a client can send the validators with the next request to see if the data has actually changed. If the data hasn’t changed, the server sends back a 304 status code and no data. So there’s still a round-trip over the network, but you end up downloading fewer bytes.
->>> import httplib2
->>> httplib2.debuglevel = 1
->>> h = httplib2.Http('.cache')
->>> response, content = h.request('http://diveintopython3.org/') ①
+>>> import httplib2
+>>> httplib2.debuglevel = 1
+>>> h = httplib2.Http('.cache')
+>>> response, content = h.request('http://diveintopython3.org/') ①
connect: (diveintopython3.org, 80)
send: b'GET / HTTP/1.1
Host: diveintopython3.org
accept-encoding: deflate, gzip
user-agent: Python-httplib2/$Rev: 259 $'
reply: 'HTTP/1.1 200 OK'
->>> print(dict(response.items())) ②
-{'-content-encoding': 'gzip',
+>>> print(dict(response.items())) ②
+{'-content-encoding': 'gzip',
'accept-ranges': 'bytes',
'connection': 'close',
'content-length': '6657',
@@ -454,8 +454,8 @@ reply: 'HTTP/1.1 200 OK'
'server': 'Apache',
'status': '304',
'vary': 'Accept-Encoding,User-Agent'}
->>> len(content) ③
-6657
+>>> len(content) ③
+6657
httplib2 has little to work with, and it sends out a minimum of headers with the request.
ETag and Last-Modified header.
@@ -464,7 +464,7 @@ reply: 'HTTP/1.1 200 OK'
# continued from the previous example ->>> response, content = h.request('http://diveintopython3.org/') ① +>>> response, content = h.request('http://diveintopython3.org/') ① connect: (diveintopython3.org, 80) send: b'GET / HTTP/1.1 Host: diveintopython3.org @@ -473,14 +473,14 @@ Host: diveintopython3.org accept-encoding: deflate, gzip user-agent: Python-httplib2/$Rev: 259 $' reply: 'HTTP/1.1 304 Not Modified' ④ ->>> response.fromcache ⑤ -True ->>> response.status ⑥ -200 ->>> response.dict['status'] ⑦ -'304' ->>> len(content) ⑧ -6657+>>> response.fromcache ⑤ +True +>>> response.status ⑥ +200 +>>> response.dict['status'] ⑦ +'304' +>>> len(content) ⑧ +6657
Http object (and the same local cache).
httplib2 sends the ETag validator back to the server in the If-None-Match header.
@@ -497,15 +497,15 @@ user-agent: Python-httplib2/$Rev: 259 $'
HTTP supports two types of compression. httplib2 supports both of them.
->>> response, content = h.request('http://diveintopython3.org/')
+>>> response, content = h.request('http://diveintopython3.org/')
connect: (diveintopython3.org, 80)
send: b'GET / HTTP/1.1
Host: diveintopython3.org
accept-encoding: deflate, gzip ①
user-agent: Python-httplib2/$Rev: 259 $'
reply: 'HTTP/1.1 200 OK'
->>> print(dict(response.items()))
-{'-content-encoding': 'gzip', ②
+>>> print(dict(response.items()))
+{'-content-encoding': 'gzip', ②
'accept-ranges': 'bytes',
'connection': 'close',
'content-length': '6657',
@@ -524,57 +524,76 @@ reply: 'HTTP/1.1 200 OK'
How httplib2 Handles Redirects
-FIXME
+
HTTP defines two kinds of redirects: temporary and permanent. There’s nothing special to do with temporary redirects except follow them, which httplib2 does automatically.
->>> response, content = h.request('http://diveintopython3.org/examples/feed-302.xml')
+>>> response, content = h.request('http://diveintopython3.org/examples/feed-302.xml') ①
connect: (diveintopython3.org, 80)
-send: b'GET /examples/feed-302.xml HTTP/1.1
+send: b'GET /examples/feed-302.xml HTTP/1.1 ②
Host: diveintopython3.org
accept-encoding: deflate, gzip
user-agent: Python-httplib2/$Rev: 259 $'
-reply: 'HTTP/1.1 302 Found'
-send: b'GET /examples/feed.xml HTTP/1.1
+reply: 'HTTP/1.1 302 Found' ③
+send: b'GET /examples/feed.xml HTTP/1.1 ④
Host: diveintopython3.org
accept-encoding: deflate, gzip
user-agent: Python-httplib2/$Rev: 259 $'
-reply: 'HTTP/1.1 200 OK'
->>> print(dict(response.items()))
-{'status': '200',
+reply: 'HTTP/1.1 200 OK'
+
+-
+
-
+
-
+
-
+
+
+
+# continued from the previous example
+>>> print(dict(response.items())) ①
+{'status': '200',
'content-length': '3070',
- 'content-location': 'http://diveintopython3.org/examples/feed.xml',
+ 'content-location': 'http://diveintopython3.org/examples/feed.xml', ②
'accept-ranges': 'bytes',
'expires': 'Thu, 04 Jun 2009 02:21:41 GMT',
'vary': 'Accept-Encoding',
'server': 'Apache',
'last-modified': 'Wed, 03 Jun 2009 02:20:15 GMT',
'connection': 'close',
- '-content-encoding': 'gzip',
+ '-content-encoding': 'gzip', ③
'etag': '"bfe-4cbbf5c0"',
'cache-control': 'max-age=86400',
'date': 'Wed, 03 Jun 2009 02:21:41 GMT',
- 'content-type': 'application/xml'}
->>> response, content = h.request('http://diveintopython3.org/examples/feed-302.xml')
-connect: (diveintopython3.org, 80)
-send: b'GET /examples/feed-302.xml HTTP/1.1
-Host: diveintopython3.org
-accept-encoding: deflate, gzip
-user-agent: Python-httplib2/$Rev: 259 $'
-reply: 'HTTP/1.1 302 Found'
+ 'content-type': 'application/xml'}
->>> response, content = h.request('http://diveintopython3.org/examples/feed-301.xml')
+# continued from the previous example
+>>> response, content = h.request('http://diveintopython3.org/examples/feed-302.xml') ①
+connect: (diveintopython3.org, 80)
+send: b'GET /examples/feed-302.xml HTTP/1.1 ②
+Host: diveintopython3.org
+accept-encoding: deflate, gzip
+user-agent: Python-httplib2/$Rev: 259 $'
+reply: 'HTTP/1.1 302 Found' ③
+
+>>> response, content = h.request('http://diveintopython3.org/examples/feed-301.xml')
connect: (diveintopython3.org, 80)
send: b'GET /examples/feed-301.xml HTTP/1.1
Host: diveintopython3.org
accept-encoding: deflate, gzip
user-agent: Python-httplib2/$Rev: 259 $'
reply: 'HTTP/1.1 301 Moved Permanently'
->>> print(dict(response.items()))
-{'status': '200',
+>>> print(dict(response.items()))
+{'status': '200',
'content-length': '3070',
'content-location': 'http://diveintopython3.org/examples/feed.xml',
'accept-ranges': 'bytes',
@@ -588,9 +607,9 @@ reply: 'HTTP/1.1 301 Moved Permanently'
'cache-control': 'max-age=86400',
'date': 'Wed, 03 Jun 2009 02:21:41 GMT',
'content-type': 'application/xml'}
->>> response2, content2 = h.request('http://diveintopython3.org/examples/feed-301.xml')
->>> response2.fromcache
-True
+>>> response2, content2 = h.request('http://diveintopython3.org/examples/feed-301.xml')
+>>> response2.fromcache
+True
FIXME
->>> import httplib2
->>> from urllib.parse import urlencode
->>> h = httplib2.Http('.cache')
->>> data = {'status': 'Test update from Python 3'}
->>> h.add_credentials('diveintomark', 'MY_SECRET_PASSWORD')
->>> resp, content = h.request('http://twitter.com/statuses/update.xml', 'POST', urlencode(data))
->>> resp.status
-200
->>> from xml.etree import ElementTree as etree
->>> tree = etree.fromstring(content)
->>> print(etree.tostring(tree))
-<status>
+>>> import httplib2
+>>> from urllib.parse import urlencode
+>>> h = httplib2.Http('.cache')
+>>> data = {'status': 'Test update from Python 3'}
+>>> h.add_credentials('diveintomark', 'MY_SECRET_PASSWORD')
+>>> resp, content = h.request('http://twitter.com/statuses/update.xml', 'POST', urlencode(data))
+>>> resp.status
+200
+>>> from xml.etree import ElementTree as etree
+>>> tree = etree.fromstring(content)
+>>> print(etree.tostring(tree))
+<status>
<created_at>Sat May 30 19:11:38 +0000 2009</created_at>
<id>1973974228</id>
<text>Test update from Python 3</text>
@@ -662,11 +681,11 @@ reply: 'HTTP/1.1 301 Moved Permanently'
# continued from the previous example
->>> tree.findtext('id')
-'1973974228'
->>> resp, delete_content = h.request('http://twitter.com/statuses/destroy/{0}.xml'.format(tree.findtext('id')), 'DELETE')
->>> resp.status
-200
+>>> tree.findtext('id')
+'1973974228'
+>>> resp, delete_content = h.request('http://twitter.com/statuses/destroy/{0}.xml'.format(tree.findtext('id')), 'DELETE')
+>>> resp.status
+200
⁂ diff --git a/iterators.html b/iterators.html index f2554c9..9d5ed19 100644 --- a/iterators.html +++ b/iterators.html @@ -95,14 +95,14 @@ body{counter-reset:h1 6}
Instantiating classes in Python is straightforward. To instantiate a class, simply call the class as if it were a function, passing the arguments that the __init__() method requires. The return value will be the newly created object.
->>> import fibonacci2 ->>> fib = fibonacci2.Fib(100) ① ->>> fib ② -<fibonacci2.Fib object at 0x00DB8810> ->>> fib.__class__ ③ -<class 'fibonacci2.Fib'> ->>> fib.__doc__ ④ -'iterator that yields numbers in the Fibonacci sequence'+>>> import fibonacci2 +>>> fib = fibonacci2.Fib(100) ① +>>> fib ② +<fibonacci2.Fib object at 0x00DB8810> +>>> fib.__class__ ③ +<class 'fibonacci2.Fib'> +>>> fib.__doc__ ④ +'iterator that yields numbers in the Fibonacci sequence'
Fib class (defined in the fibonacci2 module) and assigning the newly created instance to the variable fib. You are passing one parameter, 100, which will end up as the max argument in Fib’s __init__() method.
Fib class.
@@ -144,13 +144,13 @@ body{counter-reset:h1 6}
Instance variables are specific to one instance of a class. For example, if you create two Fib instances with different maximum values, they will each remember their own values.
->>> import fibonacci2 ->>> fib1 = fibonacci2.Fib(100) ->>> fib2 = fibonacci2.Fib(200) ->>> fib1.max -100 ->>> fib2.max -200+>>> import fibonacci2 +>>> fib1 = fibonacci2.Fib(100) +>>> fib2 = fibonacci2.Fib(200) +>>> fib1.max +100 +>>> fib2.max +200
⁂ @@ -185,10 +185,10 @@ body{counter-reset:h1 6}
Thoroughly confused yet? Excellent. Let’s see how to call this iterator:
->>> from fibonacci2 import Fib ->>> for n in Fib(1000): -... print(n, end=' ') -0 1 1 2 3 5 8 13 21 34 55 89 144 233 377 610 987+>>> from fibonacci2 import Fib +>>> for n in Fib(1000): +... print(n, end=' ') +0 1 1 2 3 5 8 13 21 34 55 89 144 233 377 610 987
Why, it’s exactly the same! Byte for byte identical to how you called Fibonacci-as-a-generator (modulo one capital letter). But how? @@ -260,20 +260,20 @@ rules = LazyRules()
Before we continue, let’s take a closer look at rules_filename. It’s not defined within the __init__() method. In fact, it’s not defined within any method. It’s defined at the class level. It’s a class variable, and although you can access it just like an instance variable (self.rules_filename), it is shared across all instances of the LazyRules class.
->>> import plural6 ->>> r1 = plural6.LazyRules() ->>> r2 = plural6.LazyRules() ->>> r1.rules_filename ① -'plural6-rules.txt' ->>> r2.rules_filename -'plural6-rules.txt' ->>> r1.__class__.rules_filename ② -'plural6-rules.txt' ->>> r1.__class__.rules_filename = 'papayawhip.txt' ③ ->>> r1.rules_filename -'papayawhip.txt' ->>> r2.rules_filename ④ -'papayawhip.txt'+>>> import plural6 +>>> r1 = plural6.LazyRules() +>>> r2 = plural6.LazyRules() +>>> r1.rules_filename ① +'plural6-rules.txt' +>>> r2.rules_filename +'plural6-rules.txt' +>>> r1.__class__.rules_filename ② +'plural6-rules.txt' +>>> r1.__class__.rules_filename = 'papayawhip.txt' ③ +>>> r1.rules_filename +'papayawhip.txt' +>>> r2.rules_filename ④ +'papayawhip.txt'
and tags in your source with class=pp.
- * You can also use the (html deprecated) tag, but the pretty printer
- * needs to do more substantial DOM manipulations to support that, so some
- * css styles may not be preserved.
* That's it. I wanted to keep the API as simple as possible, so there's no
* need to specify which language the code is in.
*
@@ -271,11 +269,6 @@ window['_pr_isIE6'] = function () {
.replace(pr_nbspEnt, ' ');
}
- /** is the given node's innerHTML normally unescaped? */
- function isRawContent(node) {
- return 'XMP' === node.tagName;
- }
-
function normalizedHtml(node, out) {
switch (node.nodeType) {
case 1: // an element
@@ -548,10 +541,6 @@ window['_pr_isIE6'] = function () {
if (PR_innerHtmlWorks) {
var content = node.innerHTML;
- // XMP tags contain unescaped entities so require special handling.
- if (isRawContent(node)) {
- content = textToHtml(content);
- }
return content;
}
@@ -1283,7 +1272,8 @@ window['_pr_isIE6'] = function () {
var codeSegments = [
document.getElementsByTagName('pre'),
document.getElementsByTagName('code'),
- document.getElementsByTagName('xmp') ];
+ document.getElementsByTagName('kbd'),
+ document.getElementsByTagName('samp') ];
var elements = [];
for (var i = 0; i < codeSegments.length; ++i) {
for (var j = 0, n = codeSegments[i].length; j < n; ++j) {
@@ -1321,7 +1311,7 @@ window['_pr_isIE6'] = function () {
var nested = false;
for (var p = cs.parentNode; p; p = p.parentNode) {
if ((p.tagName === 'pre' || p.tagName === 'code' ||
- p.tagName === 'xmp') &&
+ p.tagName === 'kbd' || p.tagName === 'samp') &&
p.className && p.className.indexOf('pp') >= 0) {
nested = true;
break;
@@ -1358,31 +1348,7 @@ window['_pr_isIE6'] = function () {
var cs = prettyPrintingJob.sourceNode;
// push the prettified html back into the tag.
- if (!isRawContent(cs)) {
- // just replace the old html with the new
- cs.innerHTML = newContent;
- } else {
- // we need to change the tag to a since s do not allow
- // embedded tags such as the span tags used to attach styles to
- // sections of source code.
- var pre = document.createElement('PRE');
- for (var i = 0; i < cs.attributes.length; ++i) {
- var a = cs.attributes[i];
- if (a.specified) {
- var aname = a.name.toLowerCase();
- if (aname === 'class') {
- pre.className = a.value; // For IE 6
- } else {
- pre.setAttribute(a.name, a.value);
- }
- }
- }
- pre.innerHTML = newContent;
-
- // remove the old
- cs.parentNode.replaceChild(pre, cs);
- cs = pre;
- }
+ cs.innerHTML = newContent;
// Replace
s with line-feeds so that copying and pasting works
// on IE 6.
diff --git a/native-datatypes.html b/native-datatypes.html
index 7e9ac98..b9760f5 100644
--- a/native-datatypes.html
+++ b/native-datatypes.html
@@ -43,28 +43,28 @@ body{counter-reset:h1 2}
raise ValueError('number must be non-negative')
size is an integer, 0 is an integer, and < is a numerical operator. The result of the expression size < 0 is always a boolean. You can test this yourself in the Python interactive shell:
->>> size = 1 ->>> size < 0 -False ->>> size = 0 ->>> size < 0 -False ->>> size = -1 ->>> size < 0 -True+>>> size = 1 +>>> size < 0 +False +>>> size = 0 +>>> size < 0 +False +>>> size = -1 +>>> size < 0 +True
⁂
Numbers are awesome. There are so many to choose from. Python supports both integers and floating point numbers. There’s no type declaration to distinguish them; Python tells them apart by the presence or absence of a decimal point.
->>> type(1) ① -<class 'int'> ->>> 1 + 1 ② -2 ->>> 1 + 1.0 ③ -2.0 ->>> type(2.0) -<class 'float'>+>>> type(1) ① +<class 'int'> +>>> 1 + 1 ② +2 +>>> 1 + 1.0 ③ +2.0 +>>> type(2.0) +<class 'float'>
type() function to check the type of any value or variable. As you might expect, 1 is an int.
int to an int yields an int.
@@ -73,18 +73,18 @@ body{counter-reset:h1 2}
As you just saw, some operators (like addition) will coerce integers to floating point numbers as needed. You can also coerce them by yourself.
->>> float(2) ① -2.0 ->>> int(2.0) ② -2 ->>> int(2.5) ③ -2 ->>> int(-2.5) ④ --2 ->>> 1.12345678901234567890 ⑤ -1.1234567890123457 ->>> type(1000000000000000) ⑥ -<class 'int'>+>>> float(2) ① +2.0 +>>> int(2.0) ② +2 +>>> int(2.5) ③ +2 +>>> int(-2.5) ④ +-2 +>>> 1.12345678901234567890 ⑤ +1.1234567890123457 +>>> type(1000000000000000) ⑥ +<class 'int'>
int to a float by calling the float() function.
float to an int by calling int().
@@ -99,18 +99,18 @@ body{counter-reset:h1 2}
You can do all kinds of things with numbers.
->>> 11 / 2 ① -5.5 ->>> 11 // 2 ② -5 ->>> −11 // 2 ③ -−6 ->>> 11.0 // 2 ④ -5.0 ->>> 11 ** 2 ⑤ -121 ->>> 11 % 2 ⑥ -1 +>>> 11 / 2 ① +5.5 +>>> 11 // 2 ② +5 +>>> −11 // 2 ③ +−6 +>>> 11.0 // 2 ④ +5.0 +>>> 11 ** 2 ⑤ +121 +>>> 11 % 2 ⑥ +1
/ operator performs floating point division. It returns a float even if both the numerator and denominator are ints.
@@ -126,14 +126,14 @@ body{counter-reset:h1 2}
Python isn’t limited to integers and floating point numbers. It can also do all the fancy math you learned in high school and promptly forgot about.
->>> import fractions ① ->>> x = fractions.Fraction(1, 3) ② ->>> x -Fraction(1, 3) ->>> x * 2 ③ -Fraction(2, 3) ->>> fractions.Fraction(6, 4) ④ -Fraction(3, 2)+>>> import fractions ① +>>> x = fractions.Fraction(1, 3) ② +>>> x +Fraction(1, 3) +>>> x * 2 ③ +Fraction(2, 3) +>>> fractions.Fraction(6, 4) ④ +Fraction(3, 2)
fractions module.
Fraction object and pass in the numerator and denominator.
@@ -143,13 +143,13 @@ body{counter-reset:h1 2}
You can also do basic trigonometry in Python.
->>> import math ->>> math.pi ① -3.1415926535897931 ->>> math.sin(math.pi / 2) ② -1.0 ->>> math.tan(math.pi / 4) ③ -0.99999999999999989+>>> import math +>>> math.pi ① +3.1415926535897931 +>>> math.sin(math.pi / 2) ② +1.0 +>>> math.tan(math.pi / 4) ③ +0.99999999999999989
math module has a constant for π, the ratio of a circle’s circumference to its diameter.
math module has all the basic trigonometric functions, including sin(), cos(), tan(), and variants like asin().
@@ -159,26 +159,26 @@ body{counter-reset:h1 2}
You can use numbers in a boolean context, such as an if statement. Zero values are false, and non-zero values are true.
->>> def is_it_true(anything): ① -... if anything: -... print('yes, it's true') -... else: -... print('no, it's false') +>>> def is_it_true(anything): ① +... if anything: +... print('yes, it's true') +... else: +... print('no, it's false') ... ->>> is_it_true(1) ② +>>> is_it_true(1) ② yes, it's true ->>> is_it_true(-1) +>>> is_it_true(-1) yes, it's true ->>> is_it_true(0) +>>> is_it_true(0) no, it's false ->>> is_it_true(0.1) ③ +>>> is_it_true(0.1) ③ yes, it's true ->>> is_it_true(0.0) +>>> is_it_true(0.0) no, it's false ->>> import fractions ->>> is_it_true(fractions.Fraction(1, 2)) ④ +>>> import fractions +>>> is_it_true(fractions.Fraction(1, 2)) ④ yes, it's true ->>> is_it_true(fractions.Fraction(0, 1)) +>>> is_it_true(fractions.Fraction(0, 1)) no, it's false
Creating a list is easy: use square brackets to wrap a comma-separated list of values.
->>> a_list = ['a', 'b', 'mpilgrim', 'z', 'example'] ① ->>> a_list +>>> a_list = ['a', 'b', 'mpilgrim', 'z', 'example'] ① +>>> a_list ['a', 'b', 'mpilgrim', 'z', 'example'] ->>> a_list[0] ② -'a' ->>> a_list[4] ③ -'example' ->>> a_list[-1] ④ -'example' ->>> a_list[-3] ⑤ -'mpilgrim'+>>> a_list[0] ② +'a' +>>> a_list[4] ③ +'example' +>>> a_list[-1] ④ +'example' +>>> a_list[-3] ⑤ +'mpilgrim'
a_list[0].
@@ -221,19 +221,19 @@ body{counter-reset:h1 2}
Once you’ve defined a list, you can get any part of it as a new list. This is called slicing the list.
->>> a_list -['a', 'b', 'mpilgrim', 'z', 'example'] ->>> a_list[1:3] ① -['b', 'mpilgrim'] ->>> a_list[1:-1] ② -['b', 'mpilgrim', 'z'] ->>> a_list[0:3] ③ -['a', 'b', 'mpilgrim'] ->>> a_list[:3] ④ -['a', 'b', 'mpilgrim'] ->>> a_list[3:] ⑤ -['z', 'example'] ->>> a_list[:] ⑥ +>>> a_list +['a', 'b', 'mpilgrim', 'z', 'example'] +>>> a_list[1:3] ① +['b', 'mpilgrim'] +>>> a_list[1:-1] ② +['b', 'mpilgrim', 'z'] +>>> a_list[0:3] ③ +['a', 'b', 'mpilgrim'] +>>> a_list[:3] ④ +['a', 'b', 'mpilgrim'] +>>> a_list[3:] ⑤ +['z', 'example'] +>>> a_list[:] ⑥ ['a', 'b', 'mpilgrim', 'z', 'example']
a_list[1]), up to but not including the second slice index (in this case a_list[3]).
@@ -246,19 +246,19 @@ body{counter-reset:h1 2}
There are four ways to add items to a list.
->>> a_list = ['a'] ->>> a_list = a_list + [2.0, 3] ① ->>> a_list -['a', 2.0, 3] ->>> a_list.append(True) ② ->>> a_list -['a', 2.0, 3, True] ->>> a_list.extend(['four', 'e']) ③ ->>> a_list -['a', 2.0, 3, True, 'four', 'e'] ->>> a_list.insert(1, 'a') ④ ->>> a_list -['a', 'a', 2.0, 3, True, 'four', 'e']+>>> a_list = ['a'] +>>> a_list = a_list + [2.0, 3] ① +>>> a_list +['a', 2.0, 3] +>>> a_list.append(True) ② +>>> a_list +['a', 2.0, 3, True] +>>> a_list.extend(['four', 'e']) ③ +>>> a_list +['a', 2.0, 3, True, 'four', 'e'] +>>> a_list.insert(1, 'a') ④ +>>> a_list +['a', 'a', 2.0, 3, True, 'four', 'e']
+ operator concatenates lists. A list can contain any number of items; there is no size limit (other than available memory). A list can contain items of any datatype; they don’t all need to be the same type. Here we have a list containing a string, a floating point number, and an integer.
append() method adds a single item to the end of the list. (Now we have four different datatypes in the list!)
@@ -267,21 +267,21 @@ body{counter-reset:h1 2}
Let’s look closer at the difference between append() and extend().
->>> a_list = ['a', 'b', 'c'] ->>> a_list.extend(['d', 'e', 'f']) ① ->>> a_list -['a', 'b', 'c', 'd', 'e', 'f'] ->>> len(a_list) ② -6 ->>> a_list[-1] -'f' ->>> a_list.append(['g', 'h', 'i']) ③ ->>> a_list -['a', 'b', 'c', 'd', 'e', 'f', ['g', 'h', 'i']] ->>> len(a_list) ④ -7 ->>> a_list[-1] -['g', 'h', 'i']+>>> a_list = ['a', 'b', 'c'] +>>> a_list.extend(['d', 'e', 'f']) ① +>>> a_list +['a', 'b', 'c', 'd', 'e', 'f'] +>>> len(a_list) ② +6 +>>> a_list[-1] +'f' +>>> a_list.append(['g', 'h', 'i']) ③ +>>> a_list +['a', 'b', 'c', 'd', 'e', 'f', ['g', 'h', 'i']] +>>> len(a_list) ④ +7 +>>> a_list[-1] +['g', 'h', 'i']
extend() method takes a single argument, which is always a list, and adds each of the items of that list to a_list.
->>> a_list = ['a', 'b', 'new', 'mpilgrim', 'new'] ->>> 'mpilgrim' in a_list ① -True ->>> a_list.index('mpilgrim') ② -3 ->>> a_list.index('new') ③ -2 ->>> 'c' in a_list ④ -False ->>> a_list.index('c') ⑤ +>>> a_list = ['a', 'b', 'new', 'mpilgrim', 'new'] +>>> 'mpilgrim' in a_list ① +True +>>> a_list.index('mpilgrim') ② +3 +>>> a_list.index('new') ③ +2 +>>> 'c' in a_list ④ +False +>>> a_list.index('c') ⑤ Traceback (innermost last): File "<interactive input>", line 1, in ? ValueError: list.index(x): x not in list@@ -314,17 +314,17 @@ ValueError: list.index(x): x not in list
You can also use a list in a boolean context, such as an if statement.
->>> def is_it_true(anything):
-... if anything:
-... print('yes, it's true')
-... else:
-... print('no, it's false')
+>>> def is_it_true(anything):
+... if anything:
+... print('yes, it's true')
+... else:
+... print('no, it's false')
...
->>> is_it_true([]) ②
+>>> is_it_true([]) ②
no, it's false
->>> is_it_true(['a']) ③
+>>> is_it_true(['a']) ③
yes, it's true
->>> is_it_true([False]) ④
+>>> is_it_true([False]) ④
yes, it's true
Creating a dictionary is easy. The syntax is similar to sets, but instead of values, you have key-value pairs. Once you have a dictionary, you can look up values by their key.
->>> a_dict = {'server':'db.diveintopython3.org', 'database':'mysql'} ① ->>> a_dict -{'server': 'db.diveintopython3.org', 'database': 'mysql'} ->>> a_dict['server'] ② +>>> a_dict = {'server':'db.diveintopython3.org', 'database':'mysql'} ① +>>> a_dict +{'server': 'db.diveintopython3.org', 'database': 'mysql'} +>>> a_dict['server'] ② 'db.diveintopython3.org' ->>> a_dict['database'] ③ +>>> a_dict['database'] ③ 'mysql' ->>> a_dict['db.diveintopython3.org'] ④ +>>> a_dict['db.diveintopython3.org'] ④ Traceback (most recent call last): File "<stdin>", line 1, in <module> KeyError: 'db.diveintopython3.org'@@ -367,20 +367,20 @@ KeyError: 'db.diveintopython3.org'
Dictionaries do not have any predefined size limit. You can add new key-value pairs to a dictionary at any time, or you can modify the value of an existing key. Continuing from the previous example:
->>> a_dict
-{'server': 'db.diveintopython3.org', 'database': 'mysql'}
->>> a_dict['database'] = 'blog' ①
->>> a_dict
-{'server': 'db.diveintopython3.org', 'database': 'blog'}
->>> a_dict['user'] = 'mark' ②
->>> a_dict ③
-{'server': 'db.diveintopython3.org', 'user': 'mark', 'database': 'blog'}
->>> a_dict['user'] = 'dora' ④
->>> a_dict
-{'server': 'db.diveintopython3.org', 'user': 'dora', 'database': 'blog'}
->>> a_dict['User'] = 'mark' ⑤
->>> a_dict
-{'User': 'mark', 'server': 'db.diveintopython3.org', 'user': 'dora', 'database': 'blog'}
+>>> a_dict
+{'server': 'db.diveintopython3.org', 'database': 'mysql'}
+>>> a_dict['database'] = 'blog' ①
+>>> a_dict
+{'server': 'db.diveintopython3.org', 'database': 'blog'}
+>>> a_dict['user'] = 'mark' ②
+>>> a_dict ③
+{'server': 'db.diveintopython3.org', 'user': 'mark', 'database': 'blog'}
+>>> a_dict['user'] = 'dora' ④
+>>> a_dict
+{'server': 'db.diveintopython3.org', 'user': 'dora', 'database': 'blog'}
+>>> a_dict['User'] = 'mark' ⑤
+>>> a_dict
+{'User': 'mark', 'server': 'db.diveintopython3.org', 'user': 'dora', 'database': 'blog'}
Let's tear that apart in the interactive shell.
->>> SUFFIXES = {1000: ['KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'],
-... 1024: ['KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB']}
->>> len(SUFFIXES) ①
-2
->>> SUFFIXES[1000] ②
-['KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB']
->>> SUFFIXES[1024] ③
-['KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB']
->>> SUFFIXES[1000][3] ④
-'TB'
+>>> SUFFIXES = {1000: ['KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'],
+... 1024: ['KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB']}
+>>> len(SUFFIXES) ①
+2
+>>> SUFFIXES[1000] ②
+['KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB']
+>>> SUFFIXES[1024] ③
+['KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB']
+>>> SUFFIXES[1000][3] ④
+'TB'
len() function gives you the number of items in a dictionary.
1000 is a key in the SUFFIXES dictionary; its value is a list of eight items (eight strings, to be precise).
@@ -415,15 +415,15 @@ KeyError: 'db.diveintopython3.org'
You can also use a dictionary in a boolean context, such as an if statement.
->>> def is_it_true(anything):
-... if anything:
-... print('yes, it's true')
-... else:
-... print('no, it's false')
+>>> def is_it_true(anything):
+... if anything:
+... print('yes, it's true')
+... else:
+... print('no, it's false')
...
->>> is_it_true({}) ①
+>>> is_it_true({}) ①
no, it's false
->>> is_it_true({'a': 1}) ②
+>>> is_it_true({'a': 1}) ②
yes, it's true
None is a special constant in Python. It is a null value. None is not the same as False. None is not 0. None is not an empty string. Comparing None to anything other than None will always return False.
None is the only null value. It has its own datatype (NoneType). You can assign None to any variable, but you can not create other NoneType objects. All variables whose value is None are equal to each other.
->>> type(None) -<class 'NoneType'> ->>> None == False -False ->>> None == 0 -False ->>> None == '' -False ->>> None == None -True ->>> x = None ->>> x == None -True ->>> y = None ->>> x == y -True +>>> type(None) +<class 'NoneType'> +>>> None == False +False +>>> None == 0 +False +>>> None == '' +False +>>> None == None +True +>>> x = None +>>> x == None +True +>>> y = None +>>> x == y +True
None In A Boolean ContextIn a boolean context, None is false and not None is true.
->>> def is_it_true(anything):
-... if anything:
-... print('yes, it's true')
-... else:
-... print('no, it's false')
+>>> def is_it_true(anything):
+... if anything:
+... print('yes, it's true')
+... else:
+... print('no, it's false')
...
->>> is_it_true(None)
+>>> is_it_true(None)
no, it's false
->>> is_it_true(not None)
+>>> is_it_true(not None)
yes, it's true
⁂ diff --git a/refactoring.html b/refactoring.html index d819acf..0b4b54d 100644 --- a/refactoring.html +++ b/refactoring.html @@ -22,9 +22,9 @@ body{counter-reset:h1 10}
Despite your best efforts to write comprehensive unit tests, bugs happen. What do I mean by “bug”? A bug is a test case you haven’t written yet. -
>>> import roman7 ->>> roman7.from_roman('') ① -0+
>>> import roman7 +>>> roman7.from_roman('') ① +0
InvalidRomanNumeralError exception just like any other sequence of characters that don’t represent a valid Roman numeral.
This series of examples was inspired by a real-life problem I had in my day job several years ago, when I needed to scrub and standardize street addresses exported from a legacy system before importing them into a newer system. (See, I don’t just make this stuff up; it’s actually useful.) This example shows how I approached the problem.
->>> s = '100 NORTH MAIN ROAD' ->>> s.replace('ROAD', 'RD.') ① -'100 NORTH MAIN RD.' ->>> s = '100 NORTH BROAD ROAD' ->>> s.replace('ROAD', 'RD.') ② -'100 NORTH BRD. RD.' ->>> s[:-4] + s[-4:].replace('ROAD', 'RD.') ③ -'100 NORTH BROAD RD.' ->>> import re ④ ->>> re.sub('ROAD$', 'RD.', s) ⑤ -'100 NORTH BROAD RD.'+>>> s = '100 NORTH MAIN ROAD' +>>> s.replace('ROAD', 'RD.') ① +'100 NORTH MAIN RD.' +>>> s = '100 NORTH BROAD ROAD' +>>> s.replace('ROAD', 'RD.') ② +'100 NORTH BRD. RD.' +>>> s[:-4] + s[-4:].replace('ROAD', 'RD.') ③ +'100 NORTH BROAD RD.' +>>> import re ④ +>>> re.sub('ROAD$', 'RD.', s) ⑤ +'100 NORTH BROAD RD.'
'ROAD' is always abbreviated as 'RD.'. At first glance, I thought this was simple enough that I could just use the string method replace(). After all, all the data was already uppercase, so case mismatches would not be a problem. And the search string, 'ROAD', was a constant. And in this deceptively simple example, s.replace() does indeed work.
'ROAD' appears twice in the address, once as part of the street name 'BROAD' and once as its own word. The replace() method sees these two occurrences and blindly replaces both of them; meanwhile, I see my addresses getting destroyed.
@@ -52,18 +52,18 @@ body{counter-reset:h1 4}
Continuing with my story of scrubbing addresses, I soon discovered that the previous example, matching 'ROAD' at the end of the address, was not good enough, because not all addresses included a street designation at all. Some addresses simply ended with the street name. I got away with it most of the time, but if the street name was 'BROAD', then the regular expression would match 'ROAD' at the end of the string as part of the word 'BROAD', which is not what I wanted.
->>> s = '100 BROAD'
->>> re.sub('ROAD$', 'RD.', s)
-'100 BRD.'
->>> re.sub('\\bROAD$', 'RD.', s) ①
-'100 BROAD'
->>> re.sub(r'\bROAD$', 'RD.', s) ②
-'100 BROAD'
->>> s = '100 BROAD ROAD APT. 3'
->>> re.sub(r'\bROAD$', 'RD.', s) ③
-'100 BROAD ROAD APT. 3'
->>> re.sub(r'\bROAD\b', 'RD.', s) ④
-'100 BROAD RD. APT 3'
+>>> s = '100 BROAD'
+>>> re.sub('ROAD$', 'RD.', s)
+'100 BRD.'
+>>> re.sub('\\bROAD$', 'RD.', s) ①
+'100 BROAD'
+>>> re.sub(r'\bROAD$', 'RD.', s) ②
+'100 BROAD'
+>>> s = '100 BROAD ROAD APT. 3'
+>>> re.sub(r'\bROAD$', 'RD.', s) ③
+'100 BROAD ROAD APT. 3'
+>>> re.sub(r'\bROAD\b', 'RD.', s) ④
+'100 BROAD RD. APT 3'
'ROAD' when it was at the end of the string and it was its own word (and not a part of some larger word). To express this in a regular expression, you use \b, which means “a word boundary must occur right here.” In Python, this is complicated by the fact that the '\' character in a string must itself be escaped. This is sometimes referred to as the backslash plague, and it is one reason why regular expressions are easier in Perl than in Python. On the down side, Perl mixes regular expressions with other syntax, so if you have a bug, it may be hard to tell whether it’s a bug in syntax or a bug in your regular expression.
r. This tells Python that nothing in this string should be escaped; '\t' is a tab character, but r'\t' is really the backslash character \ followed by the letter t. I recommend always using raw strings when dealing with regular expressions; otherwise, things get too confusing too quickly (and regular expressions are confusing enough already).
@@ -95,17 +95,17 @@ body{counter-reset:h1 4}
What would it take to validate that an arbitrary string is a valid Roman numeral? Let’s take it one digit at a time. Since Roman numerals are always written highest to lowest, let’s start with the highest: the thousands place. For numbers 1000 and higher, the thousands are represented by a series of M characters.
->>> import re ->>> pattern = '^M?M?M?$' ① ->>> re.search(pattern, 'M') ② -<SRE_Match object at 0106FB58> ->>> re.search(pattern, 'MM') ③ -<SRE_Match object at 0106C290> ->>> re.search(pattern, 'MMM') ④ -<SRE_Match object at 0106AA38> ->>> re.search(pattern, 'MMMM') ⑤ ->>> re.search(pattern, '') ⑥ -<SRE_Match object at 0106F4A8>+>>> import re +>>> pattern = '^M?M?M?$' ① +>>> re.search(pattern, 'M') ② +<SRE_Match object at 0106FB58> +>>> re.search(pattern, 'MM') ③ +<SRE_Match object at 0106C290> +>>> re.search(pattern, 'MMM') ④ +<SRE_Match object at 0106AA38> +>>> re.search(pattern, 'MMMM') ⑤ +>>> re.search(pattern, '') ⑥ +<SRE_Match object at 0106F4A8>
^ matches what follows only at the beginning of the string. If this were not specified, the pattern would match no matter where the M characters were, which is not what you want. You want to make sure that the M characters, if they’re there, are at the beginning of the string. M? optionally matches a single M character. Since this is repeated three times, you’re matching anywhere from zero to three M characters in a row. And $ matches the end of the string. When combined with the ^ character at the beginning, this means that the pattern must match the entire string, with no other characters before or after the M characters.
re module is the search() function, that takes a regular expression (pattern) and a string ('M') to try to match against the regular expression. If a match is found, search() returns an object which has various methods to describe the match; if no match is found, search() returns None, the Python null value. All you care about at the moment is whether the pattern matches, which you can tell by just looking at the return value of search(). 'M' matches this regular expression, because the first optional M matches and the second and third optional M characters are ignored.
@@ -141,17 +141,17 @@ body{counter-reset:h1 4}
This example shows how to validate the hundreds place of a Roman numeral.
->>> import re ->>> pattern = '^M?M?M?(CM|CD|D?C?C?C?)$' ① ->>> re.search(pattern, 'MCM') ② -<SRE_Match object at 01070390> ->>> re.search(pattern, 'MD') ③ -<SRE_Match object at 01073A50> ->>> re.search(pattern, 'MMMCCC') ④ -<SRE_Match object at 010748A8> ->>> re.search(pattern, 'MCMC') ⑤ ->>> re.search(pattern, '') ⑥ -<SRE_Match object at 01071D98>+>>> import re +>>> pattern = '^M?M?M?(CM|CD|D?C?C?C?)$' ① +>>> re.search(pattern, 'MCM') ② +<SRE_Match object at 01070390> +>>> re.search(pattern, 'MD') ③ +<SRE_Match object at 01073A50> +>>> re.search(pattern, 'MMMCCC') ④ +<SRE_Match object at 010748A8> +>>> re.search(pattern, 'MCMC') ⑤ +>>> re.search(pattern, '') ⑥ +<SRE_Match object at 01071D98>
^), then the thousands place (M?M?M?). Then it has the new part, in parentheses, which defines a set of three mutually exclusive patterns, separated by vertical bars: CM, CD, and D?C?C?C? (which is an optional D followed by zero to three optional C characters). The regular expression parser checks for each of these patterns in order (from left to right), takes the first one that matches, and ignores the rest.
'MCM' matches because the first M matches, the second and third M characters are ignored, and the CM matches (so the CD and D?C?C?C? patterns are never even considered). MCM is the Roman numeral representation of 1900.
@@ -167,17 +167,17 @@ body{counter-reset:h1 4}
In the previous section, you were dealing with a pattern where the same character could be repeated up to three times. There is another way to express this in regular expressions, which some people find more readable. First look at the method we already used in the previous example.
->>> import re ->>> pattern = '^M?M?M?$' ->>> re.search(pattern, 'M') ① -<_sre.SRE_Match object at 0x008EE090> ->>> pattern = '^M?M?M?$' ->>> re.search(pattern, 'MM') ② -<_sre.SRE_Match object at 0x008EEB48> ->>> pattern = '^M?M?M?$' ->>> re.search(pattern, 'MMM') ③ -<_sre.SRE_Match object at 0x008EE090> ->>> re.search(pattern, 'MMMM') ④ +>>> import re +>>> pattern = '^M?M?M?$' +>>> re.search(pattern, 'M') ① +<_sre.SRE_Match object at 0x008EE090> +>>> pattern = '^M?M?M?$' +>>> re.search(pattern, 'MM') ② +<_sre.SRE_Match object at 0x008EEB48> +>>> pattern = '^M?M?M?$' +>>> re.search(pattern, 'MMM') ③ +<_sre.SRE_Match object at 0x008EE090> +>>> re.search(pattern, 'MMMM') ④ >>>
M, but not the second and third M (but that’s okay because they’re optional), and then the end of the string.
@@ -186,14 +186,14 @@ body{counter-reset:h1 4}
M, but then does not match the the end of the string (because there is still one unmatched M), so the pattern does not match and returns None.
->>> pattern = '^M{0,3}$' ① ->>> re.search(pattern, 'M') ② -<_sre.SRE_Match object at 0x008EEB48> ->>> re.search(pattern, 'MM') ③ -<_sre.SRE_Match object at 0x008EE090> ->>> re.search(pattern, 'MMM') ④ -<_sre.SRE_Match object at 0x008EEDA8> ->>> re.search(pattern, 'MMMM') ⑤ +>>> pattern = '^M{0,3}$' ① +>>> re.search(pattern, 'M') ② +<_sre.SRE_Match object at 0x008EEB48> +>>> re.search(pattern, 'MM') ③ +<_sre.SRE_Match object at 0x008EE090> +>>> re.search(pattern, 'MMM') ④ +<_sre.SRE_Match object at 0x008EEDA8> +>>> re.search(pattern, 'MMMM') ⑤ >>>
M characters, then the end of the string.” The 0 and 3 can be any numbers; if you want to match at least one but no more than three M characters, you could say M{1,3}.
@@ -205,16 +205,16 @@ body{counter-reset:h1 4}
Now let’s expand the Roman numeral regular expression to cover the tens and ones place. This example shows the check for tens.
->>> pattern = '^M?M?M?(CM|CD|D?C?C?C?)(XC|XL|L?X?X?X?)$' ->>> re.search(pattern, 'MCMXL') ① -<_sre.SRE_Match object at 0x008EEB48> ->>> re.search(pattern, 'MCML') ② -<_sre.SRE_Match object at 0x008EEB48> ->>> re.search(pattern, 'MCMLX') ③ -<_sre.SRE_Match object at 0x008EEB48> ->>> re.search(pattern, 'MCMLXXX') ④ -<_sre.SRE_Match object at 0x008EEB48> ->>> re.search(pattern, 'MCMLXXXX') ⑤ +>>> pattern = '^M?M?M?(CM|CD|D?C?C?C?)(XC|XL|L?X?X?X?)$' +>>> re.search(pattern, 'MCMXL') ① +<_sre.SRE_Match object at 0x008EEB48> +>>> re.search(pattern, 'MCML') ② +<_sre.SRE_Match object at 0x008EEB48> +>>> re.search(pattern, 'MCMLX') ③ +<_sre.SRE_Match object at 0x008EEB48> +>>> re.search(pattern, 'MCMLXXX') ④ +<_sre.SRE_Match object at 0x008EEB48> +>>> re.search(pattern, 'MCMLXXXX') ⑤ >>>
M, then CM, then XL, then the end of the string. Remember, the (A|B|C) syntax means “match exactly one of A, B, or C”. You match XL, so you ignore the XC and L?X?X?X? choices, and then move on to the end of the string. MCML is the Roman numeral representation of 1940.
@@ -226,18 +226,18 @@ body{counter-reset:h1 4}
The expression for the ones place follows the same pattern. I’ll spare you the details and show you the end result.
->>> pattern = '^M?M?M?(CM|CD|D?C?C?C?)(XC|XL|L?X?X?X?)(IX|IV|V?I?I?I?)$' +>>> pattern = '^M?M?M?(CM|CD|D?C?C?C?)(XC|XL|L?X?X?X?)(IX|IV|V?I?I?I?)$'
So what does that look like using this alternate {n,m} syntax? This example shows the new syntax.
->>> pattern = '^M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$'
->>> re.search(pattern, 'MDLV') ①
-<_sre.SRE_Match object at 0x008EEB48>
->>> re.search(pattern, 'MMDCLXVI') ②
-<_sre.SRE_Match object at 0x008EEB48>
->>> re.search(pattern, 'MMMDCCCLXXXVIII') ③
-<_sre.SRE_Match object at 0x008EEB48>
->>> re.search(pattern, 'I') ④
-<_sre.SRE_Match object at 0x008EEB48>
+>>> pattern = '^M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$'
+>>> re.search(pattern, 'MDLV') ①
+<_sre.SRE_Match object at 0x008EEB48>
+>>> re.search(pattern, 'MMDCLXVI') ②
+<_sre.SRE_Match object at 0x008EEB48>
+>>> re.search(pattern, 'MMMDCCCLXXXVIII') ③
+<_sre.SRE_Match object at 0x008EEB48>
+>>> re.search(pattern, 'I') ④
+<_sre.SRE_Match object at 0x008EEB48>
M characters, then D?C{0,3}. Of that, it matches the optional D and zero of three possible C characters. Moving on, it matches L?X{0,3} by matching the optional L and zero of three possible X characters. Then it matches V?I{0,3} by matching the optional V and zero of three possible I characters, and finally the end of the string. MDLV is the Roman numeral representation of 1555.
M characters, then the D?C{0,3} with a D and one of three possible C characters; then L?X{0,3} with an L and one of three possible X characters; then V?I{0,3} with a V and one of three possible I characters; then the end of the string. MMDCLXVI is the Roman numeral representation of 2666.
@@ -257,7 +257,7 @@ body{counter-reset:h1 4}
This will be more clear with an example. Let’s revisit the compact regular expression you’ve been working with, and make it a verbose regular expression. This example shows how.
->>> pattern = '''
+>>> pattern = '''
^ # beginning of string
M{0,3} # thousands - 0 to 3 M's
(CM|CD|D?C{0,3}) # hundreds - 900 (CM), 400 (CD), 0-300 (0 to 3 C's),
@@ -268,13 +268,13 @@ body{counter-reset:h1 4}
# or 5-8 (V, followed by 0 to 3 I's)
$ # end of string
'''
->>> re.search(pattern, 'M', re.VERBOSE) ①
-<_sre.SRE_Match object at 0x008EEB48>
->>> re.search(pattern, 'MCMLXXXIX', re.VERBOSE) ②
-<_sre.SRE_Match object at 0x008EEB48>
->>> re.search(pattern, 'MMMDCCCLXXXVIII', re.VERBOSE) ③
-<_sre.SRE_Match object at 0x008EEB48>
->>> re.search(pattern, 'M') ④
+>>> re.search(pattern, 'M', re.VERBOSE) ①
+<_sre.SRE_Match object at 0x008EEB48>
+>>> re.search(pattern, 'MCMLXXXIX', re.VERBOSE) ②
+<_sre.SRE_Match object at 0x008EEB48>
+>>> re.search(pattern, 'MMMDCCCLXXXVIII', re.VERBOSE) ③
+<_sre.SRE_Match object at 0x008EEB48>
+>>> re.search(pattern, 'M') ④
re.VERBOSE is a constant defined in the re module that signals that the pattern should be treated as a verbose regular expression. As you can see, this pattern has quite a bit of whitespace (all of which is ignored), and several comments (all of which are ignored). Once you ignore the whitespace and the comments, this is exactly the same regular expression as you saw in the previous section, but it’s a lot more readable.
M, then CM, then L and three of a possible three X, then IX, then the end of the string.
@@ -302,10 +302,10 @@ body{counter-reset:h1 4}
Quite a variety! In each of these cases, I need to know that the area code was 800, the trunk was 555, and the rest of the phone number was 1212. For those with an extension, I need to know that the extension was 1234.
Let’s work through developing a solution for phone number parsing. This example shows the first step.
->>> phonePattern = re.compile(r'^(\d{3})-(\d{3})-(\d{4})$') ① ->>> phonePattern.search('800-555-1212').groups() ② -('800', '555', '1212') ->>> phonePattern.search('800-555-1212-1234') ③ +>>> phonePattern = re.compile(r'^(\d{3})-(\d{3})-(\d{4})$') ① +>>> phonePattern.search('800-555-1212').groups() ② +('800', '555', '1212') +>>> phonePattern.search('800-555-1212-1234') ③ >>>
(\d{3}). What’s \d{3}? Well, the {3} means “match exactly three numeric digits”; it’s a variation on the {n,m} syntax you saw earlier. \d means “any numeric digit” (0 through 9). Putting it in parentheses means “match exactly three numeric digits, and then remember them as a group that I can ask for later”. Then match a literal hyphen. Then match another group of exactly three digits. Then another literal hyphen. Then another group of exactly four digits. Then match the end of the string.
@@ -313,12 +313,12 @@ body{counter-reset:h1 4}
->>> phonePattern = re.compile(r'^(\d{3})-(\d{3})-(\d{4})-(\d+)$') ① ->>> phonePattern.search('800-555-1212-1234').groups() ② -('800', '555', '1212', '1234') ->>> phonePattern.search('800 555 1212 1234') ③ +>>> phonePattern = re.compile(r'^(\d{3})-(\d{3})-(\d{4})-(\d+)$') ① +>>> phonePattern.search('800-555-1212-1234').groups() ② +('800', '555', '1212', '1234') +>>> phonePattern.search('800 555 1212 1234') ③ >>> ->>> phonePattern.search('800-555-1212') ④ +>>> phonePattern.search('800-555-1212') ④ >>>
The next example shows the regular expression to handle separators between the different parts of the phone number.
->>> phonePattern = re.compile(r'^(\d{3})\D+(\d{3})\D+(\d{4})\D+(\d+)$') ① ->>> phonePattern.search('800 555 1212 1234').groups() ② -('800', '555', '1212', '1234') ->>> phonePattern.search('800-555-1212-1234').groups() ③ -('800', '555', '1212', '1234') ->>> phonePattern.search('80055512121234') ④ +>>> phonePattern = re.compile(r'^(\d{3})\D+(\d{3})\D+(\d{4})\D+(\d+)$') ① +>>> phonePattern.search('800 555 1212 1234').groups() ② +('800', '555', '1212', '1234') +>>> phonePattern.search('800-555-1212-1234').groups() ③ +('800', '555', '1212', '1234') +>>> phonePattern.search('80055512121234') ④ >>> ->>> phonePattern.search('800-555-1212') ⑤ +>>> phonePattern.search('800-555-1212') ⑤ >>>
\D+. What the heck is that? Well, \D matches any character except a numeric digit, and + means “1 or more”. So \D+ matches one or more characters that are not digits. This is what you’re using instead of a literal hyphen, to try to match different separators.
@@ -346,14 +346,14 @@ body{counter-reset:h1 4}
The next example shows the regular expression for handling phone numbers without separators.
->>> phonePattern = re.compile(r'^(\d{3})\D*(\d{3})\D*(\d{4})\D*(\d*)$') ① ->>> phonePattern.search('80055512121234').groups() ② -('800', '555', '1212', '1234') ->>> phonePattern.search('800.555.1212 x1234').groups() ③ -('800', '555', '1212', '1234') ->>> phonePattern.search('800-555-1212').groups() ④ -('800', '555', '1212', '') ->>> phonePattern.search('(800)5551212 x1234') ⑤ +>>> phonePattern = re.compile(r'^(\d{3})\D*(\d{3})\D*(\d{4})\D*(\d*)$') ① +>>> phonePattern.search('80055512121234').groups() ② +('800', '555', '1212', '1234') +>>> phonePattern.search('800.555.1212 x1234').groups() ③ +('800', '555', '1212', '1234') +>>> phonePattern.search('800-555-1212').groups() ④ +('800', '555', '1212', '') +>>> phonePattern.search('(800)5551212 x1234') ⑤ >>>
+ to *. Instead of \D+ between the parts of the phone number, you now match on \D*. Remember that + means “1 or more”? Well, * means “zero or more”. So now you should be able to parse phone numbers even when there is no separator character at all.
@@ -364,12 +364,12 @@ body{counter-reset:h1 4}
The next example shows how to handle leading characters in phone numbers.
->>> phonePattern = re.compile(r'^\D*(\d{3})\D*(\d{3})\D*(\d{4})\D*(\d*)$') ① ->>> phonePattern.search('(800)5551212 ext. 1234').groups() ② -('800', '555', '1212', '1234') ->>> phonePattern.search('800-555-1212').groups() ③ -('800', '555', '1212', '') ->>> phonePattern.search('work 1-(800) 555.1212 #1234') ④ +>>> phonePattern = re.compile(r'^\D*(\d{3})\D*(\d{3})\D*(\d{4})\D*(\d*)$') ① +>>> phonePattern.search('(800)5551212 ext. 1234').groups() ② +('800', '555', '1212', '1234') +>>> phonePattern.search('800-555-1212').groups() ③ +('800', '555', '1212', '') +>>> phonePattern.search('work 1-(800) 555.1212 #1234') ④ >>>
\D*, zero or more non-numeric characters, before the first remembered group (the area code). Notice that you’re not remembering these non-numeric characters (they’re not in parentheses). If you find them, you’ll just skip over them and then start remembering the area code whenever you get to it.
@@ -379,13 +379,13 @@ body{counter-reset:h1 4}
Let’s back up for a second. So far the regular expressions have all matched from the beginning of the string. But now you see that there may be an indeterminate amount of stuff at the beginning of the string that you want to ignore. Rather than trying to match it all just so you can skip over it, let’s take a different approach: don’t explicitly match the beginning of the string at all. This approach is shown in the next example.
->>> phonePattern = re.compile(r'(\d{3})\D*(\d{3})\D*(\d{4})\D*(\d*)$') ① ->>> phonePattern.search('work 1-(800) 555.1212 #1234').groups() ② -('800', '555', '1212', '1234') ->>> phonePattern.search('800-555-1212') ③ -('800', '555', '1212', '') ->>> phonePattern.search('80055512121234') ④ -('800', '555', '1212', '1234')+>>> phonePattern = re.compile(r'(\d{3})\D*(\d{3})\D*(\d{4})\D*(\d*)$') ① +>>> phonePattern.search('work 1-(800) 555.1212 #1234').groups() ② +('800', '555', '1212', '1234') +>>> phonePattern.search('800-555-1212') ③ +('800', '555', '1212', '') +>>> phonePattern.search('80055512121234') ④ +('800', '555', '1212', '1234')
^ in this regular expression. You are not matching the beginning of the string anymore. There’s nothing that says you need to match the entire input with your regular expression. The regular expression engine will do the hard work of figuring out where the input string starts to match, and go from there.
See how quickly a regular expression can get out of control? Take a quick glance at any of the previous iterations. Can you tell the difference between one and the next?
While you still understand the final answer (and it is the final answer; if you’ve discovered a case it doesn’t handle, I don’t want to know about it), let’s write it out as a verbose regular expression, before you forget why you made the choices you made.
->>> phonePattern = re.compile(r'''
+>>> phonePattern = re.compile(r'''
# don't match beginning of string, number can start anywhere
(\d{3}) # area code is 3 digits (e.g. '800')
\D* # optional separator is any number of non-digits
@@ -406,10 +406,10 @@ body{counter-reset:h1 4}
(\d*) # extension is optional and can be any number of digits
$ # end of string
''', re.VERBOSE)
->>> phonePattern.search('work 1-(800) 555.1212 #1234').groups() ①
-('800', '555', '1212', '1234')
->>> phonePattern.search('800-555-1212') ②
-('800', '555', '1212', '')
+>>> phonePattern.search('work 1-(800) 555.1212 #1234').groups() ①
+('800', '555', '1212', '1234')
+>>> phonePattern.search('800-555-1212') ②
+('800', '555', '1212', '')
__getattr()__ method as a string. If the name is 'color', the method returns a value. (In this case, it’s just a hard-coded string, but you would normally do some sort of computation and return the result.)
__getattr()__ method needs to raise an AttributeError exception, otherwise your code will silently fail when accessing undefined attributes. (Technically, if the method doesn’t raise an exception or explicitly return a value, it returns None, the Python null value. This means that all attributes not explicitly defined will be None, which is almost certainly not what you want.)
@@ -170,12 +170,12 @@ td a:link, td a:visited{border:0}
else:
raise AttributeError
->>> dyn = SuperDynamo()
->>> dyn.color ①
-'PapayaWhip'
->>> dyn.color = 'LemonChiffon'
->>> dyn.color ②
-'PapayaWhip'
+>>> dyn = SuperDynamo()
+>>> dyn.color ①
+'PapayaWhip'
+>>> dyn.color = 'LemonChiffon'
+>>> dyn.color ②
+'PapayaWhip'
__getattribute__() method is called to provide a value for dyn.color.
__getattribute__() method is still called to provide a value for dyn.color. If present, the __getattribute__() method is called unconditionally for every attribute and method lookup, even for attributes that you explicitly set after creating an instance.
@@ -194,8 +194,8 @@ td a:link, td a:visited{border:0}
def swim(self):
pass
->>> hero = Rastan()
->>> hero.swim() ②
+>>> hero = Rastan()
+>>> hero.swim() ②
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "<stdin>", line 3, in __getattribute__
@@ -361,10 +361,10 @@ class FieldStorage:
Using the appropriate special methods, you can define your own classes that act like numbers. That is, you can add them, subtract them, and perform other mathematical operations on them. This is how fractions are implemented — the Fraction class implements these special methods, then you can do things like this:
->>> from fractions import Fraction ->>> x = Fraction(1, 3) ->>> x / 3 -Fraction(1, 9)+>>> from fractions import Fraction +>>> x = Fraction(1, 3) +>>> x / 3 +Fraction(1, 9)
Here is the comprehensive list of special methods you need to implement a number-like class. @@ -430,10 +430,10 @@ class FieldStorage:
That’s all well and good if x is an instance of a class that implements those methods. But what if it doesn’t implement one of them? Or worse, what if it implements it, but it can’t handle certain kinds of arguments? For example:
->>> from fractions import Fraction ->>> x = Fraction(1, 3) ->>> 1 / x -Fraction(3, 1)+>>> from fractions import Fraction +>>> x = Fraction(1, 3) +>>> 1 / x +Fraction(3, 1)
This is not a case of taking a Fraction and dividing it by an integer (as in the previous example). That case was straightforward: x / 3 calls x.__truediv__(3), and the __truediv__() method of the Fraction class handles all the math. But integers don’t “know” how to do arithmetic operations with fractions. So why does this example work?
diff --git a/strings.html b/strings.html
index c802a39..87e5405 100644
--- a/strings.html
+++ b/strings.html
@@ -84,13 +84,13 @@ My alphabet starts where your alphabet ends! ❞
&m
In Python 3, all strings are sequences of Unicode characters. There is no such thing as a Python string encoded in UTF-8, or a Python string encoded as CP-1252. “Is this string UTF-8?” is an invalid question. UTF-8 is a way of encoding characters as a sequence of bytes. If you want to take a string and turn it into a sequence of bytes in a particular character encoding, Python 3 can help you with that. If you want to take a sequence of bytes and turn it into a string, Python 3 can help you with that too. Bytes are not characters; bytes are bytes. Characters are an abstraction. A string is a sequence of those abstractions.
->>> s = '深入 Python' ① ->>> len(s) ② -9 ->>> s[0] ③ -'深' ->>> s + ' 3' ④ -'深入 Python 3'+>>> s = '深入 Python' ① +>>> len(s) ② +9 +>>> s[0] ③ +'深' +>>> s + ' 3' ④ +'深入 Python 3'
') or double quotes (").
len() function returns the length of the string, i.e. the number of characters. This is the same function you use to find the length of a list. A string is like a list of characters.
@@ -141,10 +141,10 @@ def approximate_size(size, a_kilobyte_is_1024_bytes=True):
Python 3 supports formatting values into strings. Although this can include very complicated expressions, the most basic usage is to insert a value into a string with single placeholder.
->>> username = 'mark' ->>> password = 'PapayaWhip' ① ->>> "{0}'s password is {1}".format(username, password) ② -"mark's password is PapayaWhip"+>>> username = 'mark' +>>> password = 'PapayaWhip' ① +>>> "{0}'s password is {1}".format(username, password) ② +"mark's password is PapayaWhip"
{0} and {1} are replacement fields, which are replaced by the arguments passed to the format() method.
@@ -155,12 +155,12 @@ def approximate_size(size, a_kilobyte_is_1024_bytes=True):
The previous example shows the simplest case, where the replacement fields are simply integers. Integer replacement fields are treated as positional indices into the argument list of the format() method. That means that {0} is replaced by the first argument (username in this case), {1} is replaced by the second argument (password), &c. You can have as many positional indices as you have arguments, and you can have as many arguments as you want. But replacement fields are much more powerful than that.
->>> import humansize ->>> si_suffixes = humansize.SUFFIXES[1000] ① ->>> si_suffixes -['KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'] ->>> '1000{0[0]} = 1{0[1]}'.format(si_suffixes) ② -'1000KB = 1MB' +>>> import humansize +>>> si_suffixes = humansize.SUFFIXES[1000] ① +>>> si_suffixes +['KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'] +>>> '1000{0[0]} = 1{0[1]}'.format(si_suffixes) ② +'1000KB = 1MB'
humansize module, you’re just grabbing one of the data structures it defines: the list of “SI” (powers-of-1000) suffixes.
@@ -181,10 +181,10 @@ def approximate_size(size, a_kilobyte_is_1024_bytes=True):
Just to blow your mind, here’s an example that combines all of the above:
->>> import humansize
->>> import sys
->>> '1MB = 1000{0.modules[humansize].SUFFIXES[1000][0]}'.format(sys)
-'1MB = 1000KB'
+>>> import humansize
+>>> import sys
+>>> '1MB = 1000{0.modules[humansize].SUFFIXES[1000][0]}'.format(sys)
+'1MB = 1000KB'
Here’s how it works: @@ -213,8 +213,8 @@ def approximate_size(size, a_kilobyte_is_1024_bytes=True):
Within a replacement field, a colon (:) marks the start of the format specifier. The format specifier “.1” means “round to the nearest tenth” (i.e. display only one digit after the decimal point). The format specifier “f” means “fixed-point number” (as opposed to exponential notation or some other decimal representation). Thus, given a size of 698.25 and suffix of 'GB', the formatted string would be '698.3 GB', because 698.25 gets rounded to one decimal place, then the suffix is appended after the number.
->>> '{0:.1f} {1}'.format(698.25, 'GB')
-'698.3 GB'
+>>> '{0:.1f} {1}'.format(698.25, 'GB')
+'698.3 GB'
For all the gory details on format specifiers, consult the Format Specification Mini-Language in the official Python documentation.
@@ -229,18 +229,18 @@ def approximate_size(size, a_kilobyte_is_1024_bytes=True):
... sult of years of scientif-
... ic study combined with the
... experience of years.'''
->>> s.splitlines() ②
-['Finished files are the re-',
+>>> s.splitlines() ②
+['Finished files are the re-',
'sult of years of scientif-',
'ic study combined with the',
'experience of years.']
->>> print(s.lower()) ③
+>>> print(s.lower()) ③
finished files are the re-
sult of years of scientif-
ic study combined with the
experience of years.
->>> s.lower().count('f') ④
-6
+>>> s.lower().count('f') ④
+6
splitlines() method takes one multiline string and returns a list of strings, one for each line of the original. Note that the carriage returns at the end of each line are not included.
@@ -251,16 +251,16 @@ experience of years.
Here’s another common case. Let’s say you have a list of key-value pairs in the form key1=value1&key2=value2, and you want to split them up and make a dictionary of the form {key1: value1, key2: value2}.
->>> query = 'user=pilgrim&database=master&password=PapayaWhip' ->>> a_list = query.split('&') ① ->>> a_list -['user=pilgrim', 'database=master', 'password=PapayaWhip'] ->>> a_list_of_lists = [v.split('=', 1) for v in a_list] ② ->>> a_list_of_lists -[['user', 'pilgrim'], ['database', 'master'], ['password', 'PapayaWhip']] ->>> a_dict = dict(a_list_of_lists) ③ ->>> a_dict -{'password': 'PapayaWhip', 'user': 'pilgrim', 'database': 'master'}+>>> query = 'user=pilgrim&database=master&password=PapayaWhip' +>>> a_list = query.split('&') ① +>>> a_list +['user=pilgrim', 'database=master', 'password=PapayaWhip'] +>>> a_list_of_lists = [v.split('=', 1) for v in a_list] ② +>>> a_list_of_lists +[['user', 'pilgrim'], ['database', 'master'], ['password', 'PapayaWhip']] +>>> a_dict = dict(a_list_of_lists) ③ +>>> a_dict +{'password': 'PapayaWhip', 'user': 'pilgrim', 'database': 'master'}
split() string method takes one argument, a delimiter, and split a string into a list of strings based on the delimiter. Here, the delimiter is an ampersand character, but it could be anything.
@@ -275,21 +275,21 @@ experience of years.Bytes are bytes; characters are an abstraction. An immutable sequence of Unicode characters is called a string. An immutable sequence of numbers-between-0-and-255 is called a bytes object.
->>> by = b'abcd\x65' ① ->>> by -b'abcde' ->>> type(by) ② -<class 'bytes'> ->>> len(by) ③ -5 ->>> by += b'\xff' ④ ->>> by -b'abcde\xff' ->>> len(by) ⑤ -6 ->>> by[0] ⑥ -97 ->>> by[0] = 102 ⑦ +>>> by = b'abcd\x65' ① +>>> by +b'abcde' +>>> type(by) ② +<class 'bytes'> +>>> len(by) ③ +5 +>>> by += b'\xff' ④ +>>> by +b'abcde\xff' +>>> len(by) ⑤ +6 +>>> by[0] ⑥ +97 +>>> by[0] = 102 ⑦ Traceback (most recent call last): File "<stdin>", line 1, in <module> TypeError: 'bytes' object does not support item assignment@@ -304,15 +304,15 @@ TypeError: 'bytes' object does not support item assignment
->>> by = b'abcd\x65' ->>> barr = bytearray(by) ① ->>> barr -bytearray(b'abcde') ->>> len(barr) ② -5 ->>> barr[0] = 102 ③ ->>> barr -bytearray(b'fbcde')+>>> by = b'abcd\x65' +>>> barr = bytearray(by) ① +>>> barr +bytearray(b'abcde') +>>> len(barr) ② +5 +>>> barr[0] = 102 ③ +>>> barr +bytearray(b'fbcde')
bytes object into a mutable bytearray object, use the built-in bytearray() function.
bytes object, you can do on a bytearray object too.
@@ -322,18 +322,18 @@ TypeError: 'bytes' object does not support item assignment
The one thing you can never do is mix bytes and strings.
->>> by = b'd' ->>> s = 'abcde' ->>> by + s ① +>>> by = b'd' +>>> s = 'abcde' +>>> by + s ① Traceback (most recent call last): File "<stdin>", line 1, in <module> TypeError: can't concat bytes to str ->>> s.count(by) ② +>>> s.count(by) ② Traceback (most recent call last): File "<stdin>", line 1, in <module> TypeError: Can't convert 'bytes' object to str implicitly ->>> s.count(by.decode('ascii')) ③ -1+>>> s.count(by.decode('ascii')) ③ +1
And here is the link between strings and bytes: bytes objects have a decode() method that takes a character encoding and returns a string, and strings have an encode() method that takes a character encoding and returns a bytes object. In the previous example, the decoding was relatively straightforward — converting a sequence of bytes n the ASCII encoding into a string of characters. But the same process works with any encoding that supports the characters of the string — even legacy (non-Unicode) encodings.
->>> a_string = '深入 Python' ① ->>> len(a_string) -9 ->>> by = a_string.encode('utf-8') ② ->>> by -b'\xe6\xb7\xb1\xe5\x85\xa5 Python' ->>> len(by) -13 ->>> by = a_string.encode('gb18030') ③ ->>> by -b'\xc9\xee\xc8\xeb Python' ->>> len(by) -11 ->>> by = a_string.encode('big5') ④ ->>> by -b'\xb2`\xa4J Python' ->>> len(by) -11 ->>> roundtrip = by.decode('big5') ⑤ ->>> roundtrip -'深入 Python' ->>> a_string == roundtrip -True+>>> a_string = '深入 Python' ① +>>> len(a_string) +9 +>>> by = a_string.encode('utf-8') ② +>>> by +b'\xe6\xb7\xb1\xe5\x85\xa5 Python' +>>> len(by) +13 +>>> by = a_string.encode('gb18030') ③ +>>> by +b'\xc9\xee\xc8\xeb Python' +>>> len(by) +11 +>>> by = a_string.encode('big5') ④ +>>> by +b'\xb2`\xa4J Python' +>>> len(by) +11 +>>> roundtrip = by.decode('big5') ⑤ +>>> roundtrip +'深入 Python' +>>> a_string == roundtrip +True
bytes object. It has 13 bytes. It is the sequence of bytes you get when you take a_string and encode it in UTF-8.
diff --git a/unit-testing.html b/unit-testing.html
index fdf94a5..e952f79 100644
--- a/unit-testing.html
+++ b/unit-testing.html
@@ -202,8 +202,8 @@ while n >= integer:
print('subtracting {0} from input, adding {1} to output'.format(integer, numeral))
With the debug print() statements, the output looks like this:
->>> import roman1 ->>> roman1.to_roman(1424) +>>> import roman1 +>>> roman1.to_roman(1424) subtracting 1000 from input, adding M to output subtracting 400 from input, adding CD to output subtracting 10 from input, adding X to output @@ -229,13 +229,13 @@ OK
It is not enough to test that functions succeed when given good input; you must also test that they fail when given bad input. And not just any sort of failure; they must fail in the way you expect.
->>> import roman1 ->>> roman1.to_roman(4000) -'MMMM' ->>> roman1.to_roman(5000) -'MMMMM' ->>> roman1.to_roman(9000) ① -'MMMMMMMMM'+>>> import roman1 +>>> roman1.to_roman(4000) +'MMMM' +>>> roman1.to_roman(5000) +'MMMMM' +>>> roman1.to_roman(9000) ① +'MMMMMMMMM'
Along with testing numbers that are too large, you need to test numbers that are too small. As we noted in our functional requirements, Roman numerals cannot express 0 or negative numbers.
->>> import roman2 ->>> roman2.to_roman(0) -'' ->>> roman2.to_roman(-1) -''+>>> import roman2 +>>> roman2.to_roman(0) +'' +>>> roman2.to_roman(-1) +''
Well that’s not good. Let’s add tests for each of these conditions. @@ -441,11 +441,11 @@ OK
There was one more functional requirement for converting numbers to Roman numerals: dealing with non-integers.
->>> import roman3 ->>> roman3.to_roman(0.5) ① -'' ->>> roman3.to_roman(1.5) ② -'I'+>>> import roman3 +>>> roman3.to_roman(0.5) ① +'' +>>> roman3.to_roman(1.5) ② +'I'
->>> import xml.etree.ElementTree as etree ① ->>> tree = etree.parse('examples/feed.xml') ② ->>> root = tree.getroot() ③ ->>> root ④ -<Element {http://www.w3.org/2005/Atom}feed at cd1eb0>+>>> import xml.etree.ElementTree as etree ① +>>> tree = etree.parse('examples/feed.xml') ② +>>> root = tree.getroot() ③ +>>> root ④ +<Element {http://www.w3.org/2005/Atom}feed at cd1eb0>
xml.etree.ElementTree.
parse() function, which can take a filename or a file-like object [FIXME xref]. This function parses the entire document at once. If memory is tight, there are ways to parse an XML document incrementally instead.
@@ -276,14 +276,14 @@ mark{display:inline}
# continued from the previous example ->>> root.tag ① -'{http://www.w3.org/2005/Atom}feed' ->>> len(root) ② -8 ->>> for child in root: ③ -... print(child) ④ +>>> root.tag ① +'{http://www.w3.org/2005/Atom}feed' +>>> len(root) ② +8 +>>> for child in root: ③ +... print(child) ④ ... -<Element {http://www.w3.org/2005/Atom}title at e2b5d0> +<Element {http://www.w3.org/2005/Atom}title at e2b5d0> <Element {http://www.w3.org/2005/Atom}subtitle at e2b4e0> <Element {http://www.w3.org/2005/Atom}id at e2b6c0> <Element {http://www.w3.org/2005/Atom}updated at e2b6f0> @@ -306,18 +306,18 @@ mark{display:inline}# continuing from the previous example ->>> root.attrib ① -{'{http://www.w3.org/XML/1998/namespace}lang': 'en'} ->>> root[4] ② -<Element {http://www.w3.org/2005/Atom}link at e181b0> ->>> root[4].attrib ③ -{'href': 'http://diveintomark.org/', +>>> root.attrib ① +{'{http://www.w3.org/XML/1998/namespace}lang': 'en'} +>>> root[4] ② +<Element {http://www.w3.org/2005/Atom}link at e181b0> +>>> root[4].attrib ③ +{'href': 'http://diveintomark.org/', 'type': 'text/html', 'rel': 'alternate'} ->>> root[3] ④ -<Element {http://www.w3.org/2005/Atom}updated at e2b4e0> ->>> root[3].attrib ⑤ -{}+>>> root[3] ④ +<Element {http://www.w3.org/2005/Atom}updated at e2b4e0> +>>> root[3].attrib ⑤ +{}
attrib property is a dictionary of the element’s attributes. The original markup here was <feed xmlns='http://www.w3.org/2005/Atom' xml:lang='en'>. The xml: prefix refers to a built-in namespace that every XML document can use without declaring it.
[4] in a 0-based list — is the link element.
@@ -333,19 +333,19 @@ mark{display:inline}
So far, we’ve worked with this XML document “from the top down,” starting with the root element, getting its child elements, and so on throughout the document. But many uses of XML require you to find specific elements. Etree can do that, too.
->>> import xml.etree.ElementTree as etree
->>> tree = etree.parse('examples/feed.xml')
->>> root = tree.getroot()
->>> root.findall('{http://www.w3.org/2005/Atom}entry') ①
-[<Element {http://www.w3.org/2005/Atom}entry at e2b4e0>,
+>>> import xml.etree.ElementTree as etree
+>>> tree = etree.parse('examples/feed.xml')
+>>> root = tree.getroot()
+>>> root.findall('{http://www.w3.org/2005/Atom}entry') ①
+[<Element {http://www.w3.org/2005/Atom}entry at e2b4e0>,
<Element {http://www.w3.org/2005/Atom}entry at e2b510>,
<Element {http://www.w3.org/2005/Atom}entry at e2b540>]
->>> root.tag
-'{http://www.w3.org/2005/Atom}feed'
->>> root.findall('{http://www.w3.org/2005/Atom}feed') ②
-[]
->>> root.findall('{http://www.w3.org/2005/Atom}author') ③
-[]
+>>> root.tag
+'{http://www.w3.org/2005/Atom}feed'
+>>> root.findall('{http://www.w3.org/2005/Atom}feed') ②
+[]
+>>> root.findall('{http://www.w3.org/2005/Atom}author') ③
+[]
findall() method finds child elements that match a specific query. (More on the query format in a minute.)
findall() method. It finds all matching elements among the element’s children. But why aren’t there any results? Although it may not be obvious, this particular query only searches the element’s children. Since the root feed element has no child named feed, this query returns an empty list.
@@ -353,12 +353,12 @@ mark{display:inline}
->>> tree.findall('{http://www.w3.org/2005/Atom}entry') ① -[<Element {http://www.w3.org/2005/Atom}entry at e2b4e0>, +>>> tree.findall('{http://www.w3.org/2005/Atom}entry') ① +[<Element {http://www.w3.org/2005/Atom}entry at e2b4e0>, <Element {http://www.w3.org/2005/Atom}entry at e2b510>, <Element {http://www.w3.org/2005/Atom}entry at e2b540>] ->>> tree.findall('{http://www.w3.org/2005/Atom}author') ② -[] +>>> tree.findall('{http://www.w3.org/2005/Atom}author') ② +[]
tree object (returned from the etree.parse() function) has several methods that mirror the methods on the root element. The results are the same as if you had called the tree.getroot().findall() method.
@@ -368,26 +368,26 @@ mark{display:inline}
There is a way to search for descendant elements, i.e. children, grandchildren, and any element at any nesting level.
->>> all_links = tree.findall('//{http://www.w3.org/2005/Atom}link') ① ->>> all_links -[<Element {http://www.w3.org/2005/Atom}link at e181b0>, +>>> all_links = tree.findall('//{http://www.w3.org/2005/Atom}link') ① +>>> all_links +[<Element {http://www.w3.org/2005/Atom}link at e181b0>, <Element {http://www.w3.org/2005/Atom}link at e2b570>, <Element {http://www.w3.org/2005/Atom}link at e2b480>, <Element {http://www.w3.org/2005/Atom}link at e2b5a0>] ->>> all_links[0].attrib ② -{'href': 'http://diveintomark.org/', +>>> all_links[0].attrib ② +{'href': 'http://diveintomark.org/', 'type': 'text/html', 'rel': 'alternate'} ->>> all_links[1].attrib ③ -{'href': 'http://diveintomark.org/archives/2009/03/27/dive-into-history-2009-edition', +>>> all_links[1].attrib ③ +{'href': 'http://diveintomark.org/archives/2009/03/27/dive-into-history-2009-edition', 'type': 'text/html', 'rel': 'alternate'} ->>> all_links[2].attrib -{'href': 'http://diveintomark.org/archives/2009/03/21/accessibility-is-a-harsh-mistress', +>>> all_links[2].attrib +{'href': 'http://diveintomark.org/archives/2009/03/21/accessibility-is-a-harsh-mistress', 'type': 'text/html', 'rel': 'alternate'} ->>> all_links[3].attrib -{'href': 'http://diveintomark.org/archives/2008/12/18/give-part-1-container-formats', +>>> all_links[3].attrib +{'href': 'http://diveintomark.org/archives/2008/12/18/give-part-1-container-formats', 'type': 'text/html', 'rel': 'alternate'}
# continuing from the previous example ->>> it = tree.getiterator('{http://www.w3.org/2005/Atom}link') ① ->>> next(it) ② +>>> it = tree.getiterator('{http://www.w3.org/2005/Atom}link') ① +>>> next(it) ② <Element {http://www.w3.org/2005/Atom}link at 122f1b0> ->>> next(it) +>>> next(it) <Element {http://www.w3.org/2005/Atom}link at 122f1e0> ->>> next(it) +>>> next(it) <Element {http://www.w3.org/2005/Atom}link at 122f210> ->>> next(it) +>>> next(it) <Element {http://www.w3.org/2005/Atom}link at 122f1b0> ->>> next(it) +>>> next(it) Traceback (most recent call last): File "<stdin>", line 1, in <module> StopIteration@@ -427,11 +427,11 @@ StopIteration
lxml is an open source third-party library that builds on the popular libxml2 parser. It provides a 100% compatible ElementTree API, then extends it with full XPath support and a few other niceties. There are installers available for Windows; Linux users should always try to use distribution-specific tools like yum or apt-get to install precompiled binaries from their repositories. Otherwise you’ll need to install lxml manually.
->>> from lxml import etree ① ->>> tree = etree.parse('examples/feed.xml') ② ->>> root = tree.getroot() ③ ->>> root.findall('{http://www.w3.org/2005/Atom}entry') ④ -[<Element {http://www.w3.org/2005/Atom}entry at e2b4e0>, +>>> from lxml import etree ① +>>> tree = etree.parse('examples/feed.xml') ② +>>> root = tree.getroot() ③ +>>> root.findall('{http://www.w3.org/2005/Atom}entry') ④ +[<Element {http://www.w3.org/2005/Atom}entry at e2b4e0>, <Element {http://www.w3.org/2005/Atom}entry at e2b510>, <Element {http://www.w3.org/2005/Atom}entry at e2b540>]
But lxml is more than just a faster ElementTree. Its findall() method includes support for more complicated expressions.
->>> import lxml.etree ① ->>> tree = lxml.etree.parse('examples/feed.xml') ->>> tree.findall('//{http://www.w3.org/2005/Atom}*[@href]') ② +>>> import lxml.etree ① +>>> tree = lxml.etree.parse('examples/feed.xml') +>>> tree.findall('//{http://www.w3.org/2005/Atom}*[@href]') ② [<Element {http://www.w3.org/2005/Atom}link at eeb8a0>, <Element {http://www.w3.org/2005/Atom}link at eeb990>, <Element {http://www.w3.org/2005/Atom}link at eeb960>, <Element {http://www.w3.org/2005/Atom}link at eeb9c0>] ->>> tree.findall("//{http://www.w3.org/2005/Atom}*[@href='http://diveintomark.org/']") ③ -[<Element {http://www.w3.org/2005/Atom}link at eeb930>] ->>> NS = '{http://www.w3.org/2005/Atom}' ->>> tree.findall('//{NS}author[{NS}uri]'.format(NS=NS)) ④ -[<Element {http://www.w3.org/2005/Atom}author at eeba80>, +>>> tree.findall("//{http://www.w3.org/2005/Atom}*[@href='http://diveintomark.org/']") ③ +[<Element {http://www.w3.org/2005/Atom}link at eeb930>] +>>> NS = '{http://www.w3.org/2005/Atom}' +>>> tree.findall('//{NS}author[{NS}uri]'.format(NS=NS)) ④ +[<Element {http://www.w3.org/2005/Atom}author at eeba80>, <Element {http://www.w3.org/2005/Atom}author at eebba0>]
import lxml.etree (instead of, say, from lxml import etree), to emphasize that these features are specific to lxml.
@@ -474,16 +474,16 @@ except ImportError:
Not enough for you? lxml also integrates support for arbitrary XPath expressions. I’m not going to go into depth about XPath syntax; that could be a whole book unto itself! But I will show you how it integrates into lxml.
->>> import lxml.etree
->>> tree = lxml.etree.parse('examples/feed.xml')
->>> NSMAP = {'atom': 'http://www.w3.org/2005/Atom'} ①
->>> entries = tree.xpath("//atom:category[@term='accessibility']/..", ②
-... namespaces=NSMAP)
->>> entries ③
-[<Element {http://www.w3.org/2005/Atom}entry at e2b630>]
->>> entry = entries[0]
->>> entry.xpath('./atom:title/text()', namespaces=nsmap) ④
-['Accessibility is a harsh mistress']
+>>> import lxml.etree
+>>> tree = lxml.etree.parse('examples/feed.xml')
+>>> NSMAP = {'atom': 'http://www.w3.org/2005/Atom'} ①
+>>> entries = tree.xpath("//atom:category[@term='accessibility']/..", ②
+... namespaces=NSMAP)
+>>> entries ③
+[<Element {http://www.w3.org/2005/Atom}entry at e2b630>]
+>>> entry = entries[0]
+>>> entry.xpath('./atom:title/text()', namespaces=nsmap) ④
+['Accessibility is a harsh mistress']
category elements (in the Atom namespace) that contain a term attribute with the value accessibility. But that’s not actually the query result. Look at the very end of the query string; did you notice the /.. bit? That means “and then return the parent element of the category element you just found.” So this single XPath query will find all entries with a child element of <category term='accessibility'>.
@@ -498,11 +498,11 @@ except ImportError:
Python’s support for XML is not limited to parsing existing documents. You can also create XML documents from scratch.
->>> import xml.etree.ElementTree as etree ->>> new_feed = etree.Element('{http://www.w3.org/2005/Atom}feed', ① -... attrib={'{http://www.w3.org/XML/1998/namespace}lang': 'en'}) ② ->>> print(etree.tostring(new_feed)) ③ -<ns0:feed xmlns:ns0='http://www.w3.org/2005/Atom' xml:lang='en'/>+>>> import xml.etree.ElementTree as etree +>>> new_feed = etree.Element('{http://www.w3.org/2005/Atom}feed', ① +... attrib={'{http://www.w3.org/XML/1998/namespace}lang': 'en'}) ② +>>> print(etree.tostring(new_feed)) ③ +<ns0:feed xmlns:ns0='http://www.w3.org/2005/Atom' xml:lang='en'/>
Element class. You pass the element name (namespace + local name) as the first argument. This statement creates a feed element in the Atom namespace. This will be our new document’s root element.
{namespace}localname.
@@ -524,14 +524,14 @@ except ImportError:
The built-in ElementTree library does not offer this fine-grained control over serializing namespaced elements, but lxml does.
->>> import lxml.etree ->>> NSMAP = {None: 'http://www.w3.org/2005/Atom'} ① ->>> new_feed = lxml.etree.Element('feed', nsmap=NSMAP) ② ->>> print(lxml.etree.tounicode(new_feed)) ③ -<feed xmlns='http://www.w3.org/2005/Atom'/> ->>> new_feed.set('{http://www.w3.org/XML/1998/namespace}lang', 'en') ④ ->>> print(lxml.etree.tounicode(new_feed)) -<feed xmlns='http://www.w3.org/2005/Atom' xml:lang='en'/>+>>> import lxml.etree +>>> NSMAP = {None: 'http://www.w3.org/2005/Atom'} ① +>>> new_feed = lxml.etree.Element('feed', nsmap=NSMAP) ② +>>> print(lxml.etree.tounicode(new_feed)) ③ +<feed xmlns='http://www.w3.org/2005/Atom'/> +>>> new_feed.set('{http://www.w3.org/XML/1998/namespace}lang', 'en') ④ +>>> print(lxml.etree.tounicode(new_feed)) +<feed xmlns='http://www.w3.org/2005/Atom' xml:lang='en'/>
None as a prefix effectively declares a default namespace.
lxml-specific nsmap argument when you create an element, and lxml will respect the namespace prefixes you’ve defined.
@@ -542,15 +542,15 @@ except ImportError:
Are XML documents limited to one element per document? No, of course not. You can easily create child elements, too.
->>> title = lxml.etree.SubElement(new_feed, 'title', ① -... attrib={'type':'html'}) ② ->>> print(lxml.etree.tounicode(new_feed)) -<feed xmlns='http://www.w3.org/2005/Atom' xml:lang='en'><title type='html'/></feed> ->>> title.text = 'dive into …' ③ ->>> print(lxml.etree.tounicode(new_feed)) ④ -<feed xmlns='http://www.w3.org/2005/Atom' xml:lang='en'><title type='html'>dive into &hellip;</title></feed> ->>> print(lxml.etree.tounicode(new_feed, pretty_print=True)) ⑤ -<feed xmlns='http://www.w3.org/2005/Atom' xml:lang='en'> +>>> title = lxml.etree.SubElement(new_feed, 'title', ① +... attrib={'type':'html'}) ② +>>> print(lxml.etree.tounicode(new_feed)) +<feed xmlns='http://www.w3.org/2005/Atom' xml:lang='en'><title type='html'/></feed> +>>> title.text = 'dive into …' ③ +>>> print(lxml.etree.tounicode(new_feed)) ④ +<feed xmlns='http://www.w3.org/2005/Atom' xml:lang='en'><title type='html'>dive into &hellip;</title></feed> +>>> print(lxml.etree.tounicode(new_feed, pretty_print=True)) ⑤ +<feed xmlns='http://www.w3.org/2005/Atom' xml:lang='en'> <title type='html'>dive into&hellip;</title> </feed>
That’s an error, because the … entity is not defined in XML. (It is defined in HTML.) If you try to parse this broken feed with the default settings, lxml will choke on the undefined entity.
->>> import lxml.etree
->>> tree = lxml.etree.parse('examples/feed-broken.xml')
+>>> import lxml.etree
+>>> tree = lxml.etree.parse('examples/feed-broken.xml')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "lxml.etree.pyx", line 2693, in lxml.etree.parse (src/lxml/lxml.etree.c:52591)
@@ -600,17 +600,17 @@ lxml.etree.XMLSyntaxError: Entity 'hellip' not defined, line 3, column 28
To parse this broken XML document, despite its wellformedness error, you need to create a custom XML parser.
->>> parser = lxml.etree.XMLParser(recover=True) ①
->>> tree = lxml.etree.parse('examples/feed-broken.xml', parser) ②
->>> parser.error_log ③
+>>> parser = lxml.etree.XMLParser(recover=True) ①
+>>> tree = lxml.etree.parse('examples/feed-broken.xml', parser) ②
+>>> parser.error_log ③
examples/feed-broken.xml:3:28:FATAL:PARSER:ERR_UNDECLARED_ENTITY: Entity 'hellip' not defined
->>> tree.findall('{http://www.w3.org/2005/Atom}title')
-[<Element {http://www.w3.org/2005/Atom}title at ead510>]
->>> title = tree.findall('{http://www.w3.org/2005/Atom}title')[0]
->>> title.text ④
-'dive into '
->>> print(lxml.etree.tounicode(tree.getroot())) ⑤
-<feed xmlns='http://www.w3.org/2005/Atom' xml:lang='en'>
+>>> tree.findall('{http://www.w3.org/2005/Atom}title')
+[<Element {http://www.w3.org/2005/Atom}title at ead510>]
+>>> title = tree.findall('{http://www.w3.org/2005/Atom}title')[0]
+>>> title.text ④
+'dive into '
+>>> print(lxml.etree.tounicode(tree.getroot())) ⑤
+<feed xmlns='http://www.w3.org/2005/Atom' xml:lang='en'>
<title>dive into </title>
.
. [rest of serialization snipped for brevity]
diff --git a/your-first-python-program.html b/your-first-python-program.html
index 472bed9..7c18b6b 100644
--- a/your-first-python-program.html
+++ b/your-first-python-program.html
@@ -110,17 +110,17 @@ if __name__ == '__main__':
You can also pass values into a function by name.
->>> from humansize import approximate_size
->>> approximate_size(4000, a_kilobyte_is_1024_bytes=False) ①
-'4.0 KB'
->>> approximate_size(size=4000, a_kilobyte_is_1024_bytes=False) ②
-'4.0 KB'
->>> approximate_size(a_kilobyte_is_1024_bytes=False, size=4000) ③
-'4.0 KB'
->>> approximate_size(a_kilobyte_is_1024_bytes=False, 4000) ④
+>>> from humansize import approximate_size
+>>> approximate_size(4000, a_kilobyte_is_1024_bytes=False) ①
+'4.0 KB'
+>>> approximate_size(size=4000, a_kilobyte_is_1024_bytes=False) ②
+'4.0 KB'
+>>> approximate_size(a_kilobyte_is_1024_bytes=False, size=4000) ③
+'4.0 KB'
+>>> approximate_size(a_kilobyte_is_1024_bytes=False, 4000) ④
File "<stdin>", line 1
SyntaxError: non-keyword arg after keyword arg
->>> approximate_size(size=4000, False) ⑤
+>>> approximate_size(size=4000, False) ⑤
File "<stdin>", line 1
SyntaxError: non-keyword arg after keyword arg
@@ -163,10 +163,10 @@ SyntaxError: non-keyword arg after keyword arg
In case you missed it, I just said that Python functions have attributes, and that those attributes are available at runtime. A function, like everything else in Python, is an object.
Run the interactive Python shell and follow along:
->>> import humansize ①
->>> print(humansize.approximate_size(4096, True)) ②
-4.0 KiB
->>> print(humansize.approximate_size.__doc__) ③
+>>> import humansize ①
+>>> print(humansize.approximate_size(4096, True)) ②
+4.0 KiB
+>>> print(humansize.approximate_size.__doc__) ③
Convert a file size to human-readable form.
Keyword arguments:
@@ -188,20 +188,20 @@ SyntaxError: non-keyword arg after keyword arg
The import Search Path
Before this goes any further, I want to briefly mention the library search path. Python looks in several places when you try to import a module. Specifically, it looks in all the directories defined in sys.path. This is just a list, and you can easily view it or modify it with standard list methods. (You’ll learn more about lists in Native Datatypes.)
->>> import sys ①
->>> sys.path ②
-['',
+>>> import sys ①
+>>> sys.path ②
+['',
'/usr/lib/python30.zip',
'/usr/lib/python3.0',
'/usr/lib/python3.0/plat-linux2@EXTRAMACHDEPPATH@',
'/usr/lib/python3.0/lib-dynload',
'/usr/lib/python3.0/dist-packages',
'/usr/local/lib/python3.0/dist-packages']
->>> sys ③
-<module 'sys' (built-in)>
->>> sys.path.insert(0, '/home/mark/py') ④
->>> sys.path ⑤
-['/home/mark/py',
+>>> sys ③
+<module 'sys' (built-in)>
+>>> sys.path.insert(0, '/home/mark/py') ④
+>>> sys.path ⑤
+['/home/mark/py',
'',
'/usr/lib/python30.zip',
'/usr/lib/python3.0',
@@ -261,9 +261,9 @@ if __name__ == '__main__':
So what makes this if statement special? Well, modules are objects, and all modules have a built-in attribute __name__. A module’s __name__ depends on how you’re using the module. If you import the module, then __name__ is the module’s filename, without a directory path or file extension.
->>> import humansize
->>> humansize.__name__
-'humansize'
+>>> import humansize
+>>> humansize.__name__
+'humansize'
But you can also run the module directly as a standalone program, in which case __name__ will be a special default value, __main__. Python will evaluate this if statement, find a true expression, and execute the if code block. In this case, to print two values.
c:\home\diveintopython3> c:\python30\python.exe humansize.py