mirror of
https://github.com/kennethreitz/dive-into-python3.git
synced 2026-06-05 23:10:17 +00:00
new section in xml chapter, also entities-to-Unicode-characters in build script
This commit is contained in:
+17
-366
@@ -1,18 +1,31 @@
|
||||
"""Quick-and-dirty HTML minimizer"""
|
||||
|
||||
import sys
|
||||
import sys, re, html.entities
|
||||
|
||||
input_file = sys.argv[1]
|
||||
output_file = sys.argv[2]
|
||||
in_pre = False
|
||||
out = open(output_file, 'w')
|
||||
out = open(output_file, 'w', encoding="utf-8") # encoding argument! important!
|
||||
for line in open(input_file).readlines():
|
||||
# replace entities with Unicode characters
|
||||
for e in re.findall('&(.+?);', line):
|
||||
n = html.entities.name2codepoint.get(e)
|
||||
if not n:
|
||||
if e.count('#x'):
|
||||
# it's late, forgive me
|
||||
n = eval(e.replace('#', '0'))
|
||||
elif e.count('#'):
|
||||
n = int(e.replace('#', ''))
|
||||
else:
|
||||
continue
|
||||
line = line.replace('&' + e + ';', chr(n))
|
||||
|
||||
# strip leading and trailing whitespace, except inside <pre> blocks
|
||||
g = line.strip()
|
||||
if g.count('<pre'):
|
||||
in_pre = True
|
||||
if g.count('</pre'):
|
||||
# this will break if you have a </pre> then <pre>
|
||||
# on the same line, so don't do that
|
||||
# XXX this will break if you have </pre><pre> in one line
|
||||
in_pre = False
|
||||
g = line.rstrip()
|
||||
if in_pre:
|
||||
@@ -20,365 +33,3 @@ for line in open(input_file).readlines():
|
||||
else:
|
||||
out.write(g)
|
||||
out.close()
|
||||
|
||||
out = open(output_file)
|
||||
html = out.read()
|
||||
out.close()
|
||||
html = html.replace("å", "å")
|
||||
html = html.replace(">", ">")
|
||||
html = html.replace(">", ">")
|
||||
html = html.replace("⊃", "⊃")
|
||||
html = html.replace("⊃", "⊃")
|
||||
html = html.replace("Ñ", "Ñ")
|
||||
html = html.replace("ϒ", "ϒ")
|
||||
html = html.replace("ϒ", "ϒ")
|
||||
html = html.replace("Ý", "Ý")
|
||||
html = html.replace("Ã", "Ã")
|
||||
html = html.replace("√", "√")
|
||||
html = html.replace("⊗", "⊗")
|
||||
html = html.replace("⊗", "⊗")
|
||||
html = html.replace("æ", "æ")
|
||||
html = html.replace("Ψ", "Ψ")
|
||||
html = html.replace("Ψ", "Ψ")
|
||||
html = html.replace("Ε", "Ε")
|
||||
html = html.replace("Ε", "Ε")
|
||||
html = html.replace("Î", "Î")
|
||||
html = html.replace("É", "É")
|
||||
html = html.replace("Λ", "Λ")
|
||||
html = html.replace("Λ", "Λ")
|
||||
html = html.replace("″", "″")
|
||||
html = html.replace("Κ", "Κ")
|
||||
html = html.replace("Κ", "Κ")
|
||||
html = html.replace("ς", "ς")
|
||||
html = html.replace("ς", "ς")
|
||||
html = html.replace("‎", "‎")
|
||||
html = html.replace("‎", "‎")
|
||||
html = html.replace("¸", "¸")
|
||||
html = html.replace(" ", " ")
|
||||
html = html.replace(" ", " ")
|
||||
html = html.replace("Æ", "Æ")
|
||||
html = html.replace("′", "′")
|
||||
html = html.replace("Τ", "Τ")
|
||||
html = html.replace("Τ", "Τ")
|
||||
html = html.replace("⌈", "⌈")
|
||||
html = html.replace("⇓", "⇓")
|
||||
html = html.replace("⇓", "⇓")
|
||||
html = html.replace("≥", "≥")
|
||||
html = html.replace("≥", "≥")
|
||||
html = html.replace("⋅", "⋅")
|
||||
html = html.replace("⋅", "⋅")
|
||||
html = html.replace("⌊", "⌊")
|
||||
html = html.replace("⌊", "⌊")
|
||||
html = html.replace("⇐", "⇐")
|
||||
html = html.replace("⇐", "⇐")
|
||||
html = html.replace("¦", "¦")
|
||||
html = html.replace("Õ", "Õ")
|
||||
html = html.replace("Θ", "Θ")
|
||||
html = html.replace("Θ", "Θ")
|
||||
html = html.replace("Π", "Π")
|
||||
html = html.replace("Π", "Π")
|
||||
html = html.replace("Œ", "Œ")
|
||||
html = html.replace("Œ", "Œ")
|
||||
html = html.replace("Š", "Š")
|
||||
html = html.replace("Š", "Š")
|
||||
html = html.replace("è", "è")
|
||||
html = html.replace("⊂", "⊂")
|
||||
html = html.replace("⊂", "⊂")
|
||||
html = html.replace("¡", "¡")
|
||||
html = html.replace("∑", "∑")
|
||||
html = html.replace("∑", "∑")
|
||||
html = html.replace("ñ", "ñ")
|
||||
html = html.replace("ã", "ã")
|
||||
html = html.replace("θ", "θ")
|
||||
html = html.replace("θ", "θ")
|
||||
html = html.replace("⊄", "⊄")
|
||||
html = html.replace("⊄", "⊄")
|
||||
html = html.replace("⇔", "⇔")
|
||||
html = html.replace("⇔", "⇔")
|
||||
html = html.replace("Ø", "Ø")
|
||||
html = html.replace("Þ", "Þ")
|
||||
html = html.replace("Μ", "Μ")
|
||||
html = html.replace("Μ", "Μ")
|
||||
html = html.replace(" ", " ")
|
||||
html = html.replace(" ", " ")
|
||||
html = html.replace("ê", "ê")
|
||||
html = html.replace("„", "„")
|
||||
html = html.replace("Å", "Å")
|
||||
html = html.replace("∇", "∇")
|
||||
html = html.replace("‰", "‰")
|
||||
html = html.replace("‰", "‰")
|
||||
html = html.replace("Ù", "Ù")
|
||||
html = html.replace("η", "η")
|
||||
html = html.replace("η", "η")
|
||||
html = html.replace("À", "À")
|
||||
html = html.replace("∀", "∀")
|
||||
html = html.replace("∀", "∀")
|
||||
html = html.replace("ð", "ð")
|
||||
html = html.replace("ð", "ð")
|
||||
html = html.replace("⌉", "⌉")
|
||||
html = html.replace("È", "È")
|
||||
html = html.replace("÷", "÷")
|
||||
html = html.replace("ì", "ì")
|
||||
html = html.replace("õ", "õ")
|
||||
html = html.replace("£", "£")
|
||||
html = html.replace("⁄", "⁄")
|
||||
html = html.replace("Ð", "Ð")
|
||||
html = html.replace("Ð", "Ð")
|
||||
html = html.replace("∗", "∗")
|
||||
html = html.replace("∗", "∗")
|
||||
html = html.replace("χ", "χ")
|
||||
html = html.replace("χ", "χ")
|
||||
html = html.replace("Á", "Á")
|
||||
html = html.replace("Β", "Β")
|
||||
html = html.replace("⊥", "⊥")
|
||||
html = html.replace("⊥", "⊥")
|
||||
html = html.replace("∴", "∴")
|
||||
html = html.replace("∴", "∴")
|
||||
html = html.replace("π", "π")
|
||||
html = html.replace("π", "π")
|
||||
html = html.replace("∅", "∅")
|
||||
html = html.replace("∉", "∉")
|
||||
html = html.replace("î", "î")
|
||||
html = html.replace("•", "•")
|
||||
html = html.replace("•", "•")
|
||||
html = html.replace("υ", "υ")
|
||||
html = html.replace("υ", "υ")
|
||||
html = html.replace("Ó", "Ó")
|
||||
html = html.replace("κ", "κ")
|
||||
html = html.replace("κ", "κ")
|
||||
html = html.replace("ç", "ç")
|
||||
html = html.replace("∩", "∩")
|
||||
html = html.replace("∩", "∩")
|
||||
html = html.replace("μ", "μ")
|
||||
html = html.replace("μ", "μ")
|
||||
html = html.replace("°", "°")
|
||||
html = html.replace("°", "°")
|
||||
html = html.replace("τ", "τ")
|
||||
html = html.replace("τ", "τ")
|
||||
html = html.replace(" ", " ")
|
||||
html = html.replace(" ", " ")
|
||||
html = html.replace("…", "…")
|
||||
html = html.replace("…", "…")
|
||||
html = html.replace("û", "û")
|
||||
html = html.replace("ù", "ù")
|
||||
html = html.replace("≅", "≅")
|
||||
html = html.replace("≅", "≅")
|
||||
html = html.replace("Ι", "Ι")
|
||||
html = html.replace(""", """)
|
||||
html = html.replace(""", """)
|
||||
html = html.replace("→", "→")
|
||||
html = html.replace("→", "→")
|
||||
html = html.replace("Ρ", "Ρ")
|
||||
html = html.replace("Ρ", "Ρ")
|
||||
html = html.replace("ú", "ú")
|
||||
html = html.replace("â", "â")
|
||||
html = html.replace("∼", "∼")
|
||||
html = html.replace("∼", "∼")
|
||||
html = html.replace("φ", "φ")
|
||||
html = html.replace("φ", "φ")
|
||||
html = html.replace("♦", "♦")
|
||||
html = html.replace("Ç", "Ç")
|
||||
html = html.replace("Η", "Η")
|
||||
html = html.replace("Η", "Η")
|
||||
html = html.replace("Γ", "Γ")
|
||||
html = html.replace("Γ", "Γ")
|
||||
html = html.replace("€", "€")
|
||||
html = html.replace("€", "€")
|
||||
html = html.replace("ϑ", "ϑ")
|
||||
html = html.replace("ϑ", "ϑ")
|
||||
html = html.replace("“", "“")
|
||||
html = html.replace("♥", "♥")
|
||||
html = html.replace("♥", "♥")
|
||||
html = html.replace("ó", "ó")
|
||||
html = html.replace("‌", "‌")
|
||||
html = html.replace("‌", "‌")
|
||||
html = html.replace("¥", "¥")
|
||||
html = html.replace("¥", "¥")
|
||||
html = html.replace("ò", "ò")
|
||||
html = html.replace("Χ", "Χ")
|
||||
html = html.replace("Χ", "Χ")
|
||||
html = html.replace("™", "™")
|
||||
html = html.replace("ξ", "ξ")
|
||||
html = html.replace("ξ", "ξ")
|
||||
html = html.replace("˜", "˜")
|
||||
html = html.replace("˜", "˜")
|
||||
html = html.replace("‹", "‹")
|
||||
html = html.replace("‹", "‹")
|
||||
html = html.replace("œ", "œ")
|
||||
html = html.replace("œ", "œ")
|
||||
html = html.replace("≡", "≡")
|
||||
html = html.replace("≤", "≤")
|
||||
html = html.replace("≤", "≤")
|
||||
html = html.replace("∪", "∪")
|
||||
html = html.replace("∪", "∪")
|
||||
html = html.replace("Ÿ", "Ÿ")
|
||||
html = html.replace("<", "<")
|
||||
html = html.replace("<", "<")
|
||||
html = html.replace("Υ", "Υ")
|
||||
html = html.replace("Υ", "Υ")
|
||||
html = html.replace("–", "–")
|
||||
html = html.replace("ý", "ý")
|
||||
html = html.replace("ℜ", "ℜ")
|
||||
html = html.replace("ℜ", "ℜ")
|
||||
html = html.replace("ψ", "ψ")
|
||||
html = html.replace("ψ", "ψ")
|
||||
html = html.replace("›", "›")
|
||||
html = html.replace("›", "›")
|
||||
html = html.replace("↓", "↓")
|
||||
html = html.replace("↓", "↓")
|
||||
html = html.replace("Α", "Α")
|
||||
html = html.replace("Α", "Α")
|
||||
html = html.replace("¬", "¬")
|
||||
html = html.replace("¬", "¬")
|
||||
html = html.replace("&", "&")
|
||||
html = html.replace("ø", "ø")
|
||||
html = html.replace("´", "´")
|
||||
html = html.replace("‍", "‍")
|
||||
html = html.replace("‍", "‍")
|
||||
html = html.replace("«", "«")
|
||||
html = html.replace("”", "”")
|
||||
html = html.replace("Ì", "Ì")
|
||||
html = html.replace("µ", "µ")
|
||||
html = html.replace("­", "­")
|
||||
html = html.replace("­", "­")
|
||||
html = html.replace("⊇", "⊇")
|
||||
html = html.replace("⊇", "⊇")
|
||||
html = html.replace("ß", "ß")
|
||||
html = html.replace("♣", "♣")
|
||||
html = html.replace("à", "à")
|
||||
html = html.replace("Ô", "Ô")
|
||||
html = html.replace("↔", "↔")
|
||||
html = html.replace("↔", "↔")
|
||||
html = html.replace("←", "←")
|
||||
html = html.replace("←", "←")
|
||||
html = html.replace("½", "½")
|
||||
html = html.replace("∝", "∝")
|
||||
html = html.replace("∝", "∝")
|
||||
html = html.replace("ˆ", "ˆ")
|
||||
html = html.replace("ô", "ô")
|
||||
html = html.replace("≈", "≈")
|
||||
html = html.replace("¨", "¨")
|
||||
html = html.replace("¨", "¨")
|
||||
html = html.replace("∏", "∏")
|
||||
html = html.replace("∏", "∏")
|
||||
html = html.replace("®", "®")
|
||||
html = html.replace("®", "®")
|
||||
html = html.replace("‏", "‏")
|
||||
html = html.replace("‏", "‏")
|
||||
html = html.replace("∞", "∞")
|
||||
html = html.replace("Σ", "Σ")
|
||||
html = html.replace("Σ", "Σ")
|
||||
html = html.replace("—", "—")
|
||||
html = html.replace("↑", "↑")
|
||||
html = html.replace("↑", "↑")
|
||||
html = html.replace("×", "×")
|
||||
html = html.replace("⇒", "⇒")
|
||||
html = html.replace("⇒", "⇒")
|
||||
html = html.replace("∨", "∨")
|
||||
html = html.replace("∨", "∨")
|
||||
html = html.replace("γ", "γ")
|
||||
html = html.replace("γ", "γ")
|
||||
html = html.replace("λ", "λ")
|
||||
html = html.replace("λ", "λ")
|
||||
html = html.replace("〉", "⟩")
|
||||
html = html.replace("〉", "⟩")
|
||||
html = html.replace("†", "†")
|
||||
html = html.replace("†", "†")
|
||||
html = html.replace("ℑ", "ℑ")
|
||||
html = html.replace("ℵ", "ℵ")
|
||||
html = html.replace("ℵ", "ℵ")
|
||||
html = html.replace("⊆", "⊆")
|
||||
html = html.replace("⊆", "⊆")
|
||||
html = html.replace("α", "α")
|
||||
html = html.replace("α", "α")
|
||||
html = html.replace("Ν", "Ν")
|
||||
html = html.replace("Ν", "Ν")
|
||||
html = html.replace("±", "±")
|
||||
html = html.replace("¾", "¾")
|
||||
html = html.replace("‾", "‾")
|
||||
html = html.replace("Δ", "Δ")
|
||||
html = html.replace("Δ", "Δ")
|
||||
html = html.replace("◊", "◊")
|
||||
html = html.replace("◊", "◊")
|
||||
html = html.replace("ι", "ι")
|
||||
html = html.replace("í", "í")
|
||||
html = html.replace("ε", "ε")
|
||||
html = html.replace("ε", "ε")
|
||||
html = html.replace("℘", "℘")
|
||||
html = html.replace("℘", "℘")
|
||||
html = html.replace("∂", "∂")
|
||||
html = html.replace("∂", "∂")
|
||||
html = html.replace("δ", "δ")
|
||||
html = html.replace("δ", "δ")
|
||||
html = html.replace("ο", "ο")
|
||||
html = html.replace("ο", "ο")
|
||||
html = html.replace("Ξ", "Ξ")
|
||||
html = html.replace("Ξ", "Ξ")
|
||||
html = html.replace("‡", "‡")
|
||||
html = html.replace("‡", "‡")
|
||||
html = html.replace("Ò", "Ò")
|
||||
html = html.replace("Û", "Û")
|
||||
html = html.replace("š", "š")
|
||||
html = html.replace("š", "š")
|
||||
html = html.replace("‘", "‘")
|
||||
html = html.replace("∈", "∈")
|
||||
html = html.replace("∈", "∈")
|
||||
html = html.replace("Ζ", "Ζ")
|
||||
html = html.replace("−", "−")
|
||||
html = html.replace("∧", "∧")
|
||||
html = html.replace("∧", "∧")
|
||||
html = html.replace("∠", "∠")
|
||||
html = html.replace("∠", "∠")
|
||||
html = html.replace("¤", "¤")
|
||||
html = html.replace("∫", "∫")
|
||||
html = html.replace("∫", "∫")
|
||||
html = html.replace("⌋", "⌋")
|
||||
html = html.replace("⌋", "⌋")
|
||||
html = html.replace("↵", "↵")
|
||||
html = html.replace("∃", "∃")
|
||||
html = html.replace("⊕", "⊕")
|
||||
html = html.replace("Â", "Â")
|
||||
html = html.replace("ϖ", "ϖ")
|
||||
html = html.replace("ϖ", "ϖ")
|
||||
html = html.replace("∋", "∋")
|
||||
html = html.replace("∋", "∋")
|
||||
html = html.replace("Φ", "Φ")
|
||||
html = html.replace("Φ", "Φ")
|
||||
html = html.replace("Í", "Í")
|
||||
html = html.replace("Ú", "Ú")
|
||||
html = html.replace("Ο", "Ο")
|
||||
html = html.replace("Ο", "Ο")
|
||||
html = html.replace("≠", "≠")
|
||||
html = html.replace("≠", "≠")
|
||||
html = html.replace("¿", "¿")
|
||||
html = html.replace("‚", "‚")
|
||||
html = html.replace("Ê", "Ê")
|
||||
html = html.replace("ζ", "ζ")
|
||||
html = html.replace("Ω", "Ω")
|
||||
html = html.replace("Ω", "Ω")
|
||||
html = html.replace("ν", "ν")
|
||||
html = html.replace("ν", "ν")
|
||||
html = html.replace("¼", "¼")
|
||||
html = html.replace("á", "á")
|
||||
html = html.replace("⇑", "⇑")
|
||||
html = html.replace("⇑", "⇑")
|
||||
html = html.replace("β", "β")
|
||||
html = html.replace("ƒ", "ƒ")
|
||||
html = html.replace("ρ", "ρ")
|
||||
html = html.replace("ρ", "ρ")
|
||||
html = html.replace("é", "é")
|
||||
html = html.replace("ω", "ω")
|
||||
html = html.replace("ω", "ω")
|
||||
html = html.replace("·", "·")
|
||||
html = html.replace("〈", "⟨")
|
||||
html = html.replace("〈", "⟨")
|
||||
html = html.replace("♠", "♠")
|
||||
html = html.replace("♠", "♠")
|
||||
html = html.replace("’", "’")
|
||||
html = html.replace("þ", "þ")
|
||||
html = html.replace("»", "»")
|
||||
html = html.replace("σ", "σ")
|
||||
html = html.replace("σ", "σ")
|
||||
out = open(output_file, 'w')
|
||||
out.write(html)
|
||||
out.close()
|
||||
|
||||
@@ -7,7 +7,7 @@ cp robots.txt *.js *.css build/
|
||||
rm -f examples/*.pyc
|
||||
cp -R examples build/
|
||||
|
||||
# minimize HTML (note: this script is quite fragile and relies on knowledge of how I write HTML)
|
||||
# minimize HTML (XXX this script is quite fragile and relies on knowledge of how I write HTML)
|
||||
for f in *.html; do
|
||||
python htmlminimizer.py "$f" build/"$f"
|
||||
done
|
||||
@@ -41,7 +41,7 @@ sed -i -e "s|<link rel=stylesheet type=text/css media='only screen and (max-devi
|
||||
sed -i -e "s|dip3\.js|http://wearehugh.com/dip3/${revision}.js|g" build/*.html
|
||||
sed -i -e "s|html5\.js|http://wearehugh.com/dip3/html5.js|g" build/*.html
|
||||
|
||||
# images will be served from a separate domain
|
||||
# images would be served from a separate domain if we had any, which we currently don't
|
||||
#sed -i -e "s|bsb.png|http://wearehugh.com/dip3/bsb.png|g" build/*.html
|
||||
|
||||
# minimize URLs
|
||||
|
||||
@@ -244,20 +244,29 @@ mark{display:inline}
|
||||
|
||||
<h2 id=xml-parse>Parsing XML</h2>
|
||||
|
||||
<p>Python comes with an efficient XML parsing library called Etree.
|
||||
<p>Python can parse XML documents in several ways. It has traditional <a href=http://en.wikipedia.org/wiki/XML#DOM>DOM</a> and <a href=http://en.wikipedia.org/wiki/Simple_API_for_XML>SAX</a> parsers, but I will focus on a different library called Etree.
|
||||
|
||||
<p class=d>[<a href=examples/feed.xml>download <code>feed.xml</code></a>]
|
||||
<pre class=screen>
|
||||
>>> import xml.etree.ElementTree as etree
|
||||
>>> tree = etree.parse("examples/feed.xml")
|
||||
>>> root = tree.getroot()
|
||||
>>> root
|
||||
<Element {http://www.w3.org/2005/Atom}feed at cd1eb0>
|
||||
</pre>
|
||||
<a><samp class=p>>>> </samp><kbd>import xml.etree.ElementTree as etree</kbd> <span>①</span></a>
|
||||
<a><samp class=p>>>> </samp><kbd>tree = etree.parse("examples/feed.xml")</kbd> <span>②</span></a>
|
||||
<a><samp class=p>>>> </samp><kbd>root = tree.getroot()</kbd> <span>③</span></a>
|
||||
<a><samp class=p>>>> </samp><kbd>root</kbd> <span>④</span></a>
|
||||
<samp><Element {http://www.w3.org/2005/Atom}feed at cd1eb0></samp></pre>
|
||||
<ol>
|
||||
<li>The Etree library is part of the Python standard library, in <code>xml.etree.ElementTree</code>.
|
||||
<li>The primary entry point for the Etree library is the <code>parse()</code> function, which can take a filename or a file-like object [FIXME xref]. This function parses the entire document at once. If memory is tight, there are ways to parse an XML document incrementally instead.
|
||||
<li>The <code>parse()</code> function returns an object which represents the entire document. This is <em>not</em> the root element. To get a reference to the root element, call the <code>getroot()</code> method.
|
||||
<li>As expected, the root element is the <code>feed</code> element in the <code>http://www.w3.org/2005/Atom</code> namespace. The string representation of this object reinforces an important point: an XML element is a combination of its namespace and its tag name (also called the <i>local name</i>). Every element in this document is in the Atom namespace, so the root element is represented as <code>{http://www.w3.org/2005/Atom}feed</code>.
|
||||
</ol>
|
||||
|
||||
<blockquote class=note>
|
||||
<p><span>☞</span>Etree represents XML elements as <code>{<var>namespace</var>}<var>localname</var></code>. You’ll see and use this format in multiple places in the Etree library.
|
||||
</blockquote>
|
||||
|
||||
<h3 id=xml-elements>Elements Are Lists</h3>
|
||||
|
||||
<p>FIXME
|
||||
<p>In Etree, an element acts like a list. The items of the list are the element’s children.
|
||||
|
||||
<pre class=screen>
|
||||
>>> root.tag
|
||||
|
||||
Reference in New Issue
Block a user