diff --git a/htmlminimizer.py b/htmlminimizer.py index 670b556..4c90793 100644 --- a/htmlminimizer.py +++ b/htmlminimizer.py @@ -1,18 +1,31 @@ """Quick-and-dirty HTML minimizer""" -import sys +import sys, re, html.entities input_file = sys.argv[1] output_file = sys.argv[2] in_pre = False -out = open(output_file, 'w') +out = open(output_file, 'w', encoding="utf-8") # encoding argument! important! for line in open(input_file).readlines(): + # replace entities with Unicode characters + for e in re.findall('&(.+?);', line): + n = html.entities.name2codepoint.get(e) + if not n: + if e.count('#x'): + # it's late, forgive me + n = eval(e.replace('#', '0')) + elif e.count('#'): + n = int(e.replace('#', '')) + else: + continue + line = line.replace('&' + e + ';', chr(n)) + + # strip leading and trailing whitespace, except inside
 blocks
     g = line.strip()
     if g.count(' then 
-        # on the same line, so don't do that
+        # XXX this will break if you have 
 in one line
         in_pre = False
         g = line.rstrip()
     if in_pre:
@@ -20,365 +33,3 @@ for line in open(input_file).readlines():
     else:
         out.write(g)
 out.close()
-
-out = open(output_file)
-html = out.read()
-out.close()
-html = html.replace("å", "å")
-html = html.replace(">", ">")
-html = html.replace(">", ">")
-html = html.replace("⊃", "⊃")
-html = html.replace("⊃", "⊃")
-html = html.replace("Ñ", "Ñ")
-html = html.replace("ϒ", "ϒ")
-html = html.replace("ϒ", "ϒ")
-html = html.replace("Ý", "Ý")
-html = html.replace("Ã", "Ã")
-html = html.replace("√", "√")
-html = html.replace("⊗", "⊗")
-html = html.replace("⊗", "⊗")
-html = html.replace("æ", "æ")
-html = html.replace("Ψ", "Ψ")
-html = html.replace("Ψ", "Ψ")
-html = html.replace("Ε", "Ε")
-html = html.replace("Ε", "Ε")
-html = html.replace("Î", "Î")
-html = html.replace("É", "É")
-html = html.replace("Λ", "Λ")
-html = html.replace("Λ", "Λ")
-html = html.replace("″", "″")
-html = html.replace("Κ", "Κ")
-html = html.replace("Κ", "Κ")
-html = html.replace("ς", "ς")
-html = html.replace("ς", "ς")
-html = html.replace("‎", "‎")
-html = html.replace("‎", "‎")
-html = html.replace("¸", "¸")
-html = html.replace(" ", " ")
-html = html.replace(" ", " ")
-html = html.replace("Æ", "Æ")
-html = html.replace("′", "′")
-html = html.replace("Τ", "Τ")
-html = html.replace("Τ", "Τ")
-html = html.replace("⌈", "⌈")
-html = html.replace("⇓", "⇓")
-html = html.replace("⇓", "⇓")
-html = html.replace("≥", "≥")
-html = html.replace("≥", "≥")
-html = html.replace("⋅", "⋅")
-html = html.replace("⋅", "⋅")
-html = html.replace("⌊", "⌊")
-html = html.replace("⌊", "⌊")
-html = html.replace("⇐", "⇐")
-html = html.replace("⇐", "⇐")
-html = html.replace("¦", "¦")
-html = html.replace("Õ", "Õ")
-html = html.replace("Θ", "Θ")
-html = html.replace("Θ", "Θ")
-html = html.replace("Π", "Π")
-html = html.replace("Π", "Π")
-html = html.replace("Œ", "Œ")
-html = html.replace("Œ", "Œ")
-html = html.replace("Š", "Š")
-html = html.replace("Š", "Š")
-html = html.replace("è", "è")
-html = html.replace("⊂", "⊂")
-html = html.replace("⊂", "⊂")
-html = html.replace("¡", "¡")
-html = html.replace("∑", "∑")
-html = html.replace("∑", "∑")
-html = html.replace("ñ", "ñ")
-html = html.replace("ã", "ã")
-html = html.replace("θ", "θ")
-html = html.replace("θ", "θ")
-html = html.replace("⊄", "⊄")
-html = html.replace("⊄", "⊄")
-html = html.replace("⇔", "⇔")
-html = html.replace("⇔", "⇔")
-html = html.replace("Ø", "Ø")
-html = html.replace("Þ", "Þ")
-html = html.replace("Μ", "Μ")
-html = html.replace("Μ", "Μ")
-html = html.replace(" ", " ")
-html = html.replace(" ", " ")
-html = html.replace("ê", "ê")
-html = html.replace("„", "„")
-html = html.replace("Å", "Å")
-html = html.replace("∇", "∇")
-html = html.replace("‰", "‰")
-html = html.replace("‰", "‰")
-html = html.replace("Ù", "Ù")
-html = html.replace("η", "η")
-html = html.replace("η", "η")
-html = html.replace("À", "À")
-html = html.replace("∀", "∀")
-html = html.replace("∀", "∀")
-html = html.replace("ð", "ð")
-html = html.replace("ð", "ð")
-html = html.replace("⌉", "⌉")
-html = html.replace("È", "È")
-html = html.replace("÷", "÷")
-html = html.replace("ì", "ì")
-html = html.replace("õ", "õ")
-html = html.replace("£", "£")
-html = html.replace("⁄", "⁄")
-html = html.replace("Ð", "Ð")
-html = html.replace("Ð", "Ð")
-html = html.replace("∗", "∗")
-html = html.replace("∗", "∗")
-html = html.replace("χ", "χ")
-html = html.replace("χ", "χ")
-html = html.replace("Á", "Á")
-html = html.replace("Β", "Β")
-html = html.replace("⊥", "⊥")
-html = html.replace("⊥", "⊥")
-html = html.replace("∴", "∴")
-html = html.replace("∴", "∴")
-html = html.replace("π", "π")
-html = html.replace("π", "π")
-html = html.replace("∅", "∅")
-html = html.replace("∉", "∉")
-html = html.replace("î", "î")
-html = html.replace("•", "•")
-html = html.replace("•", "•")
-html = html.replace("υ", "υ")
-html = html.replace("υ", "υ")
-html = html.replace("Ó", "Ó")
-html = html.replace("κ", "κ")
-html = html.replace("κ", "κ")
-html = html.replace("ç", "ç")
-html = html.replace("∩", "∩")
-html = html.replace("∩", "∩")
-html = html.replace("μ", "μ")
-html = html.replace("μ", "μ")
-html = html.replace("°", "°")
-html = html.replace("°", "°")
-html = html.replace("τ", "τ")
-html = html.replace("τ", "τ")
-html = html.replace(" ", " ")
-html = html.replace(" ", " ")
-html = html.replace("…", "…")
-html = html.replace("…", "…")
-html = html.replace("û", "û")
-html = html.replace("ù", "ù")
-html = html.replace("≅", "≅")
-html = html.replace("≅", "≅")
-html = html.replace("Ι", "Ι")
-html = html.replace(""", """)
-html = html.replace(""", """)
-html = html.replace("→", "→")
-html = html.replace("→", "→")
-html = html.replace("Ρ", "Ρ")
-html = html.replace("Ρ", "Ρ")
-html = html.replace("ú", "ú")
-html = html.replace("â", "â")
-html = html.replace("∼", "∼")
-html = html.replace("∼", "∼")
-html = html.replace("φ", "φ")
-html = html.replace("φ", "φ")
-html = html.replace("♦", "♦")
-html = html.replace("Ç", "Ç")
-html = html.replace("Η", "Η")
-html = html.replace("Η", "Η")
-html = html.replace("Γ", "Γ")
-html = html.replace("Γ", "Γ")
-html = html.replace("€", "€")
-html = html.replace("€", "€")
-html = html.replace("ϑ", "ϑ")
-html = html.replace("ϑ", "ϑ")
-html = html.replace("“", "“")
-html = html.replace("♥", "♥")
-html = html.replace("♥", "♥")
-html = html.replace("ó", "ó")
-html = html.replace("‌", "‌")
-html = html.replace("‌", "‌")
-html = html.replace("¥", "¥")
-html = html.replace("¥", "¥")
-html = html.replace("ò", "ò")
-html = html.replace("Χ", "Χ")
-html = html.replace("Χ", "Χ")
-html = html.replace("™", "™")
-html = html.replace("ξ", "ξ")
-html = html.replace("ξ", "ξ")
-html = html.replace("˜", "˜")
-html = html.replace("˜", "˜")
-html = html.replace("‹", "‹")
-html = html.replace("‹", "‹")
-html = html.replace("œ", "œ")
-html = html.replace("œ", "œ")
-html = html.replace("≡", "≡")
-html = html.replace("≤", "≤")
-html = html.replace("≤", "≤")
-html = html.replace("∪", "∪")
-html = html.replace("∪", "∪")
-html = html.replace("Ÿ", "Ÿ")
-html = html.replace("<", "<")
-html = html.replace("<", "<")
-html = html.replace("Υ", "Υ")
-html = html.replace("Υ", "Υ")
-html = html.replace("–", "–")
-html = html.replace("ý", "ý")
-html = html.replace("ℜ", "ℜ")
-html = html.replace("ℜ", "ℜ")
-html = html.replace("ψ", "ψ")
-html = html.replace("ψ", "ψ")
-html = html.replace("›", "›")
-html = html.replace("›", "›")
-html = html.replace("↓", "↓")
-html = html.replace("↓", "↓")
-html = html.replace("Α", "Α")
-html = html.replace("Α", "Α")
-html = html.replace("¬", "¬")
-html = html.replace("¬", "¬")
-html = html.replace("&", "&")
-html = html.replace("ø", "ø")
-html = html.replace("´", "´")
-html = html.replace("‍", "‍")
-html = html.replace("‍", "‍")
-html = html.replace("«", "«")
-html = html.replace("”", "”")
-html = html.replace("Ì", "Ì")
-html = html.replace("µ", "µ")
-html = html.replace("­", "­")
-html = html.replace("­", "­")
-html = html.replace("⊇", "⊇")
-html = html.replace("⊇", "⊇")
-html = html.replace("ß", "ß")
-html = html.replace("♣", "♣")
-html = html.replace("à", "à")
-html = html.replace("Ô", "Ô")
-html = html.replace("↔", "↔")
-html = html.replace("↔", "↔")
-html = html.replace("←", "←")
-html = html.replace("←", "←")
-html = html.replace("½", "½")
-html = html.replace("∝", "∝")
-html = html.replace("∝", "∝")
-html = html.replace("ˆ", "ˆ")
-html = html.replace("ô", "ô")
-html = html.replace("≈", "≈")
-html = html.replace("¨", "¨")
-html = html.replace("¨", "¨")
-html = html.replace("∏", "∏")
-html = html.replace("∏", "∏")
-html = html.replace("®", "®")
-html = html.replace("®", "®")
-html = html.replace("‏", "‏")
-html = html.replace("‏", "‏")
-html = html.replace("∞", "∞")
-html = html.replace("Σ", "Σ")
-html = html.replace("Σ", "Σ")
-html = html.replace("—", "—")
-html = html.replace("↑", "↑")
-html = html.replace("↑", "↑")
-html = html.replace("×", "×")
-html = html.replace("⇒", "⇒")
-html = html.replace("⇒", "⇒")
-html = html.replace("∨", "∨")
-html = html.replace("∨", "∨")
-html = html.replace("γ", "γ")
-html = html.replace("γ", "γ")
-html = html.replace("λ", "λ")
-html = html.replace("λ", "λ")
-html = html.replace("〉", "⟩")
-html = html.replace("〉", "⟩")
-html = html.replace("†", "†")
-html = html.replace("†", "†")
-html = html.replace("ℑ", "ℑ")
-html = html.replace("ℵ", "ℵ")
-html = html.replace("ℵ", "ℵ")
-html = html.replace("⊆", "⊆")
-html = html.replace("⊆", "⊆")
-html = html.replace("α", "α")
-html = html.replace("α", "α")
-html = html.replace("Ν", "Ν")
-html = html.replace("Ν", "Ν")
-html = html.replace("±", "±")
-html = html.replace("¾", "¾")
-html = html.replace("‾", "‾")
-html = html.replace("Δ", "Δ")
-html = html.replace("Δ", "Δ")
-html = html.replace("◊", "◊")
-html = html.replace("◊", "◊")
-html = html.replace("ι", "ι")
-html = html.replace("í", "í")
-html = html.replace("ε", "ε")
-html = html.replace("ε", "ε")
-html = html.replace("℘", "℘")
-html = html.replace("℘", "℘")
-html = html.replace("∂", "∂")
-html = html.replace("∂", "∂")
-html = html.replace("δ", "δ")
-html = html.replace("δ", "δ")
-html = html.replace("ο", "ο")
-html = html.replace("ο", "ο")
-html = html.replace("Ξ", "Ξ")
-html = html.replace("Ξ", "Ξ")
-html = html.replace("‡", "‡")
-html = html.replace("‡", "‡")
-html = html.replace("Ò", "Ò")
-html = html.replace("Û", "Û")
-html = html.replace("š", "š")
-html = html.replace("š", "š")
-html = html.replace("‘", "‘")
-html = html.replace("∈", "∈")
-html = html.replace("∈", "∈")
-html = html.replace("Ζ", "Ζ")
-html = html.replace("−", "−")
-html = html.replace("∧", "∧")
-html = html.replace("∧", "∧")
-html = html.replace("∠", "∠")
-html = html.replace("∠", "∠")
-html = html.replace("¤", "¤")
-html = html.replace("∫", "∫")
-html = html.replace("∫", "∫")
-html = html.replace("⌋", "⌋")
-html = html.replace("⌋", "⌋")
-html = html.replace("↵", "↵")
-html = html.replace("∃", "∃")
-html = html.replace("⊕", "⊕")
-html = html.replace("Â", "Â")
-html = html.replace("ϖ", "ϖ")
-html = html.replace("ϖ", "ϖ")
-html = html.replace("∋", "∋")
-html = html.replace("∋", "∋")
-html = html.replace("Φ", "Φ")
-html = html.replace("Φ", "Φ")
-html = html.replace("Í", "Í")
-html = html.replace("Ú", "Ú")
-html = html.replace("Ο", "Ο")
-html = html.replace("Ο", "Ο")
-html = html.replace("≠", "≠")
-html = html.replace("≠", "≠")
-html = html.replace("¿", "¿")
-html = html.replace("‚", "‚")
-html = html.replace("Ê", "Ê")
-html = html.replace("ζ", "ζ")
-html = html.replace("Ω", "Ω")
-html = html.replace("Ω", "Ω")
-html = html.replace("ν", "ν")
-html = html.replace("ν", "ν")
-html = html.replace("¼", "¼")
-html = html.replace("á", "á")
-html = html.replace("⇑", "⇑")
-html = html.replace("⇑", "⇑")
-html = html.replace("β", "β")
-html = html.replace("ƒ", "ƒ")
-html = html.replace("ρ", "ρ")
-html = html.replace("ρ", "ρ")
-html = html.replace("é", "é")
-html = html.replace("ω", "ω")
-html = html.replace("ω", "ω")
-html = html.replace("·", "·")
-html = html.replace("〈", "⟨")
-html = html.replace("〈", "⟨")
-html = html.replace("♠", "♠")
-html = html.replace("♠", "♠")
-html = html.replace("’", "’")
-html = html.replace("þ", "þ")
-html = html.replace("»", "»")
-html = html.replace("σ", "σ")
-html = html.replace("σ", "σ")
-out = open(output_file, 'w')
-out.write(html)
-out.close()
diff --git a/publish b/publish
index bc41444..29e2904 100755
--- a/publish
+++ b/publish
@@ -7,7 +7,7 @@ cp robots.txt *.js *.css build/
 rm -f examples/*.pyc
 cp -R examples build/
 
-# minimize HTML (note: this script is quite fragile and relies on knowledge of how I write HTML)
+# minimize HTML (XXX this script is quite fragile and relies on knowledge of how I write HTML)
 for f in *.html; do
   python htmlminimizer.py "$f" build/"$f"
 done
@@ -41,7 +41,7 @@ sed -i -e "s|Parsing XML
 
-

Python comes with an efficient XML parsing library called Etree. +

Python can parse XML documents in several ways. It has traditional DOM and SAX parsers, but I will focus on a different library called Etree.

[download feed.xml]

->>> import xml.etree.ElementTree as etree
->>> tree = etree.parse("examples/feed.xml")
->>> root = tree.getroot()
->>> root
-<Element {http://www.w3.org/2005/Atom}feed at cd1eb0>
-
+>>> import xml.etree.ElementTree as etree +>>> tree = etree.parse("examples/feed.xml") +>>> root = tree.getroot() +>>> root +<Element {http://www.w3.org/2005/Atom}feed at cd1eb0>
+
    +
  1. The Etree library is part of the Python standard library, in xml.etree.ElementTree. +
  2. The primary entry point for the Etree library is the parse() function, which can take a filename or a file-like object [FIXME xref]. This function parses the entire document at once. If memory is tight, there are ways to parse an XML document incrementally instead. +
  3. The parse() function returns an object which represents the entire document. This is not the root element. To get a reference to the root element, call the getroot() method. +
  4. As expected, the root element is the feed element in the http://www.w3.org/2005/Atom namespace. The string representation of this object reinforces an important point: an XML element is a combination of its namespace and its tag name (also called the local name). Every element in this document is in the Atom namespace, so the root element is represented as {http://www.w3.org/2005/Atom}feed. +
+ +
+

Etree represents XML elements as {namespace}localname. You’ll see and use this format in multiple places in the Etree library. +

Elements Are Lists

-

FIXME +

In Etree, an element acts like a list. The items of the list are the element’s children.

 >>> root.tag