new section in xml chapter, also entities-to-Unicode-characters in build script

This commit is contained in:
Mark Pilgrim
2009-05-20 11:05:07 -04:00
parent ecb8cf0fee
commit 61a84f9b5b
3 changed files with 36 additions and 376 deletions
+17 -366
View File
@@ -1,18 +1,31 @@
"""Quick-and-dirty HTML minimizer"""
import sys
import sys, re, html.entities
input_file = sys.argv[1]
output_file = sys.argv[2]
in_pre = False
out = open(output_file, 'w')
out = open(output_file, 'w', encoding="utf-8") # encoding argument! important!
for line in open(input_file).readlines():
# replace entities with Unicode characters
for e in re.findall('&(.+?);', line):
n = html.entities.name2codepoint.get(e)
if not n:
if e.count('#x'):
# it's late, forgive me
n = eval(e.replace('#', '0'))
elif e.count('#'):
n = int(e.replace('#', ''))
else:
continue
line = line.replace('&' + e + ';', chr(n))
# strip leading and trailing whitespace, except inside <pre> blocks
g = line.strip()
if g.count('<pre'):
in_pre = True
if g.count('</pre'):
# this will break if you have a </pre> then <pre>
# on the same line, so don't do that
# XXX this will break if you have </pre><pre> in one line
in_pre = False
g = line.rstrip()
if in_pre:
@@ -20,365 +33,3 @@ for line in open(input_file).readlines():
else:
out.write(g)
out.close()
out = open(output_file)
html = out.read()
out.close()
html = html.replace("&aring;", "&#229;")
html = html.replace("&#62;", "&gt;")
html = html.replace("&#x3e;", "&gt;")
html = html.replace("&#8835;", "&sup;")
html = html.replace("&#x2283;", "&sup;")
html = html.replace("&Ntilde;", "&#209;")
html = html.replace("&#x3d2;", "&#978;")
html = html.replace("&upsih;", "&#978;")
html = html.replace("&Yacute;", "&#221;")
html = html.replace("&Atilde;", "&#195;")
html = html.replace("&#x221a;", "&#8730;")
html = html.replace("&#x2297;", "&#8855;")
html = html.replace("&otimes;", "&#8855;")
html = html.replace("&aelig;", "&#230;")
html = html.replace("&#936;", "&Psi;")
html = html.replace("&#x3a8;", "&Psi;")
html = html.replace("&#x395;", "&#917;")
html = html.replace("&Epsilon;", "&#917;")
html = html.replace("&Icirc;", "&#206;")
html = html.replace("&Eacute;", "&#201;")
html = html.replace("&#x39b;", "&#923;")
html = html.replace("&Lambda;", "&#923;")
html = html.replace("&#x2033;", "&#8243;")
html = html.replace("&#x39a;", "&#922;")
html = html.replace("&Kappa;", "&#922;")
html = html.replace("&#x3c2;", "&#962;")
html = html.replace("&sigmaf;", "&#962;")
html = html.replace("&#8206;", "&lrm;")
html = html.replace("&#x200e;", "&lrm;")
html = html.replace("&cedil;", "&#184;")
html = html.replace("&#8194;", "&ensp;")
html = html.replace("&#x2002;", "&ensp;")
html = html.replace("&AElig;", "&#198;")
html = html.replace("&#x2032;", "&#8242;")
html = html.replace("&#932;", "&Tau;")
html = html.replace("&#x3a4;", "&Tau;")
html = html.replace("&#x2308;", "&#8968;")
html = html.replace("&#8659;", "&dArr;")
html = html.replace("&#x21d3;", "&dArr;")
html = html.replace("&#8805;", "&ge;")
html = html.replace("&#x2265;", "&ge;")
html = html.replace("&#8901;", "&sdot;")
html = html.replace("&#x22c5;", "&sdot;")
html = html.replace("&#x230a;", "&#8970;")
html = html.replace("&lfloor;", "&#8970;")
html = html.replace("&#8656;", "&lArr;")
html = html.replace("&#x21d0;", "&lArr;")
html = html.replace("&brvbar;", "&#166;")
html = html.replace("&Otilde;", "&#213;")
html = html.replace("&#x398;", "&#920;")
html = html.replace("&Theta;", "&#920;")
html = html.replace("&#928;", "&Pi;")
html = html.replace("&#x3a0;", "&Pi;")
html = html.replace("&#x152;", "&#338;")
html = html.replace("&OElig;", "&#338;")
html = html.replace("&#x160;", "&#352;")
html = html.replace("&Scaron;", "&#352;")
html = html.replace("&egrave;", "&#232;")
html = html.replace("&#8834;", "&sub;")
html = html.replace("&#x2282;", "&sub;")
html = html.replace("&iexcl;", "&#161;")
html = html.replace("&#8721;", "&sum;")
html = html.replace("&#x2211;", "&sum;")
html = html.replace("&ntilde;", "&#241;")
html = html.replace("&atilde;", "&#227;")
html = html.replace("&#x3b8;", "&#952;")
html = html.replace("&theta;", "&#952;")
html = html.replace("&#8836;", "&nsub;")
html = html.replace("&#x2284;", "&nsub;")
html = html.replace("&#8660;", "&hArr;")
html = html.replace("&#x21d4;", "&hArr;")
html = html.replace("&Oslash;", "&#216;")
html = html.replace("&THORN;", "&#222;")
html = html.replace("&#924;", "&Mu;")
html = html.replace("&#x39c;", "&Mu;")
html = html.replace("&#x2009;", "&#8201;")
html = html.replace("&thinsp;", "&#8201;")
html = html.replace("&ecirc;", "&#234;")
html = html.replace("&#x201e;", "&#8222;")
html = html.replace("&Aring;", "&#197;")
html = html.replace("&#x2207;", "&#8711;")
html = html.replace("&#x2030;", "&#8240;")
html = html.replace("&permil;", "&#8240;")
html = html.replace("&Ugrave;", "&#217;")
html = html.replace("&#951;", "&eta;")
html = html.replace("&#x3b7;", "&eta;")
html = html.replace("&Agrave;", "&#192;")
html = html.replace("&#x2200;", "&#8704;")
html = html.replace("&forall;", "&#8704;")
html = html.replace("&#240;", "&eth;")
html = html.replace("&#xf0;", "&eth;")
html = html.replace("&#x2309;", "&#8969;")
html = html.replace("&Egrave;", "&#200;")
html = html.replace("&divide;", "&#247;")
html = html.replace("&igrave;", "&#236;")
html = html.replace("&otilde;", "&#245;")
html = html.replace("&pound;", "&#163;")
html = html.replace("&#x2044;", "&#8260;")
html = html.replace("&#208;", "&ETH;")
html = html.replace("&#xd0;", "&ETH;")
html = html.replace("&#x2217;", "&#8727;")
html = html.replace("&lowast;", "&#8727;")
html = html.replace("&#967;", "&chi;")
html = html.replace("&#x3c7;", "&chi;")
html = html.replace("&Aacute;", "&#193;")
html = html.replace("&#x392;", "&#914;")
html = html.replace("&#8869;", "&perp;")
html = html.replace("&#x22a5;", "&perp;")
html = html.replace("&#x2234;", "&#8756;")
html = html.replace("&there4;", "&#8756;")
html = html.replace("&#960;", "&pi;")
html = html.replace("&#x3c0;", "&pi;")
html = html.replace("&#x2205;", "&#8709;")
html = html.replace("&#x2209;", "&#8713;")
html = html.replace("&icirc;", "&#238;")
html = html.replace("&#8226;", "&bull;")
html = html.replace("&#x2022;", "&bull;")
html = html.replace("&#x3c5;", "&#965;")
html = html.replace("&upsilon;", "&#965;")
html = html.replace("&Oacute;", "&#211;")
html = html.replace("&#x3ba;", "&#954;")
html = html.replace("&kappa;", "&#954;")
html = html.replace("&ccedil;", "&#231;")
html = html.replace("&#8745;", "&cap;")
html = html.replace("&#x2229;", "&cap;")
html = html.replace("&#956;", "&mu;")
html = html.replace("&#x3bc;", "&mu;")
html = html.replace("&#176;", "&deg;")
html = html.replace("&#xb0;", "&deg;")
html = html.replace("&#964;", "&tau;")
html = html.replace("&#x3c4;", "&tau;")
html = html.replace("&#8195;", "&emsp;")
html = html.replace("&#x2003;", "&emsp;")
html = html.replace("&#x2026;", "&#8230;")
html = html.replace("&hellip;", "&#8230;")
html = html.replace("&ucirc;", "&#251;")
html = html.replace("&ugrave;", "&#249;")
html = html.replace("&#8773;", "&cong;")
html = html.replace("&#x2245;", "&cong;")
html = html.replace("&#x399;", "&#921;")
html = html.replace("&#x22;", "&#34;")
html = html.replace("&quot;", "&#34;")
html = html.replace("&#8594;", "&rarr;")
html = html.replace("&#x2192;", "&rarr;")
html = html.replace("&#929;", "&Rho;")
html = html.replace("&#x3a1;", "&Rho;")
html = html.replace("&uacute;", "&#250;")
html = html.replace("&acirc;", "&#226;")
html = html.replace("&#8764;", "&sim;")
html = html.replace("&#x223c;", "&sim;")
html = html.replace("&#966;", "&phi;")
html = html.replace("&#x3c6;", "&phi;")
html = html.replace("&#x2666;", "&#9830;")
html = html.replace("&Ccedil;", "&#199;")
html = html.replace("&#919;", "&Eta;")
html = html.replace("&#x397;", "&Eta;")
html = html.replace("&#x393;", "&#915;")
html = html.replace("&Gamma;", "&#915;")
html = html.replace("&#8364;", "&euro;")
html = html.replace("&#x20ac;", "&euro;")
html = html.replace("&#x3d1;", "&#977;")
html = html.replace("&thetasym;", "&#977;")
html = html.replace("&#x201c;", "&#8220;")
html = html.replace("&#x2665;", "&#9829;")
html = html.replace("&hearts;", "&#9829;")
html = html.replace("&oacute;", "&#243;")
html = html.replace("&#8204;", "&zwnj;")
html = html.replace("&#x200c;", "&zwnj;")
html = html.replace("&#165;", "&yen;")
html = html.replace("&#xa5;", "&yen;")
html = html.replace("&ograve;", "&#242;")
html = html.replace("&#935;", "&Chi;")
html = html.replace("&#x3a7;", "&Chi;")
html = html.replace("&#x2122;", "&#8482;")
html = html.replace("&#958;", "&xi;")
html = html.replace("&#x3be;", "&xi;")
html = html.replace("&#x2dc;", "&#732;")
html = html.replace("&tilde;", "&#732;")
html = html.replace("&#x2039;", "&#8249;")
html = html.replace("&lsaquo;", "&#8249;")
html = html.replace("&#x153;", "&#339;")
html = html.replace("&oelig;", "&#339;")
html = html.replace("&#x2261;", "&#8801;")
html = html.replace("&#8804;", "&le;")
html = html.replace("&#x2264;", "&le;")
html = html.replace("&#8746;", "&cup;")
html = html.replace("&#x222a;", "&cup;")
html = html.replace("&#x178;", "&#376;")
html = html.replace("&#60;", "&lt;")
html = html.replace("&#x3c;", "&lt;")
html = html.replace("&#x3a5;", "&#933;")
html = html.replace("&Upsilon;", "&#933;")
html = html.replace("&#x2013;", "&#8211;")
html = html.replace("&yacute;", "&#253;")
html = html.replace("&#8476;", "&real;")
html = html.replace("&#x211c;", "&real;")
html = html.replace("&#968;", "&psi;")
html = html.replace("&#x3c8;", "&psi;")
html = html.replace("&#x203a;", "&#8250;")
html = html.replace("&rsaquo;", "&#8250;")
html = html.replace("&#8595;", "&darr;")
html = html.replace("&#x2193;", "&darr;")
html = html.replace("&#x391;", "&#913;")
html = html.replace("&Alpha;", "&#913;")
html = html.replace("&#172;", "&not;")
html = html.replace("&#xac;", "&not;")
html = html.replace("&#x26;", "&#38;")
html = html.replace("&oslash;", "&#248;")
html = html.replace("&acute;", "&#180;")
html = html.replace("&#8205;", "&zwj;")
html = html.replace("&#x200d;", "&zwj;")
html = html.replace("&laquo;", "&#171;")
html = html.replace("&#x201d;", "&#8221;")
html = html.replace("&Igrave;", "&#204;")
html = html.replace("&micro;", "&#181;")
html = html.replace("&#173;", "&shy;")
html = html.replace("&#xad;", "&shy;")
html = html.replace("&#8839;", "&supe;")
html = html.replace("&#x2287;", "&supe;")
html = html.replace("&szlig;", "&#223;")
html = html.replace("&#x2663;", "&#9827;")
html = html.replace("&agrave;", "&#224;")
html = html.replace("&Ocirc;", "&#212;")
html = html.replace("&#8596;", "&harr;")
html = html.replace("&#x2194;", "&harr;")
html = html.replace("&#8592;", "&larr;")
html = html.replace("&#x2190;", "&larr;")
html = html.replace("&frac12;", "&#189;")
html = html.replace("&#8733;", "&prop;")
html = html.replace("&#x221d;", "&prop;")
html = html.replace("&#x2c6;", "&#710;")
html = html.replace("&ocirc;", "&#244;")
html = html.replace("&#x2248;", "&#8776;")
html = html.replace("&#168;", "&uml;")
html = html.replace("&#xa8;", "&uml;")
html = html.replace("&#8719;", "&prod;")
html = html.replace("&#x220f;", "&prod;")
html = html.replace("&#174;", "&reg;")
html = html.replace("&#xae;", "&reg;")
html = html.replace("&#8207;", "&rlm;")
html = html.replace("&#x200f;", "&rlm;")
html = html.replace("&#x221e;", "&#8734;")
html = html.replace("&#x3a3;", "&#931;")
html = html.replace("&Sigma;", "&#931;")
html = html.replace("&#x2014;", "&#8212;")
html = html.replace("&#8593;", "&uarr;")
html = html.replace("&#x2191;", "&uarr;")
html = html.replace("&times;", "&#215;")
html = html.replace("&#8658;", "&rArr;")
html = html.replace("&#x21d2;", "&rArr;")
html = html.replace("&#8744;", "&or;")
html = html.replace("&#x2228;", "&or;")
html = html.replace("&#x3b3;", "&#947;")
html = html.replace("&gamma;", "&#947;")
html = html.replace("&#x3bb;", "&#955;")
html = html.replace("&lambda;", "&#955;")
html = html.replace("&#9002;", "&rang;")
html = html.replace("&#x232a;", "&rang;")
html = html.replace("&#x2020;", "&#8224;")
html = html.replace("&dagger;", "&#8224;")
html = html.replace("&#x2111;", "&#8465;")
html = html.replace("&#x2135;", "&#8501;")
html = html.replace("&alefsym;", "&#8501;")
html = html.replace("&#8838;", "&sube;")
html = html.replace("&#x2286;", "&sube;")
html = html.replace("&#x3b1;", "&#945;")
html = html.replace("&alpha;", "&#945;")
html = html.replace("&#925;", "&Nu;")
html = html.replace("&#x39d;", "&Nu;")
html = html.replace("&plusmn;", "&#177;")
html = html.replace("&frac34;", "&#190;")
html = html.replace("&#x203e;", "&#8254;")
html = html.replace("&#x394;", "&#916;")
html = html.replace("&Delta;", "&#916;")
html = html.replace("&#9674;", "&loz;")
html = html.replace("&#x25ca;", "&loz;")
html = html.replace("&#x3b9;", "&#953;")
html = html.replace("&iacute;", "&#237;")
html = html.replace("&#x3b5;", "&#949;")
html = html.replace("&epsilon;", "&#949;")
html = html.replace("&#x2118;", "&#8472;")
html = html.replace("&weierp;", "&#8472;")
html = html.replace("&#8706;", "&part;")
html = html.replace("&#x2202;", "&part;")
html = html.replace("&#x3b4;", "&#948;")
html = html.replace("&delta;", "&#948;")
html = html.replace("&#x3bf;", "&#959;")
html = html.replace("&omicron;", "&#959;")
html = html.replace("&#926;", "&Xi;")
html = html.replace("&#x39e;", "&Xi;")
html = html.replace("&#x2021;", "&#8225;")
html = html.replace("&Dagger;", "&#8225;")
html = html.replace("&Ograve;", "&#210;")
html = html.replace("&Ucirc;", "&#219;")
html = html.replace("&#x161;", "&#353;")
html = html.replace("&scaron;", "&#353;")
html = html.replace("&#x2018;", "&#8216;")
html = html.replace("&#8712;", "&isin;")
html = html.replace("&#x2208;", "&isin;")
html = html.replace("&#x396;", "&#918;")
html = html.replace("&#x2212;", "&#8722;")
html = html.replace("&#8743;", "&and;")
html = html.replace("&#x2227;", "&and;")
html = html.replace("&#8736;", "&ang;")
html = html.replace("&#x2220;", "&ang;")
html = html.replace("&curren;", "&#164;")
html = html.replace("&#8747;", "&int;")
html = html.replace("&#x222b;", "&int;")
html = html.replace("&#x230b;", "&#8971;")
html = html.replace("&rfloor;", "&#8971;")
html = html.replace("&#x21b5;", "&#8629;")
html = html.replace("&#x2203;", "&#8707;")
html = html.replace("&#x2295;", "&#8853;")
html = html.replace("&Acirc;", "&#194;")
html = html.replace("&#982;", "&piv;")
html = html.replace("&#x3d6;", "&piv;")
html = html.replace("&#8715;", "&ni;")
html = html.replace("&#x220b;", "&ni;")
html = html.replace("&#934;", "&Phi;")
html = html.replace("&#x3a6;", "&Phi;")
html = html.replace("&Iacute;", "&#205;")
html = html.replace("&Uacute;", "&#218;")
html = html.replace("&#x39f;", "&#927;")
html = html.replace("&Omicron;", "&#927;")
html = html.replace("&#8800;", "&ne;")
html = html.replace("&#x2260;", "&ne;")
html = html.replace("&iquest;", "&#191;")
html = html.replace("&#x201a;", "&#8218;")
html = html.replace("&Ecirc;", "&#202;")
html = html.replace("&#x3b6;", "&#950;")
html = html.replace("&#x3a9;", "&#937;")
html = html.replace("&Omega;", "&#937;")
html = html.replace("&#957;", "&nu;")
html = html.replace("&#x3bd;", "&nu;")
html = html.replace("&frac14;", "&#188;")
html = html.replace("&aacute;", "&#225;")
html = html.replace("&#8657;", "&uArr;")
html = html.replace("&#x21d1;", "&uArr;")
html = html.replace("&#x3b2;", "&#946;")
html = html.replace("&#x192;", "&#402;")
html = html.replace("&#961;", "&rho;")
html = html.replace("&#x3c1;", "&rho;")
html = html.replace("&eacute;", "&#233;")
html = html.replace("&#x3c9;", "&#969;")
html = html.replace("&omega;", "&#969;")
html = html.replace("&middot;", "&#183;")
html = html.replace("&#9001;", "&lang;")
html = html.replace("&#x2329;", "&lang;")
html = html.replace("&#x2660;", "&#9824;")
html = html.replace("&spades;", "&#9824;")
html = html.replace("&#x2019;", "&#8217;")
html = html.replace("&thorn;", "&#254;")
html = html.replace("&raquo;", "&#187;")
html = html.replace("&#x3c3;", "&#963;")
html = html.replace("&sigma;", "&#963;")
out = open(output_file, 'w')
out.write(html)
out.close()
+2 -2
View File
@@ -7,7 +7,7 @@ cp robots.txt *.js *.css build/
rm -f examples/*.pyc
cp -R examples build/
# minimize HTML (note: this script is quite fragile and relies on knowledge of how I write HTML)
# minimize HTML (XXX this script is quite fragile and relies on knowledge of how I write HTML)
for f in *.html; do
python htmlminimizer.py "$f" build/"$f"
done
@@ -41,7 +41,7 @@ sed -i -e "s|<link rel=stylesheet type=text/css media='only screen and (max-devi
sed -i -e "s|dip3\.js|http://wearehugh.com/dip3/${revision}.js|g" build/*.html
sed -i -e "s|html5\.js|http://wearehugh.com/dip3/html5.js|g" build/*.html
# images will be served from a separate domain
# images would be served from a separate domain if we had any, which we currently don't
#sed -i -e "s|bsb.png|http://wearehugh.com/dip3/bsb.png|g" build/*.html
# minimize URLs
+17 -8
View File
@@ -244,20 +244,29 @@ mark{display:inline}
<h2 id=xml-parse>Parsing XML</h2>
<p>Python comes with an efficient XML parsing library called Etree.
<p>Python can parse XML documents in several ways. It has traditional <a href=http://en.wikipedia.org/wiki/XML#DOM>DOM</a> and <a href=http://en.wikipedia.org/wiki/Simple_API_for_XML>SAX</a> parsers, but I will focus on a different library called Etree.
<p class=d>[<a href=examples/feed.xml>download <code>feed.xml</code></a>]
<pre class=screen>
>>> import xml.etree.ElementTree as etree
>>> tree = etree.parse("examples/feed.xml")
>>> root = tree.getroot()
>>> root
&lt;Element {http://www.w3.org/2005/Atom}feed at cd1eb0>
</pre>
<a><samp class=p>>>> </samp><kbd>import xml.etree.ElementTree as etree</kbd> <span>&#x2460;</span></a>
<a><samp class=p>>>> </samp><kbd>tree = etree.parse("examples/feed.xml")</kbd> <span>&#x2461;</span></a>
<a><samp class=p>>>> </samp><kbd>root = tree.getroot()</kbd> <span>&#x2462;</span></a>
<a><samp class=p>>>> </samp><kbd>root</kbd> <span>&#x2463;</span></a>
<samp>&lt;Element {http://www.w3.org/2005/Atom}feed at cd1eb0></samp></pre>
<ol>
<li>The Etree library is part of the Python standard library, in <code>xml.etree.ElementTree</code>.
<li>The primary entry point for the Etree library is the <code>parse()</code> function, which can take a filename or a file-like object [FIXME xref]. This function parses the entire document at once. If memory is tight, there are ways to parse an XML document incrementally instead.
<li>The <code>parse()</code> function returns an object which represents the entire document. This is <em>not</em> the root element. To get a reference to the root element, call the <code>getroot()</code> method.
<li>As expected, the root element is the <code>feed</code> element in the <code>http://www.w3.org/2005/Atom</code> namespace. The string representation of this object reinforces an important point: an XML element is a combination of its namespace and its tag name (also called the <i>local name</i>). Every element in this document is in the Atom namespace, so the root element is represented as <code>{http://www.w3.org/2005/Atom}feed</code>.
</ol>
<blockquote class=note>
<p><span>&#x261E;</span>Etree represents XML elements as <code>{<var>namespace</var>}<var>localname</var></code>. You&#8217;ll see and use this format in multiple places in the Etree library.
</blockquote>
<h3 id=xml-elements>Elements Are Lists</h3>
<p>FIXME
<p>In Etree, an element acts like a list. The items of the list are the element&#8217;s children.
<pre class=screen>
>>> root.tag