diff --git a/xml.html b/xml.html index 59754d2..8c302f0 100644 --- a/xml.html +++ b/xml.html @@ -180,7 +180,7 @@ mark{display:inline}
At the top level is the root element, which every Atom feed shares: the feed element in the http://www.w3.org/2005/Atom namespace.
-
+
<feed xmlns="http://www.w3.org/2005/Atom" ①
xml:lang="en"> ②
@@ -190,7 +190,7 @@ mark{display:inline}
An Atom feed contains several pieces of information about the feed itself. These are declared as children of the root-level feed element.
-
<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en">
+<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en">
<title>dive into mark</title> ①
<subtitle>currently between addictions</subtitle> ②
<id>tag:diveintomark.org,2001-07-29:/</id> ③
@@ -212,7 +212,7 @@ mark{display:inline}
After the feed-level metadata is the list of the most recent articles. An article looks like this:
-
<entry>
+<entry>
<author> ①
<name>Mark</name>
<uri>http://diveintomark.org/</uri>
@@ -325,62 +325,96 @@ mark{display:inline}
FIXME
->>> tree.findall("{http://www.w3.org/2005/Atom}entry")
-[<Element {http://www.w3.org/2005/Atom}entry at e2b4e0>, <Element {http://www.w3.org/2005/Atom}entry at e2b510>, <Element {http://www.w3.org/2005/Atom}entry at e2b540>]
+>>> import xml.etree.ElementTree as etree
+>>> tree = etree.parse("examples/feed.xml")
+>>> tree.findall("{http://www.w3.org/2005/Atom}entry")
+[<Element {http://www.w3.org/2005/Atom}entry at e2b4e0>,
+ <Element {http://www.w3.org/2005/Atom}entry at e2b510>,
+ <Element {http://www.w3.org/2005/Atom}entry at e2b540>]
->>> feed_links = tree.findall("{http://www.w3.org/2005/Atom}link")
->>> feed_links
-[<Element {http://www.w3.org/2005/Atom}link at e181b0>, <Element {http://www.w3.org/2005/Atom}link at e2b4b0>]
->>> feed_links[0].attrib
-{'href': 'http://diveintomark.org/', 'type': 'text/html', 'rel': 'alternate'}
->>> feed_links[1].attrib
-{'href': 'http://diveintomark.org/feed/', 'type': 'application/atom+xml', 'rel': 'self'}
+
+>>> feed_links = tree.findall("{http://www.w3.org/2005/Atom}link")
+>>> feed_links
+[<Element {http://www.w3.org/2005/Atom}link at e181b0>]
+>>> feed_links[0].attrib
+{'href': 'http://diveintomark.org/',
+ 'type': 'text/html',
+ 'rel': 'alternate'}
->>> all_links = tree.findall("//{http://www.w3.org/2005/Atom}link")
->>> all_links
-[<Element {http://www.w3.org/2005/Atom}link at e181b0>, <Element {http://www.w3.org/2005/Atom}link at e2b4b0>, <Element {http://www.w3.org/2005/Atom}link at e2b570>, <Element {http://www.w3.org/2005/Atom}link at e2b480>, <Element {http://www.w3.org/2005/Atom}link at e2b5a0>]
->>> all_links[0].attrib
-{'href': 'http://diveintomark.org/', 'type': 'text/html', 'rel': 'alternate'}
->>> all_links[1].attrib
-{'href': 'http://diveintomark.org/feed/', 'type': 'application/atom+xml', 'rel': 'self'}
->>> all_links[2].attrib
-{'href': 'http://diveintomark.org/archives/2009/03/27/dive-into-history-2009-edition', 'type': 'text/html', 'rel': 'alternate'}
->>> all_links[3].attrib
-{'href': 'http://diveintomark.org/archives/2009/03/21/accessibility-is-a-harsh-mistress', 'type': 'text/html', 'rel': 'alternate'}
->>> all_links[4].attrib
-{'href': 'http://diveintomark.org/archives/2008/12/18/give-part-1-container-formats', 'type': 'text/html', 'rel': 'alternate'}
-
+
+>>> all_links = tree.findall("//{http://www.w3.org/2005/Atom}link")
+>>> all_links
+[<Element {http://www.w3.org/2005/Atom}link at e181b0>,
+ <Element {http://www.w3.org/2005/Atom}link at e2b570>,
+ <Element {http://www.w3.org/2005/Atom}link at e2b480>,
+ <Element {http://www.w3.org/2005/Atom}link at e2b5a0>]
+>>> all_links[0].attrib
+{'href': 'http://diveintomark.org/',
+ 'type': 'text/html',
+ 'rel': 'alternate'}
+>>> all_links[1].attrib
+{'href': 'http://diveintomark.org/archives/2009/03/27/dive-into-history-2009-edition',
+ 'type': 'text/html',
+ 'rel': 'alternate'}
+>>> all_links[2].attrib
+{'href': 'http://diveintomark.org/archives/2009/03/21/accessibility-is-a-harsh-mistress',
+ 'type': 'text/html',
+ 'rel': 'alternate'}
+>>> all_links[3].attrib
+{'href': 'http://diveintomark.org/archives/2008/12/18/give-part-1-container-formats',
+ 'type': 'text/html',
+ 'rel': 'alternate'}
Going Further With lxml
FIXME
->>> from lxml import etree
+>>> from lxml import etree
.
. FIXME (show how it's a drop-in replacement for everything we've done so far)
.
-
-from here on out, use lxml.etree explicitly because these functions are specific to lxml
->>> import lxml.etree
->>> nsmap = {"atom": "http://www.w3.org/2005/Atom"}
->>> tree = lxml.etree.parse("examples/feed.xml")
->>> entries = tree.xpath("//atom:category[@term='accessibility']/..", namespaces=nsmap)
->>> entries
-[<Element {http://www.w3.org/2005/Atom}entry at e2b630>]
->>> entry = entries[0]
->>> entry.xpath("./atom:title/text()", namespaces=nsmap)
-['Accessibility is a harsh mistress']
+FIXME: from here on out, we use lxml.etree explicitly because these functions are specific to lxml
+
+
+>>> import lxml.etree
+>>> tree = lxml.etree.parse("examples/feed.xml")
+>>> it = tree.iterfind("//{http://www.w3.org/2005/Atom}link")
+>>> next(it)
+<Element {http://www.w3.org/2005/Atom}link at 122f1b0>
+>>> next(it)
+<Element {http://www.w3.org/2005/Atom}link at 122f1e0>
+>>> next(it)
+<Element {http://www.w3.org/2005/Atom}link at 122f210>
+>>> next(it)
+<Element {http://www.w3.org/2005/Atom}link at 122f1b0>
+>>> next(it)
+Traceback (most recent call last):
+ File "<stdin>", line 1, in <module>
+StopIteration
+
+
+>>> NSMAP = {"atom": "http://www.w3.org/2005/Atom"}
+>>> entries = tree.xpath("//atom:category[@term='accessibility']/..", namespaces=NSMAP)
+>>> entries
+[<Element {http://www.w3.org/2005/Atom}entry at e2b630>]
+>>> entry = entries[0]
+>>> entry.xpath("./atom:title/text()", namespaces=nsmap)
+['Accessibility is a harsh mistress']
+
Customizing Your XML Parser
FIXME
->>> import lxml.etree
->>> parser = lxml.etree.XMLParser(no_network=True, ns_clean=True, recover=True, remove_blank_text=True, remove_comments=True)
->>> tree = lxml.etree.parse("examples/feed.xml", parser)
+>>> import lxml.etree
+>>> parser = lxml.etree.XMLParser(no_network=True, ns_clean=True, recover=True, remove_blank_text=True, remove_comments=True)
+>>> tree = lxml.etree.parse("examples/feed.xml", parser)
+.
+.
+.
Incremental Parsing
@@ -392,38 +426,37 @@ from here on out, use lxml.etree explicitly because these functions are specific
FIXME
->>> import lxml.etree
->>> new_feed = lxml.etree.Element("{http://www.w3.org/2005/Atom}feed", attrib={"{http://www.w3.org/XML/1998/namespace}lang": "en"})
->>> print(lxml.etree.tounicode(new_feed))
-<ns0:feed xmlns:ns0="http://www.w3.org/2005/Atom" xml:lang="en"/>
-
+>>> import xml.etree.ElementTree as etree
+>>> new_feed = etree.Element("{http://www.w3.org/2005/Atom}feed",
+... attrib={"{http://www.w3.org/XML/1998/namespace}lang": "en"})
+>>> print(etree.tostring(new_feed))
+<ns0:feed xmlns:ns0="http://www.w3.org/2005/Atom" xml:lang="en"/>
FIXME
->>> import lxml.etree
->>> new_feed = lxml.etree.Element("feed", nsmap=NSMAP)
->>> print(lxml.etree.tounicode(new_feed))
-<feed xmlns="http://www.w3.org/2005/Atom"/>
->>> new_feed.set("{http://www.w3.org/XML/1998/namespace}lang", "en")
->>> print(lxml.etree.tounicode(new_feed))
-<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en"/>
-
+>>> import lxml.etree
+>>> NSMAP = {"atom": "http://www.w3.org/2005/Atom"}
+>>> new_feed = lxml.etree.Element("feed", nsmap=NSMAP)
+>>> print(lxml.etree.tounicode(new_feed))
+<feed xmlns="http://www.w3.org/2005/Atom"/>
+>>> new_feed.set("{http://www.w3.org/XML/1998/namespace}lang", "en")
+>>> print(lxml.etree.tounicode(new_feed))
+<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en"/>
FIXME
->>> title = lxml.etree.SubElement(new_feed, "title", attrib={"type":"html"})
->>> print(lxml.etree.tounicode(new_feed))
-<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en"><title type="html"/></feed>
->>> title.text = "dive into mark"
->>> print(lxml.etree.tounicode(new_feed))
-<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en"><title type="html">dive into mark</title></feed>
->>> print(lxml.etree.tounicode(new_feed, pretty_print=True))
-<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en">
+>>> title = lxml.etree.SubElement(new_feed, "title", attrib={"type":"html"})
+>>> print(lxml.etree.tounicode(new_feed))
+<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en"><title type="html"/></feed>
+>>> title.text = "dive into mark"
+>>> print(lxml.etree.tounicode(new_feed))
+<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en"><title type="html">dive into mark</title></feed>
+>>> print(lxml.etree.tounicode(new_feed, pretty_print=True))
+<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en">
<title type="html">dive into mark</title>
-</feed>
-
+</feed>
Further Reading