From 45f3effa978c5e514567933274c1adeefa59d609 Mon Sep 17 00:00:00 2001 From: Mark Pilgrim Date: Wed, 20 May 2009 22:28:26 -0400 Subject: [PATCH] more work on xml chapter --- xml.html | 161 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 97 insertions(+), 64 deletions(-) diff --git a/xml.html b/xml.html index 59754d2..8c302f0 100644 --- a/xml.html +++ b/xml.html @@ -180,7 +180,7 @@ mark{display:inline}

At the top level is the root element, which every Atom feed shares: the feed element in the http://www.w3.org/2005/Atom namespace. -


+

 <feed xmlns="http://www.w3.org/2005/Atom"  
       xml:lang="en">                       
    @@ -190,7 +190,7 @@ mark{display:inline}

    An Atom feed contains several pieces of information about the feed itself. These are declared as children of the root-level feed element. -

    <feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en">
    +
    <feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en">
       <title>dive into mark</title>                                             
       <subtitle>currently between addictions</subtitle>                         
       <id>tag:diveintomark.org,2001-07-29:/</id>                                
    @@ -212,7 +212,7 @@ mark{display:inline}
     
     

    After the feed-level metadata is the list of the most recent articles. An article looks like this: -

    <entry>
    +
    <entry>
       <author>                                                                 
         <name>Mark</name>
         <uri>http://diveintomark.org/</uri>
    @@ -325,62 +325,96 @@ mark{display:inline}
     

    FIXME

    ->>> tree.findall("{http://www.w3.org/2005/Atom}entry")
    -[<Element {http://www.w3.org/2005/Atom}entry at e2b4e0>, <Element {http://www.w3.org/2005/Atom}entry at e2b510>, <Element {http://www.w3.org/2005/Atom}entry at e2b540>]
    +>>> import xml.etree.ElementTree as etree
    +>>> tree = etree.parse("examples/feed.xml")
    +>>> tree.findall("{http://www.w3.org/2005/Atom}entry")
    +[<Element {http://www.w3.org/2005/Atom}entry at e2b4e0>,
    + <Element {http://www.w3.org/2005/Atom}entry at e2b510>,
    + <Element {http://www.w3.org/2005/Atom}entry at e2b540>]
    ->>> feed_links = tree.findall("{http://www.w3.org/2005/Atom}link") ->>> feed_links -[<Element {http://www.w3.org/2005/Atom}link at e181b0>, <Element {http://www.w3.org/2005/Atom}link at e2b4b0>] ->>> feed_links[0].attrib -{'href': 'http://diveintomark.org/', 'type': 'text/html', 'rel': 'alternate'} ->>> feed_links[1].attrib -{'href': 'http://diveintomark.org/feed/', 'type': 'application/atom+xml', 'rel': 'self'} +
    +>>> feed_links = tree.findall("{http://www.w3.org/2005/Atom}link")
    +>>> feed_links
    +[<Element {http://www.w3.org/2005/Atom}link at e181b0>]
    +>>> feed_links[0].attrib
    +{'href': 'http://diveintomark.org/',
    + 'type': 'text/html',
    + 'rel': 'alternate'}
    ->>> all_links = tree.findall("//{http://www.w3.org/2005/Atom}link") ->>> all_links -[<Element {http://www.w3.org/2005/Atom}link at e181b0>, <Element {http://www.w3.org/2005/Atom}link at e2b4b0>, <Element {http://www.w3.org/2005/Atom}link at e2b570>, <Element {http://www.w3.org/2005/Atom}link at e2b480>, <Element {http://www.w3.org/2005/Atom}link at e2b5a0>] ->>> all_links[0].attrib -{'href': 'http://diveintomark.org/', 'type': 'text/html', 'rel': 'alternate'} ->>> all_links[1].attrib -{'href': 'http://diveintomark.org/feed/', 'type': 'application/atom+xml', 'rel': 'self'} ->>> all_links[2].attrib -{'href': 'http://diveintomark.org/archives/2009/03/27/dive-into-history-2009-edition', 'type': 'text/html', 'rel': 'alternate'} ->>> all_links[3].attrib -{'href': 'http://diveintomark.org/archives/2009/03/21/accessibility-is-a-harsh-mistress', 'type': 'text/html', 'rel': 'alternate'} ->>> all_links[4].attrib -{'href': 'http://diveintomark.org/archives/2008/12/18/give-part-1-container-formats', 'type': 'text/html', 'rel': 'alternate'} -
    +
    +>>> all_links = tree.findall("//{http://www.w3.org/2005/Atom}link")
    +>>> all_links
    +[<Element {http://www.w3.org/2005/Atom}link at e181b0>,
    + <Element {http://www.w3.org/2005/Atom}link at e2b570>,
    + <Element {http://www.w3.org/2005/Atom}link at e2b480>,
    + <Element {http://www.w3.org/2005/Atom}link at e2b5a0>]
    +>>> all_links[0].attrib
    +{'href': 'http://diveintomark.org/',
    + 'type': 'text/html',
    + 'rel': 'alternate'}
    +>>> all_links[1].attrib
    +{'href': 'http://diveintomark.org/archives/2009/03/27/dive-into-history-2009-edition',
    + 'type': 'text/html',
    + 'rel': 'alternate'}
    +>>> all_links[2].attrib
    +{'href': 'http://diveintomark.org/archives/2009/03/21/accessibility-is-a-harsh-mistress',
    + 'type': 'text/html',
    + 'rel': 'alternate'}
    +>>> all_links[3].attrib
    +{'href': 'http://diveintomark.org/archives/2008/12/18/give-part-1-container-formats',
    + 'type': 'text/html',
    + 'rel': 'alternate'}

    Going Further With lxml

    FIXME

    ->>> from lxml import etree
    +>>> from lxml import etree
     .
     .  FIXME (show how it's a drop-in replacement for everything we've done so far)
     .
    -
    -from here on out, use lxml.etree explicitly because these functions are specific to lxml
    ->>> import lxml.etree
    ->>> nsmap = {"atom": "http://www.w3.org/2005/Atom"}
    ->>> tree = lxml.etree.parse("examples/feed.xml")
    ->>> entries = tree.xpath("//atom:category[@term='accessibility']/..", namespaces=nsmap)
    ->>> entries
    -[<Element {http://www.w3.org/2005/Atom}entry at e2b630>]
    ->>> entry = entries[0]
    ->>> entry.xpath("./atom:title/text()", namespaces=nsmap)
    -['Accessibility is a harsh mistress']
     
    +

    FIXME: from here on out, we use lxml.etree explicitly because these functions are specific to lxml + +

    +>>> import lxml.etree
    +>>> tree = lxml.etree.parse("examples/feed.xml")
    +>>> it = tree.iterfind("//{http://www.w3.org/2005/Atom}link")
    +>>> next(it)
    +<Element {http://www.w3.org/2005/Atom}link at 122f1b0>
    +>>> next(it)
    +<Element {http://www.w3.org/2005/Atom}link at 122f1e0>
    +>>> next(it)
    +<Element {http://www.w3.org/2005/Atom}link at 122f210>
    +>>> next(it)
    +<Element {http://www.w3.org/2005/Atom}link at 122f1b0>
    +>>> next(it)
    +Traceback (most recent call last):
    +  File "<stdin>", line 1, in <module>
    +StopIteration
    + +
    +>>> NSMAP = {"atom": "http://www.w3.org/2005/Atom"}
    +>>> entries = tree.xpath("//atom:category[@term='accessibility']/..", namespaces=NSMAP)
    +>>> entries
    +[<Element {http://www.w3.org/2005/Atom}entry at e2b630>]
    +>>> entry = entries[0]
    +>>> entry.xpath("./atom:title/text()", namespaces=nsmap)
    +['Accessibility is a harsh mistress']
    +

    Customizing Your XML Parser

    FIXME

    ->>> import lxml.etree
    ->>> parser = lxml.etree.XMLParser(no_network=True, ns_clean=True, recover=True, remove_blank_text=True, remove_comments=True)
    ->>> tree = lxml.etree.parse("examples/feed.xml", parser)
    +>>> import lxml.etree
    +>>> parser = lxml.etree.XMLParser(no_network=True, ns_clean=True, recover=True, remove_blank_text=True, remove_comments=True)
    +>>> tree = lxml.etree.parse("examples/feed.xml", parser)
    +.
    +.
    +.
     

    Incremental Parsing

    @@ -392,38 +426,37 @@ from here on out, use lxml.etree explicitly because these functions are specific

    FIXME

    ->>> import lxml.etree
    ->>> new_feed = lxml.etree.Element("{http://www.w3.org/2005/Atom}feed", attrib={"{http://www.w3.org/XML/1998/namespace}lang": "en"})
    ->>> print(lxml.etree.tounicode(new_feed))
    -<ns0:feed xmlns:ns0="http://www.w3.org/2005/Atom" xml:lang="en"/>
    -
    +>>> import xml.etree.ElementTree as etree +>>> new_feed = etree.Element("{http://www.w3.org/2005/Atom}feed", +... attrib={"{http://www.w3.org/XML/1998/namespace}lang": "en"}) +>>> print(etree.tostring(new_feed)) +<ns0:feed xmlns:ns0="http://www.w3.org/2005/Atom" xml:lang="en"/>

    FIXME

    ->>> import lxml.etree
    ->>> new_feed = lxml.etree.Element("feed", nsmap=NSMAP)
    ->>> print(lxml.etree.tounicode(new_feed))
    -<feed xmlns="http://www.w3.org/2005/Atom"/>
    ->>> new_feed.set("{http://www.w3.org/XML/1998/namespace}lang", "en")
    ->>> print(lxml.etree.tounicode(new_feed))
    -<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en"/>
    -
    +>>> import lxml.etree +>>> NSMAP = {"atom": "http://www.w3.org/2005/Atom"} +>>> new_feed = lxml.etree.Element("feed", nsmap=NSMAP) +>>> print(lxml.etree.tounicode(new_feed)) +<feed xmlns="http://www.w3.org/2005/Atom"/> +>>> new_feed.set("{http://www.w3.org/XML/1998/namespace}lang", "en") +>>> print(lxml.etree.tounicode(new_feed)) +<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en"/>

    FIXME

    ->>> title = lxml.etree.SubElement(new_feed, "title", attrib={"type":"html"})
    ->>> print(lxml.etree.tounicode(new_feed))
    -<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en"><title type="html"/></feed>
    ->>> title.text = "dive into mark"
    ->>> print(lxml.etree.tounicode(new_feed))
    -<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en"><title type="html">dive into mark</title></feed>
    ->>> print(lxml.etree.tounicode(new_feed, pretty_print=True))
    -<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en">
    +>>> title = lxml.etree.SubElement(new_feed, "title", attrib={"type":"html"})
    +>>> print(lxml.etree.tounicode(new_feed))
    +<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en"><title type="html"/></feed>
    +>>> title.text = "dive into mark"
    +>>> print(lxml.etree.tounicode(new_feed))
    +<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en"><title type="html">dive into mark</title></feed>
    +>>> print(lxml.etree.tounicode(new_feed, pretty_print=True))
    +<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en">
     <title type="html">dive into mark</title>
    -</feed>
    -
    +</feed>

    Further Reading