From 7638a551f97301ee731599b9e7510a601042edfa Mon Sep 17 00:00:00 2001 From: Mark Pilgrim Date: Fri, 15 May 2009 12:08:35 -0400 Subject: [PATCH] skeleton of XML chapter --- examples/feed.xml | 65 ++++++++++ index.html | 2 +- table-of-contents.html | 10 +- xml.html | 266 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 337 insertions(+), 6 deletions(-) create mode 100644 examples/feed.xml create mode 100644 xml.html diff --git a/examples/feed.xml b/examples/feed.xml new file mode 100644 index 0000000..4332135 --- /dev/null +++ b/examples/feed.xml @@ -0,0 +1,65 @@ + + + dive into mark + currently between addictions + tag:diveintomark.org,2001-07-29:/ + 2009-03-27T21:56:07Z + + + + + Mark + http://diveintomark.org/ + + <![CDATA[Dive into history, 2009 edition]]> + + tag:diveintomark.org,2009-03-27:/archives/20090327172042 + 2009-03-27T21:56:07Z + 2009-03-27T17:20:42Z + + + + Putting an entire chapter on one page sounds bloated, but + consider this: my longest chapter so far would be 75 printed pages, and it + loads in under 5 seconds. On dialup. + + + + Mark + http://diveintomark.org/ + + <![CDATA[Accessibility is a harsh mistress]]> + + tag:diveintomark.org,2009-03-21:/archives/20090321200928 + 2009-03-22T01:05:37Z + 2009-03-21T20:09:28Z + + The accessibility orthodoxy does not permit people to + question the value of features that are rarely useful and rarely used. + + + + Mark + http://diveintomark.org/ + + <![CDATA[A gentle introduction to video encoding, + part 1: container formats]]> + + tag:diveintomark.org,2008-12-18:/archives/20081218155422 + 2009-01-11T19:39:22Z + 2008-12-18T15:54:22Z + + + + + + + + + These notes will eventually become part of a + tech talk on video encoding. + + diff --git a/index.html b/index.html index fd67412..e29deeb 100644 --- a/index.html +++ b/index.html @@ -37,8 +37,8 @@ h1:before{content:""}
  • Refactoring
  • Advanced Classes
  • Files +
  • XML processing
  • HTML processing -
  • XML processing
  • Web services
  • Performance tuning
  • Packaging Python libraries diff --git a/table-of-contents.html b/table-of-contents.html index e9d621a..016747a 100644 --- a/table-of-contents.html +++ b/table-of-contents.html @@ -188,6 +188,11 @@ ul li ol{margin:0;padding:0 0 0 2.5em}
  • Handling errors (exceptions)
  • Writing to files +
  • XML Processing +
      +
    1. ...major changes afoot... +
    2. lxml 2.2 officially supports Python 3 +
  • HTML processing
    1. Diving in @@ -201,11 +206,6 @@ ul li ol{margin:0;padding:0 0 0 2.5em}
    2. Putting it all together
    3. Summary
    -
  • XML Processing -
      -
    1. ...major changes afoot... -
    2. lxml 2.2 officially supports Python 3 -
  • HTTP web services
    1. Diving in diff --git a/xml.html b/xml.html new file mode 100644 index 0000000..ead88b2 --- /dev/null +++ b/xml.html @@ -0,0 +1,266 @@ + + + +XML - Dive into Python 3 + + + + +
        
      +

      You are here: Home Dive Into Python 3 +

      Difficulty level: ♦♦♦♢♢ +

      XML

      +
      +

      FIXME
      — FIXME +

      +

        +

      Diving In

      +

      Most of the chapters in this book have centered around a piece of sample code. But XML isn’t about code; it’s about data. One common use of XML is “syndication feeds” that list the latest articles on a blog, forum, or other frequently-updated website. Most popular blogging software can produce a feed and update it whenever new articles, discussion threads, or blog posts are published. You can follow a blog by “subscribing” to its feed, and you can follow multiple blogs with a dedicated “feed aggregator” like Google Reader. + +

      Here, then, is the XML data we’ll be working with in this chapter. It’s a feed — specifically, an Atom syndication feed. + +

      [download feed.xml] +

      <?xml version="1.0" encoding="utf-8"?>
      +<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en">
      +  <title type="text">dive into mark</title>
      +  <subtitle type="text">currently between addictions</subtitle>
      +  <id>tag:diveintomark.org,2001-07-29:/</id>
      +  <updated>2009-03-27T21:56:07Z</updated>
      +  <link rel="alternate" type="text/html" href="http://diveintomark.org/"/>
      +  <link rel="self" href="http://diveintomark.org/feed/" type="application/atom+xml"/>
      +  <entry>
      +    <author>
      +      <name>Mark</name>
      +      <uri>http://diveintomark.org/</uri>
      +    </author>
      +    <title type="html"><![CDATA[Dive into history, 2009 edition]]></title>
      +    <link rel="alternate" type="text/html"
      +      href="http://diveintomark.org/archives/2009/03/27/dive-into-history-2009-edition"/>
      +    <id>tag:diveintomark.org,2009-03-27:/archives/20090327172042</id>
      +    <updated>2009-03-27T21:56:07Z</updated>
      +    <published>2009-03-27T17:20:42Z</published>
      +    <category scheme="http://diveintomark.org" term="diveintopython"/>
      +    <category scheme="http://diveintomark.org" term="docbook"/>
      +    <category scheme="http://diveintomark.org" term="html"/>
      +    <summary type="html">Putting an entire chapter on one page sounds bloated, but
      +      consider this: my longest chapter so far would be 75 printed pages, and it
      +      loads in under 5 seconds. On dialup.</summary>
      +  </entry>
      +  <entry>
      +    <author>
      +      <name>Mark</name>
      +      <uri>http://diveintomark.org/</uri>
      +    </author>
      +    <title type="html"><![CDATA[Accessibility is a harsh mistress]]></title>
      +    <link rel="alternate" type="text/html"
      +      href="http://diveintomark.org/archives/2009/03/21/accessibility-is-a-harsh-mistress"/>
      +    <id>tag:diveintomark.org,2009-03-21:/archives/20090321200928</id>
      +    <updated>2009-03-22T01:05:37Z</updated>
      +    <published>2009-03-21T20:09:28Z</published>
      +    <category scheme="http://diveintomark.org" term="accessibility"/>
      +    <summary type="html">The accessibility orthodoxy does not permit people to
      +      question the value of features that are rarely useful and rarely used.</summary>
      +  </entry>
      +  <entry>
      +    <author>
      +      <name>Mark</name>
      +      <uri>http://diveintomark.org/</uri>
      +    </author>
      +    <title type="html"><![CDATA[A gentle introduction to video encoding,
      +      part 1: container formats]]></title>
      +    <link rel="alternate" type="text/html"
      +      href="http://diveintomark.org/archives/2008/12/18/give-part-1-container-formats"/>
      +    <id>tag:diveintomark.org,2008-12-18:/archives/20081218155422</id>
      +    <updated>2009-01-11T19:39:22Z</updated>
      +    <published>2008-12-18T15:54:22Z</published>
      +    <category scheme="http://diveintomark.org" term="asf"/>
      +    <category scheme="http://diveintomark.org" term="avi"/>
      +    <category scheme="http://diveintomark.org" term="encoding"/>
      +    <category scheme="http://diveintomark.org" term="flv"/>
      +    <category scheme="http://diveintomark.org" term="GIVE"/>
      +    <category scheme="http://diveintomark.org" term="mp4"/>
      +    <category scheme="http://diveintomark.org" term="ogg"/>
      +    <category scheme="http://diveintomark.org" term="video"/>
      +    <summary type="html">These notes will eventually become part of a
      +      tech talk on video encoding.</summary>
      +  </entry>
      +</feed>
      + +

      The Structure Of An Atom Feed

      + +

      FIXME + +

      Parsing XML

      + +

      FIXME + +

      +>>> import xml.etree.ElementTree as etree
      +>>> tree = etree.parse("examples/feed.xml")
      +>>> root = tree.getroot()
      +>>> root
      +<Element {http://www.w3.org/2005/Atom}feed at cd1eb0>
      +
      + +

      Elements Are Lists

      + +

      FIXME + +

      +>>> root.tag
      +'{http://www.w3.org/2005/Atom}feed'
      +>>> len(root)
      +9
      +>>> for child in root:
      +...   print(child)
      +...
      +<Element {http://www.w3.org/2005/Atom}title at e2b5d0>
      +<Element {http://www.w3.org/2005/Atom}subtitle at e2b4e0>
      +<Element {http://www.w3.org/2005/Atom}id at e2b6c0>
      +<Element {http://www.w3.org/2005/Atom}updated at e2b6f0>
      +<Element {http://www.w3.org/2005/Atom}link at e181b0>
      +<Element {http://www.w3.org/2005/Atom}link at e2b4b0>
      +<Element {http://www.w3.org/2005/Atom}entry at e2b720>
      +<Element {http://www.w3.org/2005/Atom}entry at e2b510>
      +<Element {http://www.w3.org/2005/Atom}entry at e2b750>
      +
      + +

      Attributes Are Dictonaries

      + +

      FIXME + +

      +>>> root.attrib
      +{'{http://www.w3.org/XML/1998/namespace}lang': 'en'}
      +>>> root[4]
      +
      +>>> root[4].attrib
      +{'href': 'http://diveintomark.org/', 'type': 'text/html', 'rel': 'alternate'}
      +>>> root[3]
      +
      +>>> root[3].attrib
      +{}
      +
      + +

      Searching For Nodes Within An XML Document

      + +

      FIXME + +

      +>>> tree.findall("{http://www.w3.org/2005/Atom}entry")
      +[<Element {http://www.w3.org/2005/Atom}entry at e2b4e0>, <Element {http://www.w3.org/2005/Atom}entry at e2b510>, <Element {http://www.w3.org/2005/Atom}entry at e2b540>]
      +
      +>>> feed_links = tree.findall("{http://www.w3.org/2005/Atom}link")
      +>>> feed_links
      +[<Element {http://www.w3.org/2005/Atom}link at e181b0>, <Element {http://www.w3.org/2005/Atom}link at e2b4b0>]
      +>>> feed_links[0].attrib
      +{'href': 'http://diveintomark.org/', 'type': 'text/html', 'rel': 'alternate'}
      +>>> feed_links[1].attrib
      +{'href': 'http://diveintomark.org/feed/', 'type': 'application/atom+xml', 'rel': 'self'}
      +
      +>>> all_links = tree.findall("//{http://www.w3.org/2005/Atom}link")
      +>>> all_links
      +[<Element {http://www.w3.org/2005/Atom}link at e181b0>, <Element {http://www.w3.org/2005/Atom}link at e2b4b0>, <Element {http://www.w3.org/2005/Atom}link at e2b570>, <Element {http://www.w3.org/2005/Atom}link at e2b480>, <Element {http://www.w3.org/2005/Atom}link at e2b5a0>]
      +>>> all_links[0].attrib
      +{'href': 'http://diveintomark.org/', 'type': 'text/html', 'rel': 'alternate'}
      +>>> all_links[1].attrib
      +{'href': 'http://diveintomark.org/feed/', 'type': 'application/atom+xml', 'rel': 'self'}
      +>>> all_links[2].attrib
      +{'href': 'http://diveintomark.org/archives/2009/03/27/dive-into-history-2009-edition', 'type': 'text/html', 'rel': 'alternate'}
      +>>> all_links[3].attrib
      +{'href': 'http://diveintomark.org/archives/2009/03/21/accessibility-is-a-harsh-mistress', 'type': 'text/html', 'rel': 'alternate'}
      +>>> all_links[4].attrib
      +{'href': 'http://diveintomark.org/archives/2008/12/18/give-part-1-container-formats', 'type': 'text/html', 'rel': 'alternate'}
      +
      + +

      Going Further With lxml

      + +

      FIXME + +

      +>>> from lxml import etree
      +.
      +.  FIXME (show how it's a drop-in replacement for everything we've done so far)
      +.
      +
      +from here on out, use lxml.etree explicitly because these functions are specific to lxml
      +>>> import lxml.etree
      +>>> nsmap = {"atom": "http://www.w3.org/2005/Atom"}
      +>>> tree = lxml.etree.parse("examples/feed.xml")
      +>>> entries = tree.xpath("//atom:category[@term='accessibility']/..", namespaces=nsmap)
      +>>> entries
      +[<Element {http://www.w3.org/2005/Atom}entry at e2b630>]
      +>>> entry = entries[0]
      +>>> entry.xpath("./atom:title/text()", namespaces=nsmap)
      +['Accessibility is a harsh mistress']
      +
      + +

      Customizing Your XML Parser

      + +

      FIXME + +

      +>>> import lxml.etree
      +>>> parser = lxml.etree.XMLParser(no_network=True, ns_clean=True, recover=True, remove_blank_text=True, remove_comments=True)
      +>>> tree = lxml.etree.parse("examples/feed.xml", parser)
      +
      + +

      Incremental Parsing

      + +

      FIXME + +

      Generating XML

      + +

      FIXME + +

      +>>> import lxml.etree
      +>>> new_feed = lxml.etree.Element("{http://www.w3.org/2005/Atom}feed", attrib={"{http://www.w3.org/XML/1998/namespace}lang": "en"})
      +>>> print(lxml.etree.tounicode(new_feed))
      +<ns0:feed xmlns:ns0="http://www.w3.org/2005/Atom" xml:lang="en"/>
      +
      + +

      FIXME + +

      +>>> import lxml.etree
      +>>> new_feed = lxml.etree.Element("feed", nsmap=NSMAP)
      +>>> print(lxml.etree.tounicode(new_feed))
      +<feed xmlns="http://www.w3.org/2005/Atom"/>
      +>>> new_feed.set("{http://www.w3.org/XML/1998/namespace}lang", "en")
      +>>> print(lxml.etree.tounicode(new_feed))
      +<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en"/>
      +
      + +

      FIXME + +

      +>>> title = lxml.etree.SubElement(new_feed, "title", attrib={"type":"html"})
      +>>> print(lxml.etree.tounicode(new_feed))
      +<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en"></feed>
      +>>> title.text = "dive into mark"
      +>>> print(lxml.etree.tounicode(new_feed))
      +<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en"><title type="html">dive into mark
      +>>> print(lxml.etree.tounicode(new_feed, pretty_print=True))
      +<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en">
      +  <title type="html">dive into mark</title>
      +</feed>
      +
      + +

      Further Reading

      + + + +

      © 2001–9 Mark Pilgrim + +