base_class

Signed-off-by: Kenneth Reitz <me@kennethreitz.org>
2026-06-05 23:00:20 +00:00 · 2018-02-26 07:54:17 -05:00
parent aab0dad8bb
commit 105c2e890e
1 changed files with 57 additions and 76 deletions
@@ -12,17 +12,17 @@ from parse import findall
 useragent = UserAgent()


-class Element:
-    """An element of HTML."""
-    def __init__(self, element):
+class BaseParser:
+    """docstring for BaseParser"""
+    def __init__(self, *, url, element, html=None):
        self.element = element
+        self.url = url
+        self.skip_anchors = True

-    def __repr__(self):
-        attrs = []
-        for attr in self.attrs:
-            attrs.append('{}={}'.format(attr, repr(self.attrs[attr])))
-
-        return "<Element {} {}>".format(repr(self.element.tag), ' '.join(attrs))
+        if not html:
+            self.html = etree.tostring(self.element).decode('utf-8').strip()
+        else:
+            self.html = html

    @property
    def pq(self):
@@ -31,12 +31,10 @@ class Element:

    @property
    def lxml(self):
-        return fromstring(self.html)
-
-    @property
-    def attrs(self):
-        """Returns a dictionary of the attributes of the element."""
-        return {k: self.pq.attr[k] for k in self.element.keys()}
+        if self.element:
+            return self.element
+        else:
+            return fromstring(self.html)

    @property
    def text(self):
@@ -48,16 +46,11 @@ class Element:
        """The full text content (including links) of the element."""
        return self.pq.text_content()

-    @property
-    def html(self):
-        """HTML representation of the element."""
-        return etree.tostring(self.element).decode('utf-8').strip()
-
    def find(self, selector, first=False):
        """Given a jQuery selector, returns a list of element objects."""
        def gen():
            for found in self.pq(selector):
-                yield Element(found)
+                yield Element(element=found, url=self.url)

        c = [g for g in gen()]

@@ -71,7 +64,7 @@ class Element:

    def xpath(self, selector):
        """Given an XPath selector, returns a list of element objects."""
-        return [Element(e) for e in self.lxml.xpath(selector)]
+        return [Element(element=e, url=self.url) for e in self.lxml.xpath(selector)]

    def search(self, template):
        """Searches the element for the given parse template."""
@@ -83,41 +76,6 @@ class Element:
        """
        return [r for r in findall(template, self.html)]

-
-class HTML:
-    """An HTML document."""
-    def __init__(self, response):
-        self.html = response.text
-        self.url = response.url
-        self.skip_anchors = True
-
-    def __repr__(self):
-        return "<HTML url={}>".format(repr(self.url))
-
-    def find(self, selector, first=False):
-        """Given a jQuery selector, returns a list of element objects."""
-        def gen():
-            for found in self.pq(selector):
-                yield Element(found)
-
-        c = [g for g in gen()]
-
-        if first:
-            try:
-                return c[0]
-            except IndexError:
-                return None
-        else:
-            return c
-
-    def search(self, template):
-        """Searches the page for the given parse template."""
-        return parse_search(template, self.html)
-
-    def search_all(self, template):
-        """Searches the page (multiple times) for the given parse template."""
-        return [r for r in findall(template, self.html)]
-
    @property
    def links(self):
        """All found links on page, in as–is form."""
@@ -132,15 +90,6 @@ class HTML:

        return set(g for g in gen())

-    @property
-    def base_url(self):
-        """The base URL for the page."""
-        url = '/'.join(self.url.split('/')[:-1])
-        if url.endswith('/'):
-            url = url[:-1]
-
-        return url
-
    @property
    def absolute_links(self):
        """All found links on page, in absolute form."""
@@ -160,18 +109,50 @@ class HTML:
        return set(g for g in gen())

    @property
-    def pq(self):
-        """PyQuery representation of the page."""
-        return PyQuery(self.html)
+    def base_url(self):
+        """The base URL for the page."""
+        url = '/'.join(self.url.split('/')[:-1])
+        if url.endswith('/'):
+            url = url[:-1]
+
+        return url
+
+
+class Element(BaseParser):
+    """An element of HTML."""
+    def __init__(self, *, element, url):
+        super(Element, self).__init__(element=element, url=url)
+        self.element = element
+
+    def __repr__(self):
+        attrs = []
+        for attr in self.attrs:
+            attrs.append('{}={}'.format(attr, repr(self.attrs[attr])))
+
+        return "<Element {} {}>".format(repr(self.element.tag), ' '.join(attrs))

    @property
-    def lxml(self):
-        """Etree representation of the page."""
-        return fromstring(self.html)
+    def attrs(self):
+        """Returns a dictionary of the attributes of the element."""
+        attrs = {k: self.pq.attr[k] for k in self.element.keys()}

-    def xpath(self, selector):
-        """Given an XPath selector, returns a list of element objects."""
-        return [Element(e) for e in self.lxml.xpath(selector)]
+        # Split class up, as there are ussually many of them:
+        if 'class' in attrs:
+            attrs['class'] = tuple(attrs['class'].split())
+        return attrs
+
+
+class HTML(BaseParser):
+    """An HTML document."""
+    def __init__(self, *, response):
+        super(HTML, self).__init__(
+            element=fromstring(self.html),
+            html=response.text,
+            url=response.url
+        )
+
+    def __repr__(self):
+        return "<HTML url={}>".format(repr(self.url))


 def _handle_response(response, **kwargs):
@@ -179,7 +160,7 @@ def _handle_response(response, **kwargs):
    objects.
    """

-    response.html = HTML(response)
+    response.html = HTML(response=response)
    return response