diff --git a/requests_html.py b/requests_html.py
index 10e1e6c..c90ce1a 100644
--- a/requests_html.py
+++ b/requests_html.py
@@ -12,17 +12,17 @@ from parse import findall
useragent = UserAgent()
-class Element:
- """An element of HTML."""
- def __init__(self, element):
+class BaseParser:
+ """docstring for BaseParser"""
+ def __init__(self, *, url, element, html=None):
self.element = element
+ self.url = url
+ self.skip_anchors = True
- def __repr__(self):
- attrs = []
- for attr in self.attrs:
- attrs.append('{}={}'.format(attr, repr(self.attrs[attr])))
-
- return "".format(repr(self.element.tag), ' '.join(attrs))
+ if not html:
+ self.html = etree.tostring(self.element).decode('utf-8').strip()
+ else:
+ self.html = html
@property
def pq(self):
@@ -31,12 +31,10 @@ class Element:
@property
def lxml(self):
- return fromstring(self.html)
-
- @property
- def attrs(self):
- """Returns a dictionary of the attributes of the element."""
- return {k: self.pq.attr[k] for k in self.element.keys()}
+ if self.element:
+ return self.element
+ else:
+ return fromstring(self.html)
@property
def text(self):
@@ -48,16 +46,11 @@ class Element:
"""The full text content (including links) of the element."""
return self.pq.text_content()
- @property
- def html(self):
- """HTML representation of the element."""
- return etree.tostring(self.element).decode('utf-8').strip()
-
def find(self, selector, first=False):
"""Given a jQuery selector, returns a list of element objects."""
def gen():
for found in self.pq(selector):
- yield Element(found)
+ yield Element(element=found, url=self.url)
c = [g for g in gen()]
@@ -71,7 +64,7 @@ class Element:
def xpath(self, selector):
"""Given an XPath selector, returns a list of element objects."""
- return [Element(e) for e in self.lxml.xpath(selector)]
+ return [Element(element=e, url=self.url) for e in self.lxml.xpath(selector)]
def search(self, template):
"""Searches the element for the given parse template."""
@@ -83,41 +76,6 @@ class Element:
"""
return [r for r in findall(template, self.html)]
-
-class HTML:
- """An HTML document."""
- def __init__(self, response):
- self.html = response.text
- self.url = response.url
- self.skip_anchors = True
-
- def __repr__(self):
- return "".format(repr(self.url))
-
- def find(self, selector, first=False):
- """Given a jQuery selector, returns a list of element objects."""
- def gen():
- for found in self.pq(selector):
- yield Element(found)
-
- c = [g for g in gen()]
-
- if first:
- try:
- return c[0]
- except IndexError:
- return None
- else:
- return c
-
- def search(self, template):
- """Searches the page for the given parse template."""
- return parse_search(template, self.html)
-
- def search_all(self, template):
- """Searches the page (multiple times) for the given parse template."""
- return [r for r in findall(template, self.html)]
-
@property
def links(self):
"""All found links on page, in as–is form."""
@@ -132,15 +90,6 @@ class HTML:
return set(g for g in gen())
- @property
- def base_url(self):
- """The base URL for the page."""
- url = '/'.join(self.url.split('/')[:-1])
- if url.endswith('/'):
- url = url[:-1]
-
- return url
-
@property
def absolute_links(self):
"""All found links on page, in absolute form."""
@@ -160,18 +109,50 @@ class HTML:
return set(g for g in gen())
@property
- def pq(self):
- """PyQuery representation of the page."""
- return PyQuery(self.html)
+ def base_url(self):
+ """The base URL for the page."""
+ url = '/'.join(self.url.split('/')[:-1])
+ if url.endswith('/'):
+ url = url[:-1]
+
+ return url
+
+
+class Element(BaseParser):
+ """An element of HTML."""
+ def __init__(self, *, element, url):
+ super(Element, self).__init__(element=element, url=url)
+ self.element = element
+
+ def __repr__(self):
+ attrs = []
+ for attr in self.attrs:
+ attrs.append('{}={}'.format(attr, repr(self.attrs[attr])))
+
+ return "".format(repr(self.element.tag), ' '.join(attrs))
@property
- def lxml(self):
- """Etree representation of the page."""
- return fromstring(self.html)
+ def attrs(self):
+ """Returns a dictionary of the attributes of the element."""
+ attrs = {k: self.pq.attr[k] for k in self.element.keys()}
- def xpath(self, selector):
- """Given an XPath selector, returns a list of element objects."""
- return [Element(e) for e in self.lxml.xpath(selector)]
+ # Split class up, as there are ussually many of them:
+ if 'class' in attrs:
+ attrs['class'] = tuple(attrs['class'].split())
+ return attrs
+
+
+class HTML(BaseParser):
+ """An HTML document."""
+ def __init__(self, *, response):
+ super(HTML, self).__init__(
+ element=fromstring(self.html),
+ html=response.text,
+ url=response.url
+ )
+
+ def __repr__(self):
+ return "".format(repr(self.url))
def _handle_response(response, **kwargs):
@@ -179,7 +160,7 @@ def _handle_response(response, **kwargs):
objects.
"""
- response.html = HTML(response)
+ response.html = HTML(response=response)
return response