diff --git a/requests_html.py b/requests_html.py index 10e1e6c..c90ce1a 100644 --- a/requests_html.py +++ b/requests_html.py @@ -12,17 +12,17 @@ from parse import findall useragent = UserAgent() -class Element: - """An element of HTML.""" - def __init__(self, element): +class BaseParser: + """docstring for BaseParser""" + def __init__(self, *, url, element, html=None): self.element = element + self.url = url + self.skip_anchors = True - def __repr__(self): - attrs = [] - for attr in self.attrs: - attrs.append('{}={}'.format(attr, repr(self.attrs[attr]))) - - return "".format(repr(self.element.tag), ' '.join(attrs)) + if not html: + self.html = etree.tostring(self.element).decode('utf-8').strip() + else: + self.html = html @property def pq(self): @@ -31,12 +31,10 @@ class Element: @property def lxml(self): - return fromstring(self.html) - - @property - def attrs(self): - """Returns a dictionary of the attributes of the element.""" - return {k: self.pq.attr[k] for k in self.element.keys()} + if self.element: + return self.element + else: + return fromstring(self.html) @property def text(self): @@ -48,16 +46,11 @@ class Element: """The full text content (including links) of the element.""" return self.pq.text_content() - @property - def html(self): - """HTML representation of the element.""" - return etree.tostring(self.element).decode('utf-8').strip() - def find(self, selector, first=False): """Given a jQuery selector, returns a list of element objects.""" def gen(): for found in self.pq(selector): - yield Element(found) + yield Element(element=found, url=self.url) c = [g for g in gen()] @@ -71,7 +64,7 @@ class Element: def xpath(self, selector): """Given an XPath selector, returns a list of element objects.""" - return [Element(e) for e in self.lxml.xpath(selector)] + return [Element(element=e, url=self.url) for e in self.lxml.xpath(selector)] def search(self, template): """Searches the element for the given parse template.""" @@ -83,41 +76,6 @@ class Element: """ return [r for r in findall(template, self.html)] - -class HTML: - """An HTML document.""" - def __init__(self, response): - self.html = response.text - self.url = response.url - self.skip_anchors = True - - def __repr__(self): - return "".format(repr(self.url)) - - def find(self, selector, first=False): - """Given a jQuery selector, returns a list of element objects.""" - def gen(): - for found in self.pq(selector): - yield Element(found) - - c = [g for g in gen()] - - if first: - try: - return c[0] - except IndexError: - return None - else: - return c - - def search(self, template): - """Searches the page for the given parse template.""" - return parse_search(template, self.html) - - def search_all(self, template): - """Searches the page (multiple times) for the given parse template.""" - return [r for r in findall(template, self.html)] - @property def links(self): """All found links on page, in as–is form.""" @@ -132,15 +90,6 @@ class HTML: return set(g for g in gen()) - @property - def base_url(self): - """The base URL for the page.""" - url = '/'.join(self.url.split('/')[:-1]) - if url.endswith('/'): - url = url[:-1] - - return url - @property def absolute_links(self): """All found links on page, in absolute form.""" @@ -160,18 +109,50 @@ class HTML: return set(g for g in gen()) @property - def pq(self): - """PyQuery representation of the page.""" - return PyQuery(self.html) + def base_url(self): + """The base URL for the page.""" + url = '/'.join(self.url.split('/')[:-1]) + if url.endswith('/'): + url = url[:-1] + + return url + + +class Element(BaseParser): + """An element of HTML.""" + def __init__(self, *, element, url): + super(Element, self).__init__(element=element, url=url) + self.element = element + + def __repr__(self): + attrs = [] + for attr in self.attrs: + attrs.append('{}={}'.format(attr, repr(self.attrs[attr]))) + + return "".format(repr(self.element.tag), ' '.join(attrs)) @property - def lxml(self): - """Etree representation of the page.""" - return fromstring(self.html) + def attrs(self): + """Returns a dictionary of the attributes of the element.""" + attrs = {k: self.pq.attr[k] for k in self.element.keys()} - def xpath(self, selector): - """Given an XPath selector, returns a list of element objects.""" - return [Element(e) for e in self.lxml.xpath(selector)] + # Split class up, as there are ussually many of them: + if 'class' in attrs: + attrs['class'] = tuple(attrs['class'].split()) + return attrs + + +class HTML(BaseParser): + """An HTML document.""" + def __init__(self, *, response): + super(HTML, self).__init__( + element=fromstring(self.html), + html=response.text, + url=response.url + ) + + def __repr__(self): + return "".format(repr(self.url)) def _handle_response(response, **kwargs): @@ -179,7 +160,7 @@ def _handle_response(response, **kwargs): objects. """ - response.html = HTML(response) + response.html = HTML(response=response) return response