From d0887939b3de00607322dcb022fdbd25ed639482 Mon Sep 17 00:00:00 2001 From: Kenneth Reitz Date: Sun, 25 Feb 2018 09:40:49 -0500 Subject: [PATCH] fixes Signed-off-by: Kenneth Reitz --- requests_html.py | 102 ++++++++++++++++++++++++++++++----------------- setup.py | 4 +- 2 files changed, 67 insertions(+), 39 deletions(-) diff --git a/requests_html.py b/requests_html.py index 0988a78..5135697 100644 --- a/requests_html.py +++ b/requests_html.py @@ -17,18 +17,32 @@ useragent = UserAgent() # xpath support next. # parse support. -class BaseParser: - """docstring for BaseParser""" +class Element: + """An element of HTML.""" + def __init__(self, element): + self.element = element + + def __repr__(self): + attrs = [] + for attr in self.attrs: + attrs.append('{}={}'.format(attr, repr(self.attrs[attr]))) + + return "".format(repr(self.element.tag), ' '.join(attrs)) + @property def pq(self): - """PyQuery representation of the page.""" - return PyQuery(self.html) + """PyQuery representation of the element.""" + return PyQuery(self.element) @property def lxml(self): - """Etree representation of the page.""" return fromstring(self.html) + @property + def attrs(self): + """Returns a dictionary of the attributes of the element.""" + return {k: self.pq.attr[k] for k in self.element.keys()} + @property def text(self): """The text content of the element.""" @@ -71,6 +85,38 @@ class BaseParser: """ return [r for r in findall(template, self.html)] + +class HTML: + """An HTML document.""" + def __init__(self, response): + self.html = response.text + self.url = response.url + self.skip_anchors = True + + def __repr__(self): + return "".format(repr(self.url)) + + def find(self, selector): + """Given a jQuery selector, returns a list of element objects.""" + def gen(): + for found in self.pq(selector): + yield Element(found) + + return [g for g in gen()] + + def search(self, template): + """Searches the page for the given parse template.""" + return parse_search(template, self.html) + + def search_all(self, template): + """Searches the page (multiple times) for the given parse template.""" + return [r for r in findall(template, self.html)] + + @property + def markdown(self): + """Markdown representation of the page.""" + return html2text.handle(self.html) + @property def links(self): """All found links on page, in as–is form.""" @@ -85,36 +131,6 @@ class BaseParser: return set(g for g in gen()) - -class Element(BaseParser): - """An element of HTML.""" - def __init__(self, element): - self.element = element - - def __repr__(self): - attrs = [] - for attr in self.attrs: - attrs.append('{}={}'.format(attr, repr(self.attrs[attr]))) - - return "".format(repr(self.element.tag), ' '.join(attrs)) - - @property - def attrs(self): - """Returns a dictionary of the attributes of the element.""" - return {k: self.pq.attr[k] for k in self.element.keys()} - - -class HTML(BaseParser): - """An HTML document.""" - - def __init__(self, response): - self.html = response.text - self.url = response.url - self.skip_anchors = True - - def __repr__(self): - return "".format(repr(self.url)) - @property def base_url(self): """The base URL for the page.""" @@ -142,6 +158,20 @@ class HTML(BaseParser): return set(g for g in gen()) + @property + def pq(self): + """PyQuery representation of the page.""" + return PyQuery(self.html) + + @property + def lxml(self): + """Etree representation of the page.""" + return fromstring(self.html) + + def xpath(self, selector): + """Given an XPath selector, returns a list of element objects.""" + return [Element(e) for e in self.lxml.xpath(selector)] + def _handle_response(response, **kwargs): """Requests HTTP Response handler. Attaches .html property to Response @@ -162,7 +192,6 @@ def user_agent(style=None): else: return useragent[style] - def get_session(mock_browser=True): """Returns a consumable session, for cookie persistience and connection pooling, amongst other things. @@ -180,5 +209,4 @@ def get_session(mock_browser=True): return session - session = get_session() diff --git a/setup.py b/setup.py index 6a083b3..de535c8 100644 --- a/setup.py +++ b/setup.py @@ -17,11 +17,11 @@ DESCRIPTION = 'HTML Parsing for Humans.' URL = 'https://github.com/requests/requests' EMAIL = 'me@kennethreitz.org' AUTHOR = 'Kenneth Reitz' -VERSION = '0.1.0' +VERSION = '0.1.1' # What packages are required for this module to be executed? REQUIRED = [ - 'requests', 'pyquery', 'html2text', 'fake-useragent', 'parse' + 'requests', 'pyquery', 'html2text', 'fake-useragent', 'parse', 'bs4' ] # The rest you shouldn't have to touch too much :)