diff --git a/README.rst b/README.rst index a650ebc..9eca455 100644 --- a/README.rst +++ b/README.rst @@ -61,6 +61,13 @@ Usage >>> r.html.search('Python is a {} language')[0] programming +More complex CSS Selector example: + + >>> r = session.get('https://github.com/') + >>> sel = 'body > div.application-main > div.jumbotron.jumbotron-codelines > div > div > div.col-md-7.text-center.text-md-left > p' + >>> print(r.html.find(sel)[0].text) + GitHub is a development platform inspired by the way you work. From open source to business, you can host and review code, manage projects, and build software alongside millions of other developers. + Installation ============ diff --git a/requests_html.py b/requests_html.py index 4649739..0988a78 100644 --- a/requests_html.py +++ b/requests_html.py @@ -1,11 +1,12 @@ -from tempfile import TemporaryFile +from io import StringIO import html2text import requests from pyquery import PyQuery from fake_useragent import UserAgent -from lxml.etree import tostring +from lxml import etree +from lxml.html.soupparser import fromstring from parse import search as parse_search from parse import findall @@ -16,27 +17,17 @@ useragent = UserAgent() # xpath support next. # parse support. -class Element: - """An element of HTML.""" - def __init__(self, element): - self.element = element - - def __repr__(self): - attrs = [] - for attr in self.attrs: - attrs.append('{}={}'.format(attr, repr(self.attrs[attr]))) - - return "".format(repr(self.element.tag), ' '.join(attrs)) - +class BaseParser: + """docstring for BaseParser""" @property def pq(self): - """PyQuery representation of the element.""" - return PyQuery(self.element) + """PyQuery representation of the page.""" + return PyQuery(self.html) @property - def attrs(self): - """Returns a dictionary of the attributes of the element.""" - return {k: self.pq.attr[k] for k in self.element.keys()} + def lxml(self): + """Etree representation of the page.""" + return fromstring(self.html) @property def text(self): @@ -56,7 +47,7 @@ class Element: @property def html(self): """HTML representation of the element.""" - return tostring(self.element).decode('utf-8').strip() + return etree.tostring(self.element).decode('utf-8').strip() def find(self, selector): """Given a jQuery selector, returns a list of element objects.""" @@ -66,6 +57,10 @@ class Element: return [g for g in gen()] + def xpath(self, selector): + """Given an XPath selector, returns a list of element objects.""" + return [Element(e) for e in self.lxml.xpath(selector)] + def search(self, template): """Searches the element for the given parse template.""" return parse_search(template, self.html) @@ -76,38 +71,6 @@ class Element: """ return [r for r in findall(template, self.html)] - -class HTML: - """An HTML document.""" - def __init__(self, response): - self.html = response.text - self.url = response.url - self.skip_anchors = True - - def __repr__(self): - return "".format(repr(self.url)) - - def find(self, selector): - """Given a jQuery selector, returns a list of element objects.""" - def gen(): - for found in self.pq(selector): - yield Element(found) - - return [g for g in gen()] - - def search(self, template): - """Searches the page for the given parse template.""" - return parse_search(template, self.html) - - def search_all(self, template): - """Searches the page (multiple times) for the given parse template.""" - return [r for r in findall(template, self.html)] - - @property - def markdown(self): - """Markdown representation of the page.""" - return html2text.handle(self.html) - @property def links(self): """All found links on page, in as–is form.""" @@ -122,6 +85,36 @@ class HTML: return set(g for g in gen()) + +class Element(BaseParser): + """An element of HTML.""" + def __init__(self, element): + self.element = element + + def __repr__(self): + attrs = [] + for attr in self.attrs: + attrs.append('{}={}'.format(attr, repr(self.attrs[attr]))) + + return "".format(repr(self.element.tag), ' '.join(attrs)) + + @property + def attrs(self): + """Returns a dictionary of the attributes of the element.""" + return {k: self.pq.attr[k] for k in self.element.keys()} + + +class HTML(BaseParser): + """An HTML document.""" + + def __init__(self, response): + self.html = response.text + self.url = response.url + self.skip_anchors = True + + def __repr__(self): + return "".format(repr(self.url)) + @property def base_url(self): """The base URL for the page.""" @@ -149,11 +142,6 @@ class HTML: return set(g for g in gen()) - @property - def pq(self): - """PyQuery representation of the page.""" - return PyQuery(self.html) - def _handle_response(response, **kwargs): """Requests HTTP Response handler. Attaches .html property to Response @@ -174,6 +162,7 @@ def user_agent(style=None): else: return useragent[style] + def get_session(mock_browser=True): """Returns a consumable session, for cookie persistience and connection pooling, amongst other things. @@ -191,4 +180,5 @@ def get_session(mock_browser=True): return session + session = get_session()