enhancements

Signed-off-by: Kenneth Reitz <me@kennethreitz.org>
This commit is contained in:
2018-02-25 09:33:35 -05:00
parent 6b66b61108
commit 100a9d8a00
2 changed files with 54 additions and 57 deletions
+7
View File
@@ -61,6 +61,13 @@ Usage
>>> r.html.search('Python is a {} language')[0]
programming
More complex CSS Selector example:
>>> r = session.get('https://github.com/')
>>> sel = 'body > div.application-main > div.jumbotron.jumbotron-codelines > div > div > div.col-md-7.text-center.text-md-left > p'
>>> print(r.html.find(sel)[0].text)
GitHub is a development platform inspired by the way you work. From open source to business, you can host and review code, manage projects, and build software alongside millions of other developers.
Installation
============
+47 -57
View File
@@ -1,11 +1,12 @@
from tempfile import TemporaryFile
from io import StringIO
import html2text
import requests
from pyquery import PyQuery
from fake_useragent import UserAgent
from lxml.etree import tostring
from lxml import etree
from lxml.html.soupparser import fromstring
from parse import search as parse_search
from parse import findall
@@ -16,27 +17,17 @@ useragent = UserAgent()
# xpath support next.
# parse support.
class Element:
"""An element of HTML."""
def __init__(self, element):
self.element = element
def __repr__(self):
attrs = []
for attr in self.attrs:
attrs.append('{}={}'.format(attr, repr(self.attrs[attr])))
return "<Element {} {}>".format(repr(self.element.tag), ' '.join(attrs))
class BaseParser:
"""docstring for BaseParser"""
@property
def pq(self):
"""PyQuery representation of the element."""
return PyQuery(self.element)
"""PyQuery representation of the page."""
return PyQuery(self.html)
@property
def attrs(self):
"""Returns a dictionary of the attributes of the element."""
return {k: self.pq.attr[k] for k in self.element.keys()}
def lxml(self):
"""Etree representation of the page."""
return fromstring(self.html)
@property
def text(self):
@@ -56,7 +47,7 @@ class Element:
@property
def html(self):
"""HTML representation of the element."""
return tostring(self.element).decode('utf-8').strip()
return etree.tostring(self.element).decode('utf-8').strip()
def find(self, selector):
"""Given a jQuery selector, returns a list of element objects."""
@@ -66,6 +57,10 @@ class Element:
return [g for g in gen()]
def xpath(self, selector):
"""Given an XPath selector, returns a list of element objects."""
return [Element(e) for e in self.lxml.xpath(selector)]
def search(self, template):
"""Searches the element for the given parse template."""
return parse_search(template, self.html)
@@ -76,38 +71,6 @@ class Element:
"""
return [r for r in findall(template, self.html)]
class HTML:
"""An HTML document."""
def __init__(self, response):
self.html = response.text
self.url = response.url
self.skip_anchors = True
def __repr__(self):
return "<HTML url={}>".format(repr(self.url))
def find(self, selector):
"""Given a jQuery selector, returns a list of element objects."""
def gen():
for found in self.pq(selector):
yield Element(found)
return [g for g in gen()]
def search(self, template):
"""Searches the page for the given parse template."""
return parse_search(template, self.html)
def search_all(self, template):
"""Searches the page (multiple times) for the given parse template."""
return [r for r in findall(template, self.html)]
@property
def markdown(self):
"""Markdown representation of the page."""
return html2text.handle(self.html)
@property
def links(self):
"""All found links on page, in asis form."""
@@ -122,6 +85,36 @@ class HTML:
return set(g for g in gen())
class Element(BaseParser):
"""An element of HTML."""
def __init__(self, element):
self.element = element
def __repr__(self):
attrs = []
for attr in self.attrs:
attrs.append('{}={}'.format(attr, repr(self.attrs[attr])))
return "<Element {} {}>".format(repr(self.element.tag), ' '.join(attrs))
@property
def attrs(self):
"""Returns a dictionary of the attributes of the element."""
return {k: self.pq.attr[k] for k in self.element.keys()}
class HTML(BaseParser):
"""An HTML document."""
def __init__(self, response):
self.html = response.text
self.url = response.url
self.skip_anchors = True
def __repr__(self):
return "<HTML url={}>".format(repr(self.url))
@property
def base_url(self):
"""The base URL for the page."""
@@ -149,11 +142,6 @@ class HTML:
return set(g for g in gen())
@property
def pq(self):
"""PyQuery representation of the page."""
return PyQuery(self.html)
def _handle_response(response, **kwargs):
"""Requests HTTP Response handler. Attaches .html property to Response
@@ -174,6 +162,7 @@ def user_agent(style=None):
else:
return useragent[style]
def get_session(mock_browser=True):
"""Returns a consumable session, for cookie persistience and connection
pooling, amongst other things.
@@ -191,4 +180,5 @@ def get_session(mock_browser=True):
return session
session = get_session()