mirror of
https://github.com/kennethreitz/requests-html.git
synced 2026-06-05 23:00:20 +00:00
+65
-37
@@ -17,18 +17,32 @@ useragent = UserAgent()
|
||||
# xpath support next.
|
||||
# parse support.
|
||||
|
||||
class BaseParser:
|
||||
"""docstring for BaseParser"""
|
||||
class Element:
|
||||
"""An element of HTML."""
|
||||
def __init__(self, element):
|
||||
self.element = element
|
||||
|
||||
def __repr__(self):
|
||||
attrs = []
|
||||
for attr in self.attrs:
|
||||
attrs.append('{}={}'.format(attr, repr(self.attrs[attr])))
|
||||
|
||||
return "<Element {} {}>".format(repr(self.element.tag), ' '.join(attrs))
|
||||
|
||||
@property
|
||||
def pq(self):
|
||||
"""PyQuery representation of the page."""
|
||||
return PyQuery(self.html)
|
||||
"""PyQuery representation of the element."""
|
||||
return PyQuery(self.element)
|
||||
|
||||
@property
|
||||
def lxml(self):
|
||||
"""Etree representation of the page."""
|
||||
return fromstring(self.html)
|
||||
|
||||
@property
|
||||
def attrs(self):
|
||||
"""Returns a dictionary of the attributes of the element."""
|
||||
return {k: self.pq.attr[k] for k in self.element.keys()}
|
||||
|
||||
@property
|
||||
def text(self):
|
||||
"""The text content of the element."""
|
||||
@@ -71,6 +85,38 @@ class BaseParser:
|
||||
"""
|
||||
return [r for r in findall(template, self.html)]
|
||||
|
||||
|
||||
class HTML:
|
||||
"""An HTML document."""
|
||||
def __init__(self, response):
|
||||
self.html = response.text
|
||||
self.url = response.url
|
||||
self.skip_anchors = True
|
||||
|
||||
def __repr__(self):
|
||||
return "<HTML url={}>".format(repr(self.url))
|
||||
|
||||
def find(self, selector):
|
||||
"""Given a jQuery selector, returns a list of element objects."""
|
||||
def gen():
|
||||
for found in self.pq(selector):
|
||||
yield Element(found)
|
||||
|
||||
return [g for g in gen()]
|
||||
|
||||
def search(self, template):
|
||||
"""Searches the page for the given parse template."""
|
||||
return parse_search(template, self.html)
|
||||
|
||||
def search_all(self, template):
|
||||
"""Searches the page (multiple times) for the given parse template."""
|
||||
return [r for r in findall(template, self.html)]
|
||||
|
||||
@property
|
||||
def markdown(self):
|
||||
"""Markdown representation of the page."""
|
||||
return html2text.handle(self.html)
|
||||
|
||||
@property
|
||||
def links(self):
|
||||
"""All found links on page, in as–is form."""
|
||||
@@ -85,36 +131,6 @@ class BaseParser:
|
||||
|
||||
return set(g for g in gen())
|
||||
|
||||
|
||||
class Element(BaseParser):
|
||||
"""An element of HTML."""
|
||||
def __init__(self, element):
|
||||
self.element = element
|
||||
|
||||
def __repr__(self):
|
||||
attrs = []
|
||||
for attr in self.attrs:
|
||||
attrs.append('{}={}'.format(attr, repr(self.attrs[attr])))
|
||||
|
||||
return "<Element {} {}>".format(repr(self.element.tag), ' '.join(attrs))
|
||||
|
||||
@property
|
||||
def attrs(self):
|
||||
"""Returns a dictionary of the attributes of the element."""
|
||||
return {k: self.pq.attr[k] for k in self.element.keys()}
|
||||
|
||||
|
||||
class HTML(BaseParser):
|
||||
"""An HTML document."""
|
||||
|
||||
def __init__(self, response):
|
||||
self.html = response.text
|
||||
self.url = response.url
|
||||
self.skip_anchors = True
|
||||
|
||||
def __repr__(self):
|
||||
return "<HTML url={}>".format(repr(self.url))
|
||||
|
||||
@property
|
||||
def base_url(self):
|
||||
"""The base URL for the page."""
|
||||
@@ -142,6 +158,20 @@ class HTML(BaseParser):
|
||||
|
||||
return set(g for g in gen())
|
||||
|
||||
@property
|
||||
def pq(self):
|
||||
"""PyQuery representation of the page."""
|
||||
return PyQuery(self.html)
|
||||
|
||||
@property
|
||||
def lxml(self):
|
||||
"""Etree representation of the page."""
|
||||
return fromstring(self.html)
|
||||
|
||||
def xpath(self, selector):
|
||||
"""Given an XPath selector, returns a list of element objects."""
|
||||
return [Element(e) for e in self.lxml.xpath(selector)]
|
||||
|
||||
|
||||
def _handle_response(response, **kwargs):
|
||||
"""Requests HTTP Response handler. Attaches .html property to Response
|
||||
@@ -162,7 +192,6 @@ def user_agent(style=None):
|
||||
else:
|
||||
return useragent[style]
|
||||
|
||||
|
||||
def get_session(mock_browser=True):
|
||||
"""Returns a consumable session, for cookie persistience and connection
|
||||
pooling, amongst other things.
|
||||
@@ -180,5 +209,4 @@ def get_session(mock_browser=True):
|
||||
|
||||
return session
|
||||
|
||||
|
||||
session = get_session()
|
||||
|
||||
@@ -17,11 +17,11 @@ DESCRIPTION = 'HTML Parsing for Humans.'
|
||||
URL = 'https://github.com/requests/requests'
|
||||
EMAIL = 'me@kennethreitz.org'
|
||||
AUTHOR = 'Kenneth Reitz'
|
||||
VERSION = '0.1.0'
|
||||
VERSION = '0.1.1'
|
||||
|
||||
# What packages are required for this module to be executed?
|
||||
REQUIRED = [
|
||||
'requests', 'pyquery', 'html2text', 'fake-useragent', 'parse'
|
||||
'requests', 'pyquery', 'html2text', 'fake-useragent', 'parse', 'bs4'
|
||||
]
|
||||
|
||||
# The rest you shouldn't have to touch too much :)
|
||||
|
||||
Reference in New Issue
Block a user