diff --git a/requests_html.py b/requests_html.py
index 0988a78..5135697 100644
--- a/requests_html.py
+++ b/requests_html.py
@@ -17,18 +17,32 @@ useragent = UserAgent()
# xpath support next.
# parse support.
-class BaseParser:
- """docstring for BaseParser"""
+class Element:
+ """An element of HTML."""
+ def __init__(self, element):
+ self.element = element
+
+ def __repr__(self):
+ attrs = []
+ for attr in self.attrs:
+ attrs.append('{}={}'.format(attr, repr(self.attrs[attr])))
+
+ return "".format(repr(self.element.tag), ' '.join(attrs))
+
@property
def pq(self):
- """PyQuery representation of the page."""
- return PyQuery(self.html)
+ """PyQuery representation of the element."""
+ return PyQuery(self.element)
@property
def lxml(self):
- """Etree representation of the page."""
return fromstring(self.html)
+ @property
+ def attrs(self):
+ """Returns a dictionary of the attributes of the element."""
+ return {k: self.pq.attr[k] for k in self.element.keys()}
+
@property
def text(self):
"""The text content of the element."""
@@ -71,6 +85,38 @@ class BaseParser:
"""
return [r for r in findall(template, self.html)]
+
+class HTML:
+ """An HTML document."""
+ def __init__(self, response):
+ self.html = response.text
+ self.url = response.url
+ self.skip_anchors = True
+
+ def __repr__(self):
+ return "".format(repr(self.url))
+
+ def find(self, selector):
+ """Given a jQuery selector, returns a list of element objects."""
+ def gen():
+ for found in self.pq(selector):
+ yield Element(found)
+
+ return [g for g in gen()]
+
+ def search(self, template):
+ """Searches the page for the given parse template."""
+ return parse_search(template, self.html)
+
+ def search_all(self, template):
+ """Searches the page (multiple times) for the given parse template."""
+ return [r for r in findall(template, self.html)]
+
+ @property
+ def markdown(self):
+ """Markdown representation of the page."""
+ return html2text.handle(self.html)
+
@property
def links(self):
"""All found links on page, in as–is form."""
@@ -85,36 +131,6 @@ class BaseParser:
return set(g for g in gen())
-
-class Element(BaseParser):
- """An element of HTML."""
- def __init__(self, element):
- self.element = element
-
- def __repr__(self):
- attrs = []
- for attr in self.attrs:
- attrs.append('{}={}'.format(attr, repr(self.attrs[attr])))
-
- return "".format(repr(self.element.tag), ' '.join(attrs))
-
- @property
- def attrs(self):
- """Returns a dictionary of the attributes of the element."""
- return {k: self.pq.attr[k] for k in self.element.keys()}
-
-
-class HTML(BaseParser):
- """An HTML document."""
-
- def __init__(self, response):
- self.html = response.text
- self.url = response.url
- self.skip_anchors = True
-
- def __repr__(self):
- return "".format(repr(self.url))
-
@property
def base_url(self):
"""The base URL for the page."""
@@ -142,6 +158,20 @@ class HTML(BaseParser):
return set(g for g in gen())
+ @property
+ def pq(self):
+ """PyQuery representation of the page."""
+ return PyQuery(self.html)
+
+ @property
+ def lxml(self):
+ """Etree representation of the page."""
+ return fromstring(self.html)
+
+ def xpath(self, selector):
+ """Given an XPath selector, returns a list of element objects."""
+ return [Element(e) for e in self.lxml.xpath(selector)]
+
def _handle_response(response, **kwargs):
"""Requests HTTP Response handler. Attaches .html property to Response
@@ -162,7 +192,6 @@ def user_agent(style=None):
else:
return useragent[style]
-
def get_session(mock_browser=True):
"""Returns a consumable session, for cookie persistience and connection
pooling, amongst other things.
@@ -180,5 +209,4 @@ def get_session(mock_browser=True):
return session
-
session = get_session()
diff --git a/setup.py b/setup.py
index 6a083b3..de535c8 100644
--- a/setup.py
+++ b/setup.py
@@ -17,11 +17,11 @@ DESCRIPTION = 'HTML Parsing for Humans.'
URL = 'https://github.com/requests/requests'
EMAIL = 'me@kennethreitz.org'
AUTHOR = 'Kenneth Reitz'
-VERSION = '0.1.0'
+VERSION = '0.1.1'
# What packages are required for this module to be executed?
REQUIRED = [
- 'requests', 'pyquery', 'html2text', 'fake-useragent', 'parse'
+ 'requests', 'pyquery', 'html2text', 'fake-useragent', 'parse', 'bs4'
]
# The rest you shouldn't have to touch too much :)