diff --git a/requests_html.py b/requests_html.py index 436ccb5..95ff295 100644 --- a/requests_html.py +++ b/requests_html.py @@ -2,6 +2,10 @@ import requests from pyquery import PyQuery from lxml.etree import tostring +import html2text +html2text = html2text.HTML2Text() + +# TODO: Markdown converter. class Element: """docstring for Element""" @@ -29,6 +33,14 @@ class Element: def text(self): return self.pq.text() + @property + def full_text(self): + return self.pq.text_content() + + @property + def markdown(self): + return html2text.handle(self.html) + @property def html(self): return tostring(self.element).decode('utf-8').strip() @@ -50,15 +62,19 @@ class HTML(object): self.skip_anchors = True def __repr__(self): - return repr(self.html) + return repr("".format(repr(self.url))) - def find(self, selector): + def find(self, selector=None): def gen(): for found in self.pq(selector): yield Element(found) return [g for g in gen()] + @property + def markdown(self): + return html2text.handle(self.html) + @property def links(self): def gen(): @@ -107,4 +123,6 @@ def handle_response(response, **kwargs): session = requests.Session() -session.hooks = {'response': handle_response} \ No newline at end of file +session.hooks = {'response': handle_response} + +print(session.get('http://httpbin.org/').html.markdown) \ No newline at end of file diff --git a/setup.py b/setup.py index 3c953f8..e5d709b 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ VERSION = '0.0.1' # What packages are required for this module to be executed? REQUIRED = [ - 'requests', 'pyquery' + 'requests', 'pyquery', 'html2text' ] # The rest you shouldn't have to touch too much :)