mirror of
https://github.com/kennethreitz/requests-html.git
synced 2026-06-05 23:00:20 +00:00
+21
-3
@@ -2,6 +2,10 @@ import requests
|
||||
from pyquery import PyQuery
|
||||
|
||||
from lxml.etree import tostring
|
||||
import html2text
|
||||
html2text = html2text.HTML2Text()
|
||||
|
||||
# TODO: Markdown converter.
|
||||
|
||||
class Element:
|
||||
"""docstring for Element"""
|
||||
@@ -29,6 +33,14 @@ class Element:
|
||||
def text(self):
|
||||
return self.pq.text()
|
||||
|
||||
@property
|
||||
def full_text(self):
|
||||
return self.pq.text_content()
|
||||
|
||||
@property
|
||||
def markdown(self):
|
||||
return html2text.handle(self.html)
|
||||
|
||||
@property
|
||||
def html(self):
|
||||
return tostring(self.element).decode('utf-8').strip()
|
||||
@@ -50,15 +62,19 @@ class HTML(object):
|
||||
self.skip_anchors = True
|
||||
|
||||
def __repr__(self):
|
||||
return repr(self.html)
|
||||
return repr("<HTML url={}>".format(repr(self.url)))
|
||||
|
||||
def find(self, selector):
|
||||
def find(self, selector=None):
|
||||
def gen():
|
||||
for found in self.pq(selector):
|
||||
yield Element(found)
|
||||
|
||||
return [g for g in gen()]
|
||||
|
||||
@property
|
||||
def markdown(self):
|
||||
return html2text.handle(self.html)
|
||||
|
||||
@property
|
||||
def links(self):
|
||||
def gen():
|
||||
@@ -107,4 +123,6 @@ def handle_response(response, **kwargs):
|
||||
|
||||
|
||||
session = requests.Session()
|
||||
session.hooks = {'response': handle_response}
|
||||
session.hooks = {'response': handle_response}
|
||||
|
||||
print(session.get('http://httpbin.org/').html.markdown)
|
||||
Reference in New Issue
Block a user