mirror of
https://github.com/kennethreitz/requests-html.git
synced 2026-06-05 14:50:20 +00:00
@@ -61,6 +61,13 @@ Usage
|
||||
>>> r.html.search('Python is a {} language')[0]
|
||||
programming
|
||||
|
||||
More complex CSS Selector example:
|
||||
|
||||
>>> r = session.get('https://github.com/')
|
||||
>>> sel = 'body > div.application-main > div.jumbotron.jumbotron-codelines > div > div > div.col-md-7.text-center.text-md-left > p'
|
||||
>>> print(r.html.find(sel)[0].text)
|
||||
GitHub is a development platform inspired by the way you work. From open source to business, you can host and review code, manage projects, and build software alongside millions of other developers.
|
||||
|
||||
Installation
|
||||
============
|
||||
|
||||
|
||||
+47
-57
@@ -1,11 +1,12 @@
|
||||
from tempfile import TemporaryFile
|
||||
from io import StringIO
|
||||
|
||||
import html2text
|
||||
import requests
|
||||
from pyquery import PyQuery
|
||||
|
||||
from fake_useragent import UserAgent
|
||||
from lxml.etree import tostring
|
||||
from lxml import etree
|
||||
from lxml.html.soupparser import fromstring
|
||||
from parse import search as parse_search
|
||||
from parse import findall
|
||||
|
||||
@@ -16,27 +17,17 @@ useragent = UserAgent()
|
||||
# xpath support next.
|
||||
# parse support.
|
||||
|
||||
class Element:
|
||||
"""An element of HTML."""
|
||||
def __init__(self, element):
|
||||
self.element = element
|
||||
|
||||
def __repr__(self):
|
||||
attrs = []
|
||||
for attr in self.attrs:
|
||||
attrs.append('{}={}'.format(attr, repr(self.attrs[attr])))
|
||||
|
||||
return "<Element {} {}>".format(repr(self.element.tag), ' '.join(attrs))
|
||||
|
||||
class BaseParser:
|
||||
"""docstring for BaseParser"""
|
||||
@property
|
||||
def pq(self):
|
||||
"""PyQuery representation of the element."""
|
||||
return PyQuery(self.element)
|
||||
"""PyQuery representation of the page."""
|
||||
return PyQuery(self.html)
|
||||
|
||||
@property
|
||||
def attrs(self):
|
||||
"""Returns a dictionary of the attributes of the element."""
|
||||
return {k: self.pq.attr[k] for k in self.element.keys()}
|
||||
def lxml(self):
|
||||
"""Etree representation of the page."""
|
||||
return fromstring(self.html)
|
||||
|
||||
@property
|
||||
def text(self):
|
||||
@@ -56,7 +47,7 @@ class Element:
|
||||
@property
|
||||
def html(self):
|
||||
"""HTML representation of the element."""
|
||||
return tostring(self.element).decode('utf-8').strip()
|
||||
return etree.tostring(self.element).decode('utf-8').strip()
|
||||
|
||||
def find(self, selector):
|
||||
"""Given a jQuery selector, returns a list of element objects."""
|
||||
@@ -66,6 +57,10 @@ class Element:
|
||||
|
||||
return [g for g in gen()]
|
||||
|
||||
def xpath(self, selector):
|
||||
"""Given an XPath selector, returns a list of element objects."""
|
||||
return [Element(e) for e in self.lxml.xpath(selector)]
|
||||
|
||||
def search(self, template):
|
||||
"""Searches the element for the given parse template."""
|
||||
return parse_search(template, self.html)
|
||||
@@ -76,38 +71,6 @@ class Element:
|
||||
"""
|
||||
return [r for r in findall(template, self.html)]
|
||||
|
||||
|
||||
class HTML:
|
||||
"""An HTML document."""
|
||||
def __init__(self, response):
|
||||
self.html = response.text
|
||||
self.url = response.url
|
||||
self.skip_anchors = True
|
||||
|
||||
def __repr__(self):
|
||||
return "<HTML url={}>".format(repr(self.url))
|
||||
|
||||
def find(self, selector):
|
||||
"""Given a jQuery selector, returns a list of element objects."""
|
||||
def gen():
|
||||
for found in self.pq(selector):
|
||||
yield Element(found)
|
||||
|
||||
return [g for g in gen()]
|
||||
|
||||
def search(self, template):
|
||||
"""Searches the page for the given parse template."""
|
||||
return parse_search(template, self.html)
|
||||
|
||||
def search_all(self, template):
|
||||
"""Searches the page (multiple times) for the given parse template."""
|
||||
return [r for r in findall(template, self.html)]
|
||||
|
||||
@property
|
||||
def markdown(self):
|
||||
"""Markdown representation of the page."""
|
||||
return html2text.handle(self.html)
|
||||
|
||||
@property
|
||||
def links(self):
|
||||
"""All found links on page, in as–is form."""
|
||||
@@ -122,6 +85,36 @@ class HTML:
|
||||
|
||||
return set(g for g in gen())
|
||||
|
||||
|
||||
class Element(BaseParser):
|
||||
"""An element of HTML."""
|
||||
def __init__(self, element):
|
||||
self.element = element
|
||||
|
||||
def __repr__(self):
|
||||
attrs = []
|
||||
for attr in self.attrs:
|
||||
attrs.append('{}={}'.format(attr, repr(self.attrs[attr])))
|
||||
|
||||
return "<Element {} {}>".format(repr(self.element.tag), ' '.join(attrs))
|
||||
|
||||
@property
|
||||
def attrs(self):
|
||||
"""Returns a dictionary of the attributes of the element."""
|
||||
return {k: self.pq.attr[k] for k in self.element.keys()}
|
||||
|
||||
|
||||
class HTML(BaseParser):
|
||||
"""An HTML document."""
|
||||
|
||||
def __init__(self, response):
|
||||
self.html = response.text
|
||||
self.url = response.url
|
||||
self.skip_anchors = True
|
||||
|
||||
def __repr__(self):
|
||||
return "<HTML url={}>".format(repr(self.url))
|
||||
|
||||
@property
|
||||
def base_url(self):
|
||||
"""The base URL for the page."""
|
||||
@@ -149,11 +142,6 @@ class HTML:
|
||||
|
||||
return set(g for g in gen())
|
||||
|
||||
@property
|
||||
def pq(self):
|
||||
"""PyQuery representation of the page."""
|
||||
return PyQuery(self.html)
|
||||
|
||||
|
||||
def _handle_response(response, **kwargs):
|
||||
"""Requests HTTP Response handler. Attaches .html property to Response
|
||||
@@ -174,6 +162,7 @@ def user_agent(style=None):
|
||||
else:
|
||||
return useragent[style]
|
||||
|
||||
|
||||
def get_session(mock_browser=True):
|
||||
"""Returns a consumable session, for cookie persistience and connection
|
||||
pooling, amongst other things.
|
||||
@@ -191,4 +180,5 @@ def get_session(mock_browser=True):
|
||||
|
||||
return session
|
||||
|
||||
|
||||
session = get_session()
|
||||
|
||||
Reference in New Issue
Block a user