diff --git a/_modules/requests_html.html b/_modules/requests_html.html index aeaf407..0eb042f 100644 --- a/_modules/requests_html.html +++ b/_modules/requests_html.html @@ -32,8 +32,10 @@
import sys
import asyncio
-from urllib.parse import urlparse, urlunparse
+from urllib.parse import urlparse, urlunparse, urljoin
+from concurrent.futures import ThreadPoolExecutor
from concurrent.futures._base import TimeoutError
+from functools import partial
from typing import Set, Union, List, MutableMapping, Optional
import pyppeteer
@@ -41,9 +43,11 @@
from pyquery import PyQuery
from fake_useragent import UserAgent
+from lxml.html.clean import Cleaner
import lxml
from lxml import etree
from lxml.html import HtmlElement
+from lxml.html import tostring as lxml_html_tostring
from lxml.html.soupparser import fromstring as soup_parse
from parse import search as parse_search
from parse import findall, Result
@@ -53,6 +57,10 @@
DEFAULT_URL = 'https://example.org/'
DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8'
+cleaner = Cleaner()
+cleaner.javascript = True
+cleaner.style = True
+
useragent = None
# Typing.
@@ -124,7 +132,11 @@
return etree.tostring(self.element, encoding='unicode').strip()
@html.setter
- def html(self, html: bytes) -> None:
+ def html(self, html: str) -> None:
+ self._html = html.encode(self.encoding)
+
+ @raw_html.setter
+ def raw_html(self, html: bytes) -> None:
"""Property setter for self.html."""
self._html = html
@@ -225,11 +237,12 @@
else:
return url
- def find(self, selector: str = "*", containing: _Containing = None, first: bool = False, _encoding: str = None) -> _Find:
+ def find(self, selector: str = "*", *, containing: _Containing = None, clean: bool = False, first: bool = False, _encoding: str = None) -> _Find:
"""Given a CSS Selector, returns a list of
:class:`Element <Element>` objects or a single one.
:param selector: CSS Selector to use.
+ :param clean: Whether or not to sanitize the found HTML of ``<script>`` and ``<style>`` tags.
:param containing: If specified, only return elements that contain the provided text.
:param first: Whether or not to return just the first result.
:param _encoding: The encoding format.
@@ -269,13 +282,23 @@
elements.reverse()
+ # Sanitize the found HTML.
+ if clean:
+ elements_copy = elements.copy()
+ elements = []
+
+ for element in elements_copy:
+ element.raw_html = lxml_html_tostring(cleaner.clean_html(element.lxml))
+ elements.append(element)
+
return _get_first_or_list(elements, first)
- def xpath(self, selector: str, first: bool = False, _encoding: str = None) -> _XPath:
+ def xpath(self, selector: str, *, clean: bool = False, first: bool = False, _encoding: str = None) -> _XPath:
"""Given an XPath selector, returns a list of
:class:`Element <Element>` objects or a single one.
:param selector: XPath Selector to use.
+ :param clean: Whether or not to sanitize the found HTML of ``<script>`` and ``<style>`` tags.
:param first: Whether or not to return just the first result.
:param _encoding: The encoding format.
@@ -297,6 +320,15 @@
for selection in selected
]
+ # Sanitize the found HTML.
+ if clean:
+ elements_copy = elements.copy()
+ elements = []
+
+ for element in elements_copy:
+ element.raw_html = lxml_html_tostring(cleaner.clean_html(element.lxml))
+ elements.append(element)
+
return _get_first_or_list(elements, first)
def search(self, template: str) -> Result:
@@ -337,15 +369,20 @@
# Parse the link with stdlib.
parsed = urlparse(link)._asdict()
- # Appears to be a relative link:
+ # If link is relative, then join it with base_url.
if not parsed['netloc']:
- parsed['netloc'] = urlparse(self.base_url).netloc
+ return urljoin(self.base_url, link)
+
+ # Link is absolute; if it lacks a scheme, add one from base_url.
if not parsed['scheme']:
parsed['scheme'] = urlparse(self.base_url).scheme
- # Re-construct URL, with new data.
- parsed = (v for v in parsed.values())
- return urlunparse(parsed)
+ # Reconstruct the URL to incorporate the new scheme.
+ parsed = (v for v in parsed.values())
+ return urlunparse(parsed)
+
+ # Link is absolute and complete with scheme; nothing to be done here.
+ return link
@property
@@ -372,9 +409,15 @@
if result:
return result
- url = '/'.join(self.url.split('/')[:-1])
- if url.endswith('/'):
- url = url[:-1]
+ # Parse the url to separate out the path
+ parsed = urlparse(self.url)._asdict()
+
+ # Remove any part of the path after the last '/'
+ path = '/'.join(parsed['path'].split('/')[:-1])
+
+ # Reconstruct the url with the modified path
+ parsed = (v for v in parsed.values())
+ url = urlunparse(parsed)
return url
@@ -389,12 +432,13 @@
__slots__ = [
'element', 'url', 'skip_anchors', 'default_encoding', '_encoding',
- '_encoding', '_html', '_lxml', '_pq', 'session'
+ '_html', '_lxml', '_pq', '_attrs', 'session'
]
def __init__(self, *, element, url: _URL, default_encoding: _DefaultEncoding = None) -> None:
super(Element, self).__init__(element=element, url=url, default_encoding=default_encoding)
self.element = element
+ self._attrs = None
def __repr__(self) -> str:
attrs = ['{}={}'.format(attr, repr(self.attrs[attr])) for attr in self.attrs]
@@ -405,14 +449,15 @@
"""Returns a dictionary of the attributes of the :class:`Element <Element>`
(`learn more <https://www.w3schools.com/tags/ref_attributes.asp>`_).
"""
- attrs = {k: v for k, v in self.element.items()}
+ if self._attrs is None:
+ self._attrs = {k: v for k, v in self.element.items()}
- # Split class and rel up, as there are ussually many of them:
- for attr in ['class', 'rel']:
- if attr in attrs:
- attrs[attr] = tuple(attrs[attr].split())
+ # Split class and rel up, as there are ussually many of them:
+ for attr in ['class', 'rel']:
+ if attr in self._attrs:
+ self._attrs[attr] = tuple(self._attrs[attr].split())
You can also select only elements containing certian text:
+You can also select only elements containing certain text:
>>> r = session.get('http://python-requests.org/')
>>> r.html.find('a', containing='kenneth')
[<Element 'a' href='http://kennethreitz.com/pages/open-projects.html'>, <Element 'a' href='http://kennethreitz.org/'>, <Element 'a' href='https://twitter.com/kennethreitz' class=('twitter-follow-button',) data-show-count='false'>, <Element 'a' class=('reference', 'internal') href='dev/contributing/#kenneth-reitz-s-code-style'>]
@@ -248,7 +248,7 @@ once.
find(selector: str = '*', containing: Union[str, typing.List[str]] = None, first: bool = False, _encoding: str = None) → Union[typing.List[_ForwardRef('Element')], _ForwardRef('Element')]¶find(selector: str = '*', *, containing: Union[str, typing.List[str]] = None, clean: bool = False, first: bool = False, _encoding: str = None) → Union[typing.List[_ForwardRef('Element')], _ForwardRef('Element')]¶
Given a CSS Selector, returns a list of
Element objects or a single one.
| Parameters: |
xpath(selector: str, *, clean: bool = False, first: bool = False, _encoding: str = None) → Union[typing.List[str], typing.List[_ForwardRef('Element')], str, _ForwardRef('Element')]¶
|
|---|