import sys import asyncio from urllib.parse import urlparse, urlunparse, urljoin from concurrent.futures import ThreadPoolExecutor from concurrent.futures._base import TimeoutError from functools import partial from typing import Set, Union, List, MutableMapping, Optional import pyppeteer import requests import http.cookiejar from pyquery import PyQuery from fake_useragent import UserAgent from lxml.html.clean import Cleaner import lxml from lxml import etree from lxml.html import HtmlElement from lxml.html import tostring as lxml_html_tostring from lxml.html.soupparser import fromstring as soup_parse from parse import search as parse_search from parse import findall, Result from w3lib.encoding import html_to_unicode DEFAULT_ENCODING = 'utf-8' DEFAULT_URL = 'https://example.org/' DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8' DEFAULT_NEXT_SYMBOL = ['next', 'more', 'older'] cleaner = Cleaner() cleaner.javascript = True cleaner.style = True useragent = None # Typing. _Find = Union[List['Element'], 'Element'] _XPath = Union[List[str], List['Element'], str, 'Element'] _Result = Union[List['Result'], 'Result'] _HTML = Union[str, bytes] _BaseHTML = str _UserAgent = str _DefaultEncoding = str _URL = str _RawHTML = bytes _Encoding = str _LXML = HtmlElement _Text = str _Search = Result _Containing = Union[str, List[str]] _Links = Set[str] _Attrs = MutableMapping _Next = Union['HTML', List[str]] _NextSymbol = List[str] # Sanity checking. try: assert sys.version_info.major == 3 assert sys.version_info.minor > 5 except AssertionError: raise RuntimeError('Requests-HTML requires Python 3.6+!') class MaxRetries(Exception): def __init__(self, message): self.message = message class BaseParser: """A basic HTML/Element Parser, for Humans. :param element: The element from which to base the parsing upon. :param default_encoding: Which encoding to default to. :param html: HTML from which to base the parsing upon (optional). :param url: The URL from which the HTML originated, used for ``absolute_links``. """ def __init__(self, *, element, default_encoding: _DefaultEncoding = None, html: _HTML = None, url: _URL) -> None: self.element = element self.url = url self.skip_anchors = True self.default_encoding = default_encoding self._encoding = None self._html = html.encode(DEFAULT_ENCODING) if isinstance(html, str) else html self._lxml = None self._pq = None @property def raw_html(self) -> _RawHTML: """Bytes representation of the HTML content. (`learn more `_). """ if self._html: return self._html else: return etree.tostring(self.element, encoding='unicode').strip().encode(self.encoding) @property def html(self) -> _BaseHTML: """Unicode representation of the HTML content (`learn more `_). """ if self._html: return self.raw_html.decode(self.encoding, errors='replace') else: return etree.tostring(self.element, encoding='unicode').strip() @html.setter def html(self, html: str) -> None: self._html = html.encode(self.encoding) @raw_html.setter def raw_html(self, html: bytes) -> None: """Property setter for self.html.""" self._html = html @property def encoding(self) -> _Encoding: """The encoding string to be used, extracted from the HTML and :class:`HTMLResponse ` headers. """ if self._encoding: return self._encoding # Scan meta tags for charset. if self._html: self._encoding = html_to_unicode(self.default_encoding, self._html)[0] # Fall back to requests' detected encoding if decode fails. try: self.raw_html.decode(self.encoding, errors='replace') except UnicodeDecodeError: self._encoding = self.default_encoding return self._encoding if self._encoding else self.default_encoding @encoding.setter def encoding(self, enc: str) -> None: """Property setter for self.encoding.""" self._encoding = enc @property def pq(self) -> PyQuery: """`PyQuery `_ representation of the :class:`Element ` or :class:`HTML `. """ if self._pq is None: self._pq = PyQuery(self.lxml) return self._pq @property def lxml(self) -> HtmlElement: """`lxml `_ representation of the :class:`Element ` or :class:`HTML `. """ if self._lxml is None: try: self._lxml = soup_parse(self.html, features='html.parser') except ValueError: self._lxml = lxml.html.fromstring(self.raw_html) return self._lxml @property def text(self) -> _Text: """The text content of the :class:`Element ` or :class:`HTML `. """ return self.pq.text() @property def full_text(self) -> _Text: """The full text content (including links) of the :class:`Element ` or :class:`HTML `. """ return self.lxml.text_content() def find(self, selector: str = "*", *, containing: _Containing = None, clean: bool = False, first: bool = False, _encoding: str = None) -> _Find: """Given a CSS Selector, returns a list of :class:`Element ` objects or a single one. :param selector: CSS Selector to use. :param clean: Whether or not to sanitize the found HTML of ``