diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..53588d4 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +docs/source/_templates/*.html linguist-vendored=false +tests/*.html linguist-vendored=false diff --git a/.gitignore b/.gitignore index 52793b4..3f3912c 100644 --- a/.gitignore +++ b/.gitignore @@ -103,4 +103,7 @@ venv.bak/ /site # mypy -.mypy_cache/ \ No newline at end of file +.mypy_cache/ + +# Visual Studio Code +.vscode \ No newline at end of file diff --git a/README.rst b/README.rst index 470d043..141abba 100644 --- a/README.rst +++ b/README.rst @@ -9,6 +9,8 @@ Requests-HTML: HTML Parsing for Humans™ This library intends to make parsing HTML (e.g. scraping the web) as simple and intuitive as possible. +If you're interested in financially supporting Kenneth Reitz open source, consider `visiting this link `_. Your support helps tremendously with sustainability of motivation, as Open Source is no longer part of my day job. + When using this library you automatically get: - **Full JavaScript support**! diff --git a/docs/source/index.rst b/docs/source/index.rst index 3ddc900..1549718 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -103,7 +103,19 @@ Render out an :class:`Element `'s HTML: >>> about.html '
  • \nAbout\n\n
  • ' +Crab an :class:`Element `'s root tag name: +.. code-block:: pycon + + >>> about.tag + 'li' + +Show the line number that an :class:`Element `'s root tag located in: + +.. code-block:: pycon + + >>> about.lineno + 249 Select an :class:`Element ` list within an :class:`Element `: @@ -169,7 +181,7 @@ Let's grab some text that's rendered by JavaScript: Note, the first time you ever run the ``render()`` method, it will download Chromium into your home directory (e.g. ``~/.pyppeteer/``). This only happens -once. +once. You may also need to install a few `Linux packages `_ to get pyppeteer working. Pagination ========== diff --git a/requests_html.py b/requests_html.py index b65f014..230f079 100644 --- a/requests_html.py +++ b/requests_html.py @@ -102,7 +102,7 @@ class BaseParser: (`learn more `_). """ if self._html: - return self.raw_html.decode(self.encoding) + return self.raw_html.decode(self.encoding, errors='replace') else: return etree.tostring(self.element, encoding='unicode').strip() @@ -128,7 +128,7 @@ class BaseParser: self._encoding = html_to_unicode(self.default_encoding, self._html)[0] # Fall back to requests' detected encoding if decode fails. try: - self.raw_html.decode(self.encoding) + self.raw_html.decode(self.encoding, errors='replace') except UnicodeDecodeError: self._encoding = self.default_encoding @@ -146,7 +146,7 @@ class BaseParser: of the :class:`Element ` or :class:`HTML `. """ if self._pq is None: - self._pq = PyQuery(self.html) + self._pq = PyQuery(self.lxml) return self._pq @@ -159,7 +159,7 @@ class BaseParser: try: self._lxml = soup_parse(self.html, features='html.parser') except ValueError: - self._lxml = lxml.html.fromstring(self.html) + self._lxml = lxml.html.fromstring(self.raw_html) return self._lxml @@ -378,6 +378,8 @@ class Element(BaseParser): def __init__(self, *, element, url: _URL, default_encoding: _DefaultEncoding = None) -> None: super(Element, self).__init__(element=element, url=url, default_encoding=default_encoding) self.element = element + self.tag = element.tag + self.lineno = element.sourceline self._attrs = None def __repr__(self) -> str: @@ -408,7 +410,7 @@ class HTML(BaseParser): :param default_encoding: Which encoding to default to. """ - def __init__(self, *, session: Union['HTTPSession', 'AsyncHTMLSession'] = None, url: str = DEFAULT_URL, html: _HTML, default_encoding: str = DEFAULT_ENCODING) -> None: + def __init__(self, *, session: Union['HTMLSession', 'AsyncHTMLSession'] = None, url: str = DEFAULT_URL, html: _HTML, default_encoding: str = DEFAULT_ENCODING) -> None: # Convert incoming unicode HTML into bytes. if isinstance(html, str): @@ -527,9 +529,6 @@ class HTML(BaseParser): >>> r.html.render(script=script) {'width': 800, 'height': 600, 'deviceScaleFactor': 1} - Warning: If you use keep_page, you're responsable for closing each page, since - opening to many at scale may crach the browser. - Warning: the first time you run this method, it will download Chromium into your home directory (``~/.pyppeteer``). """ @@ -567,16 +566,18 @@ class HTML(BaseParser): page = None return content, result, page except TimeoutError: + await page.close() + page = None return None - self.session.browser # Automatycally create a event loop and browser + self.session.browser # Automatically create a event loop and browser content = None # Automatically set Reload to False, if example URL is being used. if self.url == DEFAULT_URL: reload = False - for i in range(retries): + for _ in range(retries): if not content: try: @@ -645,7 +646,7 @@ class HTMLSession(requests.Session): amongst other things. """ - def __init__(self, mock_browser=True, ignoreHTTPSErrors=False): + def __init__(self, mock_browser=True, verify=False): super(HTMLSession, self).__init__() # Mock a web browser's user agent. @@ -655,6 +656,8 @@ class HTMLSession(requests.Session): self.hooks = {'response': self._handle_response} self.ignoreHTTPSErrors = ignoreHTTPSErrors + self.__browser_args = browser_args + @staticmethod def _handle_response(response, **kwargs) -> HTMLResponse: """Requests HTTP Response handler. Attaches .html property to @@ -678,7 +681,7 @@ class HTMLSession(requests.Session): def browser(self): if not hasattr(self, "_browser"): self.loop = asyncio.get_event_loop() - self._browser = self.loop.run_until_complete(pyppeteer.launch(ignoreHTTPSErrors=self.ignoreHTTPSErrors, headless=True, args=['--no-sandbox'])) + self._browser = self.loop.run_until_complete(pyppeteer.launch(ignoreHTTPSErrors=self.verify, headless=True, args=['--no-sandbox'])) return self._browser def close(self): @@ -695,7 +698,7 @@ class AsyncHTMLSession(requests.Session): mock_browser: bool = True, *args, **kwargs): """ Set or create an event loop and a thread pool. - :param loop: Asyncio lopp to use. + :param loop: Asyncio loop to use. :param workers: Amount of threads to use for executing async calls. If not pass it will default to the number of processors on the machine, multiplied by 5. """