Merge remote-tracking branch 'remotes/upstream/master' into feature/optimization

# Conflicts: # requests_html.py
2026-06-05 23:00:20 +00:00 · 2018-02-28 22:38:47 +08:00
parent 83bf5ab8c0 ed880bdd62
commit acd513096c
6 changed files with 115 additions and 72 deletions
@@ -28,4 +28,4 @@ mypy = "*"

 [scripts]

-tests = "pytest"
+tests = "pytest -v -m ok"
@@ -1,7 +1,7 @@
 {
    "_meta": {
        "hash": {
-            "sha256": "ef6f9504ed9751cf2f4c5aef06e59838981c79d84fa1d36fb5ce258d8dba189f"
+            "sha256": "cf67076e9c185c3bc951910b2a44b8b548ce954e0e3ff2a5bef1942d13275e8e"
        },
        "host-environment-markers": {
            "implementation_name": "cpython",
@@ -127,15 +127,6 @@
            ],
            "version": "==0.0.10"
        },
-        "pyqt5": {
-            "hashes": [
-                "sha256:128285176240e990fce9c50293105ffd0d2884d8910bb338118f867b171ec6e8",
-                "sha256:dbd1777d8e7540a6e7350482f1d7c981a073ce1b7195ac2cd21c204b3a28df57",
-                "sha256:3563ac935fca8e8b1dbd4856d8eedc982b5de90c53f0280e8fca8060a262d4f4",
-                "sha256:2ce953cb849e5265b9d1abe075471148ad5fb6d7e6a9881f37dfe05590571d23"
-            ],
-            "version": "==5.10"
-        },
        "pyquery": {
            "hashes": [
                "sha256:07987c2ed2aed5cba29ff18af95e56e9eb04a2249f42ce47bddfb37f487229a3",
@@ -150,23 +141,6 @@
            ],
            "version": "==2.18.4"
        },
-        "sip": {
-            "hashes": [
-                "sha256:f31bb63e63a958f65887ae27f06e62af9f9cb818ba7456a99f78a5ec3082d3dd",
-                "sha256:776e169da554729f80337070348db49a6742d8aa317aec931a4d0f47b7ef535d",
-                "sha256:beac2bc1b9457a693fb3122c797cad5678a168ecff6ccbad4aa3a9f1ff1a2d86",
-                "sha256:1c5a1ad409e97833a4a873fae5bcd7a365651f7372806992d03891082821bc41",
-                "sha256:248ecca386d4832138f6a044dceb0bfc38fb8503b7ffbfeb474073f56930144b",
-                "sha256:ab338095e32ebb2047b6184f1383c667c47b9822d7320fdfb93870567a972343",
-                "sha256:ebea4619e9626e2eb197835049807c8173f11e2023b05140cbee4b274a91ef5e",
-                "sha256:2db24e65c99b7d20a67fa461f6bc2e15bddb6cd5fde52e37d6609566d79a69a1",
-                "sha256:3b45eecf6f68a29f5629dc064079e919987b030628bb6614da7f4eefedbe145e",
-                "sha256:2120e9d713120687558b6699cf5ff6a8f7b070776b19d6c7fc96fc64ea8ca056",
-                "sha256:18350ebf82beaef6a73d2c14320f19961242ed424670407df6fd5a9b65f0e7fc",
-                "sha256:92413edcb4fea75ebd1f8142c882dc5db398025eb8a0a273385838fd791de73c"
-            ],
-            "version": "==4.19.7"
-        },
        "six": {
            "hashes": [
                "sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb",
@@ -177,6 +177,28 @@ You can also use this library without Requests:
    >>> html.links
    {'https://httpbin.org'}

+You can also render JavaScript pages without Requests:
+
+.. code-block:: pycon
+
+    # ^^ proceeding from above ^^
+    >>> script = """
+            () => {
+                return {
+                    width: document.documentElement.clientWidth,
+                    height: document.documentElement.clientHeight,
+                    deviceScaleFactor: window.devicePixelRatio,
+                }
+            }
+        """
+    >>> val = html.render(script=script, reload=False)
+
+    >>> print(val)
+    {'width': 800, 'height': 600, 'deviceScaleFactor': 1}
+
+    >>> print(html.html)
+    <html><head></head><body><a href="https://httpbin.org"></a></body></html>
+

 API Documentation
 =================
@@ -2,7 +2,7 @@ import sys
 import asyncio
 from urllib.parse import urlparse, urlunparse
 from concurrent.futures._base import TimeoutError
-from typing import Set, Union, List, MutableMapping
+from typing import Set, Union, List, MutableMapping, Optional

 import pyppeteer
 import requests
@@ -16,7 +16,6 @@ from parse import search as parse_search
 from parse import findall, Result
 from w3lib.encoding import html_to_unicode

-
 DEFAULT_ENCODING = 'utf-8'
 DEFAULT_URL = 'https://example.org/'

@@ -45,7 +44,6 @@ try:
 except AssertionError:
    raise RuntimeError('Requests-HTML requires Python 3.6+!')

-
 class BaseParser:
    """A basic HTML/Element Parser, for Humans.

@@ -155,13 +153,7 @@ class BaseParser:
            for found in self.pq(selector)
        ]

-        if first:
-            try:
-                return elements[0]
-            except IndexError:
-                return None
-        else:
-            return elements
+        return _get_first_or_list(elements, first)

    def xpath(self, selector: str, first: bool = False, _encoding: str = None) -> _XPath:
        """Given an XPath selector, returns a list of
@@ -189,13 +181,7 @@ class BaseParser:
            for selection in selected
        ]

-        if first:
-            try:
-                return elements[0]
-            except IndexError:
-                return None
-        else:
-            return elements
+        return _get_first_or_list(c, first)

    def search(self, template: str) -> Result:
        """Searches the :class:`Element <Element>` for the given Parse template.
@@ -216,14 +202,14 @@ class BaseParser:
    @property
    def links(self) -> _Links:
        """All found links on page, in as–is form."""
+
        def gen():
            for link in self.find('a'):

                try:
                    href = link.attrs['href'].strip()
-                    if not(href.startswith('#') and self.skip_anchors) and href not in ['javascript:;']:
-                        if href:
-                            yield href
+                    if href and not (href.startswith('#') and self.skip_anchors and href in ['javascript:;']):
+                        yield href
                except KeyError:
                    pass

@@ -234,6 +220,7 @@ class BaseParser:
        """All found links on page, in absolute form
        (`learn more <https://www.navegabem.com/absolute-or-relative-links.html>`_).
        """
+
        def gen():
            for link in self.links:
                # Parse the link with stdlib.
@@ -263,12 +250,11 @@ class BaseParser:
        if base:
            return base.attrs['href'].strip()

-        else:
-            url = '/'.join(self.url.split('/')[:-1])
-            if url.endswith('/'):
-                url = url[:-1]
+        url = '/'.join(self.url.split('/')[:-1])
+        if url.endswith('/'):
+            url = url[:-1]

-            return url
+        return url


 class Element(BaseParser):
@@ -284,10 +270,7 @@ class Element(BaseParser):
        self.element = element

    def __repr__(self) -> str:
-        attrs = []
-        for attr in self.attrs:
-            attrs.append('{}={}'.format(attr, repr(self.attrs[attr])))
-
+        attrs = ['{}={}'.format(attr, repr(self.attrs[attr])) for attr in self.attrs]
        return "<Element {} {}>".format(repr(self.element.tag), ' '.join(attrs))

    @property
@@ -329,10 +312,16 @@ class HTML(BaseParser):
    def __repr__(self) -> str:
        return "<HTML url={}>".format(repr(self.url))

-    def render(self, retries: int = 8, script: str = None, scrolldown=False, sleep: int = 0):
+    def render(self, retries: int = 8, script: str = None, scrolldown=False, sleep: int = 0, reload: bool = True):
        """Reloads the response in Chromium, and replaces HTML content
        with an updated version, with JavaScript executed.

+        :param retries: The number of times to retry loading the page in Chromium.
+        :param script: JavaScript to execute upon page load (optional).
+        :param scrolldown: Integer, if provided, of how many times to page down.
+        :param sleep: Integer, if provided, of how many long to sleep after initial render.
+        :param reload: If ``False``, content will not be loaded from the browser, but will be provided from memory.
+
        If ``scrolldown`` is specified, the page will scrolldown the specified
        number of times, after sleeping the specified amount of time
        (e.g. ``scrolldown=10, sleep=1``).
@@ -365,13 +354,16 @@ class HTML(BaseParser):
        Warning: the first time you run this method, it will download
        Chromium into your home directory (``~/.pyppeteer``).
        """
-        async def _async_render(*, url: str, script: str = None, scrolldown, sleep: int):
+        async def _async_render(*, url: str, script: str = None, scrolldown, sleep: int, reload: bool = True, content: Optional[str]):
            try:
                browser = pyppeteer.launch(headless=True)
                page = await browser.newPage()

                # Load the given page (GET request, obviously.)
-                await page.goto(url)
+                if reload:
+                    await page.goto(url)
+                else:
+                    await page.setContent(content)

                result = None
                if script:
@@ -399,7 +391,7 @@ class HTML(BaseParser):
        for i in range(retries):
            if not content:
                try:
-                    content, result = loop.run_until_complete(_async_render(url=self.url, script=script, sleep=sleep, scrolldown=scrolldown))
+                    content, result = loop.run_until_complete(_async_render(url=self.url, script=script, sleep=sleep, content=self.html, reload=reload, scrolldown=scrolldown))
                except TimeoutError:
                    pass

@@ -419,10 +411,9 @@ class HTMLResponse(requests.Response):

    @property
    def html(self) -> HTML:
-        if self._html:
-            return self._html
+        if not self._html:
+            self._html = HTML(url=self.url, html=self.content, default_encoding=self.encoding)

-        self._html = HTML(url=self.url, html=self.content, default_encoding=self.encoding)
        return self._html

    @classmethod
@@ -437,10 +428,17 @@ def user_agent(style='chrome') -> _UserAgent:
    style. Defaults to a Chrome-style User-Agent.
    """

-    if not style:
-        return useragent.random
+    return useragent[style] if style else useragent.random
+
+
+def _get_first_or_list(l, first=True):
+    if first:
+        try:
+            return l[0]
+        except IndexError:
+            return None
    else:
-        return useragent[style]
+        return l


 class HTMLSession(requests.Session):
@@ -473,6 +471,5 @@ class HTMLSession(requests.Session):
        """
        # Convert Request object into HTTPRequest object.
        r = super(HTMLSession, self).request(*args, **kwargs)
-        html_r = HTMLResponse._from_response(r)

-        return html_r
+        return HTMLResponse._from_response(r)
@@ -17,7 +17,7 @@ DESCRIPTION = 'HTML Parsing for Humans.'
 URL = 'https://github.com/kennethreitz/requests-html'
 EMAIL = 'me@kennethreitz.org'
 AUTHOR = 'Kenneth Reitz'
-VERSION = '0.6.7'
+VERSION = '0.6.8'

 # What packages are required for this module to be executed?
 REQUIRED = [
@@ -1,5 +1,6 @@
 import os

+import pytest
 from requests_html import HTMLSession, HTML
 from requests_file import FileAdapter

@@ -14,11 +15,13 @@ def get():
    return session.get(url)


+@pytest.mark.ok
 def test_file_get():
    r = get()
    assert r.status_code == 200


+@pytest.mark.ok
 def test_css_selector():
    r = get()

@@ -32,6 +35,7 @@ def test_css_selector():
        assert menu_item in about.full_text.split('\n')


+@pytest.mark.ok
 def test_attrs():
    r = get()
    about = r.html.find('#about', first=True)
@@ -40,20 +44,23 @@ def test_attrs():
    assert len(about.attrs['class']) == 2


+@pytest.mark.ok
 def test_links():
    r = get()
    about = r.html.find('#about', first=True)

-    len(about.links) == 6
-    len(about.absolute_links) == 6
+    assert len(about.links) == 6
+    assert len(about.absolute_links) == 6


+@pytest.mark.ok
 def test_search():
    r = get()
    style = r.html.search('Python is a {} language')[0]
    assert style == 'programming'


+@pytest.mark.ok
 def test_xpath():
    r = get()
    html = r.html.xpath('/html', first=True)
@@ -63,6 +70,7 @@ def test_xpath():
    assert '#site-map' in a_hrefs


+@pytest.mark.ok
 def test_html_loading():
    doc = """<a href='https://httpbin.org'>"""
    html = HTML(html=doc)
@@ -72,6 +80,7 @@ def test_html_loading():
    assert isinstance(html.html, str)


+@pytest.mark.ok
 def test_anchor_links():
    r = get()
    r.html.skip_anchors = False
@@ -79,5 +88,46 @@ def test_anchor_links():
    assert '#site-map' in r.html.links


+@pytest.mark.render
+def test_render():
+    r = get()
+    script = """
+    () => {
+        return {
+            width: document.documentElement.clientWidth,
+            height: document.documentElement.clientHeight,
+            deviceScaleFactor: window.devicePixelRatio,
+        }
+    }
+    """
+    val = r.html.render(script=script)
+    for value in ('width', 'height', 'deviceScaleFactor'):
+        assert value in val
+
+    about = r.html.find('#about', first=True)
+    assert len(about.links) == 6
+
+
+@pytest.mark.render
+def test_bare_render():
+    doc = """<a href='https://httpbin.org'>"""
+    html = HTML(html=doc)
+    script = """
+        () => {
+            return {
+                width: document.documentElement.clientWidth,
+                height: document.documentElement.clientHeight,
+                deviceScaleFactor: window.devicePixelRatio,
+            }
+        }
+    """
+    val = html.render(script=script, reload=False)
+    for value in ('width', 'height', 'deviceScaleFactor'):
+        assert value in val
+
+    assert html.find('html')
+    assert 'https://httpbin.org' in html.links
+
+
 if __name__ == '__main__':
    test_xpath()