Merge branch 'master' into master

2026-06-05 23:00:20 +00:00 · 2018-09-18 02:51:54 -04:00
parent 50c9058d04 c47fd127b1
commit 625910e1a5
4 changed files with 27 additions and 8 deletions
@@ -0,0 +1,2 @@
+docs/source/_templates/*.html linguist-vendored=false
+tests/*.html linguist-vendored=false
@@ -9,6 +9,8 @@ Requests-HTML: HTML Parsing for Humans™
 This library intends to make parsing HTML (e.g. scraping the web) as
 simple and intuitive as possible.

+If you're interested in financially supporting Kenneth Reitz open source, consider `visiting this link <https://cash.me/$KennethReitz>`_. Your support helps tremendously with sustainability of motivation, as Open Source is no longer part of my day job.
+
 When using this library you automatically get:

 - **Full JavaScript support**!
@@ -103,7 +103,19 @@ Render out an :class:`Element <Element>`'s HTML:
    >>> about.html
    '<li aria-haspopup="true" class="tier-1 element-1 " id="about">\n<a class="" href="/about/" title="">About</a>\n<ul aria-hidden="true" class="subnav menu" role="menu">\n<li class="tier-2 element-1" role="treeitem"><a href="/about/apps/" title="">Applications</a></li>\n<li class="tier-2 element-2" role="treeitem"><a href="/about/quotes/" title="">Quotes</a></li>\n<li class="tier-2 element-3" role="treeitem"><a href="/about/gettingstarted/" title="">Getting Started</a></li>\n<li class="tier-2 element-4" role="treeitem"><a href="/about/help/" title="">Help</a></li>\n<li class="tier-2 element-5" role="treeitem"><a href="http://brochure.getpython.info/" title="">Python Brochure</a></li>\n</ul>\n</li>'

+Crab an :class:`Element <Element>`'s root tag name:

+.. code-block:: pycon
+
+   >>> about.tag
+   'li'
+
+Show the line number that an :class:`Element <Element>`'s root tag located in:
+
+.. code-block:: pycon
+
+    >>> about.lineno
+    249

 Select an :class:`Element <Element>` list within an :class:`Element <Element>`:

@@ -169,7 +181,7 @@ Let's grab some text that's rendered by JavaScript:

 Note, the first time you ever run the ``render()`` method, it will download
 Chromium into your home directory (e.g. ``~/.pyppeteer/``). This only happens
-once.
+once. You may also need to install a few `Linux packages <https://github.com/miyakogi/pyppeteer/issues/60>`_ to get pyppeteer working.

 Pagination
 ==========
@@ -378,6 +378,8 @@ class Element(BaseParser):
    def __init__(self, *, element, url: _URL, default_encoding: _DefaultEncoding = None) -> None:
        super(Element, self).__init__(element=element, url=url, default_encoding=default_encoding)
        self.element = element
+        self.tag = element.tag
+        self.lineno = element.sourceline
        self._attrs = None

    def __repr__(self) -> str:
@@ -408,7 +410,7 @@ class HTML(BaseParser):
    :param default_encoding: Which encoding to default to.
    """

-    def __init__(self, *, session: Union['HTTPSession', 'AsyncHTMLSession'] = None, url: str = DEFAULT_URL, html: _HTML, default_encoding: str = DEFAULT_ENCODING) -> None:
+    def __init__(self, *, session: Union['HTMLSession', 'AsyncHTMLSession'] = None, url: str = DEFAULT_URL, html: _HTML, default_encoding: str = DEFAULT_ENCODING) -> None:

        # Convert incoming unicode HTML into bytes.
        if isinstance(html, str):
@@ -527,9 +529,6 @@ class HTML(BaseParser):
            >>> r.html.render(script=script)
            {'width': 800, 'height': 600, 'deviceScaleFactor': 1}

-        Warning: If you use keep_page, you're responsible for closing each page, since
-        opening to many at scale may crach the browser.
-
        Warning: the first time you run this method, it will download
        Chromium into your home directory (``~/.pyppeteer``).
        """
@@ -567,9 +566,11 @@ class HTML(BaseParser):
                    page = None
                return content, result, page
            except TimeoutError:
+                await page.close()
+                page = None
                return None

-        self.session.browser  # Automatycally create an event loop and browser
+        self.session.browser  # Automatically create a event loop and browser
        content = None

        # Automatically set Reload to False, if example URL is being used.
@@ -645,7 +646,7 @@ class HTMLSession(requests.Session):
    amongst other things.
    """

-    def __init__(self, mock_browser=True):
+    def __init__(self, mock_browser=True, browser_args=['--no-sandbox']):
        super(HTMLSession, self).__init__()

        # Mock a web browser's user agent.
@@ -654,6 +655,8 @@ class HTMLSession(requests.Session):

        self.hooks = {'response': self._handle_response}

+        self.__browser_args = browser_args
+
    @staticmethod
    def _handle_response(response, **kwargs) -> HTMLResponse:
        """Requests HTTP Response handler. Attaches .html property to
@@ -677,7 +680,7 @@ class HTMLSession(requests.Session):
    def browser(self):
        if not hasattr(self, "_browser"):
            self.loop = asyncio.get_event_loop()
-            self._browser = self.loop.run_until_complete(pyppeteer.launch(headless=True, args=['--no-sandbox']))
+            self._browser = self.loop.run_until_complete(pyppeteer.launch(headless=True, args=self.__browser_args))
        return self._browser

    def close(self):