From 81998d84c4df20df7cb9cfa64c2235d36ab81610 Mon Sep 17 00:00:00 2001 From: Nicholas Date: Mon, 7 May 2018 01:32:59 +0800 Subject: [PATCH 01/14] Docs: Add note to install Linux packages I ran into `pyppeteer.errors.BrowserError: Failed to connect to browser port:` and after a bit of snooping found that some Linux packages needed to be installed on my machine for pyppeteer to run. Suggest to add a note to save others time! --- docs/source/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index 3ddc900..0106508 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -169,7 +169,7 @@ Let's grab some text that's rendered by JavaScript: Note, the first time you ever run the ``render()`` method, it will download Chromium into your home directory (e.g. ``~/.pyppeteer/``). This only happens -once. +once. You may also need to install a few `Linux packages `_ to get pyppeteer working. Pagination ========== From 956e60054c18f10ce5439a55eaff33c46de68afe Mon Sep 17 00:00:00 2001 From: carrionc Date: Wed, 30 May 2018 00:40:37 -0400 Subject: [PATCH 02/14] Multiple chromium tab fix Within the render function, the page is rendered through the _async_render function. This function will try to render content by first creating a page, and currently will only close said page if the content is generated. However, if at any point there's a timeout beforehand, the current page isn't closed, and instead _async_render will be called again [as per the # assigned to retries in render()] and end up leaving behind an unused page. This change will enable render to close the "failed" attempt BEFORE opening a new page to try again, and should fix the issue of massive cpu buildup with multiple chromium instances. Sorry if this is messy, it's my first time using git to make a change. --- requests_html.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requests_html.py b/requests_html.py index ea8f3bd..cd5f01f 100644 --- a/requests_html.py +++ b/requests_html.py @@ -567,6 +567,8 @@ class HTML(BaseParser): page = None return content, result, page except TimeoutError: + await page.close() + page = None return None self.session.browser # Automatycally create a event loop and browser From a1c5e6ac8b8f46f0c827792c0b0eb5cce45f191e Mon Sep 17 00:00:00 2001 From: "Robson D. Montenegro" Date: Sun, 3 Jun 2018 21:52:51 +0100 Subject: [PATCH 03/14] fix: typo --- requests_html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requests_html.py b/requests_html.py index ea8f3bd..8470f40 100644 --- a/requests_html.py +++ b/requests_html.py @@ -528,7 +528,7 @@ class HTML(BaseParser): {'width': 800, 'height': 600, 'deviceScaleFactor': 1} Warning: If you use keep_page, you're responsable for closing each page, since - opening to many at scale may crach the browser. + opening to many at scale may crash the browser. Warning: the first time you run this method, it will download Chromium into your home directory (``~/.pyppeteer``). From 96dbba8fbd260ed2b3bdd4bd20dcc0db58115265 Mon Sep 17 00:00:00 2001 From: Martin Rotwang Date: Tue, 5 Jun 2018 12:39:46 +0200 Subject: [PATCH 04/14] Update requests_html.py e.g. to add a proxy setting usage: s=Session(browser_args=['--no-sandbox', '--proxy-server=127.0.0.1:9876']) @see: https://github.com/GoogleChrome/puppeteer/issues/336 --- requests_html.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/requests_html.py b/requests_html.py index ea8f3bd..26a38f6 100644 --- a/requests_html.py +++ b/requests_html.py @@ -645,7 +645,7 @@ class HTMLSession(requests.Session): amongst other things. """ - def __init__(self, mock_browser=True): + def __init__(self, mock_browser=True, browser_args=['--no-sandbox']): super(HTMLSession, self).__init__() # Mock a web browser's user agent. @@ -654,6 +654,8 @@ class HTMLSession(requests.Session): self.hooks = {'response': self._handle_response} + self.__browser_args = browser_args + @staticmethod def _handle_response(response, **kwargs) -> HTMLResponse: """Requests HTTP Response handler. Attaches .html property to @@ -677,7 +679,7 @@ class HTMLSession(requests.Session): def browser(self): if not hasattr(self, "_browser"): self.loop = asyncio.get_event_loop() - self._browser = self.loop.run_until_complete(pyppeteer.launch(headless=True, args=['--no-sandbox'])) + self._browser = self.loop.run_until_complete(pyppeteer.launch(headless=True, args=self.__browser_args)) return self._browser def close(self): From 4db2931ddc78ec1ae75ab58f44941e3b6ec22ea1 Mon Sep 17 00:00:00 2001 From: Meet Mangukiya Date: Sun, 24 Jun 2018 22:23:47 +0530 Subject: [PATCH 05/14] requests_html.py: Typo HTTPSession -> HTMLSession --- requests_html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requests_html.py b/requests_html.py index ea8f3bd..4f0944f 100644 --- a/requests_html.py +++ b/requests_html.py @@ -408,7 +408,7 @@ class HTML(BaseParser): :param default_encoding: Which encoding to default to. """ - def __init__(self, *, session: Union['HTTPSession', 'AsyncHTMLSession'] = None, url: str = DEFAULT_URL, html: _HTML, default_encoding: str = DEFAULT_ENCODING) -> None: + def __init__(self, *, session: Union['HTMLSession', 'AsyncHTMLSession'] = None, url: str = DEFAULT_URL, html: _HTML, default_encoding: str = DEFAULT_ENCODING) -> None: # Convert incoming unicode HTML into bytes. if isinstance(html, str): From 71e2571d3a34f9b3bd84448bea2dcdec0a8655c1 Mon Sep 17 00:00:00 2001 From: Timo Date: Mon, 25 Jun 2018 13:18:57 +0200 Subject: [PATCH 06/14] Fix minor typo lopp -> loop in init docstring of AsyncHTMLSession. --- requests_html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requests_html.py b/requests_html.py index ea8f3bd..6e29e20 100644 --- a/requests_html.py +++ b/requests_html.py @@ -694,7 +694,7 @@ class AsyncHTMLSession(requests.Session): mock_browser: bool = True, *args, **kwargs): """ Set or create an event loop and a thread pool. - :param loop: Asyncio lopp to use. + :param loop: Asyncio loop to use. :param workers: Amount of threads to use for executing async calls. If not pass it will default to the number of processors on the machine, multiplied by 5. """ From 6fef1d8583e211d4f96b45093b09c1f0840cdce3 Mon Sep 17 00:00:00 2001 From: rachmadaniHaryono Date: Mon, 2 Jul 2018 21:44:00 +0800 Subject: [PATCH 07/14] new: dev: ignore html count --- .gitattributes | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 .gitattributes diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..4aa3654 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,4 @@ +docs/source/_templates/hacks.html linguist-vendored=false +docs/source/_templates/sidebarintro.html linguist-vendored=false +docs/source/_templates/sidebarlogo.html linguist-vendored=false +tests/python.html linguist-vendored=false From 96973aaf4e8d02b2826edd2260c09941a3778990 Mon Sep 17 00:00:00 2001 From: rachmadaniHaryono Date: Mon, 2 Jul 2018 21:49:28 +0800 Subject: [PATCH 08/14] chg: dev: generic html --- .gitattributes | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.gitattributes b/.gitattributes index 4aa3654..53588d4 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,4 +1,2 @@ -docs/source/_templates/hacks.html linguist-vendored=false -docs/source/_templates/sidebarintro.html linguist-vendored=false -docs/source/_templates/sidebarlogo.html linguist-vendored=false -tests/python.html linguist-vendored=false +docs/source/_templates/*.html linguist-vendored=false +tests/*.html linguist-vendored=false From 116a4b08eb44214a96b80552fa523ca3bea4667d Mon Sep 17 00:00:00 2001 From: Li Yun <3425791734@qq.com> Date: Wed, 4 Jul 2018 11:20:08 +0800 Subject: [PATCH 09/14] Add "tag" attribute for Element object --- docs/source/index.rst | 5 +++++ requests_html.py | 1 + 2 files changed, 6 insertions(+) diff --git a/docs/source/index.rst b/docs/source/index.rst index 3ddc900..e6e2618 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -103,7 +103,12 @@ Render out an :class:`Element `'s HTML: >>> about.html '
  • \nAbout\n\n
  • ' +Crab an :class:`Element `'s root tag name: +.. code-block:: pycon + + >>> about.tag + 'li' Select an :class:`Element ` list within an :class:`Element `: diff --git a/requests_html.py b/requests_html.py index ea8f3bd..134f432 100644 --- a/requests_html.py +++ b/requests_html.py @@ -378,6 +378,7 @@ class Element(BaseParser): def __init__(self, *, element, url: _URL, default_encoding: _DefaultEncoding = None) -> None: super(Element, self).__init__(element=element, url=url, default_encoding=default_encoding) self.element = element + self.tag = element.tag self._attrs = None def __repr__(self) -> str: From 1c21f636728aa2815c6fc2a598e2974a9c6026b9 Mon Sep 17 00:00:00 2001 From: Li Yun <3425791734@qq.com> Date: Wed, 4 Jul 2018 11:30:59 +0800 Subject: [PATCH 10/14] Add "lineno" attribute for Element object --- docs/source/index.rst | 7 +++++++ requests_html.py | 1 + 2 files changed, 8 insertions(+) diff --git a/docs/source/index.rst b/docs/source/index.rst index e6e2618..1e891d7 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -110,6 +110,13 @@ Crab an :class:`Element `'s root tag name: >>> about.tag 'li' +Show the line number that an :class:`Element `'s root tag located in: + +.. code-block:: pycon + + >>> about.lineno + 249 + Select an :class:`Element ` list within an :class:`Element `: .. code-block:: pycon diff --git a/requests_html.py b/requests_html.py index 134f432..42c1f8d 100644 --- a/requests_html.py +++ b/requests_html.py @@ -379,6 +379,7 @@ class Element(BaseParser): super(Element, self).__init__(element=element, url=url, default_encoding=default_encoding) self.element = element self.tag = element.tag + self.lineno = element.sourceline self._attrs = None def __repr__(self) -> str: From cb6e5fb55798762e352775eba14c0bd0cb7bf037 Mon Sep 17 00:00:00 2001 From: m9mhmdy <42245918+m9mhmdy@users.noreply.github.com> Date: Thu, 30 Aug 2018 16:25:25 +0200 Subject: [PATCH 11/14] Fix a small typo --- requests_html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requests_html.py b/requests_html.py index ea8f3bd..0ab0a57 100644 --- a/requests_html.py +++ b/requests_html.py @@ -569,7 +569,7 @@ class HTML(BaseParser): except TimeoutError: return None - self.session.browser # Automatycally create a event loop and browser + self.session.browser # Automatically create a event loop and browser content = None # Automatically set Reload to False, if example URL is being used. From 6190a47eef5014e149acaefd0ed782a77abfffe6 Mon Sep 17 00:00:00 2001 From: Kenneth Reitz Date: Mon, 17 Sep 2018 08:06:43 -0400 Subject: [PATCH 12/14] Update README.rst --- README.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.rst b/README.rst index 470d043..342dabf 100644 --- a/README.rst +++ b/README.rst @@ -9,6 +9,8 @@ Requests-HTML: HTML Parsing for Humans™ This library intends to make parsing HTML (e.g. scraping the web) as simple and intuitive as possible. +**If you're interested in financially supporting Kenneth Reitz open source, consider [visiting this link](https://cash.me/$KennethReitz). Your support helps tremendously with sustainability of motivation, as Open Source is no longer part of my day job.** + When using this library you automatically get: - **Full JavaScript support**! From d159d2045a35b86e93f011dacd67e031d9c26ed1 Mon Sep 17 00:00:00 2001 From: Kenneth Reitz Date: Mon, 17 Sep 2018 08:07:22 -0400 Subject: [PATCH 13/14] Update README.rst --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 342dabf..84eb9d5 100644 --- a/README.rst +++ b/README.rst @@ -9,7 +9,7 @@ Requests-HTML: HTML Parsing for Humans™ This library intends to make parsing HTML (e.g. scraping the web) as simple and intuitive as possible. -**If you're interested in financially supporting Kenneth Reitz open source, consider [visiting this link](https://cash.me/$KennethReitz). Your support helps tremendously with sustainability of motivation, as Open Source is no longer part of my day job.** +If you're interested in financially supporting Kenneth Reitz open source, consider `visiting this link _. Your support helps tremendously with sustainability of motivation, as Open Source is no longer part of my day job. When using this library you automatically get: From ee34f3f9eaaaba9f1a440c6612413cf8f220ecd1 Mon Sep 17 00:00:00 2001 From: Kenneth Reitz Date: Mon, 17 Sep 2018 08:07:49 -0400 Subject: [PATCH 14/14] Update README.rst --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 84eb9d5..141abba 100644 --- a/README.rst +++ b/README.rst @@ -9,7 +9,7 @@ Requests-HTML: HTML Parsing for Humans™ This library intends to make parsing HTML (e.g. scraping the web) as simple and intuitive as possible. -If you're interested in financially supporting Kenneth Reitz open source, consider `visiting this link _. Your support helps tremendously with sustainability of motivation, as Open Source is no longer part of my day job. +If you're interested in financially supporting Kenneth Reitz open source, consider `visiting this link `_. Your support helps tremendously with sustainability of motivation, as Open Source is no longer part of my day job. When using this library you automatically get: