From 132e5cb522db2fad73eb5f29e5f49481976dfc2f Mon Sep 17 00:00:00 2001 From: Sun Wei Date: Mon, 26 Mar 2018 17:57:50 +0800 Subject: [PATCH 01/20] Automatically? --- requests_html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requests_html.py b/requests_html.py index ea8f3bd..0ab0a57 100644 --- a/requests_html.py +++ b/requests_html.py @@ -569,7 +569,7 @@ class HTML(BaseParser): except TimeoutError: return None - self.session.browser # Automatycally create a event loop and browser + self.session.browser # Automatically create a event loop and browser content = None # Automatically set Reload to False, if example URL is being used. From cb55034b4253044a8194d4b96b4047f71c467e0f Mon Sep 17 00:00:00 2001 From: Siddhesh Nachane Date: Sat, 31 Mar 2018 22:51:34 +0530 Subject: [PATCH 02/20] Made basic fixes 1. Corrected Comments and DocStrings Spell Errors. 2. Added .vscode folder to .gitignore 3. Replaced `i` with place holder `_` (as i is never used) --- .gitignore | 5 ++++- requests_html.py | 6 +++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 52793b4..3f3912c 100644 --- a/.gitignore +++ b/.gitignore @@ -103,4 +103,7 @@ venv.bak/ /site # mypy -.mypy_cache/ \ No newline at end of file +.mypy_cache/ + +# Visual Studio Code +.vscode \ No newline at end of file diff --git a/requests_html.py b/requests_html.py index ea8f3bd..df612cc 100644 --- a/requests_html.py +++ b/requests_html.py @@ -569,14 +569,14 @@ class HTML(BaseParser): except TimeoutError: return None - self.session.browser # Automatycally create a event loop and browser + self.session.browser # Automatically create a event loop and browser content = None # Automatically set Reload to False, if example URL is being used. if self.url == DEFAULT_URL: reload = False - for i in range(retries): + for _ in range(retries): if not content: try: @@ -694,7 +694,7 @@ class AsyncHTMLSession(requests.Session): mock_browser: bool = True, *args, **kwargs): """ Set or create an event loop and a thread pool. - :param loop: Asyncio lopp to use. + :param loop: Asyncio loop to use. :param workers: Amount of threads to use for executing async calls. If not pass it will default to the number of processors on the machine, multiplied by 5. """ From c21f0784cd32fad3a18ec18d2a0705b9b010a2b0 Mon Sep 17 00:00:00 2001 From: Angus Dippenaar Date: Thu, 5 Apr 2018 13:47:39 +0200 Subject: [PATCH 03/20] Create LXML from raw_html Create LXML from `self.raw_html` instead of `self.html` to allow LXML to process plain XML pages as per beda42's findings in issue https://github.com/kennethreitz/requests-html/issues/145 I have tested this change with 200 sites and it seems to fix the issue. HTML pages seem to all be working as expected. I haven't run into an issue with any that I've tested. --- requests_html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requests_html.py b/requests_html.py index ea8f3bd..0c3a989 100644 --- a/requests_html.py +++ b/requests_html.py @@ -159,7 +159,7 @@ class BaseParser: try: self._lxml = soup_parse(self.html, features='html.parser') except ValueError: - self._lxml = lxml.html.fromstring(self.html) + self._lxml = lxml.html.fromstring(self.raw_html) return self._lxml From 05ff6e87ca7206a241d3f1ba7ff49ebf7cda7217 Mon Sep 17 00:00:00 2001 From: Angus Dippenaar Date: Sat, 7 Apr 2018 17:15:51 +0200 Subject: [PATCH 04/20] Replace errors when decoding raw_html Some websites don't have valid bytes, even when the encoding is specified. I'm not 100% sure if replacing "bad" bytes is the correct way to fix the problem. It seems to fix the issues I've run into with some sites. --- requests_html.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requests_html.py b/requests_html.py index ea8f3bd..6cdab39 100644 --- a/requests_html.py +++ b/requests_html.py @@ -102,7 +102,7 @@ class BaseParser: (`learn more `_). """ if self._html: - return self.raw_html.decode(self.encoding) + return self.raw_html.decode(self.encoding, errors='replace') else: return etree.tostring(self.element, encoding='unicode').strip() @@ -128,7 +128,7 @@ class BaseParser: self._encoding = html_to_unicode(self.default_encoding, self._html)[0] # Fall back to requests' detected encoding if decode fails. try: - self.raw_html.decode(self.encoding) + self.raw_html.decode(self.encoding, errors='replace') except UnicodeDecodeError: self._encoding = self.default_encoding From 50c9058d04e4b3a46074f4bcb22114f232451baf Mon Sep 17 00:00:00 2001 From: Shay Elmualem Date: Sat, 7 Apr 2018 22:18:46 +0300 Subject: [PATCH 05/20] Minor typo fix --- requests_html.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/requests_html.py b/requests_html.py index ea8f3bd..90c26a0 100644 --- a/requests_html.py +++ b/requests_html.py @@ -527,7 +527,7 @@ class HTML(BaseParser): >>> r.html.render(script=script) {'width': 800, 'height': 600, 'deviceScaleFactor': 1} - Warning: If you use keep_page, you're responsable for closing each page, since + Warning: If you use keep_page, you're responsible for closing each page, since opening to many at scale may crach the browser. Warning: the first time you run this method, it will download @@ -569,7 +569,7 @@ class HTML(BaseParser): except TimeoutError: return None - self.session.browser # Automatycally create a event loop and browser + self.session.browser # Automatycally create an event loop and browser content = None # Automatically set Reload to False, if example URL is being used. @@ -694,7 +694,7 @@ class AsyncHTMLSession(requests.Session): mock_browser: bool = True, *args, **kwargs): """ Set or create an event loop and a thread pool. - :param loop: Asyncio lopp to use. + :param loop: Asyncio loop to use. :param workers: Amount of threads to use for executing async calls. If not pass it will default to the number of processors on the machine, multiplied by 5. """ From 2a7d08722d9d28c23efac63e6191385f093aa38c Mon Sep 17 00:00:00 2001 From: Angus Dippenaar Date: Sat, 14 Apr 2018 21:32:00 +0200 Subject: [PATCH 06/20] Initialize PyQuery with lxml PyQuery with XML sites also has the same issue that LXML does with unicode encoded strings because it uses LXML to parse the page. The fix has already been applied to LXML, so we can fix the issue with PyQuery by passing the already parsed LXML into PyQuery. --- requests_html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requests_html.py b/requests_html.py index 0c3a989..4c8a59f 100644 --- a/requests_html.py +++ b/requests_html.py @@ -146,7 +146,7 @@ class BaseParser: of the :class:`Element ` or :class:`HTML `. """ if self._pq is None: - self._pq = PyQuery(self.html) + self._pq = PyQuery(self.lxml) return self._pq From 81998d84c4df20df7cb9cfa64c2235d36ab81610 Mon Sep 17 00:00:00 2001 From: Nicholas Date: Mon, 7 May 2018 01:32:59 +0800 Subject: [PATCH 07/20] Docs: Add note to install Linux packages I ran into `pyppeteer.errors.BrowserError: Failed to connect to browser port:` and after a bit of snooping found that some Linux packages needed to be installed on my machine for pyppeteer to run. Suggest to add a note to save others time! --- docs/source/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index 3ddc900..0106508 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -169,7 +169,7 @@ Let's grab some text that's rendered by JavaScript: Note, the first time you ever run the ``render()`` method, it will download Chromium into your home directory (e.g. ``~/.pyppeteer/``). This only happens -once. +once. You may also need to install a few `Linux packages `_ to get pyppeteer working. Pagination ========== From 956e60054c18f10ce5439a55eaff33c46de68afe Mon Sep 17 00:00:00 2001 From: carrionc Date: Wed, 30 May 2018 00:40:37 -0400 Subject: [PATCH 08/20] Multiple chromium tab fix Within the render function, the page is rendered through the _async_render function. This function will try to render content by first creating a page, and currently will only close said page if the content is generated. However, if at any point there's a timeout beforehand, the current page isn't closed, and instead _async_render will be called again [as per the # assigned to retries in render()] and end up leaving behind an unused page. This change will enable render to close the "failed" attempt BEFORE opening a new page to try again, and should fix the issue of massive cpu buildup with multiple chromium instances. Sorry if this is messy, it's my first time using git to make a change. --- requests_html.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requests_html.py b/requests_html.py index ea8f3bd..cd5f01f 100644 --- a/requests_html.py +++ b/requests_html.py @@ -567,6 +567,8 @@ class HTML(BaseParser): page = None return content, result, page except TimeoutError: + await page.close() + page = None return None self.session.browser # Automatycally create a event loop and browser From a1c5e6ac8b8f46f0c827792c0b0eb5cce45f191e Mon Sep 17 00:00:00 2001 From: "Robson D. Montenegro" Date: Sun, 3 Jun 2018 21:52:51 +0100 Subject: [PATCH 09/20] fix: typo --- requests_html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requests_html.py b/requests_html.py index ea8f3bd..8470f40 100644 --- a/requests_html.py +++ b/requests_html.py @@ -528,7 +528,7 @@ class HTML(BaseParser): {'width': 800, 'height': 600, 'deviceScaleFactor': 1} Warning: If you use keep_page, you're responsable for closing each page, since - opening to many at scale may crach the browser. + opening to many at scale may crash the browser. Warning: the first time you run this method, it will download Chromium into your home directory (``~/.pyppeteer``). From 96dbba8fbd260ed2b3bdd4bd20dcc0db58115265 Mon Sep 17 00:00:00 2001 From: Martin Rotwang Date: Tue, 5 Jun 2018 12:39:46 +0200 Subject: [PATCH 10/20] Update requests_html.py e.g. to add a proxy setting usage: s=Session(browser_args=['--no-sandbox', '--proxy-server=127.0.0.1:9876']) @see: https://github.com/GoogleChrome/puppeteer/issues/336 --- requests_html.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/requests_html.py b/requests_html.py index ea8f3bd..26a38f6 100644 --- a/requests_html.py +++ b/requests_html.py @@ -645,7 +645,7 @@ class HTMLSession(requests.Session): amongst other things. """ - def __init__(self, mock_browser=True): + def __init__(self, mock_browser=True, browser_args=['--no-sandbox']): super(HTMLSession, self).__init__() # Mock a web browser's user agent. @@ -654,6 +654,8 @@ class HTMLSession(requests.Session): self.hooks = {'response': self._handle_response} + self.__browser_args = browser_args + @staticmethod def _handle_response(response, **kwargs) -> HTMLResponse: """Requests HTTP Response handler. Attaches .html property to @@ -677,7 +679,7 @@ class HTMLSession(requests.Session): def browser(self): if not hasattr(self, "_browser"): self.loop = asyncio.get_event_loop() - self._browser = self.loop.run_until_complete(pyppeteer.launch(headless=True, args=['--no-sandbox'])) + self._browser = self.loop.run_until_complete(pyppeteer.launch(headless=True, args=self.__browser_args)) return self._browser def close(self): From 4db2931ddc78ec1ae75ab58f44941e3b6ec22ea1 Mon Sep 17 00:00:00 2001 From: Meet Mangukiya Date: Sun, 24 Jun 2018 22:23:47 +0530 Subject: [PATCH 11/20] requests_html.py: Typo HTTPSession -> HTMLSession --- requests_html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requests_html.py b/requests_html.py index ea8f3bd..4f0944f 100644 --- a/requests_html.py +++ b/requests_html.py @@ -408,7 +408,7 @@ class HTML(BaseParser): :param default_encoding: Which encoding to default to. """ - def __init__(self, *, session: Union['HTTPSession', 'AsyncHTMLSession'] = None, url: str = DEFAULT_URL, html: _HTML, default_encoding: str = DEFAULT_ENCODING) -> None: + def __init__(self, *, session: Union['HTMLSession', 'AsyncHTMLSession'] = None, url: str = DEFAULT_URL, html: _HTML, default_encoding: str = DEFAULT_ENCODING) -> None: # Convert incoming unicode HTML into bytes. if isinstance(html, str): From 71e2571d3a34f9b3bd84448bea2dcdec0a8655c1 Mon Sep 17 00:00:00 2001 From: Timo Date: Mon, 25 Jun 2018 13:18:57 +0200 Subject: [PATCH 12/20] Fix minor typo lopp -> loop in init docstring of AsyncHTMLSession. --- requests_html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requests_html.py b/requests_html.py index ea8f3bd..6e29e20 100644 --- a/requests_html.py +++ b/requests_html.py @@ -694,7 +694,7 @@ class AsyncHTMLSession(requests.Session): mock_browser: bool = True, *args, **kwargs): """ Set or create an event loop and a thread pool. - :param loop: Asyncio lopp to use. + :param loop: Asyncio loop to use. :param workers: Amount of threads to use for executing async calls. If not pass it will default to the number of processors on the machine, multiplied by 5. """ From 6fef1d8583e211d4f96b45093b09c1f0840cdce3 Mon Sep 17 00:00:00 2001 From: rachmadaniHaryono Date: Mon, 2 Jul 2018 21:44:00 +0800 Subject: [PATCH 13/20] new: dev: ignore html count --- .gitattributes | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 .gitattributes diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..4aa3654 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,4 @@ +docs/source/_templates/hacks.html linguist-vendored=false +docs/source/_templates/sidebarintro.html linguist-vendored=false +docs/source/_templates/sidebarlogo.html linguist-vendored=false +tests/python.html linguist-vendored=false From 96973aaf4e8d02b2826edd2260c09941a3778990 Mon Sep 17 00:00:00 2001 From: rachmadaniHaryono Date: Mon, 2 Jul 2018 21:49:28 +0800 Subject: [PATCH 14/20] chg: dev: generic html --- .gitattributes | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.gitattributes b/.gitattributes index 4aa3654..53588d4 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,4 +1,2 @@ -docs/source/_templates/hacks.html linguist-vendored=false -docs/source/_templates/sidebarintro.html linguist-vendored=false -docs/source/_templates/sidebarlogo.html linguist-vendored=false -tests/python.html linguist-vendored=false +docs/source/_templates/*.html linguist-vendored=false +tests/*.html linguist-vendored=false From 116a4b08eb44214a96b80552fa523ca3bea4667d Mon Sep 17 00:00:00 2001 From: Li Yun <3425791734@qq.com> Date: Wed, 4 Jul 2018 11:20:08 +0800 Subject: [PATCH 15/20] Add "tag" attribute for Element object --- docs/source/index.rst | 5 +++++ requests_html.py | 1 + 2 files changed, 6 insertions(+) diff --git a/docs/source/index.rst b/docs/source/index.rst index 3ddc900..e6e2618 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -103,7 +103,12 @@ Render out an :class:`Element `'s HTML: >>> about.html '
  • \nAbout\n\n
  • ' +Crab an :class:`Element `'s root tag name: +.. code-block:: pycon + + >>> about.tag + 'li' Select an :class:`Element ` list within an :class:`Element `: diff --git a/requests_html.py b/requests_html.py index ea8f3bd..134f432 100644 --- a/requests_html.py +++ b/requests_html.py @@ -378,6 +378,7 @@ class Element(BaseParser): def __init__(self, *, element, url: _URL, default_encoding: _DefaultEncoding = None) -> None: super(Element, self).__init__(element=element, url=url, default_encoding=default_encoding) self.element = element + self.tag = element.tag self._attrs = None def __repr__(self) -> str: From 1c21f636728aa2815c6fc2a598e2974a9c6026b9 Mon Sep 17 00:00:00 2001 From: Li Yun <3425791734@qq.com> Date: Wed, 4 Jul 2018 11:30:59 +0800 Subject: [PATCH 16/20] Add "lineno" attribute for Element object --- docs/source/index.rst | 7 +++++++ requests_html.py | 1 + 2 files changed, 8 insertions(+) diff --git a/docs/source/index.rst b/docs/source/index.rst index e6e2618..1e891d7 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -110,6 +110,13 @@ Crab an :class:`Element `'s root tag name: >>> about.tag 'li' +Show the line number that an :class:`Element `'s root tag located in: + +.. code-block:: pycon + + >>> about.lineno + 249 + Select an :class:`Element ` list within an :class:`Element `: .. code-block:: pycon diff --git a/requests_html.py b/requests_html.py index 134f432..42c1f8d 100644 --- a/requests_html.py +++ b/requests_html.py @@ -379,6 +379,7 @@ class Element(BaseParser): super(Element, self).__init__(element=element, url=url, default_encoding=default_encoding) self.element = element self.tag = element.tag + self.lineno = element.sourceline self._attrs = None def __repr__(self) -> str: From cb6e5fb55798762e352775eba14c0bd0cb7bf037 Mon Sep 17 00:00:00 2001 From: m9mhmdy <42245918+m9mhmdy@users.noreply.github.com> Date: Thu, 30 Aug 2018 16:25:25 +0200 Subject: [PATCH 17/20] Fix a small typo --- requests_html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requests_html.py b/requests_html.py index ea8f3bd..0ab0a57 100644 --- a/requests_html.py +++ b/requests_html.py @@ -569,7 +569,7 @@ class HTML(BaseParser): except TimeoutError: return None - self.session.browser # Automatycally create a event loop and browser + self.session.browser # Automatically create a event loop and browser content = None # Automatically set Reload to False, if example URL is being used. From 6190a47eef5014e149acaefd0ed782a77abfffe6 Mon Sep 17 00:00:00 2001 From: Kenneth Reitz Date: Mon, 17 Sep 2018 08:06:43 -0400 Subject: [PATCH 18/20] Update README.rst --- README.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.rst b/README.rst index 470d043..342dabf 100644 --- a/README.rst +++ b/README.rst @@ -9,6 +9,8 @@ Requests-HTML: HTML Parsing for Humans™ This library intends to make parsing HTML (e.g. scraping the web) as simple and intuitive as possible. +**If you're interested in financially supporting Kenneth Reitz open source, consider [visiting this link](https://cash.me/$KennethReitz). Your support helps tremendously with sustainability of motivation, as Open Source is no longer part of my day job.** + When using this library you automatically get: - **Full JavaScript support**! From d159d2045a35b86e93f011dacd67e031d9c26ed1 Mon Sep 17 00:00:00 2001 From: Kenneth Reitz Date: Mon, 17 Sep 2018 08:07:22 -0400 Subject: [PATCH 19/20] Update README.rst --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 342dabf..84eb9d5 100644 --- a/README.rst +++ b/README.rst @@ -9,7 +9,7 @@ Requests-HTML: HTML Parsing for Humans™ This library intends to make parsing HTML (e.g. scraping the web) as simple and intuitive as possible. -**If you're interested in financially supporting Kenneth Reitz open source, consider [visiting this link](https://cash.me/$KennethReitz). Your support helps tremendously with sustainability of motivation, as Open Source is no longer part of my day job.** +If you're interested in financially supporting Kenneth Reitz open source, consider `visiting this link _. Your support helps tremendously with sustainability of motivation, as Open Source is no longer part of my day job. When using this library you automatically get: From ee34f3f9eaaaba9f1a440c6612413cf8f220ecd1 Mon Sep 17 00:00:00 2001 From: Kenneth Reitz Date: Mon, 17 Sep 2018 08:07:49 -0400 Subject: [PATCH 20/20] Update README.rst --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 84eb9d5..141abba 100644 --- a/README.rst +++ b/README.rst @@ -9,7 +9,7 @@ Requests-HTML: HTML Parsing for Humans™ This library intends to make parsing HTML (e.g. scraping the web) as simple and intuitive as possible. -If you're interested in financially supporting Kenneth Reitz open source, consider `visiting this link _. Your support helps tremendously with sustainability of motivation, as Open Source is no longer part of my day job. +If you're interested in financially supporting Kenneth Reitz open source, consider `visiting this link `_. Your support helps tremendously with sustainability of motivation, as Open Source is no longer part of my day job. When using this library you automatically get: