From 21ce89ea4d384def792aad324f2170af0defeef1 Mon Sep 17 00:00:00 2001 From: Chyroc Date: Wed, 28 Feb 2018 20:08:54 +0800 Subject: [PATCH 1/8] clean and format code --- requests_html.py | 62 ++++++++++++++++++++---------------------------- 1 file changed, 26 insertions(+), 36 deletions(-) diff --git a/requests_html.py b/requests_html.py index ee2f6b2..0745edb 100644 --- a/requests_html.py +++ b/requests_html.py @@ -15,14 +15,12 @@ from parse import search as parse_search from parse import findall, Result from w3lib.encoding import html_to_unicode - DEFAULT_ENCODING = 'utf-8' DEFAULT_URL = 'https://example.org/' useragent = UserAgent() - class BaseParser: """A basic HTML/Element Parser, for Humans.""" @@ -114,13 +112,7 @@ class BaseParser: for found in self.pq(selector) ] - if first: - try: - return elements[0] - except IndexError: - return None - else: - return elements + return _get_first_or_list(elements, first) def xpath(self, selector: str, first: bool = False, _encoding: str = None): """Given an XPath selector, returns a list of :class:`Element ` objects. @@ -129,13 +121,8 @@ class BaseParser: If ``first`` is ``True``, only returns the first :class:`Element ` found.""" c = [Element(element=e, url=self.url, default_encoding=_encoding or self.encoding) for e in self.lxml.xpath(selector)] - if first: - try: - return c[0] - except IndexError: - return None - else: - return c + + return _get_first_or_list(c, first) def search(self, template: str) -> Result: """Searches the :class:`Element ` for the given parse template.""" @@ -150,14 +137,14 @@ class BaseParser: @property def links(self) -> Set[str]: """All found links on page, in as–is form.""" + def gen(): for link in self.find('a'): try: href = link.attrs['href'].strip() - if not(href.startswith('#') and self.skip_anchors) and href not in ['javascript:;']: - if href: - yield href + if href and not (href.startswith('#') and self.skip_anchors and href in ['javascript:;']): + yield href except KeyError: pass @@ -168,6 +155,7 @@ class BaseParser: """All found links on page, in absolute form (`learn more `_). """ + def gen(): for link in self.links: # Parse the link with stdlib. @@ -197,12 +185,11 @@ class BaseParser: if base: return base.attrs['href'].strip() - else: - url = '/'.join(self.url.split('/')[:-1]) - if url.endswith('/'): - url = url[:-1] + url = '/'.join(self.url.split('/')[:-1]) + if url.endswith('/'): + url = url[:-1] - return url + return url class Element(BaseParser): @@ -213,10 +200,7 @@ class Element(BaseParser): self.element = element def __repr__(self) -> str: - attrs = [] - for attr in self.attrs: - attrs.append('{}={}'.format(attr, repr(self.attrs[attr]))) - + attrs = ['{}={}'.format(attr, repr(self.attrs[attr])) for attr in self.attrs] return "".format(repr(self.element.tag), ' '.join(attrs)) @property @@ -289,6 +273,7 @@ class HTML(BaseParser): Warning: the first time you run this method, it will download Chromium into your home directory (``~/.pyppeteer``). """ + async def _async_render(*, url: str, script: str = None, scrolldown, sleep: int): try: browser = pyppeteer.launch(headless=True) @@ -344,10 +329,9 @@ class HTMLResponse(requests.Response): @property def html(self) -> HTML: - if self._html: - return self._html + if not self._html: + self._html = HTML(url=self.url, html=self.content, default_encoding=self.encoding) - self._html = HTML(url=self.url, html=self.content, default_encoding=self.encoding) return self._html @classmethod @@ -362,10 +346,17 @@ def user_agent(style='chrome') -> str: style. Defaults to a Chrome-style User-Agent. """ - if not style: - return useragent.random + return useragent[style] if style else useragent.random + + +def _get_first_or_list(l, first=True): + if first: + try: + return l[0] + except IndexError: + return None else: - return useragent[style] + return l class HTMLSession(requests.Session): @@ -395,6 +386,5 @@ class HTMLSession(requests.Session): def request(self, *args, **kwargs) -> HTMLResponse: # Convert Request object into HTTPRequest object. r = super(HTMLSession, self).request(*args, **kwargs) - html_r = HTMLResponse._from_response(r) - return html_r + return HTMLResponse._from_response(r) From 2504efb35fa7c6262de5dc8d75d46d7e10e848d2 Mon Sep 17 00:00:00 2001 From: Kenneth Reitz Date: Wed, 28 Feb 2018 08:17:46 -0500 Subject: [PATCH 2/8] support render of non-loaded websites Signed-off-by: Kenneth Reitz --- requests_html.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/requests_html.py b/requests_html.py index 0be722c..082ca5f 100644 --- a/requests_html.py +++ b/requests_html.py @@ -2,7 +2,7 @@ import sys import asyncio from urllib.parse import urlparse, urlunparse from concurrent.futures._base import TimeoutError -from typing import Set, Union, List, MutableMapping +from typing import Set, Union, List, MutableMapping, Optional import pyppeteer import requests @@ -325,7 +325,7 @@ class HTML(BaseParser): def __repr__(self) -> str: return "".format(repr(self.url)) - def render(self, retries: int = 8, script: str = None, scrolldown=False, sleep: int = 0): + def render(self, retries: int = 8, script: str = None, scrolldown=False, sleep: int = 0, reload: bool = True): """Reloads the response in Chromium, and replaces HTML content with an updated version, with JavaScript executed. @@ -361,13 +361,16 @@ class HTML(BaseParser): Warning: the first time you run this method, it will download Chromium into your home directory (``~/.pyppeteer``). """ - async def _async_render(*, url: str, script: str = None, scrolldown, sleep: int): + async def _async_render(*, url: str, script: str = None, scrolldown, sleep: int, reload: bool = True, content: Optional[str]): try: browser = pyppeteer.launch(headless=True) page = await browser.newPage() # Load the given page (GET request, obviously.) - await page.goto(url) + if reload: + await page.goto(url) + else: + await page.setContent(content) result = None if script: @@ -395,7 +398,7 @@ class HTML(BaseParser): for i in range(retries): if not content: try: - content, result = loop.run_until_complete(_async_render(url=self.url, script=script, sleep=sleep, scrolldown=scrolldown)) + content, result = loop.run_until_complete(_async_render(url=self.url, script=script, sleep=sleep, content=self.html, reload=reload, scrolldown=scrolldown)) except TimeoutError: pass From e9c162f5f76c87452fb84a833287b313ae7932a4 Mon Sep 17 00:00:00 2001 From: Kenneth Reitz Date: Wed, 28 Feb 2018 08:20:34 -0500 Subject: [PATCH 3/8] next version Signed-off-by: Kenneth Reitz --- requests_html.py | 6 ++++++ setup.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/requests_html.py b/requests_html.py index 082ca5f..7311667 100644 --- a/requests_html.py +++ b/requests_html.py @@ -329,6 +329,12 @@ class HTML(BaseParser): """Reloads the response in Chromium, and replaces HTML content with an updated version, with JavaScript executed. + :param retries: The number of times to retry loading the page in Chromium. + :param script: JavaScript to execute upon page load (optional). + :param scrolldown: Integer, if provided, of how many times to page down. + :param sleep: Integer, if provided, of how many long to sleep after initial render. + :param reload: If ``False``, content will not be loaded from the browser, but will be provided from memory. + If ``scrolldown`` is specified, the page will scrolldown the specified number of times, after sleeping the specified amount of time (e.g. ``scrolldown=10, sleep=1``). diff --git a/setup.py b/setup.py index 873dabe..b28f984 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ DESCRIPTION = 'HTML Parsing for Humans.' URL = 'https://github.com/kennethreitz/requests-html' EMAIL = 'me@kennethreitz.org' AUTHOR = 'Kenneth Reitz' -VERSION = '0.6.7' +VERSION = '0.6.8' # What packages are required for this module to be executed? REQUIRED = [ From 565cd496c2a37973182ab01575e991a58ca8c209 Mon Sep 17 00:00:00 2001 From: Kenneth Reitz Date: Wed, 28 Feb 2018 08:39:58 -0500 Subject: [PATCH 4/8] bare render tests Signed-off-by: Kenneth Reitz --- tests/test_requests_html.py | 43 +++++++++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/tests/test_requests_html.py b/tests/test_requests_html.py index 5780bde..dbf02d8 100644 --- a/tests/test_requests_html.py +++ b/tests/test_requests_html.py @@ -44,8 +44,8 @@ def test_links(): r = get() about = r.html.find('#about', first=True) - len(about.links) == 6 - len(about.absolute_links) == 6 + assert len(about.links) == 6 + assert len(about.absolute_links) == 6 def test_search(): @@ -79,5 +79,44 @@ def test_anchor_links(): assert '#site-map' in r.html.links +def test_render(): + r = get() + script = """ + () => { + return { + width: document.documentElement.clientWidth, + height: document.documentElement.clientHeight, + deviceScaleFactor: window.devicePixelRatio, + } + } + """ + val = r.html.render(script=script) + for value in ('width', 'height', 'deviceScaleFactor'): + assert value in val + + about = r.html.find('#about', first=True) + assert len(about.links) == 6 + + +def test_bare_render(): + doc = """""" + html = HTML(html=doc) + script = """ + () => { + return { + width: document.documentElement.clientWidth, + height: document.documentElement.clientHeight, + deviceScaleFactor: window.devicePixelRatio, + } + } + """ + val = html.render(script=script, reload=False) + for value in ('width', 'height', 'deviceScaleFactor'): + assert value in val + + assert html.find('html') + assert 'https://httpbin.org' in html.links + + if __name__ == '__main__': test_xpath() From f2a99240655c18a1b719f75c2642fdb7615d2ff6 Mon Sep 17 00:00:00 2001 From: Kenneth Reitz Date: Wed, 28 Feb 2018 08:43:42 -0500 Subject: [PATCH 5/8] render without requests Signed-off-by: Kenneth Reitz --- docs/source/index.rst | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/docs/source/index.rst b/docs/source/index.rst index feb45aa..86182c7 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -177,6 +177,27 @@ You can also use this library without Requests: >>> html.links {'https://httpbin.org'} +You can also render JavaScript pages without Requests: + +.. code-block:: pycon + + # ^^ proceeding from above ^^ + >>> script = """ + () => { + return { + width: document.documentElement.clientWidth, + height: document.documentElement.clientHeight, + deviceScaleFactor: window.devicePixelRatio, + } + } + """ + >>> val = html.render(script=script, reload=False) + + >>> print(val) + {'width': 800, 'height': 600, 'deviceScaleFactor': 1} + + >>> print(html.html) + API Documentation ================= From dcb828f3a341a656f69aa54169c874da8cbc6e5c Mon Sep 17 00:00:00 2001 From: Kenneth Reitz Date: Wed, 28 Feb 2018 08:44:24 -0500 Subject: [PATCH 6/8] fix Signed-off-by: Kenneth Reitz --- docs/source/index.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index 86182c7..40b8905 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -183,13 +183,13 @@ You can also render JavaScript pages without Requests: # ^^ proceeding from above ^^ >>> script = """ - () => { - return { - width: document.documentElement.clientWidth, - height: document.documentElement.clientHeight, - deviceScaleFactor: window.devicePixelRatio, + () => { + return { + width: document.documentElement.clientWidth, + height: document.documentElement.clientHeight, + deviceScaleFactor: window.devicePixelRatio, + } } - } """ >>> val = html.render(script=script, reload=False) From e531e5cab2105805ec8328a76dab4de2e4e109dd Mon Sep 17 00:00:00 2001 From: Kenneth Reitz Date: Wed, 28 Feb 2018 08:49:38 -0500 Subject: [PATCH 7/8] updated tests for travis Signed-off-by: Kenneth Reitz --- Pipfile | 2 +- Pipfile.lock | 28 +--------------------------- tests/test_requests_html.py | 11 +++++++++++ 3 files changed, 13 insertions(+), 28 deletions(-) diff --git a/Pipfile b/Pipfile index c0dda7f..62550b6 100644 --- a/Pipfile +++ b/Pipfile @@ -28,4 +28,4 @@ mypy = "*" [scripts] -tests = "pytest" +tests = "pytest -v -m ok" diff --git a/Pipfile.lock b/Pipfile.lock index 03398aa..55c5b45 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "ef6f9504ed9751cf2f4c5aef06e59838981c79d84fa1d36fb5ce258d8dba189f" + "sha256": "cf67076e9c185c3bc951910b2a44b8b548ce954e0e3ff2a5bef1942d13275e8e" }, "host-environment-markers": { "implementation_name": "cpython", @@ -127,15 +127,6 @@ ], "version": "==0.0.10" }, - "pyqt5": { - "hashes": [ - "sha256:128285176240e990fce9c50293105ffd0d2884d8910bb338118f867b171ec6e8", - "sha256:dbd1777d8e7540a6e7350482f1d7c981a073ce1b7195ac2cd21c204b3a28df57", - "sha256:3563ac935fca8e8b1dbd4856d8eedc982b5de90c53f0280e8fca8060a262d4f4", - "sha256:2ce953cb849e5265b9d1abe075471148ad5fb6d7e6a9881f37dfe05590571d23" - ], - "version": "==5.10" - }, "pyquery": { "hashes": [ "sha256:07987c2ed2aed5cba29ff18af95e56e9eb04a2249f42ce47bddfb37f487229a3", @@ -150,23 +141,6 @@ ], "version": "==2.18.4" }, - "sip": { - "hashes": [ - "sha256:f31bb63e63a958f65887ae27f06e62af9f9cb818ba7456a99f78a5ec3082d3dd", - "sha256:776e169da554729f80337070348db49a6742d8aa317aec931a4d0f47b7ef535d", - "sha256:beac2bc1b9457a693fb3122c797cad5678a168ecff6ccbad4aa3a9f1ff1a2d86", - "sha256:1c5a1ad409e97833a4a873fae5bcd7a365651f7372806992d03891082821bc41", - "sha256:248ecca386d4832138f6a044dceb0bfc38fb8503b7ffbfeb474073f56930144b", - "sha256:ab338095e32ebb2047b6184f1383c667c47b9822d7320fdfb93870567a972343", - "sha256:ebea4619e9626e2eb197835049807c8173f11e2023b05140cbee4b274a91ef5e", - "sha256:2db24e65c99b7d20a67fa461f6bc2e15bddb6cd5fde52e37d6609566d79a69a1", - "sha256:3b45eecf6f68a29f5629dc064079e919987b030628bb6614da7f4eefedbe145e", - "sha256:2120e9d713120687558b6699cf5ff6a8f7b070776b19d6c7fc96fc64ea8ca056", - "sha256:18350ebf82beaef6a73d2c14320f19961242ed424670407df6fd5a9b65f0e7fc", - "sha256:92413edcb4fea75ebd1f8142c882dc5db398025eb8a0a273385838fd791de73c" - ], - "version": "==4.19.7" - }, "six": { "hashes": [ "sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb", diff --git a/tests/test_requests_html.py b/tests/test_requests_html.py index dbf02d8..3dbcd2e 100644 --- a/tests/test_requests_html.py +++ b/tests/test_requests_html.py @@ -1,5 +1,6 @@ import os +import pytest from requests_html import HTMLSession, HTML from requests_file import FileAdapter @@ -14,11 +15,13 @@ def get(): return session.get(url) +@pytest.mark.ok def test_file_get(): r = get() assert r.status_code == 200 +@pytest.mark.ok def test_css_selector(): r = get() @@ -32,6 +35,7 @@ def test_css_selector(): assert menu_item in about.full_text.split('\n') +@pytest.mark.ok def test_attrs(): r = get() about = r.html.find('#about', first=True) @@ -40,6 +44,7 @@ def test_attrs(): assert len(about.attrs['class']) == 2 +@pytest.mark.ok def test_links(): r = get() about = r.html.find('#about', first=True) @@ -48,12 +53,14 @@ def test_links(): assert len(about.absolute_links) == 6 +@pytest.mark.ok def test_search(): r = get() style = r.html.search('Python is a {} language')[0] assert style == 'programming' +@pytest.mark.ok def test_xpath(): r = get() html = r.html.xpath('/html', first=True) @@ -63,6 +70,7 @@ def test_xpath(): assert '#site-map' in a_hrefs +@pytest.mark.ok def test_html_loading(): doc = """""" html = HTML(html=doc) @@ -72,6 +80,7 @@ def test_html_loading(): assert isinstance(html.html, str) +@pytest.mark.ok def test_anchor_links(): r = get() r.html.skip_anchors = False @@ -79,6 +88,7 @@ def test_anchor_links(): assert '#site-map' in r.html.links +@pytest.mark.render def test_render(): r = get() script = """ @@ -98,6 +108,7 @@ def test_render(): assert len(about.links) == 6 +@pytest.mark.render def test_bare_render(): doc = """""" html = HTML(html=doc) From 0b8fe5f6c68a55e7747c1575e79c95a8d3424729 Mon Sep 17 00:00:00 2001 From: Kenneth Reitz Date: Wed, 28 Feb 2018 08:53:45 -0500 Subject: [PATCH 8/8] working Signed-off-by: Kenneth Reitz --- docs/source/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/index.rst b/docs/source/index.rst index 40b8905..ce87fe4 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -199,6 +199,7 @@ You can also render JavaScript pages without Requests: >>> print(html.html) + API Documentation =================