diff --git a/Pipfile b/Pipfile index c0dda7f..62550b6 100644 --- a/Pipfile +++ b/Pipfile @@ -28,4 +28,4 @@ mypy = "*" [scripts] -tests = "pytest" +tests = "pytest -v -m ok" diff --git a/Pipfile.lock b/Pipfile.lock index 03398aa..55c5b45 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "ef6f9504ed9751cf2f4c5aef06e59838981c79d84fa1d36fb5ce258d8dba189f" + "sha256": "cf67076e9c185c3bc951910b2a44b8b548ce954e0e3ff2a5bef1942d13275e8e" }, "host-environment-markers": { "implementation_name": "cpython", @@ -127,15 +127,6 @@ ], "version": "==0.0.10" }, - "pyqt5": { - "hashes": [ - "sha256:128285176240e990fce9c50293105ffd0d2884d8910bb338118f867b171ec6e8", - "sha256:dbd1777d8e7540a6e7350482f1d7c981a073ce1b7195ac2cd21c204b3a28df57", - "sha256:3563ac935fca8e8b1dbd4856d8eedc982b5de90c53f0280e8fca8060a262d4f4", - "sha256:2ce953cb849e5265b9d1abe075471148ad5fb6d7e6a9881f37dfe05590571d23" - ], - "version": "==5.10" - }, "pyquery": { "hashes": [ "sha256:07987c2ed2aed5cba29ff18af95e56e9eb04a2249f42ce47bddfb37f487229a3", @@ -150,23 +141,6 @@ ], "version": "==2.18.4" }, - "sip": { - "hashes": [ - "sha256:f31bb63e63a958f65887ae27f06e62af9f9cb818ba7456a99f78a5ec3082d3dd", - "sha256:776e169da554729f80337070348db49a6742d8aa317aec931a4d0f47b7ef535d", - "sha256:beac2bc1b9457a693fb3122c797cad5678a168ecff6ccbad4aa3a9f1ff1a2d86", - "sha256:1c5a1ad409e97833a4a873fae5bcd7a365651f7372806992d03891082821bc41", - "sha256:248ecca386d4832138f6a044dceb0bfc38fb8503b7ffbfeb474073f56930144b", - "sha256:ab338095e32ebb2047b6184f1383c667c47b9822d7320fdfb93870567a972343", - "sha256:ebea4619e9626e2eb197835049807c8173f11e2023b05140cbee4b274a91ef5e", - "sha256:2db24e65c99b7d20a67fa461f6bc2e15bddb6cd5fde52e37d6609566d79a69a1", - "sha256:3b45eecf6f68a29f5629dc064079e919987b030628bb6614da7f4eefedbe145e", - "sha256:2120e9d713120687558b6699cf5ff6a8f7b070776b19d6c7fc96fc64ea8ca056", - "sha256:18350ebf82beaef6a73d2c14320f19961242ed424670407df6fd5a9b65f0e7fc", - "sha256:92413edcb4fea75ebd1f8142c882dc5db398025eb8a0a273385838fd791de73c" - ], - "version": "==4.19.7" - }, "six": { "hashes": [ "sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb", diff --git a/docs/source/index.rst b/docs/source/index.rst index feb45aa..ce87fe4 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -177,6 +177,28 @@ You can also use this library without Requests: >>> html.links {'https://httpbin.org'} +You can also render JavaScript pages without Requests: + +.. code-block:: pycon + + # ^^ proceeding from above ^^ + >>> script = """ + () => { + return { + width: document.documentElement.clientWidth, + height: document.documentElement.clientHeight, + deviceScaleFactor: window.devicePixelRatio, + } + } + """ + >>> val = html.render(script=script, reload=False) + + >>> print(val) + {'width': 800, 'height': 600, 'deviceScaleFactor': 1} + + >>> print(html.html) + + API Documentation ================= diff --git a/requests_html.py b/requests_html.py index a90750a..7ab12e0 100644 --- a/requests_html.py +++ b/requests_html.py @@ -2,7 +2,7 @@ import sys import asyncio from urllib.parse import urlparse, urlunparse from concurrent.futures._base import TimeoutError -from typing import Set, Union, List, MutableMapping +from typing import Set, Union, List, MutableMapping, Optional import pyppeteer import requests @@ -16,7 +16,6 @@ from parse import search as parse_search from parse import findall, Result from w3lib.encoding import html_to_unicode - DEFAULT_ENCODING = 'utf-8' DEFAULT_URL = 'https://example.org/' @@ -45,7 +44,6 @@ try: except AssertionError: raise RuntimeError('Requests-HTML requires Python 3.6+!') - class BaseParser: """A basic HTML/Element Parser, for Humans. @@ -155,13 +153,7 @@ class BaseParser: for found in self.pq(selector) ] - if first: - try: - return elements[0] - except IndexError: - return None - else: - return elements + return _get_first_or_list(elements, first) def xpath(self, selector: str, first: bool = False, _encoding: str = None) -> _XPath: """Given an XPath selector, returns a list of @@ -189,13 +181,7 @@ class BaseParser: for selection in selected ] - if first: - try: - return elements[0] - except IndexError: - return None - else: - return elements + return _get_first_or_list(c, first) def search(self, template: str) -> Result: """Searches the :class:`Element ` for the given Parse template. @@ -216,14 +202,14 @@ class BaseParser: @property def links(self) -> _Links: """All found links on page, in as–is form.""" + def gen(): for link in self.find('a'): try: href = link.attrs['href'].strip() - if not(href.startswith('#') and self.skip_anchors) and href not in ['javascript:;']: - if href: - yield href + if href and not (href.startswith('#') and self.skip_anchors and href in ['javascript:;']): + yield href except KeyError: pass @@ -234,6 +220,7 @@ class BaseParser: """All found links on page, in absolute form (`learn more `_). """ + def gen(): for link in self.links: # Parse the link with stdlib. @@ -263,12 +250,11 @@ class BaseParser: if base: return base.attrs['href'].strip() - else: - url = '/'.join(self.url.split('/')[:-1]) - if url.endswith('/'): - url = url[:-1] + url = '/'.join(self.url.split('/')[:-1]) + if url.endswith('/'): + url = url[:-1] - return url + return url class Element(BaseParser): @@ -284,10 +270,7 @@ class Element(BaseParser): self.element = element def __repr__(self) -> str: - attrs = [] - for attr in self.attrs: - attrs.append('{}={}'.format(attr, repr(self.attrs[attr]))) - + attrs = ['{}={}'.format(attr, repr(self.attrs[attr])) for attr in self.attrs] return "".format(repr(self.element.tag), ' '.join(attrs)) @property @@ -329,10 +312,16 @@ class HTML(BaseParser): def __repr__(self) -> str: return "".format(repr(self.url)) - def render(self, retries: int = 8, script: str = None, scrolldown=False, sleep: int = 0): + def render(self, retries: int = 8, script: str = None, scrolldown=False, sleep: int = 0, reload: bool = True): """Reloads the response in Chromium, and replaces HTML content with an updated version, with JavaScript executed. + :param retries: The number of times to retry loading the page in Chromium. + :param script: JavaScript to execute upon page load (optional). + :param scrolldown: Integer, if provided, of how many times to page down. + :param sleep: Integer, if provided, of how many long to sleep after initial render. + :param reload: If ``False``, content will not be loaded from the browser, but will be provided from memory. + If ``scrolldown`` is specified, the page will scrolldown the specified number of times, after sleeping the specified amount of time (e.g. ``scrolldown=10, sleep=1``). @@ -365,13 +354,16 @@ class HTML(BaseParser): Warning: the first time you run this method, it will download Chromium into your home directory (``~/.pyppeteer``). """ - async def _async_render(*, url: str, script: str = None, scrolldown, sleep: int): + async def _async_render(*, url: str, script: str = None, scrolldown, sleep: int, reload: bool = True, content: Optional[str]): try: browser = pyppeteer.launch(headless=True) page = await browser.newPage() # Load the given page (GET request, obviously.) - await page.goto(url) + if reload: + await page.goto(url) + else: + await page.setContent(content) result = None if script: @@ -399,7 +391,7 @@ class HTML(BaseParser): for i in range(retries): if not content: try: - content, result = loop.run_until_complete(_async_render(url=self.url, script=script, sleep=sleep, scrolldown=scrolldown)) + content, result = loop.run_until_complete(_async_render(url=self.url, script=script, sleep=sleep, content=self.html, reload=reload, scrolldown=scrolldown)) except TimeoutError: pass @@ -419,10 +411,9 @@ class HTMLResponse(requests.Response): @property def html(self) -> HTML: - if self._html: - return self._html + if not self._html: + self._html = HTML(url=self.url, html=self.content, default_encoding=self.encoding) - self._html = HTML(url=self.url, html=self.content, default_encoding=self.encoding) return self._html @classmethod @@ -437,10 +428,17 @@ def user_agent(style='chrome') -> _UserAgent: style. Defaults to a Chrome-style User-Agent. """ - if not style: - return useragent.random + return useragent[style] if style else useragent.random + + +def _get_first_or_list(l, first=True): + if first: + try: + return l[0] + except IndexError: + return None else: - return useragent[style] + return l class HTMLSession(requests.Session): @@ -473,6 +471,5 @@ class HTMLSession(requests.Session): """ # Convert Request object into HTTPRequest object. r = super(HTMLSession, self).request(*args, **kwargs) - html_r = HTMLResponse._from_response(r) - return html_r + return HTMLResponse._from_response(r) diff --git a/setup.py b/setup.py index 873dabe..b28f984 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ DESCRIPTION = 'HTML Parsing for Humans.' URL = 'https://github.com/kennethreitz/requests-html' EMAIL = 'me@kennethreitz.org' AUTHOR = 'Kenneth Reitz' -VERSION = '0.6.7' +VERSION = '0.6.8' # What packages are required for this module to be executed? REQUIRED = [ diff --git a/tests/test_requests_html.py b/tests/test_requests_html.py index 5780bde..3dbcd2e 100644 --- a/tests/test_requests_html.py +++ b/tests/test_requests_html.py @@ -1,5 +1,6 @@ import os +import pytest from requests_html import HTMLSession, HTML from requests_file import FileAdapter @@ -14,11 +15,13 @@ def get(): return session.get(url) +@pytest.mark.ok def test_file_get(): r = get() assert r.status_code == 200 +@pytest.mark.ok def test_css_selector(): r = get() @@ -32,6 +35,7 @@ def test_css_selector(): assert menu_item in about.full_text.split('\n') +@pytest.mark.ok def test_attrs(): r = get() about = r.html.find('#about', first=True) @@ -40,20 +44,23 @@ def test_attrs(): assert len(about.attrs['class']) == 2 +@pytest.mark.ok def test_links(): r = get() about = r.html.find('#about', first=True) - len(about.links) == 6 - len(about.absolute_links) == 6 + assert len(about.links) == 6 + assert len(about.absolute_links) == 6 +@pytest.mark.ok def test_search(): r = get() style = r.html.search('Python is a {} language')[0] assert style == 'programming' +@pytest.mark.ok def test_xpath(): r = get() html = r.html.xpath('/html', first=True) @@ -63,6 +70,7 @@ def test_xpath(): assert '#site-map' in a_hrefs +@pytest.mark.ok def test_html_loading(): doc = """""" html = HTML(html=doc) @@ -72,6 +80,7 @@ def test_html_loading(): assert isinstance(html.html, str) +@pytest.mark.ok def test_anchor_links(): r = get() r.html.skip_anchors = False @@ -79,5 +88,46 @@ def test_anchor_links(): assert '#site-map' in r.html.links +@pytest.mark.render +def test_render(): + r = get() + script = """ + () => { + return { + width: document.documentElement.clientWidth, + height: document.documentElement.clientHeight, + deviceScaleFactor: window.devicePixelRatio, + } + } + """ + val = r.html.render(script=script) + for value in ('width', 'height', 'deviceScaleFactor'): + assert value in val + + about = r.html.find('#about', first=True) + assert len(about.links) == 6 + + +@pytest.mark.render +def test_bare_render(): + doc = """""" + html = HTML(html=doc) + script = """ + () => { + return { + width: document.documentElement.clientWidth, + height: document.documentElement.clientHeight, + deviceScaleFactor: window.devicePixelRatio, + } + } + """ + val = html.render(script=script, reload=False) + for value in ('width', 'height', 'deviceScaleFactor'): + assert value in val + + assert html.find('html') + assert 'https://httpbin.org' in html.links + + if __name__ == '__main__': test_xpath()