diff --git a/Pipfile b/Pipfile
index c0dda7f..62550b6 100644
--- a/Pipfile
+++ b/Pipfile
@@ -28,4 +28,4 @@ mypy = "*"
[scripts]
-tests = "pytest"
+tests = "pytest -v -m ok"
diff --git a/Pipfile.lock b/Pipfile.lock
index 03398aa..55c5b45 100644
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
- "sha256": "ef6f9504ed9751cf2f4c5aef06e59838981c79d84fa1d36fb5ce258d8dba189f"
+ "sha256": "cf67076e9c185c3bc951910b2a44b8b548ce954e0e3ff2a5bef1942d13275e8e"
},
"host-environment-markers": {
"implementation_name": "cpython",
@@ -127,15 +127,6 @@
],
"version": "==0.0.10"
},
- "pyqt5": {
- "hashes": [
- "sha256:128285176240e990fce9c50293105ffd0d2884d8910bb338118f867b171ec6e8",
- "sha256:dbd1777d8e7540a6e7350482f1d7c981a073ce1b7195ac2cd21c204b3a28df57",
- "sha256:3563ac935fca8e8b1dbd4856d8eedc982b5de90c53f0280e8fca8060a262d4f4",
- "sha256:2ce953cb849e5265b9d1abe075471148ad5fb6d7e6a9881f37dfe05590571d23"
- ],
- "version": "==5.10"
- },
"pyquery": {
"hashes": [
"sha256:07987c2ed2aed5cba29ff18af95e56e9eb04a2249f42ce47bddfb37f487229a3",
@@ -150,23 +141,6 @@
],
"version": "==2.18.4"
},
- "sip": {
- "hashes": [
- "sha256:f31bb63e63a958f65887ae27f06e62af9f9cb818ba7456a99f78a5ec3082d3dd",
- "sha256:776e169da554729f80337070348db49a6742d8aa317aec931a4d0f47b7ef535d",
- "sha256:beac2bc1b9457a693fb3122c797cad5678a168ecff6ccbad4aa3a9f1ff1a2d86",
- "sha256:1c5a1ad409e97833a4a873fae5bcd7a365651f7372806992d03891082821bc41",
- "sha256:248ecca386d4832138f6a044dceb0bfc38fb8503b7ffbfeb474073f56930144b",
- "sha256:ab338095e32ebb2047b6184f1383c667c47b9822d7320fdfb93870567a972343",
- "sha256:ebea4619e9626e2eb197835049807c8173f11e2023b05140cbee4b274a91ef5e",
- "sha256:2db24e65c99b7d20a67fa461f6bc2e15bddb6cd5fde52e37d6609566d79a69a1",
- "sha256:3b45eecf6f68a29f5629dc064079e919987b030628bb6614da7f4eefedbe145e",
- "sha256:2120e9d713120687558b6699cf5ff6a8f7b070776b19d6c7fc96fc64ea8ca056",
- "sha256:18350ebf82beaef6a73d2c14320f19961242ed424670407df6fd5a9b65f0e7fc",
- "sha256:92413edcb4fea75ebd1f8142c882dc5db398025eb8a0a273385838fd791de73c"
- ],
- "version": "==4.19.7"
- },
"six": {
"hashes": [
"sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb",
diff --git a/docs/source/index.rst b/docs/source/index.rst
index feb45aa..ce87fe4 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -177,6 +177,28 @@ You can also use this library without Requests:
>>> html.links
{'https://httpbin.org'}
+You can also render JavaScript pages without Requests:
+
+.. code-block:: pycon
+
+ # ^^ proceeding from above ^^
+ >>> script = """
+ () => {
+ return {
+ width: document.documentElement.clientWidth,
+ height: document.documentElement.clientHeight,
+ deviceScaleFactor: window.devicePixelRatio,
+ }
+ }
+ """
+ >>> val = html.render(script=script, reload=False)
+
+ >>> print(val)
+ {'width': 800, 'height': 600, 'deviceScaleFactor': 1}
+
+ >>> print(html.html)
+
+
API Documentation
=================
diff --git a/requests_html.py b/requests_html.py
index a90750a..7ab12e0 100644
--- a/requests_html.py
+++ b/requests_html.py
@@ -2,7 +2,7 @@ import sys
import asyncio
from urllib.parse import urlparse, urlunparse
from concurrent.futures._base import TimeoutError
-from typing import Set, Union, List, MutableMapping
+from typing import Set, Union, List, MutableMapping, Optional
import pyppeteer
import requests
@@ -16,7 +16,6 @@ from parse import search as parse_search
from parse import findall, Result
from w3lib.encoding import html_to_unicode
-
DEFAULT_ENCODING = 'utf-8'
DEFAULT_URL = 'https://example.org/'
@@ -45,7 +44,6 @@ try:
except AssertionError:
raise RuntimeError('Requests-HTML requires Python 3.6+!')
-
class BaseParser:
"""A basic HTML/Element Parser, for Humans.
@@ -155,13 +153,7 @@ class BaseParser:
for found in self.pq(selector)
]
- if first:
- try:
- return elements[0]
- except IndexError:
- return None
- else:
- return elements
+ return _get_first_or_list(elements, first)
def xpath(self, selector: str, first: bool = False, _encoding: str = None) -> _XPath:
"""Given an XPath selector, returns a list of
@@ -189,13 +181,7 @@ class BaseParser:
for selection in selected
]
- if first:
- try:
- return elements[0]
- except IndexError:
- return None
- else:
- return elements
+ return _get_first_or_list(c, first)
def search(self, template: str) -> Result:
"""Searches the :class:`Element ` for the given Parse template.
@@ -216,14 +202,14 @@ class BaseParser:
@property
def links(self) -> _Links:
"""All found links on page, in as–is form."""
+
def gen():
for link in self.find('a'):
try:
href = link.attrs['href'].strip()
- if not(href.startswith('#') and self.skip_anchors) and href not in ['javascript:;']:
- if href:
- yield href
+ if href and not (href.startswith('#') and self.skip_anchors and href in ['javascript:;']):
+ yield href
except KeyError:
pass
@@ -234,6 +220,7 @@ class BaseParser:
"""All found links on page, in absolute form
(`learn more `_).
"""
+
def gen():
for link in self.links:
# Parse the link with stdlib.
@@ -263,12 +250,11 @@ class BaseParser:
if base:
return base.attrs['href'].strip()
- else:
- url = '/'.join(self.url.split('/')[:-1])
- if url.endswith('/'):
- url = url[:-1]
+ url = '/'.join(self.url.split('/')[:-1])
+ if url.endswith('/'):
+ url = url[:-1]
- return url
+ return url
class Element(BaseParser):
@@ -284,10 +270,7 @@ class Element(BaseParser):
self.element = element
def __repr__(self) -> str:
- attrs = []
- for attr in self.attrs:
- attrs.append('{}={}'.format(attr, repr(self.attrs[attr])))
-
+ attrs = ['{}={}'.format(attr, repr(self.attrs[attr])) for attr in self.attrs]
return "".format(repr(self.element.tag), ' '.join(attrs))
@property
@@ -329,10 +312,16 @@ class HTML(BaseParser):
def __repr__(self) -> str:
return "".format(repr(self.url))
- def render(self, retries: int = 8, script: str = None, scrolldown=False, sleep: int = 0):
+ def render(self, retries: int = 8, script: str = None, scrolldown=False, sleep: int = 0, reload: bool = True):
"""Reloads the response in Chromium, and replaces HTML content
with an updated version, with JavaScript executed.
+ :param retries: The number of times to retry loading the page in Chromium.
+ :param script: JavaScript to execute upon page load (optional).
+ :param scrolldown: Integer, if provided, of how many times to page down.
+ :param sleep: Integer, if provided, of how many long to sleep after initial render.
+ :param reload: If ``False``, content will not be loaded from the browser, but will be provided from memory.
+
If ``scrolldown`` is specified, the page will scrolldown the specified
number of times, after sleeping the specified amount of time
(e.g. ``scrolldown=10, sleep=1``).
@@ -365,13 +354,16 @@ class HTML(BaseParser):
Warning: the first time you run this method, it will download
Chromium into your home directory (``~/.pyppeteer``).
"""
- async def _async_render(*, url: str, script: str = None, scrolldown, sleep: int):
+ async def _async_render(*, url: str, script: str = None, scrolldown, sleep: int, reload: bool = True, content: Optional[str]):
try:
browser = pyppeteer.launch(headless=True)
page = await browser.newPage()
# Load the given page (GET request, obviously.)
- await page.goto(url)
+ if reload:
+ await page.goto(url)
+ else:
+ await page.setContent(content)
result = None
if script:
@@ -399,7 +391,7 @@ class HTML(BaseParser):
for i in range(retries):
if not content:
try:
- content, result = loop.run_until_complete(_async_render(url=self.url, script=script, sleep=sleep, scrolldown=scrolldown))
+ content, result = loop.run_until_complete(_async_render(url=self.url, script=script, sleep=sleep, content=self.html, reload=reload, scrolldown=scrolldown))
except TimeoutError:
pass
@@ -419,10 +411,9 @@ class HTMLResponse(requests.Response):
@property
def html(self) -> HTML:
- if self._html:
- return self._html
+ if not self._html:
+ self._html = HTML(url=self.url, html=self.content, default_encoding=self.encoding)
- self._html = HTML(url=self.url, html=self.content, default_encoding=self.encoding)
return self._html
@classmethod
@@ -437,10 +428,17 @@ def user_agent(style='chrome') -> _UserAgent:
style. Defaults to a Chrome-style User-Agent.
"""
- if not style:
- return useragent.random
+ return useragent[style] if style else useragent.random
+
+
+def _get_first_or_list(l, first=True):
+ if first:
+ try:
+ return l[0]
+ except IndexError:
+ return None
else:
- return useragent[style]
+ return l
class HTMLSession(requests.Session):
@@ -473,6 +471,5 @@ class HTMLSession(requests.Session):
"""
# Convert Request object into HTTPRequest object.
r = super(HTMLSession, self).request(*args, **kwargs)
- html_r = HTMLResponse._from_response(r)
- return html_r
+ return HTMLResponse._from_response(r)
diff --git a/setup.py b/setup.py
index 873dabe..b28f984 100644
--- a/setup.py
+++ b/setup.py
@@ -17,7 +17,7 @@ DESCRIPTION = 'HTML Parsing for Humans.'
URL = 'https://github.com/kennethreitz/requests-html'
EMAIL = 'me@kennethreitz.org'
AUTHOR = 'Kenneth Reitz'
-VERSION = '0.6.7'
+VERSION = '0.6.8'
# What packages are required for this module to be executed?
REQUIRED = [
diff --git a/tests/test_requests_html.py b/tests/test_requests_html.py
index 5780bde..3dbcd2e 100644
--- a/tests/test_requests_html.py
+++ b/tests/test_requests_html.py
@@ -1,5 +1,6 @@
import os
+import pytest
from requests_html import HTMLSession, HTML
from requests_file import FileAdapter
@@ -14,11 +15,13 @@ def get():
return session.get(url)
+@pytest.mark.ok
def test_file_get():
r = get()
assert r.status_code == 200
+@pytest.mark.ok
def test_css_selector():
r = get()
@@ -32,6 +35,7 @@ def test_css_selector():
assert menu_item in about.full_text.split('\n')
+@pytest.mark.ok
def test_attrs():
r = get()
about = r.html.find('#about', first=True)
@@ -40,20 +44,23 @@ def test_attrs():
assert len(about.attrs['class']) == 2
+@pytest.mark.ok
def test_links():
r = get()
about = r.html.find('#about', first=True)
- len(about.links) == 6
- len(about.absolute_links) == 6
+ assert len(about.links) == 6
+ assert len(about.absolute_links) == 6
+@pytest.mark.ok
def test_search():
r = get()
style = r.html.search('Python is a {} language')[0]
assert style == 'programming'
+@pytest.mark.ok
def test_xpath():
r = get()
html = r.html.xpath('/html', first=True)
@@ -63,6 +70,7 @@ def test_xpath():
assert '#site-map' in a_hrefs
+@pytest.mark.ok
def test_html_loading():
doc = """"""
html = HTML(html=doc)
@@ -72,6 +80,7 @@ def test_html_loading():
assert isinstance(html.html, str)
+@pytest.mark.ok
def test_anchor_links():
r = get()
r.html.skip_anchors = False
@@ -79,5 +88,46 @@ def test_anchor_links():
assert '#site-map' in r.html.links
+@pytest.mark.render
+def test_render():
+ r = get()
+ script = """
+ () => {
+ return {
+ width: document.documentElement.clientWidth,
+ height: document.documentElement.clientHeight,
+ deviceScaleFactor: window.devicePixelRatio,
+ }
+ }
+ """
+ val = r.html.render(script=script)
+ for value in ('width', 'height', 'deviceScaleFactor'):
+ assert value in val
+
+ about = r.html.find('#about', first=True)
+ assert len(about.links) == 6
+
+
+@pytest.mark.render
+def test_bare_render():
+ doc = """"""
+ html = HTML(html=doc)
+ script = """
+ () => {
+ return {
+ width: document.documentElement.clientWidth,
+ height: document.documentElement.clientHeight,
+ deviceScaleFactor: window.devicePixelRatio,
+ }
+ }
+ """
+ val = html.render(script=script, reload=False)
+ for value in ('width', 'height', 'deviceScaleFactor'):
+ assert value in val
+
+ assert html.find('html')
+ assert 'https://httpbin.org' in html.links
+
+
if __name__ == '__main__':
test_xpath()