mirror of
https://github.com/kennethreitz/requests-html.git
synced 2026-06-05 23:00:20 +00:00
Merge remote-tracking branch 'remotes/upstream/master' into feature/optimization
# Conflicts: # requests_html.py
This commit is contained in:
@@ -28,4 +28,4 @@ mypy = "*"
|
||||
|
||||
[scripts]
|
||||
|
||||
tests = "pytest"
|
||||
tests = "pytest -v -m ok"
|
||||
|
||||
Generated
+1
-27
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "ef6f9504ed9751cf2f4c5aef06e59838981c79d84fa1d36fb5ce258d8dba189f"
|
||||
"sha256": "cf67076e9c185c3bc951910b2a44b8b548ce954e0e3ff2a5bef1942d13275e8e"
|
||||
},
|
||||
"host-environment-markers": {
|
||||
"implementation_name": "cpython",
|
||||
@@ -127,15 +127,6 @@
|
||||
],
|
||||
"version": "==0.0.10"
|
||||
},
|
||||
"pyqt5": {
|
||||
"hashes": [
|
||||
"sha256:128285176240e990fce9c50293105ffd0d2884d8910bb338118f867b171ec6e8",
|
||||
"sha256:dbd1777d8e7540a6e7350482f1d7c981a073ce1b7195ac2cd21c204b3a28df57",
|
||||
"sha256:3563ac935fca8e8b1dbd4856d8eedc982b5de90c53f0280e8fca8060a262d4f4",
|
||||
"sha256:2ce953cb849e5265b9d1abe075471148ad5fb6d7e6a9881f37dfe05590571d23"
|
||||
],
|
||||
"version": "==5.10"
|
||||
},
|
||||
"pyquery": {
|
||||
"hashes": [
|
||||
"sha256:07987c2ed2aed5cba29ff18af95e56e9eb04a2249f42ce47bddfb37f487229a3",
|
||||
@@ -150,23 +141,6 @@
|
||||
],
|
||||
"version": "==2.18.4"
|
||||
},
|
||||
"sip": {
|
||||
"hashes": [
|
||||
"sha256:f31bb63e63a958f65887ae27f06e62af9f9cb818ba7456a99f78a5ec3082d3dd",
|
||||
"sha256:776e169da554729f80337070348db49a6742d8aa317aec931a4d0f47b7ef535d",
|
||||
"sha256:beac2bc1b9457a693fb3122c797cad5678a168ecff6ccbad4aa3a9f1ff1a2d86",
|
||||
"sha256:1c5a1ad409e97833a4a873fae5bcd7a365651f7372806992d03891082821bc41",
|
||||
"sha256:248ecca386d4832138f6a044dceb0bfc38fb8503b7ffbfeb474073f56930144b",
|
||||
"sha256:ab338095e32ebb2047b6184f1383c667c47b9822d7320fdfb93870567a972343",
|
||||
"sha256:ebea4619e9626e2eb197835049807c8173f11e2023b05140cbee4b274a91ef5e",
|
||||
"sha256:2db24e65c99b7d20a67fa461f6bc2e15bddb6cd5fde52e37d6609566d79a69a1",
|
||||
"sha256:3b45eecf6f68a29f5629dc064079e919987b030628bb6614da7f4eefedbe145e",
|
||||
"sha256:2120e9d713120687558b6699cf5ff6a8f7b070776b19d6c7fc96fc64ea8ca056",
|
||||
"sha256:18350ebf82beaef6a73d2c14320f19961242ed424670407df6fd5a9b65f0e7fc",
|
||||
"sha256:92413edcb4fea75ebd1f8142c882dc5db398025eb8a0a273385838fd791de73c"
|
||||
],
|
||||
"version": "==4.19.7"
|
||||
},
|
||||
"six": {
|
||||
"hashes": [
|
||||
"sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb",
|
||||
|
||||
@@ -177,6 +177,28 @@ You can also use this library without Requests:
|
||||
>>> html.links
|
||||
{'https://httpbin.org'}
|
||||
|
||||
You can also render JavaScript pages without Requests:
|
||||
|
||||
.. code-block:: pycon
|
||||
|
||||
# ^^ proceeding from above ^^
|
||||
>>> script = """
|
||||
() => {
|
||||
return {
|
||||
width: document.documentElement.clientWidth,
|
||||
height: document.documentElement.clientHeight,
|
||||
deviceScaleFactor: window.devicePixelRatio,
|
||||
}
|
||||
}
|
||||
"""
|
||||
>>> val = html.render(script=script, reload=False)
|
||||
|
||||
>>> print(val)
|
||||
{'width': 800, 'height': 600, 'deviceScaleFactor': 1}
|
||||
|
||||
>>> print(html.html)
|
||||
<html><head></head><body><a href="https://httpbin.org"></a></body></html>
|
||||
|
||||
|
||||
API Documentation
|
||||
=================
|
||||
|
||||
+38
-41
@@ -2,7 +2,7 @@ import sys
|
||||
import asyncio
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
from concurrent.futures._base import TimeoutError
|
||||
from typing import Set, Union, List, MutableMapping
|
||||
from typing import Set, Union, List, MutableMapping, Optional
|
||||
|
||||
import pyppeteer
|
||||
import requests
|
||||
@@ -16,7 +16,6 @@ from parse import search as parse_search
|
||||
from parse import findall, Result
|
||||
from w3lib.encoding import html_to_unicode
|
||||
|
||||
|
||||
DEFAULT_ENCODING = 'utf-8'
|
||||
DEFAULT_URL = 'https://example.org/'
|
||||
|
||||
@@ -45,7 +44,6 @@ try:
|
||||
except AssertionError:
|
||||
raise RuntimeError('Requests-HTML requires Python 3.6+!')
|
||||
|
||||
|
||||
class BaseParser:
|
||||
"""A basic HTML/Element Parser, for Humans.
|
||||
|
||||
@@ -155,13 +153,7 @@ class BaseParser:
|
||||
for found in self.pq(selector)
|
||||
]
|
||||
|
||||
if first:
|
||||
try:
|
||||
return elements[0]
|
||||
except IndexError:
|
||||
return None
|
||||
else:
|
||||
return elements
|
||||
return _get_first_or_list(elements, first)
|
||||
|
||||
def xpath(self, selector: str, first: bool = False, _encoding: str = None) -> _XPath:
|
||||
"""Given an XPath selector, returns a list of
|
||||
@@ -189,13 +181,7 @@ class BaseParser:
|
||||
for selection in selected
|
||||
]
|
||||
|
||||
if first:
|
||||
try:
|
||||
return elements[0]
|
||||
except IndexError:
|
||||
return None
|
||||
else:
|
||||
return elements
|
||||
return _get_first_or_list(c, first)
|
||||
|
||||
def search(self, template: str) -> Result:
|
||||
"""Searches the :class:`Element <Element>` for the given Parse template.
|
||||
@@ -216,14 +202,14 @@ class BaseParser:
|
||||
@property
|
||||
def links(self) -> _Links:
|
||||
"""All found links on page, in as–is form."""
|
||||
|
||||
def gen():
|
||||
for link in self.find('a'):
|
||||
|
||||
try:
|
||||
href = link.attrs['href'].strip()
|
||||
if not(href.startswith('#') and self.skip_anchors) and href not in ['javascript:;']:
|
||||
if href:
|
||||
yield href
|
||||
if href and not (href.startswith('#') and self.skip_anchors and href in ['javascript:;']):
|
||||
yield href
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
@@ -234,6 +220,7 @@ class BaseParser:
|
||||
"""All found links on page, in absolute form
|
||||
(`learn more <https://www.navegabem.com/absolute-or-relative-links.html>`_).
|
||||
"""
|
||||
|
||||
def gen():
|
||||
for link in self.links:
|
||||
# Parse the link with stdlib.
|
||||
@@ -263,12 +250,11 @@ class BaseParser:
|
||||
if base:
|
||||
return base.attrs['href'].strip()
|
||||
|
||||
else:
|
||||
url = '/'.join(self.url.split('/')[:-1])
|
||||
if url.endswith('/'):
|
||||
url = url[:-1]
|
||||
url = '/'.join(self.url.split('/')[:-1])
|
||||
if url.endswith('/'):
|
||||
url = url[:-1]
|
||||
|
||||
return url
|
||||
return url
|
||||
|
||||
|
||||
class Element(BaseParser):
|
||||
@@ -284,10 +270,7 @@ class Element(BaseParser):
|
||||
self.element = element
|
||||
|
||||
def __repr__(self) -> str:
|
||||
attrs = []
|
||||
for attr in self.attrs:
|
||||
attrs.append('{}={}'.format(attr, repr(self.attrs[attr])))
|
||||
|
||||
attrs = ['{}={}'.format(attr, repr(self.attrs[attr])) for attr in self.attrs]
|
||||
return "<Element {} {}>".format(repr(self.element.tag), ' '.join(attrs))
|
||||
|
||||
@property
|
||||
@@ -329,10 +312,16 @@ class HTML(BaseParser):
|
||||
def __repr__(self) -> str:
|
||||
return "<HTML url={}>".format(repr(self.url))
|
||||
|
||||
def render(self, retries: int = 8, script: str = None, scrolldown=False, sleep: int = 0):
|
||||
def render(self, retries: int = 8, script: str = None, scrolldown=False, sleep: int = 0, reload: bool = True):
|
||||
"""Reloads the response in Chromium, and replaces HTML content
|
||||
with an updated version, with JavaScript executed.
|
||||
|
||||
:param retries: The number of times to retry loading the page in Chromium.
|
||||
:param script: JavaScript to execute upon page load (optional).
|
||||
:param scrolldown: Integer, if provided, of how many times to page down.
|
||||
:param sleep: Integer, if provided, of how many long to sleep after initial render.
|
||||
:param reload: If ``False``, content will not be loaded from the browser, but will be provided from memory.
|
||||
|
||||
If ``scrolldown`` is specified, the page will scrolldown the specified
|
||||
number of times, after sleeping the specified amount of time
|
||||
(e.g. ``scrolldown=10, sleep=1``).
|
||||
@@ -365,13 +354,16 @@ class HTML(BaseParser):
|
||||
Warning: the first time you run this method, it will download
|
||||
Chromium into your home directory (``~/.pyppeteer``).
|
||||
"""
|
||||
async def _async_render(*, url: str, script: str = None, scrolldown, sleep: int):
|
||||
async def _async_render(*, url: str, script: str = None, scrolldown, sleep: int, reload: bool = True, content: Optional[str]):
|
||||
try:
|
||||
browser = pyppeteer.launch(headless=True)
|
||||
page = await browser.newPage()
|
||||
|
||||
# Load the given page (GET request, obviously.)
|
||||
await page.goto(url)
|
||||
if reload:
|
||||
await page.goto(url)
|
||||
else:
|
||||
await page.setContent(content)
|
||||
|
||||
result = None
|
||||
if script:
|
||||
@@ -399,7 +391,7 @@ class HTML(BaseParser):
|
||||
for i in range(retries):
|
||||
if not content:
|
||||
try:
|
||||
content, result = loop.run_until_complete(_async_render(url=self.url, script=script, sleep=sleep, scrolldown=scrolldown))
|
||||
content, result = loop.run_until_complete(_async_render(url=self.url, script=script, sleep=sleep, content=self.html, reload=reload, scrolldown=scrolldown))
|
||||
except TimeoutError:
|
||||
pass
|
||||
|
||||
@@ -419,10 +411,9 @@ class HTMLResponse(requests.Response):
|
||||
|
||||
@property
|
||||
def html(self) -> HTML:
|
||||
if self._html:
|
||||
return self._html
|
||||
if not self._html:
|
||||
self._html = HTML(url=self.url, html=self.content, default_encoding=self.encoding)
|
||||
|
||||
self._html = HTML(url=self.url, html=self.content, default_encoding=self.encoding)
|
||||
return self._html
|
||||
|
||||
@classmethod
|
||||
@@ -437,10 +428,17 @@ def user_agent(style='chrome') -> _UserAgent:
|
||||
style. Defaults to a Chrome-style User-Agent.
|
||||
"""
|
||||
|
||||
if not style:
|
||||
return useragent.random
|
||||
return useragent[style] if style else useragent.random
|
||||
|
||||
|
||||
def _get_first_or_list(l, first=True):
|
||||
if first:
|
||||
try:
|
||||
return l[0]
|
||||
except IndexError:
|
||||
return None
|
||||
else:
|
||||
return useragent[style]
|
||||
return l
|
||||
|
||||
|
||||
class HTMLSession(requests.Session):
|
||||
@@ -473,6 +471,5 @@ class HTMLSession(requests.Session):
|
||||
"""
|
||||
# Convert Request object into HTTPRequest object.
|
||||
r = super(HTMLSession, self).request(*args, **kwargs)
|
||||
html_r = HTMLResponse._from_response(r)
|
||||
|
||||
return html_r
|
||||
return HTMLResponse._from_response(r)
|
||||
|
||||
@@ -17,7 +17,7 @@ DESCRIPTION = 'HTML Parsing for Humans.'
|
||||
URL = 'https://github.com/kennethreitz/requests-html'
|
||||
EMAIL = 'me@kennethreitz.org'
|
||||
AUTHOR = 'Kenneth Reitz'
|
||||
VERSION = '0.6.7'
|
||||
VERSION = '0.6.8'
|
||||
|
||||
# What packages are required for this module to be executed?
|
||||
REQUIRED = [
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from requests_html import HTMLSession, HTML
|
||||
from requests_file import FileAdapter
|
||||
|
||||
@@ -14,11 +15,13 @@ def get():
|
||||
return session.get(url)
|
||||
|
||||
|
||||
@pytest.mark.ok
|
||||
def test_file_get():
|
||||
r = get()
|
||||
assert r.status_code == 200
|
||||
|
||||
|
||||
@pytest.mark.ok
|
||||
def test_css_selector():
|
||||
r = get()
|
||||
|
||||
@@ -32,6 +35,7 @@ def test_css_selector():
|
||||
assert menu_item in about.full_text.split('\n')
|
||||
|
||||
|
||||
@pytest.mark.ok
|
||||
def test_attrs():
|
||||
r = get()
|
||||
about = r.html.find('#about', first=True)
|
||||
@@ -40,20 +44,23 @@ def test_attrs():
|
||||
assert len(about.attrs['class']) == 2
|
||||
|
||||
|
||||
@pytest.mark.ok
|
||||
def test_links():
|
||||
r = get()
|
||||
about = r.html.find('#about', first=True)
|
||||
|
||||
len(about.links) == 6
|
||||
len(about.absolute_links) == 6
|
||||
assert len(about.links) == 6
|
||||
assert len(about.absolute_links) == 6
|
||||
|
||||
|
||||
@pytest.mark.ok
|
||||
def test_search():
|
||||
r = get()
|
||||
style = r.html.search('Python is a {} language')[0]
|
||||
assert style == 'programming'
|
||||
|
||||
|
||||
@pytest.mark.ok
|
||||
def test_xpath():
|
||||
r = get()
|
||||
html = r.html.xpath('/html', first=True)
|
||||
@@ -63,6 +70,7 @@ def test_xpath():
|
||||
assert '#site-map' in a_hrefs
|
||||
|
||||
|
||||
@pytest.mark.ok
|
||||
def test_html_loading():
|
||||
doc = """<a href='https://httpbin.org'>"""
|
||||
html = HTML(html=doc)
|
||||
@@ -72,6 +80,7 @@ def test_html_loading():
|
||||
assert isinstance(html.html, str)
|
||||
|
||||
|
||||
@pytest.mark.ok
|
||||
def test_anchor_links():
|
||||
r = get()
|
||||
r.html.skip_anchors = False
|
||||
@@ -79,5 +88,46 @@ def test_anchor_links():
|
||||
assert '#site-map' in r.html.links
|
||||
|
||||
|
||||
@pytest.mark.render
|
||||
def test_render():
|
||||
r = get()
|
||||
script = """
|
||||
() => {
|
||||
return {
|
||||
width: document.documentElement.clientWidth,
|
||||
height: document.documentElement.clientHeight,
|
||||
deviceScaleFactor: window.devicePixelRatio,
|
||||
}
|
||||
}
|
||||
"""
|
||||
val = r.html.render(script=script)
|
||||
for value in ('width', 'height', 'deviceScaleFactor'):
|
||||
assert value in val
|
||||
|
||||
about = r.html.find('#about', first=True)
|
||||
assert len(about.links) == 6
|
||||
|
||||
|
||||
@pytest.mark.render
|
||||
def test_bare_render():
|
||||
doc = """<a href='https://httpbin.org'>"""
|
||||
html = HTML(html=doc)
|
||||
script = """
|
||||
() => {
|
||||
return {
|
||||
width: document.documentElement.clientWidth,
|
||||
height: document.documentElement.clientHeight,
|
||||
deviceScaleFactor: window.devicePixelRatio,
|
||||
}
|
||||
}
|
||||
"""
|
||||
val = html.render(script=script, reload=False)
|
||||
for value in ('width', 'height', 'deviceScaleFactor'):
|
||||
assert value in val
|
||||
|
||||
assert html.find('html')
|
||||
assert 'https://httpbin.org' in html.links
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_xpath()
|
||||
|
||||
Reference in New Issue
Block a user