Merge remote-tracking branch 'remotes/upstream/master' into feature/optimization

# Conflicts:
#	requests_html.py
This commit is contained in:
sudoz
2018-02-28 22:38:47 +08:00
6 changed files with 115 additions and 72 deletions
+1 -1
View File
@@ -28,4 +28,4 @@ mypy = "*"
[scripts]
tests = "pytest"
tests = "pytest -v -m ok"
Generated
+1 -27
View File
@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "ef6f9504ed9751cf2f4c5aef06e59838981c79d84fa1d36fb5ce258d8dba189f"
"sha256": "cf67076e9c185c3bc951910b2a44b8b548ce954e0e3ff2a5bef1942d13275e8e"
},
"host-environment-markers": {
"implementation_name": "cpython",
@@ -127,15 +127,6 @@
],
"version": "==0.0.10"
},
"pyqt5": {
"hashes": [
"sha256:128285176240e990fce9c50293105ffd0d2884d8910bb338118f867b171ec6e8",
"sha256:dbd1777d8e7540a6e7350482f1d7c981a073ce1b7195ac2cd21c204b3a28df57",
"sha256:3563ac935fca8e8b1dbd4856d8eedc982b5de90c53f0280e8fca8060a262d4f4",
"sha256:2ce953cb849e5265b9d1abe075471148ad5fb6d7e6a9881f37dfe05590571d23"
],
"version": "==5.10"
},
"pyquery": {
"hashes": [
"sha256:07987c2ed2aed5cba29ff18af95e56e9eb04a2249f42ce47bddfb37f487229a3",
@@ -150,23 +141,6 @@
],
"version": "==2.18.4"
},
"sip": {
"hashes": [
"sha256:f31bb63e63a958f65887ae27f06e62af9f9cb818ba7456a99f78a5ec3082d3dd",
"sha256:776e169da554729f80337070348db49a6742d8aa317aec931a4d0f47b7ef535d",
"sha256:beac2bc1b9457a693fb3122c797cad5678a168ecff6ccbad4aa3a9f1ff1a2d86",
"sha256:1c5a1ad409e97833a4a873fae5bcd7a365651f7372806992d03891082821bc41",
"sha256:248ecca386d4832138f6a044dceb0bfc38fb8503b7ffbfeb474073f56930144b",
"sha256:ab338095e32ebb2047b6184f1383c667c47b9822d7320fdfb93870567a972343",
"sha256:ebea4619e9626e2eb197835049807c8173f11e2023b05140cbee4b274a91ef5e",
"sha256:2db24e65c99b7d20a67fa461f6bc2e15bddb6cd5fde52e37d6609566d79a69a1",
"sha256:3b45eecf6f68a29f5629dc064079e919987b030628bb6614da7f4eefedbe145e",
"sha256:2120e9d713120687558b6699cf5ff6a8f7b070776b19d6c7fc96fc64ea8ca056",
"sha256:18350ebf82beaef6a73d2c14320f19961242ed424670407df6fd5a9b65f0e7fc",
"sha256:92413edcb4fea75ebd1f8142c882dc5db398025eb8a0a273385838fd791de73c"
],
"version": "==4.19.7"
},
"six": {
"hashes": [
"sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb",
+22
View File
@@ -177,6 +177,28 @@ You can also use this library without Requests:
>>> html.links
{'https://httpbin.org'}
You can also render JavaScript pages without Requests:
.. code-block:: pycon
# ^^ proceeding from above ^^
>>> script = """
() => {
return {
width: document.documentElement.clientWidth,
height: document.documentElement.clientHeight,
deviceScaleFactor: window.devicePixelRatio,
}
}
"""
>>> val = html.render(script=script, reload=False)
>>> print(val)
{'width': 800, 'height': 600, 'deviceScaleFactor': 1}
>>> print(html.html)
<html><head></head><body><a href="https://httpbin.org"></a></body></html>
API Documentation
=================
+38 -41
View File
@@ -2,7 +2,7 @@ import sys
import asyncio
from urllib.parse import urlparse, urlunparse
from concurrent.futures._base import TimeoutError
from typing import Set, Union, List, MutableMapping
from typing import Set, Union, List, MutableMapping, Optional
import pyppeteer
import requests
@@ -16,7 +16,6 @@ from parse import search as parse_search
from parse import findall, Result
from w3lib.encoding import html_to_unicode
DEFAULT_ENCODING = 'utf-8'
DEFAULT_URL = 'https://example.org/'
@@ -45,7 +44,6 @@ try:
except AssertionError:
raise RuntimeError('Requests-HTML requires Python 3.6+!')
class BaseParser:
"""A basic HTML/Element Parser, for Humans.
@@ -155,13 +153,7 @@ class BaseParser:
for found in self.pq(selector)
]
if first:
try:
return elements[0]
except IndexError:
return None
else:
return elements
return _get_first_or_list(elements, first)
def xpath(self, selector: str, first: bool = False, _encoding: str = None) -> _XPath:
"""Given an XPath selector, returns a list of
@@ -189,13 +181,7 @@ class BaseParser:
for selection in selected
]
if first:
try:
return elements[0]
except IndexError:
return None
else:
return elements
return _get_first_or_list(c, first)
def search(self, template: str) -> Result:
"""Searches the :class:`Element <Element>` for the given Parse template.
@@ -216,14 +202,14 @@ class BaseParser:
@property
def links(self) -> _Links:
"""All found links on page, in asis form."""
def gen():
for link in self.find('a'):
try:
href = link.attrs['href'].strip()
if not(href.startswith('#') and self.skip_anchors) and href not in ['javascript:;']:
if href:
yield href
if href and not (href.startswith('#') and self.skip_anchors and href in ['javascript:;']):
yield href
except KeyError:
pass
@@ -234,6 +220,7 @@ class BaseParser:
"""All found links on page, in absolute form
(`learn more <https://www.navegabem.com/absolute-or-relative-links.html>`_).
"""
def gen():
for link in self.links:
# Parse the link with stdlib.
@@ -263,12 +250,11 @@ class BaseParser:
if base:
return base.attrs['href'].strip()
else:
url = '/'.join(self.url.split('/')[:-1])
if url.endswith('/'):
url = url[:-1]
url = '/'.join(self.url.split('/')[:-1])
if url.endswith('/'):
url = url[:-1]
return url
return url
class Element(BaseParser):
@@ -284,10 +270,7 @@ class Element(BaseParser):
self.element = element
def __repr__(self) -> str:
attrs = []
for attr in self.attrs:
attrs.append('{}={}'.format(attr, repr(self.attrs[attr])))
attrs = ['{}={}'.format(attr, repr(self.attrs[attr])) for attr in self.attrs]
return "<Element {} {}>".format(repr(self.element.tag), ' '.join(attrs))
@property
@@ -329,10 +312,16 @@ class HTML(BaseParser):
def __repr__(self) -> str:
return "<HTML url={}>".format(repr(self.url))
def render(self, retries: int = 8, script: str = None, scrolldown=False, sleep: int = 0):
def render(self, retries: int = 8, script: str = None, scrolldown=False, sleep: int = 0, reload: bool = True):
"""Reloads the response in Chromium, and replaces HTML content
with an updated version, with JavaScript executed.
:param retries: The number of times to retry loading the page in Chromium.
:param script: JavaScript to execute upon page load (optional).
:param scrolldown: Integer, if provided, of how many times to page down.
:param sleep: Integer, if provided, of how many long to sleep after initial render.
:param reload: If ``False``, content will not be loaded from the browser, but will be provided from memory.
If ``scrolldown`` is specified, the page will scrolldown the specified
number of times, after sleeping the specified amount of time
(e.g. ``scrolldown=10, sleep=1``).
@@ -365,13 +354,16 @@ class HTML(BaseParser):
Warning: the first time you run this method, it will download
Chromium into your home directory (``~/.pyppeteer``).
"""
async def _async_render(*, url: str, script: str = None, scrolldown, sleep: int):
async def _async_render(*, url: str, script: str = None, scrolldown, sleep: int, reload: bool = True, content: Optional[str]):
try:
browser = pyppeteer.launch(headless=True)
page = await browser.newPage()
# Load the given page (GET request, obviously.)
await page.goto(url)
if reload:
await page.goto(url)
else:
await page.setContent(content)
result = None
if script:
@@ -399,7 +391,7 @@ class HTML(BaseParser):
for i in range(retries):
if not content:
try:
content, result = loop.run_until_complete(_async_render(url=self.url, script=script, sleep=sleep, scrolldown=scrolldown))
content, result = loop.run_until_complete(_async_render(url=self.url, script=script, sleep=sleep, content=self.html, reload=reload, scrolldown=scrolldown))
except TimeoutError:
pass
@@ -419,10 +411,9 @@ class HTMLResponse(requests.Response):
@property
def html(self) -> HTML:
if self._html:
return self._html
if not self._html:
self._html = HTML(url=self.url, html=self.content, default_encoding=self.encoding)
self._html = HTML(url=self.url, html=self.content, default_encoding=self.encoding)
return self._html
@classmethod
@@ -437,10 +428,17 @@ def user_agent(style='chrome') -> _UserAgent:
style. Defaults to a Chrome-style User-Agent.
"""
if not style:
return useragent.random
return useragent[style] if style else useragent.random
def _get_first_or_list(l, first=True):
if first:
try:
return l[0]
except IndexError:
return None
else:
return useragent[style]
return l
class HTMLSession(requests.Session):
@@ -473,6 +471,5 @@ class HTMLSession(requests.Session):
"""
# Convert Request object into HTTPRequest object.
r = super(HTMLSession, self).request(*args, **kwargs)
html_r = HTMLResponse._from_response(r)
return html_r
return HTMLResponse._from_response(r)
+1 -1
View File
@@ -17,7 +17,7 @@ DESCRIPTION = 'HTML Parsing for Humans.'
URL = 'https://github.com/kennethreitz/requests-html'
EMAIL = 'me@kennethreitz.org'
AUTHOR = 'Kenneth Reitz'
VERSION = '0.6.7'
VERSION = '0.6.8'
# What packages are required for this module to be executed?
REQUIRED = [
+52 -2
View File
@@ -1,5 +1,6 @@
import os
import pytest
from requests_html import HTMLSession, HTML
from requests_file import FileAdapter
@@ -14,11 +15,13 @@ def get():
return session.get(url)
@pytest.mark.ok
def test_file_get():
r = get()
assert r.status_code == 200
@pytest.mark.ok
def test_css_selector():
r = get()
@@ -32,6 +35,7 @@ def test_css_selector():
assert menu_item in about.full_text.split('\n')
@pytest.mark.ok
def test_attrs():
r = get()
about = r.html.find('#about', first=True)
@@ -40,20 +44,23 @@ def test_attrs():
assert len(about.attrs['class']) == 2
@pytest.mark.ok
def test_links():
r = get()
about = r.html.find('#about', first=True)
len(about.links) == 6
len(about.absolute_links) == 6
assert len(about.links) == 6
assert len(about.absolute_links) == 6
@pytest.mark.ok
def test_search():
r = get()
style = r.html.search('Python is a {} language')[0]
assert style == 'programming'
@pytest.mark.ok
def test_xpath():
r = get()
html = r.html.xpath('/html', first=True)
@@ -63,6 +70,7 @@ def test_xpath():
assert '#site-map' in a_hrefs
@pytest.mark.ok
def test_html_loading():
doc = """<a href='https://httpbin.org'>"""
html = HTML(html=doc)
@@ -72,6 +80,7 @@ def test_html_loading():
assert isinstance(html.html, str)
@pytest.mark.ok
def test_anchor_links():
r = get()
r.html.skip_anchors = False
@@ -79,5 +88,46 @@ def test_anchor_links():
assert '#site-map' in r.html.links
@pytest.mark.render
def test_render():
r = get()
script = """
() => {
return {
width: document.documentElement.clientWidth,
height: document.documentElement.clientHeight,
deviceScaleFactor: window.devicePixelRatio,
}
}
"""
val = r.html.render(script=script)
for value in ('width', 'height', 'deviceScaleFactor'):
assert value in val
about = r.html.find('#about', first=True)
assert len(about.links) == 6
@pytest.mark.render
def test_bare_render():
doc = """<a href='https://httpbin.org'>"""
html = HTML(html=doc)
script = """
() => {
return {
width: document.documentElement.clientWidth,
height: document.documentElement.clientHeight,
deviceScaleFactor: window.devicePixelRatio,
}
}
"""
val = html.render(script=script, reload=False)
for value in ('width', 'height', 'deviceScaleFactor'):
assert value in val
assert html.find('html')
assert 'https://httpbin.org' in html.links
if __name__ == '__main__':
test_xpath()