Signed-off-by: Kenneth Reitz <me@kennethreitz.org>
This commit is contained in:
2018-02-27 11:59:29 -05:00
parent c198b8c147
commit c11cd6cda7
3 changed files with 111 additions and 34 deletions
+1
View File
@@ -24,6 +24,7 @@ requests-file = "*"
pytest = "*"
"e1839a8" = {path = ".", editable = true}
sphinx = "*"
mypy = "*"
[scripts]
Generated
+79 -1
View File
@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "9af93ac7145d6f8f0f24b28c59064699450344bdd45f18d3c4383647a1a08f03"
"sha256": "ef6f9504ed9751cf2f4c5aef06e59838981c79d84fa1d36fb5ce258d8dba189f"
},
"host-environment-markers": {
"implementation_name": "cpython",
@@ -345,6 +345,13 @@
],
"version": "==1.0"
},
"mypy": {
"hashes": [
"sha256:aa668809ae0dbec5e9feb8929f4b5e1f9318a0a397447fa2f38c382a2ed6a036",
"sha256:bd0c9a2fcf0c4f7a54a2b625f466fcc000d415f371298d96fa5d2acc69074aca"
],
"version": "==0.560"
},
"packaging": {
"hashes": [
"sha256:99276dc6e3a7851f32027a68f1095cd3f77c148091b092ea867a351811cfe388",
@@ -371,6 +378,20 @@
],
"version": "==0.6.0"
},
"psutil": {
"hashes": [
"sha256:82a06785db8eeb637b349006cc28a92e40cd190fefae9875246d18d0de7ccac8",
"sha256:4152ae231709e3e8b80e26b6da20dc965a1a589959c48af1ed024eca6473f60d",
"sha256:230eeb3aeb077814f3a2cd036ddb6e0f571960d327298cc914c02385c3e02a63",
"sha256:a3286556d4d2f341108db65d8e20d0cd3fcb9a91741cb5eb496832d7daf2a97c",
"sha256:94d4e63189f2593960e73acaaf96be235dd8a455fe2bcb37d8ad6f0e87f61556",
"sha256:c91eee73eea00df5e62c741b380b7e5b6fdd553891bee5669817a3a38d036f13",
"sha256:779ec7e7621758ca11a8d99a1064996454b3570154277cc21342a01148a49c28",
"sha256:8a15d773203a1277e57b1d11a7ccdf70804744ef4a9518a87ab8436995c31a4b",
"sha256:e2467e9312c2fa191687b89ff4bc2ad8843be4af6fb4dc95a7cc5f7d7a327b18"
],
"version": "==5.4.3"
},
"py": {
"hashes": [
"sha256:8cca5c229d225f8c1e3085be4fcf306090b00850fefad892f9d96c7b6e2f310f",
@@ -378,6 +399,13 @@
],
"version": "==1.5.2"
},
"pyee": {
"hashes": [
"sha256:47f8fa96d6dee61c82001831e1fbba55f3f808003a322d0e6653aa01c59f6b9e",
"sha256:4ec22817297b7024f89721cc34f790ee2767c5b5ca44284c565ee643abafbe32"
],
"version": "==5.0.0"
},
"pygments": {
"hashes": [
"sha256:78f3f434bcc5d6ee09020f92ba487f95ba50f1e3ef83ae96b9d5ffa1bab25c5d",
@@ -397,6 +425,12 @@
],
"version": "==2.2.0"
},
"pyppeteer": {
"hashes": [
"sha256:596929fb7d052048679081d3dc2a998cf065e936a752c7ba2392445d6e0e9706"
],
"version": "==0.0.10"
},
"pyquery": {
"hashes": [
"sha256:07987c2ed2aed5cba29ff18af95e56e9eb04a2249f42ce47bddfb37f487229a3",
@@ -488,6 +522,29 @@
],
"version": "==1.9.1"
},
"typed-ast": {
"hashes": [
"sha256:0948004fa228ae071054f5208840a1e88747a357ec1101c17217bfe99b299d58",
"sha256:25d8feefe27eb0303b73545416b13d108c6067b846b543738a25ff304824ed9a",
"sha256:c05b41bc1deade9f90ddc5d988fe506208019ebba9f2578c622516fd201f5863",
"sha256:519425deca5c2b2bdac49f77b2c5625781abbaf9a809d727d3a5596b30bb4ded",
"sha256:6de012d2b166fe7a4cdf505eee3aaa12192f7ba365beeefaca4ec10e31241a85",
"sha256:79b91ebe5a28d349b6d0d323023350133e927b4de5b651a8aa2db69c761420c6",
"sha256:a8034021801bc0440f2e027c354b4eafd95891b573e12ff0418dec385c76785c",
"sha256:f19f2a4f547505fe9072e15f6f4ae714af51b5a681a97f187971f50c283193b6",
"sha256:c9b060bd1e5a26ab6e8267fd46fc9e02b54eb15fffb16d112d4c7b1c12987559",
"sha256:2e214b72168ea0275efd6c884b114ab42e316de3ffa125b267e732ed2abda892",
"sha256:bc978ac17468fe868ee589c795d06777f75496b1ed576d308002c8a5756fb9ea",
"sha256:edb04bdd45bfd76c8292c4d9654568efaedf76fe78eb246dde69bdb13b2dad87",
"sha256:668d0cec391d9aed1c6a388b0d5b97cd22e6073eaa5fbaa6d2946603b4871efe",
"sha256:29464a177d56e4e055b5f7b629935af7f49c196be47528cc94e0a7bf83fbc2b9",
"sha256:8550177fa5d4c1f09b5e5f524411c44633c80ec69b24e0e98906dd761941ca46",
"sha256:3e0d5e48e3a23e9a4d1a9f698e32a542a4a288c871d33ed8df1b092a40f3a0f9",
"sha256:68ba70684990f59497680ff90d18e756a47bf4863c604098f10de9716b2c0bdd",
"sha256:57fe287f0cdd9ceaf69e7b71a2e94a24b5d268b35df251a88fef5cc241bf73aa"
],
"version": "==1.1.0"
},
"urllib3": {
"hashes": [
"sha256:06330f386d6e4b195fbfc736b297f58c5a892e4440e54d294d7004e3a9bbea1b",
@@ -501,6 +558,27 @@
"sha256:55994787e93b411c2d659068b51b9998d9d0c05e0df188e6daf8f45836e1ea38"
],
"version": "==1.19.0"
},
"websockets": {
"hashes": [
"sha256:f5192da704535a7cbf76d6e99c1ec4af7e8d1288252bf5a2385d414509ded0cf",
"sha256:0c31bc832d529dc7583d324eb6c836a4f362032a1902723c112cf57883488d8c",
"sha256:da7610a017f5343fdf765f4e0eb6fd0dfd08264ca1565212b110836d9367fc9c",
"sha256:fd81af8cf3e69f9a97f3a6c0623a0527de0f922c2df725f00cd7646d478af632",
"sha256:3d425ae081fb4ba1eef9ecf30472ffd79f8e868297ccc7a47993c96dbf2a819c",
"sha256:ebdd4f18fe7e3bea9bd3bf446b0f4117739478caa2c76e4f0fb72cc45b03cbd7",
"sha256:3859ca16c229ddb0fa21c5090e4efcb037c08ce69b0c1dfed6122c3f98cd0c22",
"sha256:d1a0572b6edb22c9208e3e5381064e09d287d2a915f90233fef994ee7a14a935",
"sha256:80188abdadd23edaaea05ce761dc9a2e1df31a74a0533967f0dcd9560c85add0",
"sha256:fecf51c13195c416c22422353b306dddb9c752e4b80b21e0fa1fccbe38246677",
"sha256:367ff945bc0950ad9634591e2afe50bf2222bc4fad1088a386c4bb700888026e",
"sha256:6df87698022aef2596bffdfecc96d656db59c8d719708c8a471daa815ee61656",
"sha256:341824d8c9ad53fc43cca3fa9407f294125fa258592f7676640396501448e57e",
"sha256:64896a6b3368c959b8096b655e46f03dfa65b96745249f374bd6a35705cc3489",
"sha256:1f3e5a52cab6daa3d432c7b0de0a14109be39d2bfaad033ee5de4a3d3e11dcdf",
"sha256:da4d4fbe059b0453e726d6d993760065d69b823a27efc3040402a6fcfe6a1ed9"
],
"version": "==4.0.1"
}
}
}
+31 -33
View File
@@ -1,7 +1,7 @@
import asyncio
from urllib.parse import urlparse, urlunparse
from concurrent.futures._base import TimeoutError
from typing import List
from typing import Set
import pyppeteer
import requests
@@ -16,43 +16,16 @@ from parse import findall
from w3lib.encoding import html_to_unicode
DEFAULT_ENCODING = 'utf-8'
useragent = UserAgent()
class HTMLResponse(requests.Response):
"""An HTML-enabled :class:`Response <Response>` object.
Same as Requests class:`Response <Response>` object, but with an
intelligent ``.html`` property added.
"""
def __init__(self, *args, **kwargs):
super(HTMLResponse, self).__init__(*args, **kwargs)
self._html = None
@property
def html(self) -> str:
if self._html:
return self._html
self._html = HTML(url=self.url, html=self.text, default_encoding=self.encoding)
return self._html
@classmethod
def _from_response(cls, response):
html_r = cls()
html_r.__dict__.update(response.__dict__)
return html_r
class BaseParser:
"""A basic HTML/Element Parser, for Humans."""
def __init__(self, *, element, default_encoding: str = None, html: str = None, url: str):
def __init__(self, *, element, default_encoding: str = None, html: str = None, url: str) -> None:
self.element = element
self.url = url
self.skip_anchors = True
@@ -69,7 +42,7 @@ class BaseParser:
return etree.tostring(self.element, encoding='unicode').strip()
@html.setter
def set_html(self, html):
def set_html(self, html: str) -> None:
"""Property setter for self.html."""
self._html = html
@@ -148,7 +121,7 @@ class BaseParser:
return [r for r in findall(template, self.html)]
@property
def links(self) -> List[str]:
def links(self) -> Set[str]:
"""All found links on page, in asis form."""
def gen():
for link in self.find('a'):
@@ -164,7 +137,7 @@ class BaseParser:
return set(g for g in gen())
@property
def absolute_links(self) -> List[str]:
def absolute_links(self) -> Set[str]:
"""All found links on page, in absolute form."""
def gen():
for link in self.links:
@@ -275,6 +248,31 @@ class HTML(BaseParser):
return self
class HTMLResponse(requests.Response):
"""An HTML-enabled :class:`Response <Response>` object.
Same as Requests class:`Response <Response>` object, but with an
intelligent ``.html`` property added.
"""
def __init__(self, *args, **kwargs):
super(HTMLResponse, self).__init__(*args, **kwargs)
self._html = None
@property
def html(self) -> HTML:
if self._html:
return self._html
self._html = HTML(url=self.url, html=self.text, default_encoding=self.encoding)
return self._html
@classmethod
def _from_response(cls, response):
html_r = cls()
html_r.__dict__.update(response.__dict__)
return html_r
def user_agent(style='chrome') -> str:
"""Returns a random user-agent, if not requested one of a specific
style. Defaults to a Chrome-style User-Agent.
@@ -301,7 +299,7 @@ class HTMLSession(requests.Session):
self.hooks = {'response': self._handle_response}
@staticmethod
def _handle_response(response, **kwargs) -> requests.Response:
def _handle_response(response, **kwargs) -> HTMLResponse:
"""Requests HTTP Response handler. Attaches .html property to Response
objects.
"""