prototype

Signed-off-by: Kenneth Reitz <me@kennethreitz.org>
This commit is contained in:
2018-02-24 15:56:40 -05:00
commit 7fb775bf2b
3 changed files with 215 additions and 0 deletions
+15
View File
@@ -0,0 +1,15 @@
[[source]]
url = "https://pypi.python.org/simple"
verify_ssl = true
name = "pypi"
[packages]
requests = "*"
pyquery = "*"
[dev-packages]
Generated
+114
View File
@@ -0,0 +1,114 @@
{
"_meta": {
"hash": {
"sha256": "bf10659b65fbdc452338842d0ef489386058d58982dfe4c2f4ef6cca8f91b171"
},
"host-environment-markers": {
"implementation_name": "cpython",
"implementation_version": "3.6.4",
"os_name": "posix",
"platform_machine": "x86_64",
"platform_python_implementation": "CPython",
"platform_release": "17.4.0",
"platform_system": "Darwin",
"platform_version": "Darwin Kernel Version 17.4.0: Sun Dec 17 09:19:54 PST 2017; root:xnu-4570.41.2~1/RELEASE_X86_64",
"python_full_version": "3.6.4",
"python_version": "3.6",
"sys_platform": "darwin"
},
"pipfile-spec": 6,
"requires": {},
"sources": [
{
"name": "pypi",
"url": "https://pypi.python.org/simple",
"verify_ssl": true
}
]
},
"default": {
"certifi": {
"hashes": [
"sha256:14131608ad2fd56836d33a71ee60fa1c82bc9d2c8d98b7bdbc631fe1b3cd1296",
"sha256:edbc3f203427eef571f79a7692bb160a2b0f7ccaa31953e99bd17e307cf63f7d"
],
"version": "==2018.1.18"
},
"chardet": {
"hashes": [
"sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691",
"sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae"
],
"version": "==3.0.4"
},
"cssselect": {
"hashes": [
"sha256:3b5103e8789da9e936a68d993b70df732d06b8bb9a337a05ed4eb52c17ef7206",
"sha256:066d8bc5229af09617e24b3ca4d52f1f9092d9e061931f4184cd572885c23204"
],
"version": "==1.0.3"
},
"idna": {
"hashes": [
"sha256:8c7309c718f94b3a625cb648ace320157ad16ff131ae0af362c9f21b80ef6ec4",
"sha256:2c6a5de3089009e3da7c5dde64a141dbc8551d5b7f6cf4ed7c2568d0cc520a8f"
],
"version": "==2.6"
},
"lxml": {
"hashes": [
"sha256:41f59cbdab232f11680d5d4dec9f2e6782fd24d78e37ee833447702e34e675f4",
"sha256:e7e41d383f19bab9d57f5f3b18d158655bcd682e7e723f441b9e183e1e35a6b5",
"sha256:155521c337acecf8202091cff85bb9f709f238130ebadf04280fb1db11f5ad8b",
"sha256:d2c985d2460b81c6ca5feb8b86f1bc594ad59405d0bdf68626b85852b701553c",
"sha256:950e63387514aa1b881eba5ac6cb2ec51a118b3dafe99dd80ca19d8fb0142f30",
"sha256:470d7ce41e8047208ba1a376560bad17f1468df1f3097bc83902b26cfafdbb0c",
"sha256:e608839a5ee2180164424ccf279c8e2d9bbe8816d002c58fd97d6b621ba4aa94",
"sha256:87a66bcadac270fc010cb029022a93fc722bf1204a8b03e782d4c790f0edf7ca",
"sha256:2dedfeeecc2d5a939cf622602f5a1ce443ca82407f386880f739f1a9f08053ad",
"sha256:ba05732e4bcf59e948f61588851dcf620fd60d5bbd9d704203e5f59bbaa60219",
"sha256:2190266059fec3c5a55f9d6c30532c64c6d414d3228909c0af573fe4907e78d1",
"sha256:dd291debfaa535d9cb6cee8d7aca2328775e037d02d13f1634e57f49bc302cc4",
"sha256:29a36e354c39b2e24bc4ee103de53417ebb80f976a6ab9e8d093d559e2ac03e1",
"sha256:e37427d5a27eefbcfc48847e0b37f348113fac7280bc857421db39ffc6372570",
"sha256:b106d4d2383382399ad82108fd187e92f40b1c90f55c2d36bbcb1c44bcf940fc",
"sha256:0ee07da52d240f1dc3c83eef5cd5f1b7f018226c1121f2a54d446645779a6d17",
"sha256:3b33549fb8f91b38a7500078242b03cca513f3412a2cdae722e89bf83f95971d",
"sha256:4c12e90886d9c53ab434c8d0cebea122321cce19614c3c6b6d1a7700d7cc6212",
"sha256:79322000279cda10b53c374d53ca632ead3bc51c6aebf8e62c8fa93a4d08b750",
"sha256:6cba398eb37e0631e60e0e080c101cfe91769b2c8267105b64b4625e2581ea21",
"sha256:49a655956f8de69e1258bc0fcfc43eb3bd1e038655784d77d1869b4b81444e37",
"sha256:af8a5373241d09b8fc53e0490e1719ce5dc90a21b19db89b6596c1adcdd52270",
"sha256:e6b6698415c7e8d227a47a3b1038e1b37c2b438a1b48c2db7ad9e74ddbcd1149",
"sha256:155c916cf2645b4a8f2bd5d09065e92d1b67b8d464bdc001e0b524af84bedf6f",
"sha256:fa7320679ced5e25b20203d157280680fc84eb783b6cc650cb0c98e1858b7dd3",
"sha256:4187c4b0cefc3353181db048c51f42c489d9ac51e40b86c4851dc0671372971d",
"sha256:d5d29663e979e83b3fc361e97200f959cddb3a14797391d15273d84a5a8ae44b",
"sha256:940caef1ec7c78e0c34b0f6b94fe42d0f2022915ffc78643d28538a5cfd0f40e"
],
"version": "==4.1.1"
},
"pyquery": {
"hashes": [
"sha256:07987c2ed2aed5cba29ff18af95e56e9eb04a2249f42ce47bddfb37f487229a3",
"sha256:4771db76bd14352eba006463656aef990a0147a0eeaf094725097acfa90442bf"
],
"version": "==1.4.0"
},
"requests": {
"hashes": [
"sha256:6a1b267aa90cac58ac3a765d067950e7dbbf75b1da07e895d1f594193a40a38b",
"sha256:9c443e7324ba5b85070c4a818ade28bfabedf16ea10206da1132edaa6dda237e"
],
"version": "==2.18.4"
},
"urllib3": {
"hashes": [
"sha256:06330f386d6e4b195fbfc736b297f58c5a892e4440e54d294d7004e3a9bbea1b",
"sha256:cc44da8e1145637334317feebd728bd869a35285b93cbb4cca2577da7e62db4f"
],
"version": "==1.22"
}
},
"develop": {}
}
+86
View File
@@ -0,0 +1,86 @@
import requests
from pyquery import PyQuery
class Element:
"""docstring for Element"""
def __init__(self, element):
self.element = element
def __repr__(self):
attrs = []
for attr in self.attrs:
attrs.append('{}={}'.format(attr, repr(self.attrs[attr])))
return "<Element {} {}>".format(repr(self.element.tag), ' '.join(attrs))
# return tostring(self.element).decode('utf-8')
@property
def pq(self):
return PyQuery(self.element)
@property
def attrs(self):
# print(dir(self.element))
return {k: self.pq.attr[k] for k in self.element.keys()}
@property
def text(self):
return self.pq.text()
class HTML(object):
"""docstring for HTML"""
def __init__(self, response):
self.html = response.text
self.url = response.url
self.skip_anchors = True
def __repr__(self):
return repr(self.html)
def find(self, selector):
def gen():
for found in self.pq(selector):
yield Element(found)
return [g for g in gen()]
@property
def links(self):
def gen():
for link in self.find('a'):
href = link.attrs['href']
if not href.startswith('#') and self.skip_anchors:
yield href
return [g for g in gen()]
@property
def base_url(self):
return '/'.join(self.url.split('/')[:-1])
@property
def absolute_links(self):
def gen():
for link in self.links:
if not link.startswith('http'):
href = '{}/{}'.format(self.base_url, link)
yield href
return [g for g in gen()]
@property
def pq(self):
return PyQuery(self.html)
def handle_response(response, **kwargs):
response.html = HTML(response)
return response
session = requests.Session()
session.hooks = {'response': handle_response}
r = session.get('https://pythonhosted.org/pyquery/')
print(r.html.absolute_links)