pyppeteer

Signed-off-by: Kenneth Reitz <me@kennethreitz.org>
This commit is contained in:
2018-02-27 09:58:25 -05:00
parent f026f821e3
commit 3f58d2bfdc
4 changed files with 53 additions and 34 deletions
+1
View File
@@ -14,6 +14,7 @@ parse = "*"
"bs4" = "*"
"pyqt5" = "*"
"w3lib" = "*"
pyppeteer = "*"
[dev-packages]
Generated
+35 -1
View File
@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "f6a1a62089049c03d073d0255f1547245bfb4277c62c8df273afc3fcc4f5e127"
"sha256": "9af93ac7145d6f8f0f24b28c59064699450344bdd45f18d3c4383647a1a08f03"
},
"host-environment-markers": {
"implementation_name": "cpython",
@@ -114,6 +114,19 @@
],
"version": "==1.8.2"
},
"pyee": {
"hashes": [
"sha256:47f8fa96d6dee61c82001831e1fbba55f3f808003a322d0e6653aa01c59f6b9e",
"sha256:4ec22817297b7024f89721cc34f790ee2767c5b5ca44284c565ee643abafbe32"
],
"version": "==5.0.0"
},
"pyppeteer": {
"hashes": [
"sha256:596929fb7d052048679081d3dc2a998cf065e936a752c7ba2392445d6e0e9706"
],
"version": "==0.0.10"
},
"pyqt5": {
"hashes": [
"sha256:128285176240e990fce9c50293105ffd0d2884d8910bb338118f867b171ec6e8",
@@ -174,6 +187,27 @@
"sha256:55994787e93b411c2d659068b51b9998d9d0c05e0df188e6daf8f45836e1ea38"
],
"version": "==1.19.0"
},
"websockets": {
"hashes": [
"sha256:f5192da704535a7cbf76d6e99c1ec4af7e8d1288252bf5a2385d414509ded0cf",
"sha256:0c31bc832d529dc7583d324eb6c836a4f362032a1902723c112cf57883488d8c",
"sha256:da7610a017f5343fdf765f4e0eb6fd0dfd08264ca1565212b110836d9367fc9c",
"sha256:fd81af8cf3e69f9a97f3a6c0623a0527de0f922c2df725f00cd7646d478af632",
"sha256:3d425ae081fb4ba1eef9ecf30472ffd79f8e868297ccc7a47993c96dbf2a819c",
"sha256:ebdd4f18fe7e3bea9bd3bf446b0f4117739478caa2c76e4f0fb72cc45b03cbd7",
"sha256:3859ca16c229ddb0fa21c5090e4efcb037c08ce69b0c1dfed6122c3f98cd0c22",
"sha256:d1a0572b6edb22c9208e3e5381064e09d287d2a915f90233fef994ee7a14a935",
"sha256:80188abdadd23edaaea05ce761dc9a2e1df31a74a0533967f0dcd9560c85add0",
"sha256:fecf51c13195c416c22422353b306dddb9c752e4b80b21e0fa1fccbe38246677",
"sha256:367ff945bc0950ad9634591e2afe50bf2222bc4fad1088a386c4bb700888026e",
"sha256:6df87698022aef2596bffdfecc96d656db59c8d719708c8a471daa815ee61656",
"sha256:341824d8c9ad53fc43cca3fa9407f294125fa258592f7676640396501448e57e",
"sha256:64896a6b3368c959b8096b655e46f03dfa65b96745249f374bd6a35705cc3489",
"sha256:1f3e5a52cab6daa3d432c7b0de0a14109be39d2bfaad033ee5de4a3d3e11dcdf",
"sha256:da4d4fbe059b0453e726d6d993760065d69b823a27efc3040402a6fcfe6a1ed9"
],
"version": "==4.0.1"
}
},
"develop": {
+16 -29
View File
@@ -1,5 +1,7 @@
import asyncio
from urllib.parse import urlparse, urlunparse
import pyppeteer
import requests
from pyquery import PyQuery
@@ -10,11 +12,7 @@ from parse import search as parse_search
from parse import findall
from w3lib.encoding import html_to_unicode
try:
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEngineView
except ImportError:
pass
DEFAULT_ENCODING = 'utf-8'
@@ -154,7 +152,8 @@ class BaseParser:
try:
href = link.attrs['href'].strip()
if not href.startswith('#') and self.skip_anchors and href not in ['javascript:;']:
yield href
if href:
yield href
except KeyError:
pass
@@ -285,7 +284,7 @@ class HTMLSession(requests.Session):
class BrowserHTMLSession(HTMLSession):
"""A web-browser interpreted session (for JavaScript), powered by
PyQt5's QWebEngineView."""
`PyPpeteer <https://pypi.python.org/pypi/pyppeteer>`_."""
def __init__(self, *args, **kwargs):
super(BrowserHTMLSession, self).__init__(*args, **kwargs)
@@ -294,7 +293,7 @@ class BrowserHTMLSession(HTMLSession):
# Convert Request object into HTTPRequest object.
r = super(BrowserHTMLSession, self).request(*args, **kwargs)
r._content = self.render(r.text).encode(DEFAULT_ENCODING)
r._content = self.render(r.url).encode(DEFAULT_ENCODING)
r.encoding = DEFAULT_ENCODING
return r
@@ -303,30 +302,18 @@ class BrowserHTMLSession(HTMLSession):
def render(source_url):
"""Fully render HTML, JavaScript and all."""
if 'QApplication' not in globals():
raise RuntimeError('PyQt5 must be installed.')
async def _async_render(url):
browser = pyppeteer.launch()
page = await browser.newPage()
await page.goto(url)
class Render(QWebEngineView):
def __init__(self, html):
self.html = None
self.app = QApplication([])
QWebEngineView.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.setHtml(html)
# self.load(QUrl(url))
self.app.exec_()
content = await page.content()
return content
def _loadFinished(self, result):
# This is an async call, you need to wait for this
# to be called before closing the app
self.page().toHtml(self._callable)
loop = asyncio.get_event_loop()
content = loop.run_until_complete(_async_render(source_url))
def _callable(self, data):
self.html = data
# Data has been stored, it's safe to quit the app
self.app.quit()
return Render(source_url).html
return content
# Backwards compatiblity.
+1 -4
View File
@@ -21,7 +21,7 @@ VERSION = '0.3.5'
# What packages are required for this module to be executed?
REQUIRED = [
'requests', 'pyquery', 'fake-useragent', 'parse', 'bs4', 'w3lib'
'requests', 'pyquery', 'fake-useragent', 'parse', 'bs4', 'w3lib', 'pyppeteer'
]
# The rest you shouldn't have to touch too much :)
@@ -79,9 +79,6 @@ setup(
author_email=EMAIL,
url=URL,
python_requires='>=3.5.0',
extras_require={
'browser': ['PyQt5'],
},
# If your package is a single module, use this instead of 'packages':
py_modules=['requests_html'],