diff --git a/Pipfile b/Pipfile index f88338d..63f66f9 100644 --- a/Pipfile +++ b/Pipfile @@ -14,6 +14,7 @@ parse = "*" "bs4" = "*" "pyqt5" = "*" "w3lib" = "*" +pyppeteer = "*" [dev-packages] diff --git a/Pipfile.lock b/Pipfile.lock index 0ee7d2b..a35c154 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "f6a1a62089049c03d073d0255f1547245bfb4277c62c8df273afc3fcc4f5e127" + "sha256": "9af93ac7145d6f8f0f24b28c59064699450344bdd45f18d3c4383647a1a08f03" }, "host-environment-markers": { "implementation_name": "cpython", @@ -114,6 +114,19 @@ ], "version": "==1.8.2" }, + "pyee": { + "hashes": [ + "sha256:47f8fa96d6dee61c82001831e1fbba55f3f808003a322d0e6653aa01c59f6b9e", + "sha256:4ec22817297b7024f89721cc34f790ee2767c5b5ca44284c565ee643abafbe32" + ], + "version": "==5.0.0" + }, + "pyppeteer": { + "hashes": [ + "sha256:596929fb7d052048679081d3dc2a998cf065e936a752c7ba2392445d6e0e9706" + ], + "version": "==0.0.10" + }, "pyqt5": { "hashes": [ "sha256:128285176240e990fce9c50293105ffd0d2884d8910bb338118f867b171ec6e8", @@ -174,6 +187,27 @@ "sha256:55994787e93b411c2d659068b51b9998d9d0c05e0df188e6daf8f45836e1ea38" ], "version": "==1.19.0" + }, + "websockets": { + "hashes": [ + "sha256:f5192da704535a7cbf76d6e99c1ec4af7e8d1288252bf5a2385d414509ded0cf", + "sha256:0c31bc832d529dc7583d324eb6c836a4f362032a1902723c112cf57883488d8c", + "sha256:da7610a017f5343fdf765f4e0eb6fd0dfd08264ca1565212b110836d9367fc9c", + "sha256:fd81af8cf3e69f9a97f3a6c0623a0527de0f922c2df725f00cd7646d478af632", + "sha256:3d425ae081fb4ba1eef9ecf30472ffd79f8e868297ccc7a47993c96dbf2a819c", + "sha256:ebdd4f18fe7e3bea9bd3bf446b0f4117739478caa2c76e4f0fb72cc45b03cbd7", + "sha256:3859ca16c229ddb0fa21c5090e4efcb037c08ce69b0c1dfed6122c3f98cd0c22", + "sha256:d1a0572b6edb22c9208e3e5381064e09d287d2a915f90233fef994ee7a14a935", + "sha256:80188abdadd23edaaea05ce761dc9a2e1df31a74a0533967f0dcd9560c85add0", + "sha256:fecf51c13195c416c22422353b306dddb9c752e4b80b21e0fa1fccbe38246677", + "sha256:367ff945bc0950ad9634591e2afe50bf2222bc4fad1088a386c4bb700888026e", + "sha256:6df87698022aef2596bffdfecc96d656db59c8d719708c8a471daa815ee61656", + "sha256:341824d8c9ad53fc43cca3fa9407f294125fa258592f7676640396501448e57e", + "sha256:64896a6b3368c959b8096b655e46f03dfa65b96745249f374bd6a35705cc3489", + "sha256:1f3e5a52cab6daa3d432c7b0de0a14109be39d2bfaad033ee5de4a3d3e11dcdf", + "sha256:da4d4fbe059b0453e726d6d993760065d69b823a27efc3040402a6fcfe6a1ed9" + ], + "version": "==4.0.1" } }, "develop": { diff --git a/requests_html.py b/requests_html.py index 9e5ca14..552560d 100644 --- a/requests_html.py +++ b/requests_html.py @@ -1,5 +1,7 @@ +import asyncio from urllib.parse import urlparse, urlunparse +import pyppeteer import requests from pyquery import PyQuery @@ -10,11 +12,7 @@ from parse import search as parse_search from parse import findall from w3lib.encoding import html_to_unicode -try: - from PyQt5.QtWidgets import QApplication - from PyQt5.QtWebEngineWidgets import QWebEngineView -except ImportError: - pass + DEFAULT_ENCODING = 'utf-8' @@ -154,7 +152,8 @@ class BaseParser: try: href = link.attrs['href'].strip() if not href.startswith('#') and self.skip_anchors and href not in ['javascript:;']: - yield href + if href: + yield href except KeyError: pass @@ -285,7 +284,7 @@ class HTMLSession(requests.Session): class BrowserHTMLSession(HTMLSession): """A web-browser interpreted session (for JavaScript), powered by - PyQt5's QWebEngineView.""" + `PyPpeteer `_.""" def __init__(self, *args, **kwargs): super(BrowserHTMLSession, self).__init__(*args, **kwargs) @@ -294,7 +293,7 @@ class BrowserHTMLSession(HTMLSession): # Convert Request object into HTTPRequest object. r = super(BrowserHTMLSession, self).request(*args, **kwargs) - r._content = self.render(r.text).encode(DEFAULT_ENCODING) + r._content = self.render(r.url).encode(DEFAULT_ENCODING) r.encoding = DEFAULT_ENCODING return r @@ -303,30 +302,18 @@ class BrowserHTMLSession(HTMLSession): def render(source_url): """Fully render HTML, JavaScript and all.""" - if 'QApplication' not in globals(): - raise RuntimeError('PyQt5 must be installed.') + async def _async_render(url): + browser = pyppeteer.launch() + page = await browser.newPage() + await page.goto(url) - class Render(QWebEngineView): - def __init__(self, html): - self.html = None - self.app = QApplication([]) - QWebEngineView.__init__(self) - self.loadFinished.connect(self._loadFinished) - self.setHtml(html) - # self.load(QUrl(url)) - self.app.exec_() + content = await page.content() + return content - def _loadFinished(self, result): - # This is an async call, you need to wait for this - # to be called before closing the app - self.page().toHtml(self._callable) + loop = asyncio.get_event_loop() + content = loop.run_until_complete(_async_render(source_url)) - def _callable(self, data): - self.html = data - # Data has been stored, it's safe to quit the app - self.app.quit() - - return Render(source_url).html + return content # Backwards compatiblity. diff --git a/setup.py b/setup.py index 41e6ddd..5c3ca53 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ VERSION = '0.3.5' # What packages are required for this module to be executed? REQUIRED = [ - 'requests', 'pyquery', 'fake-useragent', 'parse', 'bs4', 'w3lib' + 'requests', 'pyquery', 'fake-useragent', 'parse', 'bs4', 'w3lib', 'pyppeteer' ] # The rest you shouldn't have to touch too much :) @@ -79,9 +79,6 @@ setup( author_email=EMAIL, url=URL, python_requires='>=3.5.0', - extras_require={ - 'browser': ['PyQt5'], - }, # If your package is a single module, use this instead of 'packages': py_modules=['requests_html'],