From 26e429f4ca9039c3c1af44f2a88bfb094c3a7a0b Mon Sep 17 00:00:00 2001 From: Kenneth Reitz Date: Mon, 26 Feb 2018 11:48:57 -0500 Subject: [PATCH] extras, browser! Signed-off-by: Kenneth Reitz --- Pipfile | 2 ++ requests_html.py | 57 +++++++++++++++++++++++++++++++++++++++++++++++- setup.py | 3 +++ 3 files changed, 61 insertions(+), 1 deletion(-) diff --git a/Pipfile b/Pipfile index 7af9e51..c0e534d 100644 --- a/Pipfile +++ b/Pipfile @@ -12,6 +12,8 @@ pyquery = "*" fake-useragent = "*" parse = "*" "bs4" = "*" +"Ghost.py" = {git = "git://github.com/carrerasrodrigo/Ghost.py.git", editable = true} +dukpy = "*" [dev-packages] diff --git a/requests_html.py b/requests_html.py index d8429bf..60154e1 100644 --- a/requests_html.py +++ b/requests_html.py @@ -1,3 +1,4 @@ +import sys from urllib.parse import urlparse, urlunparse import requests @@ -9,6 +10,13 @@ from lxml.html.soupparser import fromstring from parse import search as parse_search from parse import findall +try: + from PyQt5.QtWidgets import QApplication + from PyQt5.QtWebEngineWidgets import QWebEngineView +except ImportError: + pass + + DEFAULT_ENCODING = 'utf-8' useragent = UserAgent() @@ -222,6 +230,7 @@ def user_agent(style=None): return useragent[style] + class Session(requests.Session): """A consumable session, for cookie persistience and connection pooling, amongst other things. @@ -249,5 +258,51 @@ class Session(requests.Session): return response +class BrowserSession(Session): + """A web-browser interpreted session (for JavaScript).""" + + def __init__(self, *args, **kwargs): + super(BrowserSession, self).__init__(*args, **kwargs) + + def request(self, *args, **kwargs): + r = super(BrowserSession, self).request(*args, **kwargs) + + r._content = self.render(r.text).encode(DEFAULT_ENCODING) + r.encoding = 'utf-8' + + r.html = HTML(url=r.url, html=r.text, default_encoding=r.encoding) + + return r + + @staticmethod + def render(source_url): + """Fully render HTML, JavaScript and all.""" + + if not 'QApplication' in globals(): + raise RuntimeError('PyQt5 must be installed.') + + class Render(QWebEngineView): + def __init__(self, html): + self.html = None + self.app = QApplication([]) + QWebEngineView.__init__(self) + self.loadFinished.connect(self._loadFinished) + self.setHtml(html) + # self.load(QUrl(url)) + self.app.exec_() + + def _loadFinished(self, result): + # This is an async call, you need to wait for this + # to be called before closing the app + self.page().toHtml(self._callable) + + def _callable(self, data): + self.html = data + # Data has been stored, it's safe to quit the app + self.app.quit() + + return Render(source_url).html + + # Backwards compatiblity. -session = Session() +session = Session() \ No newline at end of file diff --git a/setup.py b/setup.py index d7f6271..6cd25ec 100644 --- a/setup.py +++ b/setup.py @@ -79,6 +79,9 @@ setup( author_email=EMAIL, url=URL, python_requires='>=3.5.0', + extras_require={ + 'browser': ['PyQt5'], + }, # If your package is a single module, use this instead of 'packages': py_modules=['requests_html'],