From a2cc6bfa55e931f68525256ba91fe8930e188241 Mon Sep 17 00:00:00 2001 From: Ordanis Sanchez Date: Tue, 20 Mar 2018 19:50:04 -0400 Subject: [PATCH] Update HTML.render to use session.browser anf close pages automatically --- requests_html.py | 18 ++++++++++++------ tests/test_requests_html.py | 23 +++++++++++++++++++++++ 2 files changed, 35 insertions(+), 6 deletions(-) diff --git a/requests_html.py b/requests_html.py index f6ab3f5..0807881 100644 --- a/requests_html.py +++ b/requests_html.py @@ -480,7 +480,7 @@ class HTML(BaseParser): def add_next_symbol(self, next_symbol): self.next_symbol.append(next_symbol) - def render(self, retries: int = 8, script: str = None, wait: float = 0.2, scrolldown=False, sleep: int = 0, reload: bool = True, timeout: Union[float, int] = 8.0): + def render(self, retries: int = 8, script: str = None, wait: float = 0.2, scrolldown=False, sleep: int = 0, reload: bool = True, timeout: Union[float, int] = 8.0, keep_page: bool = False): """Reloads the response in Chromium, and replaces HTML content with an updated version, with JavaScript executed. @@ -490,6 +490,7 @@ class HTML(BaseParser): :param scrolldown: Integer, if provided, of how many times to page down. :param sleep: Integer, if provided, of how many long to sleep after initial render. :param reload: If ``False``, content will not be loaded from the browser, but will be provided from memory. + :param keep_page: If ``True`` will allow you to interact with the browser page through ``r.html.page``. If ``scrolldown`` is specified, the page will scrolldown the specified number of times, after sleeping the specified amount of time @@ -520,13 +521,15 @@ class HTML(BaseParser): >>> r.html.render(script=script) {'width': 800, 'height': 600, 'deviceScaleFactor': 1} + Warning: If you use keep_page, you're responsable for closing each page, since + opening to many at scale may crach the browser. + Warning: the first time you run this method, it will download Chromium into your home directory (``~/.pyppeteer``). """ - async def _async_render(*, url: str, script: str = None, scrolldown, sleep: int, wait: float, reload, content: Optional[str], timeout: Union[float, int]): + async def _async_render(*, url: str, script: str = None, scrolldown, sleep: int, wait: float, reload, content: Optional[str], timeout: Union[float, int], keep_page: bool): try: - browser = await pyppeteer.launch(headless=True, args=['--no-sandbox']) - page = await browser.newPage() + page = await self.session.browser.newPage() # Wait before rendering the page, to prevent timeouts. await asyncio.sleep(wait) @@ -553,11 +556,14 @@ class HTML(BaseParser): # Return the content of the page, JavaScript evaluated. content = await page.content() + if not keep_page: + await page.close() + page = None return content, result, page except TimeoutError: return None - loop = asyncio.get_event_loop() + self.session.browser # Automatycally create a event loop and browser content = None # Automatically set Reload to False, if example URL is being used. @@ -568,7 +574,7 @@ class HTML(BaseParser): if not content: try: - content, result, page = loop.run_until_complete(_async_render(url=self.url, script=script, sleep=sleep, wait=wait, content=self.html, reload=reload, scrolldown=scrolldown, timeout=timeout)) + content, result, page = self.session.loop.run_until_complete(_async_render(url=self.url, script=script, sleep=sleep, wait=wait, content=self.html, reload=reload, scrolldown=scrolldown, timeout=timeout, keep_page=keep_page)) except TypeError: pass else: diff --git a/tests/test_requests_html.py b/tests/test_requests_html.py index 54d2ff9..bf0070d 100644 --- a/tests/test_requests_html.py +++ b/tests/test_requests_html.py @@ -4,6 +4,7 @@ from functools import partial import pytest import psutil from pyppeteer.browser import Browser +from pyppeteer.page import Page from requests_html import HTMLSession, AsyncHTMLSession, HTML from requests_file import FileAdapter @@ -247,5 +248,27 @@ def test_browser_session(): assert count_chromium_process() == 0 +@pytest.mark.ok +def test_browser_process(): + for _ in range(3): + r = get() + r.html.render() + + assert r.html.page == None + + assert count_chromium_process() == 2 + + +@pytest.mark.ok +def test_browser_pages(): + for _ in range(3): + r = get() + r.html.render(keep_page=True) + + assert isinstance(r.html.page, Page) + + assert count_chromium_process() == 5 # 2 process for chromiun and 1 by each page + + if __name__ == '__main__': test_containing()