From be5f7c2ba8cae16f7b68c413f0375571ec0ed26f Mon Sep 17 00:00:00 2001 From: "M.Michel" Date: Sun, 21 Apr 2019 20:15:18 +0200 Subject: [PATCH 1/3] add: send session.cookies to render --- requests_html.py | 68 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 65 insertions(+), 3 deletions(-) diff --git a/requests_html.py b/requests_html.py index a6c80e8..b2ee804 100644 --- a/requests_html.py +++ b/requests_html.py @@ -499,7 +499,7 @@ class HTML(BaseParser): def add_next_symbol(self, next_symbol): self.next_symbol.append(next_symbol) - async def _async_render(self, *, url: str, script: str = None, scrolldown, sleep: int, wait: float, reload, content: Optional[str], timeout: Union[float, int], keep_page: bool): + async def _async_render(self, *, url: str, script: str = None, scrolldown, sleep: int, wait: float, reload, content: Optional[str], timeout: Union[float, int], keep_page: bool, cookies: list = [{}]): """ Handle page creation and js rendering. Internal use for render/arender methods. """ try: page = await self.browser.newPage() @@ -507,6 +507,10 @@ class HTML(BaseParser): # Wait before rendering the page, to prevent timeouts. await asyncio.sleep(wait) + if cookies: + for cookie in cookies: + await page.setCookie(cookie) + # Load the given page (GET request, obviously.) if reload: await page.goto(url, options={'timeout': int(timeout * 1000)}) @@ -538,7 +542,60 @@ class HTML(BaseParser): page = None return None - def render(self, retries: int = 8, script: str = None, wait: float = 0.2, scrolldown=False, sleep: int = 0, reload: bool = True, timeout: Union[float, int] = 8.0, keep_page: bool = False): + def _convert_cookie_to_render(self, session_cookie): + """ + Convert HTMLSession.cookies[] for browser.newPage().setCookie + """ + # | setCookie(self, *cookies:dict) -> None + # | Set cookies. + # | + # | ``cookies`` should be dictionaries which contain these fields: + # | + # | * ``name`` (str): **required** + # | * ``value`` (str): **required** + # | * ``url`` (str) + # | * ``domain`` (str) + # | * ``path`` (str) + # | * ``expires`` (number): Unix time in seconds + # | * ``httpOnly`` (bool) + # | * ``secure`` (bool) + # | * ``sameSite`` (str): ``'Strict'`` or ``'Lax'`` + cookie_render = {} + def __convert(cookie, key): + try: + v = eval ("cookie."+key) + if not v: kv = '' + else: kv = {key: v} + except: + kv = '' + return kv + + keys = [ + 'name', + 'value', + 'url', + 'domain', + 'path', + 'sameSite', + 'expires', + 'httpOnly', + 'secure', + ] + for key in keys: + cookie_render.update(__convert(session_cookie, key)) + return cookie_render + + def _convert_cookies_to_render(self): + """ + Convert HTMLSession.cookies for browser.newPage().setCookie + Return a list of dict + """ + cookies_render = [] + for cookie in self.session.cookies: + cookies_render.append(self._convert_cookie_to_render(cookie)) + return cookies_render + + def render(self, retries: int = 8, script: str = None, wait: float = 0.2, scrolldown=False, sleep: int = 0, reload: bool = True, timeout: Union[float, int] = 8.0, keep_page: bool = False, cookies: list = [{}], send_cookies_session: bool = False): """Reloads the response in Chromium, and replaces HTML content with an updated version, with JavaScript executed. @@ -550,6 +607,9 @@ class HTML(BaseParser): :param reload: If ``False``, content will not be loaded from the browser, but will be provided from memory. :param keep_page: If ``True`` will allow you to interact with the browser page through ``r.html.page``. + :param send_cookies_session: If ``True`` send ``HTMLSession.cookies`` convert. + :param cookies: If not ``empty`` send ``cookies``. + If ``scrolldown`` is specified, the page will scrolldown the specified number of times, after sleeping the specified amount of time (e.g. ``scrolldown=10, sleep=1``). @@ -590,12 +650,14 @@ class HTML(BaseParser): if self.url == DEFAULT_URL: reload = False + if send_cookies_session: + cookies = self._convert_cookies_to_render() for i in range(retries): if not content: try: - content, result, page = self.session.loop.run_until_complete(self._async_render(url=self.url, script=script, sleep=sleep, wait=wait, content=self.html, reload=reload, scrolldown=scrolldown, timeout=timeout, keep_page=keep_page)) + content, result, page = self.session.loop.run_until_complete(self._async_render(url=self.url, script=script, sleep=sleep, wait=wait, content=self.html, reload=reload, scrolldown=scrolldown, timeout=timeout, keep_page=keep_page, cookies=cookies)) except TypeError: pass else: From aff9336c7b57be048348247b5aeae5c5f451b828 Mon Sep 17 00:00:00 2001 From: "M.Michel" Date: Mon, 22 Apr 2019 09:33:13 +0200 Subject: [PATCH 2/3] mod: explicit type cookiejar of cookie --- requests_html.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/requests_html.py b/requests_html.py index b2ee804..6029b03 100644 --- a/requests_html.py +++ b/requests_html.py @@ -8,6 +8,7 @@ from typing import Set, Union, List, MutableMapping, Optional import pyppeteer import requests +import http.cookiejar from pyquery import PyQuery from fake_useragent import UserAgent @@ -542,9 +543,9 @@ class HTML(BaseParser): page = None return None - def _convert_cookie_to_render(self, session_cookie): + def _convert_cookiejar_to_render(self, session_cookiejar): """ - Convert HTMLSession.cookies[] for browser.newPage().setCookie + Convert HTMLSession.cookies:cookiejar[] for browser.newPage().setCookie """ # | setCookie(self, *cookies:dict) -> None # | Set cookies. @@ -561,9 +562,9 @@ class HTML(BaseParser): # | * ``secure`` (bool) # | * ``sameSite`` (str): ``'Strict'`` or ``'Lax'`` cookie_render = {} - def __convert(cookie, key): + def __convert(cookiejar, key): try: - v = eval ("cookie."+key) + v = eval ("cookiejar."+key) if not v: kv = '' else: kv = {key: v} except: @@ -582,17 +583,18 @@ class HTML(BaseParser): 'secure', ] for key in keys: - cookie_render.update(__convert(session_cookie, key)) + cookie_render.update(__convert(session_cookiejar, key)) return cookie_render - def _convert_cookies_to_render(self): + def _convert_cookiesjar_to_render(self): """ Convert HTMLSession.cookies for browser.newPage().setCookie Return a list of dict """ cookies_render = [] - for cookie in self.session.cookies: - cookies_render.append(self._convert_cookie_to_render(cookie)) + if isinstance(self.session.cookies, http.cookiejar.CookieJar): + for cookie in self.session.cookies: + cookies_render.append(self._convert_cookiejar_to_render(cookie)) return cookies_render def render(self, retries: int = 8, script: str = None, wait: float = 0.2, scrolldown=False, sleep: int = 0, reload: bool = True, timeout: Union[float, int] = 8.0, keep_page: bool = False, cookies: list = [{}], send_cookies_session: bool = False): @@ -651,7 +653,7 @@ class HTML(BaseParser): reload = False if send_cookies_session: - cookies = self._convert_cookies_to_render() + cookies = self._convert_cookiesjar_to_render() for i in range(retries): if not content: From 8753f7972c3021e7d4263392dbf5d88698289ae3 Mon Sep 17 00:00:00 2001 From: "M.Michel" Date: Mon, 22 Apr 2019 13:31:25 +0200 Subject: [PATCH 3/3] fix: cookie = {} --- requests_html.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/requests_html.py b/requests_html.py index 6029b03..be5a5a4 100644 --- a/requests_html.py +++ b/requests_html.py @@ -510,7 +510,8 @@ class HTML(BaseParser): if cookies: for cookie in cookies: - await page.setCookie(cookie) + if cookie: + await page.setCookie(cookie) # Load the given page (GET request, obviously.) if reload: @@ -673,7 +674,7 @@ class HTML(BaseParser): self.page = page return result - async def arender(self, retries: int = 8, script: str = None, wait: float = 0.2, scrolldown=False, sleep: int = 0, reload: bool = True, timeout: Union[float, int] = 8.0, keep_page: bool = False): + async def arender(self, retries: int = 8, script: str = None, wait: float = 0.2, scrolldown=False, sleep: int = 0, reload: bool = True, timeout: Union[float, int] = 8.0, keep_page: bool = False, cookies: list = [{}], send_cookies_session: bool = False): """ Async version of render. Takes same parameters. """ self.browser = await self.session.browser @@ -683,11 +684,14 @@ class HTML(BaseParser): if self.url == DEFAULT_URL: reload = False + if send_cookies_session: + cookies = self._convert_cookiesjar_to_render() + for _ in range(retries): if not content: try: - content, result, page = await self._async_render(url=self.url, script=script, sleep=sleep, wait=wait, content=self.html, reload=reload, scrolldown=scrolldown, timeout=timeout, keep_page=keep_page) + content, result, page = await self._async_render(url=self.url, script=script, sleep=sleep, wait=wait, content=self.html, reload=reload, scrolldown=scrolldown, timeout=timeout, keep_page=keep_page, cookies=cookies) except TypeError: pass else: