From 32b941d339cc1f79377df56034424fe1c4c773b9 Mon Sep 17 00:00:00 2001 From: Kenneth Reitz Date: Wed, 21 Mar 2018 07:48:15 -0400 Subject: [PATCH] updates --- _modules/requests_html.html | 174 ++++++++++++++++++++++-------------- genindex.html | 13 --- index.html | 27 ++---- objects.inv | Bin 631 -> 621 bytes searchindex.js | 2 +- 5 files changed, 116 insertions(+), 100 deletions(-) diff --git a/_modules/requests_html.html b/_modules/requests_html.html index 0eb042f..18011c8 100644 --- a/_modules/requests_html.html +++ b/_modules/requests_html.html @@ -56,6 +56,7 @@ DEFAULT_ENCODING = 'utf-8' DEFAULT_URL = 'https://example.org/' DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8' +DEFAULT_NEXT_SYMBOL = ['next', 'more', 'older'] cleaner = Cleaner() cleaner.javascript = True @@ -81,6 +82,7 @@ _Links = Set[str] _Attrs = MutableMapping _Next = Union['HTML', List[str]] +_NextSymbol = List[str] # Sanity checking. try: @@ -90,6 +92,12 @@ raise RuntimeError('Requests-HTML requires Python 3.6+!') +class MaxRetries(Exception): + + def __init__(self, message): + self.message = message + + class BaseParser: """A basic HTML/Element Parser, for Humans. @@ -100,9 +108,8 @@ """ - def __init__(self, *, element, session: 'HTTPSession' = None, default_encoding: _DefaultEncoding = None, html: _HTML = None, url: _URL) -> None: + def __init__(self, *, element, default_encoding: _DefaultEncoding = None, html: _HTML = None, url: _URL) -> None: self.element = element - self.session = session or HTMLSession() self.url = url self.skip_anchors = True self.default_encoding = default_encoding @@ -151,6 +158,12 @@ # Scan meta tags for charset. if self._html: self._encoding = html_to_unicode(self.default_encoding, self._html)[0] + # Fall back to requests' detected encoding if decode fails. + try: + self.raw_html.decode(self.encoding) + except UnicodeDecodeError: + self._encoding = self.default_encoding + return self._encoding if self._encoding else self.default_encoding @@ -196,47 +209,6 @@ """ return self.lxml.text_content() - def next(self, fetch: bool = False) -> _Next: - """Attempts to find the next page, if there is one. If ``fetch`` - is ``True`` (default), returns :class:`HTML <HTML>` object of - next page. If ``fetch`` is ``False``, simply returns the next URL. - - """ - - def get_next(): - candidates = self.find('a', containing=('next', 'more', 'older')) - - for candidate in candidates: - if candidate.attrs.get('href'): - # Support 'next' rel (e.g. reddit). - if 'next' in candidate.attrs.get('rel', []): - return candidate.attrs['href'] - - # Support 'next' in classnames. - for _class in candidate.attrs.get('class', []): - if 'next' in _class: - return candidate.attrs['href'] - - if 'page' in candidate.attrs['href']: - return candidate.attrs['href'] - - try: - # Resort to the last candidate. - return candidates[-1].attrs['href'] - except IndexError: - return None - - next = get_next() - if next: - url = self._make_absolute(next) - else: - return None - - if fetch: - return self.session.get(url) - else: - return url - def find(self, selector: str = "*", *, containing: _Containing = None, clean: bool = False, first: bool = False, _encoding: str = None) -> _Find: """Given a CSS Selector, returns a list of :class:`Element <Element>` objects or a single one. @@ -356,7 +328,7 @@ try: href = link.attrs['href'].strip() - if href and not (href.startswith('#') and self.skip_anchors) and not href.startswith('javascript:'): + if href and not (href.startswith('#') and self.skip_anchors) and not href.startswith(('javascript:', 'mailto:')): yield href except KeyError: pass @@ -405,7 +377,7 @@ # Support for <base> tag. base = self.find('base', first=True) if base: - result = base.attrs['href'].strip() + result = base.attrs.get('href', '').strip() if result: return result @@ -413,7 +385,7 @@ parsed = urlparse(self.url)._asdict() # Remove any part of the path after the last '/' - path = '/'.join(parsed['path'].split('/')[:-1]) + parsed['path'] = '/'.join(parsed['path'].split('/')[:-1]) + '/' # Reconstruct the url with the modified path parsed = (v for v in parsed.values()) @@ -468,7 +440,7 @@ :param default_encoding: Which encoding to default to. """ - def __init__(self, *, url: str = DEFAULT_URL, html: _HTML, default_encoding: str = DEFAULT_ENCODING) -> None: + def __init__(self, *, session: Union['HTTPSession', 'AsyncHTMLSession'] = None, url: str = DEFAULT_URL, html: _HTML, default_encoding: str = DEFAULT_ENCODING) -> None: # Convert incoming unicode HTML into bytes. if isinstance(html, str): @@ -481,11 +453,54 @@ url=url, default_encoding=default_encoding ) + self.session = session or HTMLSession() self.page = None + self.next_symbol = DEFAULT_NEXT_SYMBOL def __repr__(self) -> str: return f"<HTML url={self.url!r}>" + def _next(self, fetch: bool = False, next_symbol: _NextSymbol = DEFAULT_NEXT_SYMBOL) -> _Next: + """Attempts to find the next page, if there is one. If ``fetch`` + is ``True`` (default), returns :class:`HTML <HTML>` object of + next page. If ``fetch`` is ``False``, simply returns the next URL. + + """ + + def get_next(): + candidates = self.find('a', containing=next_symbol) + + for candidate in candidates: + if candidate.attrs.get('href'): + # Support 'next' rel (e.g. reddit). + if 'next' in candidate.attrs.get('rel', []): + return candidate.attrs['href'] + + # Support 'next' in classnames. + for _class in candidate.attrs.get('class', []): + if 'next' in _class: + return candidate.attrs['href'] + + if 'page' in candidate.attrs['href']: + return candidate.attrs['href'] + + try: + # Resort to the last candidate. + return candidates[-1].attrs['href'] + except IndexError: + return None + + __next = get_next() + if __next: + url = self._make_absolute(__next) + else: + return None + + if fetch: + return self.session.get(url) + else: + return url + def __iter__(self): next = self @@ -493,14 +508,17 @@ while True: yield next try: - next = next.next(fetch=True).html + next = next._next(fetch=True, next_symbol=self.next_symbol).html except AttributeError: break def __next__(self): - return self.next(fetch=True).html + return self._next(fetch=True, next_symbol=self.next_symbol).html -
[docs] def render(self, retries: int = 8, script: str = None, wait: float = 0.2, scrolldown=False, sleep: int = 0, reload: bool = True, timeout: Union[float, int] = 8.0): + def add_next_symbol(self, next_symbol): + self.next_symbol.append(next_symbol) + +
[docs] def render(self, retries: int = 8, script: str = None, wait: float = 0.2, scrolldown=False, sleep: int = 0, reload: bool = True, timeout: Union[float, int] = 8.0, keep_page: bool = False): """Reloads the response in Chromium, and replaces HTML content with an updated version, with JavaScript executed. @@ -510,6 +528,7 @@ :param scrolldown: Integer, if provided, of how many times to page down. :param sleep: Integer, if provided, of how many long to sleep after initial render. :param reload: If ``False``, content will not be loaded from the browser, but will be provided from memory. + :param keep_page: If ``True`` will allow you to interact with the browser page through ``r.html.page``. If ``scrolldown`` is specified, the page will scrolldown the specified number of times, after sleeping the specified amount of time @@ -540,13 +559,15 @@ >>> r.html.render(script=script) {'width': 800, 'height': 600, 'deviceScaleFactor': 1} + Warning: If you use keep_page, you're responsable for closing each page, since + opening to many at scale may crach the browser. + Warning: the first time you run this method, it will download Chromium into your home directory (``~/.pyppeteer``). """ - async def _async_render(*, url: str, script: str = None, scrolldown, sleep: int, wait: float, reload, content: Optional[str], timeout: Union[float, int]): + async def _async_render(*, url: str, script: str = None, scrolldown, sleep: int, wait: float, reload, content: Optional[str], timeout: Union[float, int], keep_page: bool): try: - browser = pyppeteer.launch(headless=True, args=['--no-sandbox']) - page = await browser.newPage() + page = await self.session.browser.newPage() # Wait before rendering the page, to prevent timeouts. await asyncio.sleep(wait) @@ -573,11 +594,14 @@ # Return the content of the page, JavaScript evaluated. content = await page.content() + if not keep_page: + await page.close() + page = None return content, result, page except TimeoutError: return None - loop = asyncio.get_event_loop() + self.session.browser # Automatycally create a event loop and browser content = None # Automatically set Reload to False, if example URL is being used. @@ -588,9 +612,14 @@ if not content: try: - content, result, page = loop.run_until_complete(_async_render(url=self.url, script=script, sleep=sleep, wait=wait, content=self.html, reload=reload, scrolldown=scrolldown, timeout=timeout)) - except TimeoutError: + content, result, page = self.session.loop.run_until_complete(_async_render(url=self.url, script=script, sleep=sleep, wait=wait, content=self.html, reload=reload, scrolldown=scrolldown, timeout=timeout, keep_page=keep_page)) + except TypeError: pass + else: + break + + if not content: + raise MaxRetries("Unable to render the page. Try increasing timeout") html = HTML(url=self.url, html=content.encode(DEFAULT_ENCODING), default_encoding=DEFAULT_ENCODING) self.__dict__.update(html.__dict__) @@ -603,20 +632,21 @@ Effectively the same, but with an intelligent ``.html`` property added. """ - def __init__(self) -> None: + def __init__(self, session: Union['HTMLSession', 'AsyncHTMLSession']) -> None: super(HTMLResponse, self).__init__() self._html = None # type: HTML + self.session = session @property def html(self) -> HTML: if not self._html: - self._html = HTML(url=self.url, html=self.content, default_encoding=self.encoding) + self._html = HTML(session=self.session, url=self.url, html=self.content, default_encoding=self.encoding) return self._html @classmethod - def _from_response(cls, response): - html_r = cls() + def _from_response(cls, response, session: Union['HTMLSession', 'AsyncHTMLSession']): + html_r = cls(session=session) html_r.__dict__.update(response.__dict__) return html_r @@ -672,8 +702,21 @@ """ # Convert Request object into HTTPRequest object. r = super(HTMLSession, self).request(*args, **kwargs) -
- return HTMLResponse._from_response(r) + + return HTMLResponse._from_response(r, self) + + @property + def browser(self): + if not hasattr(self, "_browser"): + self.loop = asyncio.get_event_loop() + self._browser = self.loop.run_until_complete(pyppeteer.launch(headless=True, args=['--no-sandbox'])) + return self._browser + +
[docs] def close(self): + """ If a browser was created close it first. """ + if hasattr(self, "_browser"): + self.loop.run_until_complete(self._browser.close())
+ super().close() class AsyncHTMLSession(requests.Session): @@ -693,16 +736,15 @@ if mock_browser: self.headers['User-Agent'] = user_agent() - self.hooks["response"].append(self.response_hook) + self.hooks['response'].append(self.response_hook) self.loop = loop or asyncio.get_event_loop() self.thread_pool = ThreadPoolExecutor(max_workers=workers) - @staticmethod - def response_hook(response, **kwargs) -> HTMLResponse: + def response_hook(self, response, **kwargs) -> HTMLResponse: """ Change response enconding and replace it by a HTMLResponse. """ response.encoding = DEFAULT_ENCODING - return HTMLResponse._from_response(response) + return HTMLResponse._from_response(response, self) def request(self, *args, **kwargs): """ Partial original request func and run it in a thread. """ diff --git a/genindex.html b/genindex.html index f20b8cf..807070b 100644 --- a/genindex.html +++ b/genindex.html @@ -44,7 +44,6 @@ | H | L | M - | N | O | P | R @@ -200,18 +199,6 @@ -

N

- - -
-

O

    diff --git a/index.html b/index.html index 7934e14..836c4ea 100644 --- a/index.html +++ b/index.html @@ -210,7 +210,7 @@ once.

    These classes are the main interface to requests-html:

    -class requests_html.HTML(*, url: str = 'https://example.org/', html: Union[str, bytes], default_encoding: str = 'utf-8') → None[source]
    +class requests_html.HTML(*, session: Union[_ForwardRef('HTTPSession'), _ForwardRef('AsyncHTMLSession')] = None, url: str = 'https://example.org/', html: Union[str, bytes], default_encoding: str = 'utf-8') → None[source]

    An HTML document, ready for parsing.

    @@ -306,14 +306,6 @@ for more details.

    Element or HTML.

    -
    -
    -next(fetch: bool = False) → Union[_ForwardRef('HTML'), typing.List[str]]
    -

    Attempts to find the next page, if there is one. If fetch -is True (default), returns HTML object of -next page. If fetch is False, simply returns the next URL.

    -
    -
    pq
    @@ -330,7 +322,7 @@ of the
    -render(retries: int = 8, script: str = None, wait: float = 0.2, scrolldown=False, sleep: int = 0, reload: bool = True, timeout: Union[float, int] = 8.0)[source]
    +render(retries: int = 8, script: str = None, wait: float = 0.2, scrolldown=False, sleep: int = 0, reload: bool = True, timeout: Union[float, int] = 8.0, keep_page: bool = False)[source]

    Reloads the response in Chromium, and replaces HTML content with an updated version, with JavaScript executed.

    @@ -344,6 +336,7 @@ with an updated version, with JavaScript executed.

  • scrolldown – Integer, if provided, of how many times to page down.
  • sleep – Integer, if provided, of how many long to sleep after initial render.
  • reload – If False, content will not be loaded from the browser, but will be provided from memory.
  • +
  • keep_page – If True will allow you to interact with the browser page through r.html.page.
  • @@ -372,6 +365,8 @@ runtime. Example:

    {'width': 800, 'height': 600, 'deviceScaleFactor': 1} +

    Warning: If you use keep_page, you’re responsable for closing each page, since +opening to many at scale may crach the browser.

    Warning: the first time you run this method, it will download Chromium into your home directory (~/.pyppeteer).

    @@ -546,14 +541,6 @@ for more details.

    Element or HTML.

    -
    -
    -next(fetch: bool = False) → Union[_ForwardRef('HTML'), typing.List[str]]
    -

    Attempts to find the next page, if there is one. If fetch -is True (default), returns HTML object of -next page. If fetch is False, simply returns the next URL.

    -
    -
    pq
    @@ -654,8 +641,8 @@ style. Defaults to a Chrome-style User-Agent.

    amongst other things.

    -close()
    -

    Closes all adapters and as such the session

    +close()[source] +

    If a browser was created close it first.

    diff --git a/objects.inv b/objects.inv index bcfdbe15cf8b3e031136b64f4456be981187511a..00083a244f1d71162f3bf31b55b39a3b664d5928 100644 GIT binary patch delta 471 zcmV;|0Vw|W1nmTneNIh}+aM5z@A(x}?V&x`>Z-TgD{ZB&6m@$?*i2%y7}x=Joqu0G zoUN->r7$@q!tXOPFuVpMJV)@s^C`@lx$}OuGHPI7nKBZVJ`aDak;^83Y3S+@380aK z@W3N#5)5XvQkfG8J|`Pn zE0yCy>voph%*Bb&Q0x8gho4t_GmCFAX5mTRDkRBRg=}+GMM#z^X!AxZ6mdD5om8ks z8x@A+p-MuMU}FPXUAKXM+ALQ(2CkSMa3n^S^ zZT>a@f>a1&;6Wg{k_m*Qku#*M9L$SCYw{bGdk7)Dg*-8yjfOZ0>q7b==1n_woQSCP z#EKn)69{}&5Q9V_h%iK@B^Qxs(zggJc~cCBc2VG|L2g(;gmhwlzrdT5h&6az%m z=i9>3BD;^XNY45EiYj}T9nNVEhJE|;VK?VH#k&IBxw|~586N6KPc3CVoBko{SQS6k N=Z7@^-5-8gx*1a7-!A|F delta 482 zcmV<80UiGB1os4xeNM}6+b|GC_x=hN?V?>sXwaSa1yY~^jHY{n62}%1ie!dlBL98) zkW&>w0du^vF}i1F__!j4@EpMh&!;eJ=Fa=s%BX>TWy(lc`W*gPk;^83DRgy;1klJq zc;FE=2?jG-smzH4ACs!gF@(v=tEs3p53mGM&C9wrgY>1*3$dhG2iZ11?wM2!#VaRI zjt7C`)ZT05xX`+tC2xVniO|sN{qKjLS35e3Z?SXXNscTe$&Q6=^I}CvMl5LaMk^F? zI-C7hs5bKzq|teWA=$2fl8_|W*kY?uH&B~P3rO1>1b;q)_sW_ko32bhYU_biu9u)8 z5ZMz1GRTD#F0?NGHUWZE2xH(uAi0wXgrt!(q^caui$ZJiOPPBJA-%XfF`tcwI0)-P z`XSa$J9V6hsP)8-9fA`Gd{q#WL?VbVM5QGck!aGl3@dq44u^JsQQ)abZkRxXbjZKJ zo0Et=cxX9yqX*>Tw0>h8qZ#w&eNQDti7+4JUW)-V>+aYkT94l!`S0$pOxiJvHi}