diff --git a/_modules/requests_html.html b/_modules/requests_html.html index 0eb042f..18011c8 100644 --- a/_modules/requests_html.html +++ b/_modules/requests_html.html @@ -56,6 +56,7 @@ DEFAULT_ENCODING = 'utf-8' DEFAULT_URL = 'https://example.org/' DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8' +DEFAULT_NEXT_SYMBOL = ['next', 'more', 'older'] cleaner = Cleaner() cleaner.javascript = True @@ -81,6 +82,7 @@ _Links = Set[str] _Attrs = MutableMapping _Next = Union['HTML', List[str]] +_NextSymbol = List[str] # Sanity checking. try: @@ -90,6 +92,12 @@ raise RuntimeError('Requests-HTML requires Python 3.6+!') +class MaxRetries(Exception): + + def __init__(self, message): + self.message = message + + class BaseParser: """A basic HTML/Element Parser, for Humans. @@ -100,9 +108,8 @@ """ - def __init__(self, *, element, session: 'HTTPSession' = None, default_encoding: _DefaultEncoding = None, html: _HTML = None, url: _URL) -> None: + def __init__(self, *, element, default_encoding: _DefaultEncoding = None, html: _HTML = None, url: _URL) -> None: self.element = element - self.session = session or HTMLSession() self.url = url self.skip_anchors = True self.default_encoding = default_encoding @@ -151,6 +158,12 @@ # Scan meta tags for charset. if self._html: self._encoding = html_to_unicode(self.default_encoding, self._html)[0] + # Fall back to requests' detected encoding if decode fails. + try: + self.raw_html.decode(self.encoding) + except UnicodeDecodeError: + self._encoding = self.default_encoding + return self._encoding if self._encoding else self.default_encoding @@ -196,47 +209,6 @@ """ return self.lxml.text_content() - def next(self, fetch: bool = False) -> _Next: - """Attempts to find the next page, if there is one. If ``fetch`` - is ``True`` (default), returns :class:`HTML <HTML>` object of - next page. If ``fetch`` is ``False``, simply returns the next URL. - - """ - - def get_next(): - candidates = self.find('a', containing=('next', 'more', 'older')) - - for candidate in candidates: - if candidate.attrs.get('href'): - # Support 'next' rel (e.g. reddit). - if 'next' in candidate.attrs.get('rel', []): - return candidate.attrs['href'] - - # Support 'next' in classnames. - for _class in candidate.attrs.get('class', []): - if 'next' in _class: - return candidate.attrs['href'] - - if 'page' in candidate.attrs['href']: - return candidate.attrs['href'] - - try: - # Resort to the last candidate. - return candidates[-1].attrs['href'] - except IndexError: - return None - - next = get_next() - if next: - url = self._make_absolute(next) - else: - return None - - if fetch: - return self.session.get(url) - else: - return url - def find(self, selector: str = "*", *, containing: _Containing = None, clean: bool = False, first: bool = False, _encoding: str = None) -> _Find: """Given a CSS Selector, returns a list of :class:`Element <Element>` objects or a single one. @@ -356,7 +328,7 @@ try: href = link.attrs['href'].strip() - if href and not (href.startswith('#') and self.skip_anchors) and not href.startswith('javascript:'): + if href and not (href.startswith('#') and self.skip_anchors) and not href.startswith(('javascript:', 'mailto:')): yield href except KeyError: pass @@ -405,7 +377,7 @@ # Support for <base> tag. base = self.find('base', first=True) if base: - result = base.attrs['href'].strip() + result = base.attrs.get('href', '').strip() if result: return result @@ -413,7 +385,7 @@ parsed = urlparse(self.url)._asdict() # Remove any part of the path after the last '/' - path = '/'.join(parsed['path'].split('/')[:-1]) + parsed['path'] = '/'.join(parsed['path'].split('/')[:-1]) + '/' # Reconstruct the url with the modified path parsed = (v for v in parsed.values()) @@ -468,7 +440,7 @@ :param default_encoding: Which encoding to default to. """ - def __init__(self, *, url: str = DEFAULT_URL, html: _HTML, default_encoding: str = DEFAULT_ENCODING) -> None: + def __init__(self, *, session: Union['HTTPSession', 'AsyncHTMLSession'] = None, url: str = DEFAULT_URL, html: _HTML, default_encoding: str = DEFAULT_ENCODING) -> None: # Convert incoming unicode HTML into bytes. if isinstance(html, str): @@ -481,11 +453,54 @@ url=url, default_encoding=default_encoding ) + self.session = session or HTMLSession() self.page = None + self.next_symbol = DEFAULT_NEXT_SYMBOL def __repr__(self) -> str: return f"<HTML url={self.url!r}>" + def _next(self, fetch: bool = False, next_symbol: _NextSymbol = DEFAULT_NEXT_SYMBOL) -> _Next: + """Attempts to find the next page, if there is one. If ``fetch`` + is ``True`` (default), returns :class:`HTML <HTML>` object of + next page. If ``fetch`` is ``False``, simply returns the next URL. + + """ + + def get_next(): + candidates = self.find('a', containing=next_symbol) + + for candidate in candidates: + if candidate.attrs.get('href'): + # Support 'next' rel (e.g. reddit). + if 'next' in candidate.attrs.get('rel', []): + return candidate.attrs['href'] + + # Support 'next' in classnames. + for _class in candidate.attrs.get('class', []): + if 'next' in _class: + return candidate.attrs['href'] + + if 'page' in candidate.attrs['href']: + return candidate.attrs['href'] + + try: + # Resort to the last candidate. + return candidates[-1].attrs['href'] + except IndexError: + return None + + __next = get_next() + if __next: + url = self._make_absolute(__next) + else: + return None + + if fetch: + return self.session.get(url) + else: + return url + def __iter__(self): next = self @@ -493,14 +508,17 @@ while True: yield next try: - next = next.next(fetch=True).html + next = next._next(fetch=True, next_symbol=self.next_symbol).html except AttributeError: break def __next__(self): - return self.next(fetch=True).html + return self._next(fetch=True, next_symbol=self.next_symbol).html -
| - |
These classes are the main interface to requests_html.HTML(*, session: Union[_ForwardRef('HTTPSession'), _ForwardRef('AsyncHTMLSession')] = None, url: str = 'https://example.org/', html: Union[str, bytes], default_encoding: str = 'utf-8') → None[source]¶
|