diff --git a/requests_html.py b/requests_html.py index ceaf54c..6203ba4 100644 --- a/requests_html.py +++ b/requests_html.py @@ -24,6 +24,7 @@ from w3lib.encoding import html_to_unicode DEFAULT_ENCODING = 'utf-8' DEFAULT_URL = 'https://example.org/' DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8' +DEFAULT_NEXT_SYMBOL = ['next', 'more', 'older'] cleaner = Cleaner() cleaner.javascript = True @@ -49,6 +50,7 @@ _Containing = Union[str, List[str]] _Links = Set[str] _Attrs = MutableMapping _Next = Union['HTML', List[str]] +_NextSymbol = List[str] # Sanity checking. try: @@ -164,7 +166,7 @@ class BaseParser: """ return self.lxml.text_content() - def next(self, fetch: bool = False) -> _Next: + def next(self, fetch: bool = False, next_symbol: _NextSymbol = DEFAULT_NEXT_SYMBOL) -> _Next: """Attempts to find the next page, if there is one. If ``fetch`` is ``True`` (default), returns :class:`HTML ` object of next page. If ``fetch`` is ``False``, simply returns the next URL. @@ -172,7 +174,7 @@ class BaseParser: """ def get_next(): - candidates = self.find('a', containing=('next', 'more', 'older')) + candidates = self.find('a', containing=next_symbol) for candidate in candidates: if candidate.attrs.get('href'): @@ -450,6 +452,7 @@ class HTML(BaseParser): default_encoding=default_encoding ) self.page = None + self.next_symbol = DEFAULT_NEXT_SYMBOL def __repr__(self) -> str: return f"" @@ -461,12 +464,15 @@ class HTML(BaseParser): while True: yield next try: - next = next.next(fetch=True).html + next = next.next(fetch=True, next_symbol=self.next_symbol).html except AttributeError: break def __next__(self): - return self.next(fetch=True).html + return self.next(fetch=True, next_symbol=self.next_symbol).html + + def add_next_symbol(self, next_symbol): + self.next_symbol.append(next_symbol) def render(self, retries: int = 8, script: str = None, wait: float = 0.2, scrolldown=False, sleep: int = 0, reload: bool = True, timeout: Union[float, int] = 8.0): """Reloads the response in Chromium, and replaces HTML content