add func add_next_symbol make it possible to append word to default next page symbols

This commit is contained in:
bonfy
2018-03-13 11:11:07 +08:00
parent 32c1c983f9
commit 76f2f6434c
+10 -4
View File
@@ -24,6 +24,7 @@ from w3lib.encoding import html_to_unicode
DEFAULT_ENCODING = 'utf-8'
DEFAULT_URL = 'https://example.org/'
DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8'
DEFAULT_NEXT_SYMBOL = ['next', 'more', 'older']
cleaner = Cleaner()
cleaner.javascript = True
@@ -49,6 +50,7 @@ _Containing = Union[str, List[str]]
_Links = Set[str]
_Attrs = MutableMapping
_Next = Union['HTML', List[str]]
_NextSymbol = List[str]
# Sanity checking.
try:
@@ -164,7 +166,7 @@ class BaseParser:
"""
return self.lxml.text_content()
def next(self, fetch: bool = False) -> _Next:
def next(self, fetch: bool = False, next_symbol: _NextSymbol = DEFAULT_NEXT_SYMBOL) -> _Next:
"""Attempts to find the next page, if there is one. If ``fetch``
is ``True`` (default), returns :class:`HTML <HTML>` object of
next page. If ``fetch`` is ``False``, simply returns the next URL.
@@ -172,7 +174,7 @@ class BaseParser:
"""
def get_next():
candidates = self.find('a', containing=('next', 'more', 'older'))
candidates = self.find('a', containing=next_symbol)
for candidate in candidates:
if candidate.attrs.get('href'):
@@ -450,6 +452,7 @@ class HTML(BaseParser):
default_encoding=default_encoding
)
self.page = None
self.next_symbol = DEFAULT_NEXT_SYMBOL
def __repr__(self) -> str:
return f"<HTML url={self.url!r}>"
@@ -461,12 +464,15 @@ class HTML(BaseParser):
while True:
yield next
try:
next = next.next(fetch=True).html
next = next.next(fetch=True, next_symbol=self.next_symbol).html
except AttributeError:
break
def __next__(self):
return self.next(fetch=True).html
return self.next(fetch=True, next_symbol=self.next_symbol).html
def add_next_symbol(self, next_symbol):
self.next_symbol.append(next_symbol)
def render(self, retries: int = 8, script: str = None, wait: float = 0.2, scrolldown=False, sleep: int = 0, reload: bool = True, timeout: Union[float, int] = 8.0):
"""Reloads the response in Chromium, and replaces HTML content