mirror of
https://github.com/kennethreitz/requests-html.git
synced 2026-06-05 14:50:20 +00:00
Move next method form BaseParser to HTML class
This commit is contained in:
+46
-46
@@ -70,9 +70,8 @@ class BaseParser:
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, *, element, session: 'HTTPSession' = None, default_encoding: _DefaultEncoding = None, html: _HTML = None, url: _URL) -> None:
|
||||
def __init__(self, *, element, default_encoding: _DefaultEncoding = None, html: _HTML = None, url: _URL) -> None:
|
||||
self.element = element
|
||||
self.session = session or HTMLSession()
|
||||
self.url = url
|
||||
self.skip_anchors = True
|
||||
self.default_encoding = default_encoding
|
||||
@@ -166,47 +165,6 @@ class BaseParser:
|
||||
"""
|
||||
return self.lxml.text_content()
|
||||
|
||||
def next(self, fetch: bool = False, next_symbol: _NextSymbol = DEFAULT_NEXT_SYMBOL) -> _Next:
|
||||
"""Attempts to find the next page, if there is one. If ``fetch``
|
||||
is ``True`` (default), returns :class:`HTML <HTML>` object of
|
||||
next page. If ``fetch`` is ``False``, simply returns the next URL.
|
||||
|
||||
"""
|
||||
|
||||
def get_next():
|
||||
candidates = self.find('a', containing=next_symbol)
|
||||
|
||||
for candidate in candidates:
|
||||
if candidate.attrs.get('href'):
|
||||
# Support 'next' rel (e.g. reddit).
|
||||
if 'next' in candidate.attrs.get('rel', []):
|
||||
return candidate.attrs['href']
|
||||
|
||||
# Support 'next' in classnames.
|
||||
for _class in candidate.attrs.get('class', []):
|
||||
if 'next' in _class:
|
||||
return candidate.attrs['href']
|
||||
|
||||
if 'page' in candidate.attrs['href']:
|
||||
return candidate.attrs['href']
|
||||
|
||||
try:
|
||||
# Resort to the last candidate.
|
||||
return candidates[-1].attrs['href']
|
||||
except IndexError:
|
||||
return None
|
||||
|
||||
next = get_next()
|
||||
if next:
|
||||
url = self._make_absolute(next)
|
||||
else:
|
||||
return None
|
||||
|
||||
if fetch:
|
||||
return self.session.get(url)
|
||||
else:
|
||||
return url
|
||||
|
||||
def find(self, selector: str = "*", *, containing: _Containing = None, clean: bool = False, first: bool = False, _encoding: str = None) -> _Find:
|
||||
"""Given a CSS Selector, returns a list of
|
||||
:class:`Element <Element>` objects or a single one.
|
||||
@@ -438,7 +396,7 @@ class HTML(BaseParser):
|
||||
:param default_encoding: Which encoding to default to.
|
||||
"""
|
||||
|
||||
def __init__(self, *, url: str = DEFAULT_URL, html: _HTML, default_encoding: str = DEFAULT_ENCODING) -> None:
|
||||
def __init__(self, *, session: 'HTTPSession' = None, url: str = DEFAULT_URL, html: _HTML, default_encoding: str = DEFAULT_ENCODING) -> None:
|
||||
|
||||
# Convert incoming unicode HTML into bytes.
|
||||
if isinstance(html, str):
|
||||
@@ -451,12 +409,54 @@ class HTML(BaseParser):
|
||||
url=url,
|
||||
default_encoding=default_encoding
|
||||
)
|
||||
self.session = session or HTMLSession()
|
||||
self.page = None
|
||||
self.next_symbol = DEFAULT_NEXT_SYMBOL
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"<HTML url={self.url!r}>"
|
||||
|
||||
def _next(self, fetch: bool = False, next_symbol: _NextSymbol = DEFAULT_NEXT_SYMBOL) -> _Next:
|
||||
"""Attempts to find the next page, if there is one. If ``fetch``
|
||||
is ``True`` (default), returns :class:`HTML <HTML>` object of
|
||||
next page. If ``fetch`` is ``False``, simply returns the next URL.
|
||||
|
||||
"""
|
||||
|
||||
def get_next():
|
||||
candidates = self.find('a', containing=next_symbol)
|
||||
|
||||
for candidate in candidates:
|
||||
if candidate.attrs.get('href'):
|
||||
# Support 'next' rel (e.g. reddit).
|
||||
if 'next' in candidate.attrs.get('rel', []):
|
||||
return candidate.attrs['href']
|
||||
|
||||
# Support 'next' in classnames.
|
||||
for _class in candidate.attrs.get('class', []):
|
||||
if 'next' in _class:
|
||||
return candidate.attrs['href']
|
||||
|
||||
if 'page' in candidate.attrs['href']:
|
||||
return candidate.attrs['href']
|
||||
|
||||
try:
|
||||
# Resort to the last candidate.
|
||||
return candidates[-1].attrs['href']
|
||||
except IndexError:
|
||||
return None
|
||||
|
||||
__next = get_next()
|
||||
if __next:
|
||||
url = self._make_absolute(__next)
|
||||
else:
|
||||
return None
|
||||
|
||||
if fetch:
|
||||
return self.session.get(url)
|
||||
else:
|
||||
return url
|
||||
|
||||
def __iter__(self):
|
||||
|
||||
next = self
|
||||
@@ -464,12 +464,12 @@ class HTML(BaseParser):
|
||||
while True:
|
||||
yield next
|
||||
try:
|
||||
next = next.next(fetch=True, next_symbol=self.next_symbol).html
|
||||
next = next._next(fetch=True, next_symbol=self.next_symbol).html
|
||||
except AttributeError:
|
||||
break
|
||||
|
||||
def __next__(self):
|
||||
return self.next(fetch=True, next_symbol=self.next_symbol).html
|
||||
return self._next(fetch=True, next_symbol=self.next_symbol).html
|
||||
|
||||
def add_next_symbol(self, next_symbol):
|
||||
self.next_symbol.append(next_symbol)
|
||||
|
||||
Reference in New Issue
Block a user