diff --git a/requests_html.py b/requests_html.py
index c451be8..29b9683 100644
--- a/requests_html.py
+++ b/requests_html.py
@@ -10,8 +10,10 @@ from pyquery import PyQuery
from pyquery.pyquery import fromstring
from fake_useragent import UserAgent
+import lxml
from lxml import etree
from lxml.html import HtmlElement
+from lxml.html.soupparser import fromstring as soup_parse
from parse import search as parse_search
from parse import findall, Result
from w3lib.encoding import html_to_unicode
@@ -57,8 +59,14 @@ class BaseParser:
"""
- def __init__(self, *, element, default_encoding: _DefaultEncoding = None, html: _HTML = None, url: _URL) -> None:
+ __slots__ = [
+ 'element', 'url', 'skip_anchors', 'default_encoding', '_encoding',
+ '_encoding', '_html', '_lxml', '_pq', 'session'
+ ]
+
+ def __init__(self, *, element, session: 'HTTPSession' = None, default_encoding: _DefaultEncoding = None, html: _HTML = None, url: _URL) -> None:
self.element = element
+ self.session = session or HTMLSession()
self.url = url
self.skip_anchors = True
self.default_encoding = default_encoding
@@ -127,7 +135,10 @@ class BaseParser:
:class:`Element ` or :class:`HTML `.
"""
if self._lxml is None:
- self._lxml = fromstring(self.html, parser='soup')[0]
+ try:
+ self._lxml = soup_parse(self.html, features='html.parser')
+ except ValueError:
+ self._lxml = lxml.html.fromstring(self.html)
return self._lxml
@@ -145,11 +156,51 @@ class BaseParser:
"""
return self.lxml.text_content()
- def find(self, selector: str, first: bool = False, _encoding: str = None) -> _Find:
+ def next(self, fetch=True):
+ """Attempts to find the next page, if there is one."""
+
+ def get_next():
+ candidates = self.find('a', containing=('next', 'more', 'older'))
+
+ for candidate in candidates:
+ if candidate.attrs.get('href'):
+ # Support 'next' rel (e.g. reddit).
+ if 'next' in candidate.attrs.get('rel', []):
+ return candidate.attrs['href']
+
+ # Support 'next' in classnames.
+ for _class in candidate.attrs.get('class', []):
+ if 'next' in _class:
+ return candidate.attrs['href']
+
+ if 'page' in candidate.attrs['href']:
+ return candidate.attrs['href']
+
+ try:
+ # Resort to the last candidate.
+ return candidates[-1].attrs['href']
+ except IndexError:
+ return None
+
+
+ next = get_next()
+ if next:
+ url = self._make_absolute(next)
+ else:
+ return None
+
+ if fetch:
+ return self.session.get(url)
+ else:
+ return url
+
+
+ def find(self, selector: str = "*", containing: Optional[str] = None, first: bool = False, _encoding: str = None) -> _Find:
"""Given a CSS Selector, returns a list of
:class:`Element ` objects or a single one.
:param selector: CSS Selector to use.
+ :param containing: If specified, only return elements that contain the provided text.
:param first: Whether or not to return just the first result.
:param _encoding: The encoding format.
@@ -174,6 +225,16 @@ class BaseParser:
for found in self.pq(selector)
]
+ if containing:
+ elements_copy = elements.copy()
+ elements = []
+
+ for element in elements_copy:
+ if any([c.lower() in element.full_text.lower() for c in containing]):
+ elements.append(element)
+
+ elements.reverse()
+
return _get_first_or_list(elements, first)
def xpath(self, selector: str, first: bool = False, _encoding: str = None) -> _XPath:
@@ -236,6 +297,23 @@ class BaseParser:
return set(gen())
+ def _make_absolute(self, link):
+ """Makes a given link absolute."""
+
+ # Parse the link with stdlib.
+ parsed = urlparse(link)._asdict()
+
+ # Appears to be a relative link:
+ if not parsed['netloc']:
+ parsed['netloc'] = urlparse(self.base_url).netloc
+ if not parsed['scheme']:
+ parsed['scheme'] = urlparse(self.base_url).scheme
+
+ # Re-construct URL, with new data.
+ parsed = (v for v in parsed.values())
+ return urlunparse(parsed)
+
+
@property
def absolute_links(self) -> _Links:
"""All found links on page, in absolute form
@@ -244,20 +322,7 @@ class BaseParser:
def gen():
for link in self.links:
- # Parse the link with stdlib.
- parsed = urlparse(link)._asdict()
-
- # Appears to be a relative link:
- if not parsed['netloc']:
- parsed['netloc'] = urlparse(self.base_url).netloc
- if not parsed['scheme']:
- parsed['scheme'] = urlparse(self.base_url).scheme
-
- # Re-construct URL, with new data.
- parsed = (v for v in parsed.values())
- href = urlunparse(parsed)
-
- yield href
+ yield self._make_absolute(link)
return set(gen())
@@ -269,7 +334,9 @@ class BaseParser:
# Support for tag.
base = self.find('base', first=True)
if base:
- return base.attrs['href'].strip()
+ result = base.attrs['href'].strip()
+ if result:
+ return result
url = '/'.join(self.url.split('/')[:-1])
if url.endswith('/'):
@@ -286,6 +353,8 @@ class Element(BaseParser):
:param default_encoding: Which encoding to default to.
"""
+ __slots__ = BaseParser.__slots__
+
def __init__(self, *, element, url: _URL, default_encoding: _DefaultEncoding = None) -> None:
super(Element, self).__init__(element=element, url=url, default_encoding=default_encoding)
self.element = element
@@ -301,8 +370,8 @@ class Element(BaseParser):
"""
attrs = {k: v for k, v in self.element.items()}
- # Split class up, as there are ussually many of them:
- for attr in ['class']:
+ # Split class and rel up, as there are ussually many of them:
+ for attr in ['class', 'rel']:
if attr in attrs:
attrs[attr] = tuple(attrs[attr].split())
@@ -334,6 +403,17 @@ class HTML(BaseParser):
def __repr__(self) -> str:
return f""
+ def __iter__(self):
+
+ next = self
+
+ while True:
+ yield next
+ try:
+ next = next.next(fetch=True).html
+ except AttributeError:
+ break
+
def render(self, retries: int = 8, script: str = None, wait: float = 0.2, scrolldown=False, sleep: int = 0, reload: bool = True, timeout: Union[float, int] = 8.0):
"""Reloads the response in Chromium, and replaces HTML content
with an updated version, with JavaScript executed.