diff --git a/requests_html.py b/requests_html.py index 7311667..3b2794c 100644 --- a/requests_html.py +++ b/requests_html.py @@ -16,7 +16,6 @@ from parse import search as parse_search from parse import findall, Result from w3lib.encoding import html_to_unicode - DEFAULT_ENCODING = 'utf-8' DEFAULT_URL = 'https://example.org/' @@ -45,7 +44,6 @@ try: except AssertionError: raise RuntimeError('Requests-HTML requires Python 3.6+!') - class BaseParser: """A basic HTML/Element Parser, for Humans. @@ -151,13 +149,7 @@ class BaseParser: for found in self.pq(selector) ] - if first: - try: - return elements[0] - except IndexError: - return None - else: - return elements + return _get_first_or_list(elements, first) def xpath(self, selector: str, first: bool = False, _encoding: str = None) -> _XPath: """Given an XPath selector, returns a list of @@ -185,13 +177,7 @@ class BaseParser: element = str(selection) c.append(element) - if first: - try: - return c[0] - except IndexError: - return None - else: - return c + return _get_first_or_list(c, first) def search(self, template: str) -> Result: """Searches the :class:`Element ` for the given Parse template. @@ -212,14 +198,14 @@ class BaseParser: @property def links(self) -> _Links: """All found links on page, in as–is form.""" + def gen(): for link in self.find('a'): try: href = link.attrs['href'].strip() - if not(href.startswith('#') and self.skip_anchors) and href not in ['javascript:;']: - if href: - yield href + if href and not (href.startswith('#') and self.skip_anchors and href in ['javascript:;']): + yield href except KeyError: pass @@ -230,6 +216,7 @@ class BaseParser: """All found links on page, in absolute form (`learn more `_). """ + def gen(): for link in self.links: # Parse the link with stdlib. @@ -259,12 +246,11 @@ class BaseParser: if base: return base.attrs['href'].strip() - else: - url = '/'.join(self.url.split('/')[:-1]) - if url.endswith('/'): - url = url[:-1] + url = '/'.join(self.url.split('/')[:-1]) + if url.endswith('/'): + url = url[:-1] - return url + return url class Element(BaseParser): @@ -280,10 +266,7 @@ class Element(BaseParser): self.element = element def __repr__(self) -> str: - attrs = [] - for attr in self.attrs: - attrs.append('{}={}'.format(attr, repr(self.attrs[attr]))) - + attrs = ['{}={}'.format(attr, repr(self.attrs[attr])) for attr in self.attrs] return "".format(repr(self.element.tag), ' '.join(attrs)) @property @@ -424,10 +407,9 @@ class HTMLResponse(requests.Response): @property def html(self) -> HTML: - if self._html: - return self._html + if not self._html: + self._html = HTML(url=self.url, html=self.content, default_encoding=self.encoding) - self._html = HTML(url=self.url, html=self.content, default_encoding=self.encoding) return self._html @classmethod @@ -442,10 +424,17 @@ def user_agent(style='chrome') -> _UserAgent: style. Defaults to a Chrome-style User-Agent. """ - if not style: - return useragent.random + return useragent[style] if style else useragent.random + + +def _get_first_or_list(l, first=True): + if first: + try: + return l[0] + except IndexError: + return None else: - return useragent[style] + return l class HTMLSession(requests.Session): @@ -478,6 +467,5 @@ class HTMLSession(requests.Session): """ # Convert Request object into HTTPRequest object. r = super(HTMLSession, self).request(*args, **kwargs) - html_r = HTMLResponse._from_response(r) - return html_r + return HTMLResponse._from_response(r)