From 21ce89ea4d384def792aad324f2170af0defeef1 Mon Sep 17 00:00:00 2001 From: Chyroc Date: Wed, 28 Feb 2018 20:08:54 +0800 Subject: [PATCH] clean and format code --- requests_html.py | 62 ++++++++++++++++++++---------------------------- 1 file changed, 26 insertions(+), 36 deletions(-) diff --git a/requests_html.py b/requests_html.py index ee2f6b2..0745edb 100644 --- a/requests_html.py +++ b/requests_html.py @@ -15,14 +15,12 @@ from parse import search as parse_search from parse import findall, Result from w3lib.encoding import html_to_unicode - DEFAULT_ENCODING = 'utf-8' DEFAULT_URL = 'https://example.org/' useragent = UserAgent() - class BaseParser: """A basic HTML/Element Parser, for Humans.""" @@ -114,13 +112,7 @@ class BaseParser: for found in self.pq(selector) ] - if first: - try: - return elements[0] - except IndexError: - return None - else: - return elements + return _get_first_or_list(elements, first) def xpath(self, selector: str, first: bool = False, _encoding: str = None): """Given an XPath selector, returns a list of :class:`Element ` objects. @@ -129,13 +121,8 @@ class BaseParser: If ``first`` is ``True``, only returns the first :class:`Element ` found.""" c = [Element(element=e, url=self.url, default_encoding=_encoding or self.encoding) for e in self.lxml.xpath(selector)] - if first: - try: - return c[0] - except IndexError: - return None - else: - return c + + return _get_first_or_list(c, first) def search(self, template: str) -> Result: """Searches the :class:`Element ` for the given parse template.""" @@ -150,14 +137,14 @@ class BaseParser: @property def links(self) -> Set[str]: """All found links on page, in as–is form.""" + def gen(): for link in self.find('a'): try: href = link.attrs['href'].strip() - if not(href.startswith('#') and self.skip_anchors) and href not in ['javascript:;']: - if href: - yield href + if href and not (href.startswith('#') and self.skip_anchors and href in ['javascript:;']): + yield href except KeyError: pass @@ -168,6 +155,7 @@ class BaseParser: """All found links on page, in absolute form (`learn more `_). """ + def gen(): for link in self.links: # Parse the link with stdlib. @@ -197,12 +185,11 @@ class BaseParser: if base: return base.attrs['href'].strip() - else: - url = '/'.join(self.url.split('/')[:-1]) - if url.endswith('/'): - url = url[:-1] + url = '/'.join(self.url.split('/')[:-1]) + if url.endswith('/'): + url = url[:-1] - return url + return url class Element(BaseParser): @@ -213,10 +200,7 @@ class Element(BaseParser): self.element = element def __repr__(self) -> str: - attrs = [] - for attr in self.attrs: - attrs.append('{}={}'.format(attr, repr(self.attrs[attr]))) - + attrs = ['{}={}'.format(attr, repr(self.attrs[attr])) for attr in self.attrs] return "".format(repr(self.element.tag), ' '.join(attrs)) @property @@ -289,6 +273,7 @@ class HTML(BaseParser): Warning: the first time you run this method, it will download Chromium into your home directory (``~/.pyppeteer``). """ + async def _async_render(*, url: str, script: str = None, scrolldown, sleep: int): try: browser = pyppeteer.launch(headless=True) @@ -344,10 +329,9 @@ class HTMLResponse(requests.Response): @property def html(self) -> HTML: - if self._html: - return self._html + if not self._html: + self._html = HTML(url=self.url, html=self.content, default_encoding=self.encoding) - self._html = HTML(url=self.url, html=self.content, default_encoding=self.encoding) return self._html @classmethod @@ -362,10 +346,17 @@ def user_agent(style='chrome') -> str: style. Defaults to a Chrome-style User-Agent. """ - if not style: - return useragent.random + return useragent[style] if style else useragent.random + + +def _get_first_or_list(l, first=True): + if first: + try: + return l[0] + except IndexError: + return None else: - return useragent[style] + return l class HTMLSession(requests.Session): @@ -395,6 +386,5 @@ class HTMLSession(requests.Session): def request(self, *args, **kwargs) -> HTMLResponse: # Convert Request object into HTTPRequest object. r = super(HTMLSession, self).request(*args, **kwargs) - html_r = HTMLResponse._from_response(r) - return html_r + return HTMLResponse._from_response(r)