From b0bd9783923e702ef964c35e0366e2ed2924e067 Mon Sep 17 00:00:00 2001 From: Carey Metcalfe Date: Wed, 28 Feb 2018 19:27:41 -0500 Subject: [PATCH 1/3] Don't initialize the UserAgent object until one is requested This avoids hitting external servers to get a plausible user agent string when the module is imported. When the first user agent string is requested the object is initialized as usual. --- requests_html.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/requests_html.py b/requests_html.py index d6ddecd..7964625 100644 --- a/requests_html.py +++ b/requests_html.py @@ -19,7 +19,7 @@ from w3lib.encoding import html_to_unicode DEFAULT_ENCODING = 'utf-8' DEFAULT_URL = 'https://example.org/' -useragent = UserAgent() +useragent = None # Typing. _Find = Union[List['Element'], 'Element'] @@ -431,6 +431,9 @@ def user_agent(style='chrome') -> _UserAgent: """Returns a random user-agent, if not requested one of a specific style. Defaults to a Chrome-style User-Agent. """ + global useragent + if not useragent: + useragent = UserAgent() return useragent[style] if style else useragent.random From 9ed0bac87b1edaed3da31374679f3714269787ac Mon Sep 17 00:00:00 2001 From: alxia Date: Wed, 28 Feb 2018 22:45:10 -0500 Subject: [PATCH 2/3] UserWarning fix fixed UserWarning by passing default **bsargs to lxml --- requests_html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requests_html.py b/requests_html.py index 7964625..82a72c2 100644 --- a/requests_html.py +++ b/requests_html.py @@ -117,7 +117,7 @@ class BaseParser: """`lxml `_ representation of the :class:`Element ` or :class:`HTML `. """ - return soup_parse(self.html) + return soup_parse(self.html, features='html.parser') @property def text(self) -> _Text: From 64e67d46ff802766c5ef531aa0cb99dc9db9de4b Mon Sep 17 00:00:00 2001 From: camper42 Date: Thu, 1 Mar 2018 20:34:56 +0800 Subject: [PATCH 3/3] ignore javascript links ignore anything startswith `javascript:` in href attr --- requests_html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requests_html.py b/requests_html.py index 7964625..bd41ba8 100644 --- a/requests_html.py +++ b/requests_html.py @@ -212,7 +212,7 @@ class BaseParser: try: href = link.attrs['href'].strip() - if href and not (href.startswith('#') and self.skip_anchors and href in ['javascript:;']): + if href and not (href.startswith('#') and self.skip_anchors) and not href.startswith('javascript:'): yield href except KeyError: pass