From 9022c0bea96435a19a2c395db7fff50296e6b794 Mon Sep 17 00:00:00 2001 From: Kenneth Reitz Date: Mon, 26 Feb 2018 08:53:59 -0500 Subject: [PATCH] better absolute link handling Signed-off-by: Kenneth Reitz --- requests_html.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/requests_html.py b/requests_html.py index 1c58930..59b9aeb 100644 --- a/requests_html.py +++ b/requests_html.py @@ -1,3 +1,5 @@ +from urllib.parse import urlparse, urlunparse + import requests from pyquery import PyQuery @@ -8,6 +10,7 @@ from parse import search as parse_search from parse import findall + useragent = UserAgent() @@ -88,6 +91,7 @@ class BaseParser: """All found links on page, in as–is form.""" def gen(): for link in self.find('a'): + try: href = link.attrs['href'] if not href.startswith('#') and self.skip_anchors and href not in ['javascript:;']: @@ -102,14 +106,18 @@ class BaseParser: """All found links on page, in absolute form.""" def gen(): for link in self.links: - # Appears to not be an absolute link. - if ':' not in link: - if link.startswith('/'): - href = '{}{}'.format(self.base_url, link) - else: - href = '{}/{}'.format(self.base_url, link) - else: - href = link + # Parse the link with stdlib. + parsed = urlparse(link)._asdict() + + # Appears to be a relative link: + if not parsed['netloc']: + parsed['netloc'] = urlparse(self.base_url).netloc + if not parsed['scheme']: + parsed['scheme'] = urlparse(self.base_url).scheme + + # Re-construct URL, with new data. + parsed = (v for v in parsed.values()) + href = urlunparse(parsed) yield href