diff --git a/requests_html.py b/requests_html.py index 1c58930..59b9aeb 100644 --- a/requests_html.py +++ b/requests_html.py @@ -1,3 +1,5 @@ +from urllib.parse import urlparse, urlunparse + import requests from pyquery import PyQuery @@ -8,6 +10,7 @@ from parse import search as parse_search from parse import findall + useragent = UserAgent() @@ -88,6 +91,7 @@ class BaseParser: """All found links on page, in as–is form.""" def gen(): for link in self.find('a'): + try: href = link.attrs['href'] if not href.startswith('#') and self.skip_anchors and href not in ['javascript:;']: @@ -102,14 +106,18 @@ class BaseParser: """All found links on page, in absolute form.""" def gen(): for link in self.links: - # Appears to not be an absolute link. - if ':' not in link: - if link.startswith('/'): - href = '{}{}'.format(self.base_url, link) - else: - href = '{}/{}'.format(self.base_url, link) - else: - href = link + # Parse the link with stdlib. + parsed = urlparse(link)._asdict() + + # Appears to be a relative link: + if not parsed['netloc']: + parsed['netloc'] = urlparse(self.base_url).netloc + if not parsed['scheme']: + parsed['scheme'] = urlparse(self.base_url).scheme + + # Re-construct URL, with new data. + parsed = (v for v in parsed.values()) + href = urlunparse(parsed) yield href