From 9022c0bea96435a19a2c395db7fff50296e6b794 Mon Sep 17 00:00:00 2001
From: Kenneth Reitz <me@kennethreitz.org>
Date: Mon, 26 Feb 2018 08:53:59 -0500
Subject: [PATCH] better absolute link handling

Signed-off-by: Kenneth Reitz <me@kennethreitz.org>
---
 requests_html.py | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/requests_html.py b/requests_html.py
index 1c58930..59b9aeb 100644
--- a/requests_html.py
+++ b/requests_html.py
@@ -1,3 +1,5 @@
+from urllib.parse import urlparse, urlunparse
+
 import requests
 from pyquery import PyQuery
 
@@ -8,6 +10,7 @@ from parse import search as parse_search
 from parse import findall
 
 
+
 useragent = UserAgent()
 
 
@@ -88,6 +91,7 @@ class BaseParser:
         """All found links on page, in as–is form."""
         def gen():
             for link in self.find('a'):
+
                 try:
                     href = link.attrs['href']
                     if not href.startswith('#') and self.skip_anchors and href not in ['javascript:;']:
@@ -102,14 +106,18 @@ class BaseParser:
         """All found links on page, in absolute form."""
         def gen():
             for link in self.links:
-                # Appears to not be an absolute link.
-                if ':' not in link:
-                    if link.startswith('/'):
-                        href = '{}{}'.format(self.base_url, link)
-                    else:
-                        href = '{}/{}'.format(self.base_url, link)
-                else:
-                    href = link
+                # Parse the link with stdlib.
+                parsed = urlparse(link)._asdict()
+
+                # Appears to be a relative link:
+                if not parsed['netloc']:
+                    parsed['netloc'] = urlparse(self.base_url).netloc
+                if not parsed['scheme']:
+                    parsed['scheme'] = urlparse(self.base_url).scheme
+
+                # Re-construct URL, with new data.
+                parsed = (v for v in parsed.values())
+                href = urlunparse(parsed)
 
                 yield href