better absolute link handling

Signed-off-by: Kenneth Reitz <me@kennethreitz.org>
2026-06-05 23:00:20 +00:00 · 2018-02-26 08:53:59 -05:00
parent 4b26b44c88
commit 9022c0bea9
1 changed files with 16 additions and 8 deletions
@@ -1,3 +1,5 @@
+from urllib.parse import urlparse, urlunparse
+
 import requests
 from pyquery import PyQuery

@@ -8,6 +10,7 @@ from parse import search as parse_search
 from parse import findall


+
 useragent = UserAgent()


@@ -88,6 +91,7 @@ class BaseParser:
        """All found links on page, in as–is form."""
        def gen():
            for link in self.find('a'):
+
                try:
                    href = link.attrs['href']
                    if not href.startswith('#') and self.skip_anchors and href not in ['javascript:;']:
@@ -102,14 +106,18 @@ class BaseParser:
        """All found links on page, in absolute form."""
        def gen():
            for link in self.links:
-                # Appears to not be an absolute link.
-                if ':' not in link:
-                    if link.startswith('/'):
-                        href = '{}{}'.format(self.base_url, link)
-                    else:
-                        href = '{}/{}'.format(self.base_url, link)
-                else:
-                    href = link
+                # Parse the link with stdlib.
+                parsed = urlparse(link)._asdict()
+
+                # Appears to be a relative link:
+                if not parsed['netloc']:
+                    parsed['netloc'] = urlparse(self.base_url).netloc
+                if not parsed['scheme']:
+                    parsed['scheme'] = urlparse(self.base_url).scheme
+
+                # Re-construct URL, with new data.
+                parsed = (v for v in parsed.values())
+                href = urlunparse(parsed)

                yield href