Merge pull request #111 from andrewsg/make-absolute

Add tests for absolute_links (_make_absolute and base_url) and make them pass
2026-06-05 14:50:20 +00:00 · 2018-03-06 19:15:49 -05:00
parent 89c001a02e 14da46f03d
commit 0ab4a53ca9
2 changed files with 45 additions and 9 deletions
@@ -1,6 +1,6 @@
 import sys
 import asyncio
-from urllib.parse import urlparse, urlunparse
+from urllib.parse import urlparse, urlunparse, urljoin
 from concurrent.futures import ThreadPoolExecutor
 from concurrent.futures._base import TimeoutError
 from functools import partial
@@ -307,15 +307,20 @@ class BaseParser:
        # Parse the link with stdlib.
        parsed = urlparse(link)._asdict()

-        # Appears to be a relative link:
+        # If link is relative, then join it with base_url.
        if not parsed['netloc']:
-            parsed['netloc'] = urlparse(self.base_url).netloc
+            return urljoin(self.base_url, link)
+
+        # Link is absolute; if it lacks a scheme, add one from base_url.
        if not parsed['scheme']:
            parsed['scheme'] = urlparse(self.base_url).scheme

-        # Re-construct URL, with new data.
-        parsed = (v for v in parsed.values())
-        return urlunparse(parsed)
+            # Reconstruct the URL to incorporate the new scheme.
+            parsed = (v for v in parsed.values())
+            return urlunparse(parsed)
+
+        # Link is absolute and complete with scheme; nothing to be done here.
+        return link


    @property
@@ -342,9 +347,15 @@ class BaseParser:
            if result:
                return result

-        url = '/'.join(self.url.split('/')[:-1])
-        if url.endswith('/'):
-            url = url[:-1]
+        # Parse the url to separate out the path
+        parsed = urlparse(self.url)._asdict()
+
+        # Remove any part of the path after the last '/'
+        path = '/'.join(parsed['path'].split('/')[:-1])
+
+        # Reconstruct the url with the modified path
+        parsed = (v for v in parsed.values())
+        url = urlunparse(parsed)

        return url

@@ -137,6 +137,31 @@ def test_anchor_links():
    assert '#site-map' in r.html.links


+@pytest.mark.ok
+@pytest.mark.parametrize('url,link,expected', [
+    ('http://example.com/', 'test.html', 'http://example.com/test.html'),
+    ('http://example.com', 'test.html', 'http://example.com/test.html'),
+    ('http://example.com/foo/', 'test.html', 'http://example.com/foo/test.html'),
+    ('http://example.com/foo/bar', 'test.html', 'http://example.com/foo/test.html'),
+    ('http://example.com/foo/', '/test.html', 'http://example.com/test.html'),
+    ('http://example.com/', 'http://xkcd.com/about/', 'http://xkcd.com/about/'),
+    ('http://example.com/', '//xkcd.com/about/', 'http://xkcd.com/about/'),
+])
+def test_absolute_links(url, link, expected):
+    head_template = """<head><base href='{}'></head>"""
+    body_template = """<body><a href='{}'>Next</a></body>"""
+
+    # Test without `<base>` tag (url is base)
+    html = HTML(html=body_template.format(link), url=url)
+    assert html.absolute_links.pop() == expected
+
+    # Test with `<base>` tag (url is other)
+    html = HTML(
+        html=head_template.format(url) + body_template.format(link),
+        url='http://example.com/foobar/')
+    assert html.absolute_links.pop() == expected
+
+
@pytest.mark.render
 def test_render():
    r = get()