Merge pull request #111 from andrewsg/make-absolute

Add tests for absolute_links (_make_absolute and base_url) and make them pass
This commit is contained in:
2018-03-06 19:15:49 -05:00
committed by GitHub
2 changed files with 45 additions and 9 deletions
+20 -9
View File
@@ -1,6 +1,6 @@
import sys
import asyncio
from urllib.parse import urlparse, urlunparse
from urllib.parse import urlparse, urlunparse, urljoin
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures._base import TimeoutError
from functools import partial
@@ -307,15 +307,20 @@ class BaseParser:
# Parse the link with stdlib.
parsed = urlparse(link)._asdict()
# Appears to be a relative link:
# If link is relative, then join it with base_url.
if not parsed['netloc']:
parsed['netloc'] = urlparse(self.base_url).netloc
return urljoin(self.base_url, link)
# Link is absolute; if it lacks a scheme, add one from base_url.
if not parsed['scheme']:
parsed['scheme'] = urlparse(self.base_url).scheme
# Re-construct URL, with new data.
parsed = (v for v in parsed.values())
return urlunparse(parsed)
# Reconstruct the URL to incorporate the new scheme.
parsed = (v for v in parsed.values())
return urlunparse(parsed)
# Link is absolute and complete with scheme; nothing to be done here.
return link
@property
@@ -342,9 +347,15 @@ class BaseParser:
if result:
return result
url = '/'.join(self.url.split('/')[:-1])
if url.endswith('/'):
url = url[:-1]
# Parse the url to separate out the path
parsed = urlparse(self.url)._asdict()
# Remove any part of the path after the last '/'
path = '/'.join(parsed['path'].split('/')[:-1])
# Reconstruct the url with the modified path
parsed = (v for v in parsed.values())
url = urlunparse(parsed)
return url
+25
View File
@@ -137,6 +137,31 @@ def test_anchor_links():
assert '#site-map' in r.html.links
@pytest.mark.ok
@pytest.mark.parametrize('url,link,expected', [
('http://example.com/', 'test.html', 'http://example.com/test.html'),
('http://example.com', 'test.html', 'http://example.com/test.html'),
('http://example.com/foo/', 'test.html', 'http://example.com/foo/test.html'),
('http://example.com/foo/bar', 'test.html', 'http://example.com/foo/test.html'),
('http://example.com/foo/', '/test.html', 'http://example.com/test.html'),
('http://example.com/', 'http://xkcd.com/about/', 'http://xkcd.com/about/'),
('http://example.com/', '//xkcd.com/about/', 'http://xkcd.com/about/'),
])
def test_absolute_links(url, link, expected):
head_template = """<head><base href='{}'></head>"""
body_template = """<body><a href='{}'>Next</a></body>"""
# Test without `<base>` tag (url is base)
html = HTML(html=body_template.format(link), url=url)
assert html.absolute_links.pop() == expected
# Test with `<base>` tag (url is other)
html = HTML(
html=head_template.format(url) + body_template.format(link),
url='http://example.com/foobar/')
assert html.absolute_links.pop() == expected
@pytest.mark.render
def test_render():
r = get()