mirror of
https://github.com/kennethreitz/requests-html.git
synced 2026-06-05 14:50:20 +00:00
Merge pull request #111 from andrewsg/make-absolute
Add tests for absolute_links (_make_absolute and base_url) and make them pass
This commit is contained in:
+20
-9
@@ -1,6 +1,6 @@
|
||||
import sys
|
||||
import asyncio
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
from urllib.parse import urlparse, urlunparse, urljoin
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from concurrent.futures._base import TimeoutError
|
||||
from functools import partial
|
||||
@@ -307,15 +307,20 @@ class BaseParser:
|
||||
# Parse the link with stdlib.
|
||||
parsed = urlparse(link)._asdict()
|
||||
|
||||
# Appears to be a relative link:
|
||||
# If link is relative, then join it with base_url.
|
||||
if not parsed['netloc']:
|
||||
parsed['netloc'] = urlparse(self.base_url).netloc
|
||||
return urljoin(self.base_url, link)
|
||||
|
||||
# Link is absolute; if it lacks a scheme, add one from base_url.
|
||||
if not parsed['scheme']:
|
||||
parsed['scheme'] = urlparse(self.base_url).scheme
|
||||
|
||||
# Re-construct URL, with new data.
|
||||
parsed = (v for v in parsed.values())
|
||||
return urlunparse(parsed)
|
||||
# Reconstruct the URL to incorporate the new scheme.
|
||||
parsed = (v for v in parsed.values())
|
||||
return urlunparse(parsed)
|
||||
|
||||
# Link is absolute and complete with scheme; nothing to be done here.
|
||||
return link
|
||||
|
||||
|
||||
@property
|
||||
@@ -342,9 +347,15 @@ class BaseParser:
|
||||
if result:
|
||||
return result
|
||||
|
||||
url = '/'.join(self.url.split('/')[:-1])
|
||||
if url.endswith('/'):
|
||||
url = url[:-1]
|
||||
# Parse the url to separate out the path
|
||||
parsed = urlparse(self.url)._asdict()
|
||||
|
||||
# Remove any part of the path after the last '/'
|
||||
path = '/'.join(parsed['path'].split('/')[:-1])
|
||||
|
||||
# Reconstruct the url with the modified path
|
||||
parsed = (v for v in parsed.values())
|
||||
url = urlunparse(parsed)
|
||||
|
||||
return url
|
||||
|
||||
|
||||
@@ -137,6 +137,31 @@ def test_anchor_links():
|
||||
assert '#site-map' in r.html.links
|
||||
|
||||
|
||||
@pytest.mark.ok
|
||||
@pytest.mark.parametrize('url,link,expected', [
|
||||
('http://example.com/', 'test.html', 'http://example.com/test.html'),
|
||||
('http://example.com', 'test.html', 'http://example.com/test.html'),
|
||||
('http://example.com/foo/', 'test.html', 'http://example.com/foo/test.html'),
|
||||
('http://example.com/foo/bar', 'test.html', 'http://example.com/foo/test.html'),
|
||||
('http://example.com/foo/', '/test.html', 'http://example.com/test.html'),
|
||||
('http://example.com/', 'http://xkcd.com/about/', 'http://xkcd.com/about/'),
|
||||
('http://example.com/', '//xkcd.com/about/', 'http://xkcd.com/about/'),
|
||||
])
|
||||
def test_absolute_links(url, link, expected):
|
||||
head_template = """<head><base href='{}'></head>"""
|
||||
body_template = """<body><a href='{}'>Next</a></body>"""
|
||||
|
||||
# Test without `<base>` tag (url is base)
|
||||
html = HTML(html=body_template.format(link), url=url)
|
||||
assert html.absolute_links.pop() == expected
|
||||
|
||||
# Test with `<base>` tag (url is other)
|
||||
html = HTML(
|
||||
html=head_template.format(url) + body_template.format(link),
|
||||
url='http://example.com/foobar/')
|
||||
assert html.absolute_links.pop() == expected
|
||||
|
||||
|
||||
@pytest.mark.render
|
||||
def test_render():
|
||||
r = get()
|
||||
|
||||
Reference in New Issue
Block a user