From 14da46f03d193012c750fd5e6b180900d1a91f3e Mon Sep 17 00:00:00 2001 From: Andrew Gorcester Date: Tue, 6 Mar 2018 16:06:23 -0800 Subject: [PATCH 1/3] Add tests for ._make_absolute() and make them pass. --- requests_html.py | 29 ++++++++++++++++++++--------- tests/test_requests_html.py | 25 +++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 9 deletions(-) diff --git a/requests_html.py b/requests_html.py index 9c14ff5..fee56bb 100644 --- a/requests_html.py +++ b/requests_html.py @@ -1,6 +1,6 @@ import sys import asyncio -from urllib.parse import urlparse, urlunparse +from urllib.parse import urlparse, urlunparse, urljoin from concurrent.futures import ThreadPoolExecutor from concurrent.futures._base import TimeoutError from functools import partial @@ -307,15 +307,20 @@ class BaseParser: # Parse the link with stdlib. parsed = urlparse(link)._asdict() - # Appears to be a relative link: + # If link is relative, then join it with base_url. if not parsed['netloc']: - parsed['netloc'] = urlparse(self.base_url).netloc + return urljoin(self.base_url, link) + + # Link is absolute; if it lacks a scheme, add one from base_url. if not parsed['scheme']: parsed['scheme'] = urlparse(self.base_url).scheme - # Re-construct URL, with new data. - parsed = (v for v in parsed.values()) - return urlunparse(parsed) + # Reconstruct the URL to incorporate the new scheme. + parsed = (v for v in parsed.values()) + return urlunparse(parsed) + + # Link is absolute and complete with scheme; nothing to be done here. + return link @property @@ -342,9 +347,15 @@ class BaseParser: if result: return result - url = '/'.join(self.url.split('/')[:-1]) - if url.endswith('/'): - url = url[:-1] + # Parse the url to separate out the path + parsed = urlparse(self.url)._asdict() + + # Remove any part of the path after the last '/' + path = '/'.join(parsed['path'].split('/')[:-1]) + + # Reconstruct the url with the modified path + parsed = (v for v in parsed.values()) + url = urlunparse(parsed) return url diff --git a/tests/test_requests_html.py b/tests/test_requests_html.py index 5f35aa7..a2cd4e9 100644 --- a/tests/test_requests_html.py +++ b/tests/test_requests_html.py @@ -137,6 +137,31 @@ def test_anchor_links(): assert '#site-map' in r.html.links +@pytest.mark.ok +@pytest.mark.parametrize('url,link,expected', [ + ('http://example.com/', 'test.html', 'http://example.com/test.html'), + ('http://example.com', 'test.html', 'http://example.com/test.html'), + ('http://example.com/foo/', 'test.html', 'http://example.com/foo/test.html'), + ('http://example.com/foo/bar', 'test.html', 'http://example.com/foo/test.html'), + ('http://example.com/foo/', '/test.html', 'http://example.com/test.html'), + ('http://example.com/', 'http://xkcd.com/about/', 'http://xkcd.com/about/'), + ('http://example.com/', '//xkcd.com/about/', 'http://xkcd.com/about/'), +]) +def test_absolute_links(url, link, expected): + head_template = """""" + body_template = """Next""" + + # Test without `` tag (url is base) + html = HTML(html=body_template.format(link), url=url) + assert html.absolute_links.pop() == expected + + # Test with `` tag (url is other) + html = HTML( + html=head_template.format(url) + body_template.format(link), + url='http://example.com/foobar/') + assert html.absolute_links.pop() == expected + + @pytest.mark.render def test_render(): r = get() From 016054fc8323257916943548d60eb366a60420e9 Mon Sep 17 00:00:00 2001 From: John Ludwig Date: Wed, 7 Mar 2018 18:46:03 -0600 Subject: [PATCH 2/3] Change website to Amazon Smile --- tests/test_internet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_internet.py b/tests/test_internet.py index 05cfff5..19527bb 100644 --- a/tests/test_internet.py +++ b/tests/test_internet.py @@ -6,7 +6,7 @@ def test_pagination(): pages = ( 'https://xkcd.com/1957/', 'https://reddit.com/', - 'https://pornhub.com/', + 'https://smile.amazon.com/', 'https://theverge.com/archives' ) From fcaac85af9e6387a91efb6c4f79a62f0fc9c8bf1 Mon Sep 17 00:00:00 2001 From: bkcsfi Date: Thu, 8 Mar 2018 12:33:06 -0500 Subject: [PATCH 3/3] fix typo in index.rst --- docs/source/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index ef794b3..4f51fe7 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -144,7 +144,7 @@ XPath is also supported (`learn more >> r.html.xpath('a') [] -You can also select only elements containing certian text: +You can also select only elements containing certain text: .. code-block:: pycon