better absolute link handling

Signed-off-by: Kenneth Reitz <me@kennethreitz.org>
This commit is contained in:
2018-02-26 08:53:59 -05:00
parent 4b26b44c88
commit 9022c0bea9
+16 -8
View File
@@ -1,3 +1,5 @@
from urllib.parse import urlparse, urlunparse
import requests
from pyquery import PyQuery
@@ -8,6 +10,7 @@ from parse import search as parse_search
from parse import findall
useragent = UserAgent()
@@ -88,6 +91,7 @@ class BaseParser:
"""All found links on page, in asis form."""
def gen():
for link in self.find('a'):
try:
href = link.attrs['href']
if not href.startswith('#') and self.skip_anchors and href not in ['javascript:;']:
@@ -102,14 +106,18 @@ class BaseParser:
"""All found links on page, in absolute form."""
def gen():
for link in self.links:
# Appears to not be an absolute link.
if ':' not in link:
if link.startswith('/'):
href = '{}{}'.format(self.base_url, link)
else:
href = '{}/{}'.format(self.base_url, link)
else:
href = link
# Parse the link with stdlib.
parsed = urlparse(link)._asdict()
# Appears to be a relative link:
if not parsed['netloc']:
parsed['netloc'] = urlparse(self.base_url).netloc
if not parsed['scheme']:
parsed['scheme'] = urlparse(self.base_url).scheme
# Re-construct URL, with new data.
parsed = (v for v in parsed.values())
href = urlunparse(parsed)
yield href