mirror of
https://github.com/kennethreitz/requests-html.git
synced 2026-06-05 23:00:20 +00:00
better absolute link handling
Signed-off-by: Kenneth Reitz <me@kennethreitz.org>
This commit is contained in:
+16
-8
@@ -1,3 +1,5 @@
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
|
||||
import requests
|
||||
from pyquery import PyQuery
|
||||
|
||||
@@ -8,6 +10,7 @@ from parse import search as parse_search
|
||||
from parse import findall
|
||||
|
||||
|
||||
|
||||
useragent = UserAgent()
|
||||
|
||||
|
||||
@@ -88,6 +91,7 @@ class BaseParser:
|
||||
"""All found links on page, in as–is form."""
|
||||
def gen():
|
||||
for link in self.find('a'):
|
||||
|
||||
try:
|
||||
href = link.attrs['href']
|
||||
if not href.startswith('#') and self.skip_anchors and href not in ['javascript:;']:
|
||||
@@ -102,14 +106,18 @@ class BaseParser:
|
||||
"""All found links on page, in absolute form."""
|
||||
def gen():
|
||||
for link in self.links:
|
||||
# Appears to not be an absolute link.
|
||||
if ':' not in link:
|
||||
if link.startswith('/'):
|
||||
href = '{}{}'.format(self.base_url, link)
|
||||
else:
|
||||
href = '{}/{}'.format(self.base_url, link)
|
||||
else:
|
||||
href = link
|
||||
# Parse the link with stdlib.
|
||||
parsed = urlparse(link)._asdict()
|
||||
|
||||
# Appears to be a relative link:
|
||||
if not parsed['netloc']:
|
||||
parsed['netloc'] = urlparse(self.base_url).netloc
|
||||
if not parsed['scheme']:
|
||||
parsed['scheme'] = urlparse(self.base_url).scheme
|
||||
|
||||
# Re-construct URL, with new data.
|
||||
parsed = (v for v in parsed.values())
|
||||
href = urlunparse(parsed)
|
||||
|
||||
yield href
|
||||
|
||||
|
||||
Reference in New Issue
Block a user