Fix bugs related to links

* #121 KeyError of special base tag
* #124 Remove 'mailto:' links out from links
This commit is contained in:
Frost Ming
2018-03-11 16:26:58 +08:00
parent c0680d3f41
commit af97ddd5f1
+3 -3
View File
@@ -294,7 +294,7 @@ class BaseParser:
try:
href = link.attrs['href'].strip()
if href and not (href.startswith('#') and self.skip_anchors) and not href.startswith('javascript:'):
if href and not (href.startswith('#') and self.skip_anchors) and not href.startswith('javascript:') and not href.startswith('mailto:'):
yield href
except KeyError:
pass
@@ -343,7 +343,7 @@ class BaseParser:
# Support for <base> tag.
base = self.find('base', first=True)
if base:
result = base.attrs['href'].strip()
result = base.attrs.get('href', '').strip()
if result:
return result
@@ -351,7 +351,7 @@ class BaseParser:
parsed = urlparse(self.url)._asdict()
# Remove any part of the path after the last '/'
path = '/'.join(parsed['path'].split('/')[:-1])
parsed['path'] = '/'.join(parsed['path'].split('/')[:-1]) + '/'
# Reconstruct the url with the modified path
parsed = (v for v in parsed.values())