From af97ddd5f18e75f1277b0d6d79ba0f780de9b9dd Mon Sep 17 00:00:00 2001 From: Frost Ming Date: Sun, 11 Mar 2018 16:26:58 +0800 Subject: [PATCH] Fix bugs related to links * #121 KeyError of special base tag * #124 Remove 'mailto:' links out from links --- requests_html.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/requests_html.py b/requests_html.py index fee56bb..2277ce6 100644 --- a/requests_html.py +++ b/requests_html.py @@ -294,7 +294,7 @@ class BaseParser: try: href = link.attrs['href'].strip() - if href and not (href.startswith('#') and self.skip_anchors) and not href.startswith('javascript:'): + if href and not (href.startswith('#') and self.skip_anchors) and not href.startswith('javascript:') and not href.startswith('mailto:'): yield href except KeyError: pass @@ -343,7 +343,7 @@ class BaseParser: # Support for tag. base = self.find('base', first=True) if base: - result = base.attrs['href'].strip() + result = base.attrs.get('href', '').strip() if result: return result @@ -351,7 +351,7 @@ class BaseParser: parsed = urlparse(self.url)._asdict() # Remove any part of the path after the last '/' - path = '/'.join(parsed['path'].split('/')[:-1]) + parsed['path'] = '/'.join(parsed['path'].split('/')[:-1]) + '/' # Reconstruct the url with the modified path parsed = (v for v in parsed.values())