From 05ff6e87ca7206a241d3f1ba7ff49ebf7cda7217 Mon Sep 17 00:00:00 2001 From: Angus Dippenaar Date: Sat, 7 Apr 2018 17:15:51 +0200 Subject: [PATCH] Replace errors when decoding raw_html Some websites don't have valid bytes, even when the encoding is specified. I'm not 100% sure if replacing "bad" bytes is the correct way to fix the problem. It seems to fix the issues I've run into with some sites. --- requests_html.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requests_html.py b/requests_html.py index ea8f3bd..6cdab39 100644 --- a/requests_html.py +++ b/requests_html.py @@ -102,7 +102,7 @@ class BaseParser: (`learn more `_). """ if self._html: - return self.raw_html.decode(self.encoding) + return self.raw_html.decode(self.encoding, errors='replace') else: return etree.tostring(self.element, encoding='unicode').strip() @@ -128,7 +128,7 @@ class BaseParser: self._encoding = html_to_unicode(self.default_encoding, self._html)[0] # Fall back to requests' detected encoding if decode fails. try: - self.raw_html.decode(self.encoding) + self.raw_html.decode(self.encoding, errors='replace') except UnicodeDecodeError: self._encoding = self.default_encoding