Replace errors when decoding raw_html

Some websites don't have valid bytes, even when the encoding is specified. I'm not 100% sure if replacing "bad" bytes is the correct way to fix the problem. It seems to fix the issues I've run into with some sites.
2026-06-05 23:00:20 +00:00 · 2018-04-07 17:15:51 +02:00
parent c59480bf15
commit 05ff6e87ca
1 changed files with 2 additions and 2 deletions
@@ -102,7 +102,7 @@ class BaseParser:
        (`learn more <http://www.diveintopython3.net/strings.html>`_).
        """
        if self._html:
-            return self.raw_html.decode(self.encoding)
+            return self.raw_html.decode(self.encoding, errors='replace')
        else:
            return etree.tostring(self.element, encoding='unicode').strip()

@@ -128,7 +128,7 @@ class BaseParser:
            self._encoding = html_to_unicode(self.default_encoding, self._html)[0]
            # Fall back to requests' detected encoding if decode fails.
            try:
-                self.raw_html.decode(self.encoding)
+                self.raw_html.decode(self.encoding, errors='replace')
            except UnicodeDecodeError:
                self._encoding = self.default_encoding