Replace errors when decoding raw_html

Some websites don't have valid bytes, even when the encoding is specified. I'm not 100% sure if replacing "bad" bytes is the correct way to fix the problem. It seems to fix the issues I've run into with some sites.
This commit is contained in:
Angus Dippenaar
2018-04-07 17:15:51 +02:00
committed by GitHub
parent c59480bf15
commit 05ff6e87ca
+2 -2
View File
@@ -102,7 +102,7 @@ class BaseParser:
(`learn more <http://www.diveintopython3.net/strings.html>`_).
"""
if self._html:
return self.raw_html.decode(self.encoding)
return self.raw_html.decode(self.encoding, errors='replace')
else:
return etree.tostring(self.element, encoding='unicode').strip()
@@ -128,7 +128,7 @@ class BaseParser:
self._encoding = html_to_unicode(self.default_encoding, self._html)[0]
# Fall back to requests' detected encoding if decode fails.
try:
self.raw_html.decode(self.encoding)
self.raw_html.decode(self.encoding, errors='replace')
except UnicodeDecodeError:
self._encoding = self.default_encoding