mirror of
https://github.com/kennethreitz/requests-html.git
synced 2026-06-05 23:00:20 +00:00
Replace errors when decoding raw_html
Some websites don't have valid bytes, even when the encoding is specified. I'm not 100% sure if replacing "bad" bytes is the correct way to fix the problem. It seems to fix the issues I've run into with some sites.
This commit is contained in:
+2
-2
@@ -102,7 +102,7 @@ class BaseParser:
|
||||
(`learn more <http://www.diveintopython3.net/strings.html>`_).
|
||||
"""
|
||||
if self._html:
|
||||
return self.raw_html.decode(self.encoding)
|
||||
return self.raw_html.decode(self.encoding, errors='replace')
|
||||
else:
|
||||
return etree.tostring(self.element, encoding='unicode').strip()
|
||||
|
||||
@@ -128,7 +128,7 @@ class BaseParser:
|
||||
self._encoding = html_to_unicode(self.default_encoding, self._html)[0]
|
||||
# Fall back to requests' detected encoding if decode fails.
|
||||
try:
|
||||
self.raw_html.decode(self.encoding)
|
||||
self.raw_html.decode(self.encoding, errors='replace')
|
||||
except UnicodeDecodeError:
|
||||
self._encoding = self.default_encoding
|
||||
|
||||
|
||||
Reference in New Issue
Block a user