diff --git a/requests/utils.py b/requests/utils.py index 37aa19e4..ac5f59d8 100644 --- a/requests/utils.py +++ b/requests/utils.py @@ -265,8 +265,12 @@ def get_encodings_from_content(content): """ charset_re = re.compile(r']', flags=re.I) + pragma_re = re.compile(r']', flags=re.I) + xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]') - return charset_re.findall(content) + return (charset_re.findall(content) + + pragma_re.findall(content) + + xml_re.findall(content)) def get_encoding_from_headers(headers): diff --git a/test_requests.py b/test_requests.py index b6e46591..e62d9237 100755 --- a/test_requests.py +++ b/test_requests.py @@ -639,6 +639,50 @@ class RequestsTestCase(unittest.TestCase): self.assertEqual(r.url, url) +class TestContentEncodingDetection(unittest.TestCase): + + def test_none(self): + encodings = requests.utils.get_encodings_from_content('') + self.assertEqual(len(encodings), 0) + + def test_html_charset(self): + """HTML5 meta charset attribute""" + content = '' + encodings = requests.utils.get_encodings_from_content(content) + self.assertEqual(len(encodings), 1) + self.assertEqual(encodings[0], 'UTF-8') + + def test_html4_pragma(self): + """HTML4 pragma directive""" + content = '' + encodings = requests.utils.get_encodings_from_content(content) + self.assertEqual(len(encodings), 1) + self.assertEqual(encodings[0], 'UTF-8') + + def test_xhtml_pragma(self): + """XHTML 1.x served with text/html MIME type""" + content = '' + encodings = requests.utils.get_encodings_from_content(content) + self.assertEqual(len(encodings), 1) + self.assertEqual(encodings[0], 'UTF-8') + + def test_xml(self): + """XHTML 1.x served as XML""" + content = '' + encodings = requests.utils.get_encodings_from_content(content) + self.assertEqual(len(encodings), 1) + self.assertEqual(encodings[0], 'UTF-8') + + def test_precedence(self): + content = ''' + + + + '''.strip() + encodings = requests.utils.get_encodings_from_content(content) + self.assertEqual(encodings, ['HTML5', 'HTML4', 'XML']) + + class TestCaseInsensitiveDict(unittest.TestCase): def test_mapping_init(self):