mirror of
https://github.com/kennethreitz/requests.git
synced 2026-06-05 22:50:18 +00:00
Merge pull request #1575 from jparise/content-encoding
Improved content encoding detection.
This commit is contained in:
+5
-1
@@ -265,8 +265,12 @@ def get_encodings_from_content(content):
|
||||
"""
|
||||
|
||||
charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
|
||||
pragma_re = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
|
||||
xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
|
||||
|
||||
return charset_re.findall(content)
|
||||
return (charset_re.findall(content) +
|
||||
pragma_re.findall(content) +
|
||||
xml_re.findall(content))
|
||||
|
||||
|
||||
def get_encoding_from_headers(headers):
|
||||
|
||||
@@ -639,6 +639,50 @@ class RequestsTestCase(unittest.TestCase):
|
||||
self.assertEqual(r.url, url)
|
||||
|
||||
|
||||
class TestContentEncodingDetection(unittest.TestCase):
|
||||
|
||||
def test_none(self):
|
||||
encodings = requests.utils.get_encodings_from_content('')
|
||||
self.assertEqual(len(encodings), 0)
|
||||
|
||||
def test_html_charset(self):
|
||||
"""HTML5 meta charset attribute"""
|
||||
content = '<meta charset="UTF-8">'
|
||||
encodings = requests.utils.get_encodings_from_content(content)
|
||||
self.assertEqual(len(encodings), 1)
|
||||
self.assertEqual(encodings[0], 'UTF-8')
|
||||
|
||||
def test_html4_pragma(self):
|
||||
"""HTML4 pragma directive"""
|
||||
content = '<meta http-equiv="Content-type" content="text/html;charset=UTF-8">'
|
||||
encodings = requests.utils.get_encodings_from_content(content)
|
||||
self.assertEqual(len(encodings), 1)
|
||||
self.assertEqual(encodings[0], 'UTF-8')
|
||||
|
||||
def test_xhtml_pragma(self):
|
||||
"""XHTML 1.x served with text/html MIME type"""
|
||||
content = '<meta http-equiv="Content-type" content="text/html;charset=UTF-8" />'
|
||||
encodings = requests.utils.get_encodings_from_content(content)
|
||||
self.assertEqual(len(encodings), 1)
|
||||
self.assertEqual(encodings[0], 'UTF-8')
|
||||
|
||||
def test_xml(self):
|
||||
"""XHTML 1.x served as XML"""
|
||||
content = '<?xml version="1.0" encoding="UTF-8"?>'
|
||||
encodings = requests.utils.get_encodings_from_content(content)
|
||||
self.assertEqual(len(encodings), 1)
|
||||
self.assertEqual(encodings[0], 'UTF-8')
|
||||
|
||||
def test_precedence(self):
|
||||
content = '''
|
||||
<?xml version="1.0" encoding="XML"?>
|
||||
<meta charset="HTML5">
|
||||
<meta http-equiv="Content-type" content="text/html;charset=HTML4" />
|
||||
'''.strip()
|
||||
encodings = requests.utils.get_encodings_from_content(content)
|
||||
self.assertEqual(encodings, ['HTML5', 'HTML4', 'XML'])
|
||||
|
||||
|
||||
class TestCaseInsensitiveDict(unittest.TestCase):
|
||||
|
||||
def test_mapping_init(self):
|
||||
|
||||
Reference in New Issue
Block a user