Use a JSON-specific encoding detection when no encoding has been specified.

JSON *must* be encoded using UTF-8, UTF-16 or UTF-32 (see the [RFC][1]; detect the encoding based on the fact that JSON always starts with 2 ASCII characters.

[1]: http://tools.ietf.org/html/rfc4627#section-3
This commit is contained in:
Martijn Pieters
2012-10-25 17:43:52 +02:00
parent 6e0ad1eca5
commit 4decc7986e
3 changed files with 102 additions and 1 deletions
+14 -1
View File
@@ -31,7 +31,8 @@ from .exceptions import (
from .utils import (
get_encoding_from_headers, stream_untransfer, guess_filename, requote_uri,
stream_decode_response_unicode, get_netrc_auth, get_environ_proxies,
to_key_val_list, DEFAULT_CA_BUNDLE_PATH, parse_header_links, iter_slices)
to_key_val_list, DEFAULT_CA_BUNDLE_PATH, parse_header_links, iter_slices,
guess_json_utf)
from .compat import (
cookielib, urlparse, urlunparse, urljoin, urlsplit, urlencode, str, bytes,
StringIO, is_py2, chardet, json, builtin_str)
@@ -842,6 +843,18 @@ class Response(object):
@property
def json(self):
"""Returns the json-encoded content of a response, if any."""
if not self.encoding and len(self.content) > 3:
# No encoding set. JSON RFC 4627 section 3 states we should expect
# UTF-8, -16 or -32. Detect which one to use; If the detection or
# decoding fails, fall back to `self.text` (using chardet to make
# a best guess).
encoding = guess_json_utf(self.content)
if encoding is not None:
try:
return json.loads(self.content.decode(encoding))
except (ValueError, UnicodeDecodeError):
pass
try:
return json.loads(self.text or self.content)
except ValueError:
+35
View File
@@ -579,3 +579,38 @@ def parse_header_links(value):
links.append(link)
return links
# Null bytes; no need to recreate these on each call to guess_json_utf
_null = '\x00'.encode('ascii') # encoding to ASCII for Python 3
_null2 = _null * 2
_null3 = _null * 3
def guess_json_utf(data):
# JSON always starts with two ASCII characters, so detection is as
# easy as counting the nulls and from their location and count
# determine the encoding. Also detect a BOM, if present.
sample = data[:4]
if sample in (codecs.BOM_UTF32_LE, codecs.BOM32_BE):
return 'utf-32' # BOM included
if sample[:3] == codecs.BOM_UTF8:
return 'utf-8-sig' # BOM included, MS style (discouraged)
if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE):
return 'utf-16' # BOM included
nullcount = sample.count(_null)
if nullcount == 0:
return 'utf-8'
if nullcount == 2:
if sample[::2] == _null2: # 1st and 3rd are null
return 'utf-16-be'
if sample[1::2] == _null2: # 2nd and 4th are null
return 'utf-16-le'
# Did not detect 2 valid UTF-16 ascii-range characters
if nullcount == 3:
if sample[:3] == _null3:
return 'utf-32-be'
if sample[1:] == _null3:
return 'utf-32-le'
# Did not detect a valid UTF-32 ascii-range character
return None
+53
View File
@@ -0,0 +1,53 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import sys
import unittest
import random
# Path hack.
sys.path.insert(0, os.path.abspath('..'))
import requests.utils
from requests.compat import is_py3, bytes
class GuessJSONUTFTests(unittest.TestCase):
"""Smoke test for https functionality."""
codecs = (
'utf-8', 'utf-8-sig',
'utf-16', 'utf-16-le', 'utf-16-be',
'utf-32', 'utf-32-le', 'utf-32-be'
)
def test_guess_encoding(self):
# Throw 4-character ASCII strings (encoded to a UTF encoding)
# at the guess routine; it should correctly guess all codecs.
unichr = chr if is_py3 else __builtins__.unichr
guess = requests.utils.guess_json_utf
for c in range(33, 127): # printable only
sample = unichr(c) * 4
for codec in self.codecs:
res = guess(sample.encode(codec))
self.assertEqual(res, codec)
def test_smoke_encoding(self):
# Throw random 4-byte strings at the guess function.
# Any guess for a UTF encoding is verified, a decode exception
# is a test failure.
chr = (lambda c: bytes([c])) if is_py3 else __builtins__.chr
guess = requests.utils.guess_json_utf
for i in range(1000):
sample = bytes().join(
[chr(random.randrange(256)) for _ in range(4)])
res = guess(sample)
if res is not None and res != 'utf-8':
# This should decode without errors if this is *really*
# something in this encoding. Skip UTF-8, it is more
# picky about valid data.
sample.decode(res)
if __name__ == '__main__':
unittest.main()