mirror of
https://github.com/kennethreitz/requests.git
synced 2026-06-05 22:50:18 +00:00
Merge pull request #909 from mjpieters/issue765-json-encodings
Use a JSON-specific encoding detection when no encoding has been specified
This commit is contained in:
@@ -115,3 +115,4 @@ Patches and Suggestions
|
||||
- Rhys Elsmore
|
||||
- André Graf (dergraf)
|
||||
- Stephen Zhuang (everbird)
|
||||
- Martijn Pieters
|
||||
|
||||
+14
-1
@@ -31,7 +31,8 @@ from .exceptions import (
|
||||
from .utils import (
|
||||
get_encoding_from_headers, stream_untransfer, guess_filename, requote_uri,
|
||||
stream_decode_response_unicode, get_netrc_auth, get_environ_proxies,
|
||||
to_key_val_list, DEFAULT_CA_BUNDLE_PATH, parse_header_links, iter_slices)
|
||||
to_key_val_list, DEFAULT_CA_BUNDLE_PATH, parse_header_links, iter_slices,
|
||||
guess_json_utf)
|
||||
from .compat import (
|
||||
cookielib, urlparse, urlunparse, urljoin, urlsplit, urlencode, str, bytes,
|
||||
StringIO, is_py2, chardet, json, builtin_str, urldefrag)
|
||||
@@ -844,6 +845,18 @@ class Response(object):
|
||||
@property
|
||||
def json(self):
|
||||
"""Returns the json-encoded content of a response, if any."""
|
||||
|
||||
if not self.encoding and len(self.content) > 3:
|
||||
# No encoding set. JSON RFC 4627 section 3 states we should expect
|
||||
# UTF-8, -16 or -32. Detect which one to use; If the detection or
|
||||
# decoding fails, fall back to `self.text` (using chardet to make
|
||||
# a best guess).
|
||||
encoding = guess_json_utf(self.content)
|
||||
if encoding is not None:
|
||||
try:
|
||||
return json.loads(self.content.decode(encoding))
|
||||
except (ValueError, UnicodeDecodeError):
|
||||
pass
|
||||
try:
|
||||
return json.loads(self.text or self.content)
|
||||
except ValueError:
|
||||
|
||||
@@ -579,3 +579,38 @@ def parse_header_links(value):
|
||||
links.append(link)
|
||||
|
||||
return links
|
||||
|
||||
|
||||
# Null bytes; no need to recreate these on each call to guess_json_utf
|
||||
_null = '\x00'.encode('ascii') # encoding to ASCII for Python 3
|
||||
_null2 = _null * 2
|
||||
_null3 = _null * 3
|
||||
|
||||
|
||||
def guess_json_utf(data):
|
||||
# JSON always starts with two ASCII characters, so detection is as
|
||||
# easy as counting the nulls and from their location and count
|
||||
# determine the encoding. Also detect a BOM, if present.
|
||||
sample = data[:4]
|
||||
if sample in (codecs.BOM_UTF32_LE, codecs.BOM32_BE):
|
||||
return 'utf-32' # BOM included
|
||||
if sample[:3] == codecs.BOM_UTF8:
|
||||
return 'utf-8-sig' # BOM included, MS style (discouraged)
|
||||
if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE):
|
||||
return 'utf-16' # BOM included
|
||||
nullcount = sample.count(_null)
|
||||
if nullcount == 0:
|
||||
return 'utf-8'
|
||||
if nullcount == 2:
|
||||
if sample[::2] == _null2: # 1st and 3rd are null
|
||||
return 'utf-16-be'
|
||||
if sample[1::2] == _null2: # 2nd and 4th are null
|
||||
return 'utf-16-le'
|
||||
# Did not detect 2 valid UTF-16 ascii-range characters
|
||||
if nullcount == 3:
|
||||
if sample[:3] == _null3:
|
||||
return 'utf-32-be'
|
||||
if sample[1:] == _null3:
|
||||
return 'utf-32-le'
|
||||
# Did not detect a valid UTF-32 ascii-range character
|
||||
return None
|
||||
|
||||
@@ -0,0 +1,77 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import codecs
|
||||
import os
|
||||
import sys
|
||||
import unittest
|
||||
import random
|
||||
|
||||
# Path hack.
|
||||
sys.path.insert(0, os.path.abspath('..'))
|
||||
import requests.utils
|
||||
from requests.compat import is_py3, bytes
|
||||
|
||||
|
||||
if is_py3:
|
||||
unichr = chr
|
||||
byteschr = lambda c: bytes([c])
|
||||
else:
|
||||
byteschr = chr
|
||||
|
||||
|
||||
class GuessJSONUTFTests(unittest.TestCase):
|
||||
"""Tests for the JSON UTF encoding guessing code."""
|
||||
|
||||
codecs = (
|
||||
'utf-8', 'utf-8-sig',
|
||||
'utf-16', 'utf-16-le', 'utf-16-be',
|
||||
'utf-32', 'utf-32-le', 'utf-32-be'
|
||||
)
|
||||
|
||||
def test_guess_encoding(self):
|
||||
# Throw 4-character ASCII strings (encoded to a UTF encoding)
|
||||
# at the guess routine; it should correctly guess all codecs.
|
||||
guess = requests.utils.guess_json_utf
|
||||
for c in range(33, 127): # printable only
|
||||
sample = unichr(c) * 4
|
||||
for codec in self.codecs:
|
||||
res = guess(sample.encode(codec))
|
||||
self.assertEqual(res, codec)
|
||||
|
||||
def test_smoke_encoding(self):
|
||||
# Throw random 4-byte strings at the guess function.
|
||||
# Any guess for a UTF encoding is verified, a decode exception
|
||||
# is a test failure.
|
||||
guess = requests.utils.guess_json_utf
|
||||
for i in range(1000):
|
||||
sample = bytes().join(
|
||||
[byteschr(random.randrange(256)) for _ in range(4)])
|
||||
res = guess(sample)
|
||||
if res is not None:
|
||||
# This should decode without errors if this is *really*
|
||||
# something in this encoding. However, UTF-8 is a lot
|
||||
# more picky, so we expect errors there. UTF-16 surrogate
|
||||
# pairs also fail
|
||||
try:
|
||||
sample.decode(res)
|
||||
except UnicodeDecodeError as e:
|
||||
self.assertEqual(e.args[0].replace('-', '').lower(),
|
||||
res.replace('-', '').lower())
|
||||
if res == 'utf-8':
|
||||
self.assertTrue(e.args[-1], (
|
||||
'invalid continuation byte',
|
||||
'invalid start byte'))
|
||||
continue
|
||||
if res == 'utf-16':
|
||||
self.assertEqual(e.args[-1], 'unexpected end of data')
|
||||
self.assertTrue(sample[:2] in (
|
||||
codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE))
|
||||
# the second two bytes are in the range \ud800-\udfff
|
||||
# if someone wants to add tests for that as well. I don't
|
||||
# see the need; we are not testing UTF decoding here.
|
||||
continue
|
||||
raise
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user