diff --git a/AUTHORS.rst b/AUTHORS.rst index 9a7f25d5..3c074d1b 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -115,3 +115,4 @@ Patches and Suggestions - Rhys Elsmore - André Graf (dergraf) - Stephen Zhuang (everbird) +- Martijn Pieters diff --git a/requests/models.py b/requests/models.py index a87a988b..f3d7d768 100644 --- a/requests/models.py +++ b/requests/models.py @@ -31,7 +31,8 @@ from .exceptions import ( from .utils import ( get_encoding_from_headers, stream_untransfer, guess_filename, requote_uri, stream_decode_response_unicode, get_netrc_auth, get_environ_proxies, - to_key_val_list, DEFAULT_CA_BUNDLE_PATH, parse_header_links, iter_slices) + to_key_val_list, DEFAULT_CA_BUNDLE_PATH, parse_header_links, iter_slices, + guess_json_utf) from .compat import ( cookielib, urlparse, urlunparse, urljoin, urlsplit, urlencode, str, bytes, StringIO, is_py2, chardet, json, builtin_str, urldefrag) @@ -844,6 +845,18 @@ class Response(object): @property def json(self): """Returns the json-encoded content of a response, if any.""" + + if not self.encoding and len(self.content) > 3: + # No encoding set. JSON RFC 4627 section 3 states we should expect + # UTF-8, -16 or -32. Detect which one to use; If the detection or + # decoding fails, fall back to `self.text` (using chardet to make + # a best guess). + encoding = guess_json_utf(self.content) + if encoding is not None: + try: + return json.loads(self.content.decode(encoding)) + except (ValueError, UnicodeDecodeError): + pass try: return json.loads(self.text or self.content) except ValueError: diff --git a/requests/utils.py b/requests/utils.py index 7e9f6315..ec9f4d2c 100644 --- a/requests/utils.py +++ b/requests/utils.py @@ -579,3 +579,38 @@ def parse_header_links(value): links.append(link) return links + + +# Null bytes; no need to recreate these on each call to guess_json_utf +_null = '\x00'.encode('ascii') # encoding to ASCII for Python 3 +_null2 = _null * 2 +_null3 = _null * 3 + + +def guess_json_utf(data): + # JSON always starts with two ASCII characters, so detection is as + # easy as counting the nulls and from their location and count + # determine the encoding. Also detect a BOM, if present. + sample = data[:4] + if sample in (codecs.BOM_UTF32_LE, codecs.BOM32_BE): + return 'utf-32' # BOM included + if sample[:3] == codecs.BOM_UTF8: + return 'utf-8-sig' # BOM included, MS style (discouraged) + if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE): + return 'utf-16' # BOM included + nullcount = sample.count(_null) + if nullcount == 0: + return 'utf-8' + if nullcount == 2: + if sample[::2] == _null2: # 1st and 3rd are null + return 'utf-16-be' + if sample[1::2] == _null2: # 2nd and 4th are null + return 'utf-16-le' + # Did not detect 2 valid UTF-16 ascii-range characters + if nullcount == 3: + if sample[:3] == _null3: + return 'utf-32-be' + if sample[1:] == _null3: + return 'utf-32-le' + # Did not detect a valid UTF-32 ascii-range character + return None diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 00000000..5cd0684e --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import codecs +import os +import sys +import unittest +import random + +# Path hack. +sys.path.insert(0, os.path.abspath('..')) +import requests.utils +from requests.compat import is_py3, bytes + + +if is_py3: + unichr = chr + byteschr = lambda c: bytes([c]) +else: + byteschr = chr + + +class GuessJSONUTFTests(unittest.TestCase): + """Tests for the JSON UTF encoding guessing code.""" + + codecs = ( + 'utf-8', 'utf-8-sig', + 'utf-16', 'utf-16-le', 'utf-16-be', + 'utf-32', 'utf-32-le', 'utf-32-be' + ) + + def test_guess_encoding(self): + # Throw 4-character ASCII strings (encoded to a UTF encoding) + # at the guess routine; it should correctly guess all codecs. + guess = requests.utils.guess_json_utf + for c in range(33, 127): # printable only + sample = unichr(c) * 4 + for codec in self.codecs: + res = guess(sample.encode(codec)) + self.assertEqual(res, codec) + + def test_smoke_encoding(self): + # Throw random 4-byte strings at the guess function. + # Any guess for a UTF encoding is verified, a decode exception + # is a test failure. + guess = requests.utils.guess_json_utf + for i in range(1000): + sample = bytes().join( + [byteschr(random.randrange(256)) for _ in range(4)]) + res = guess(sample) + if res is not None: + # This should decode without errors if this is *really* + # something in this encoding. However, UTF-8 is a lot + # more picky, so we expect errors there. UTF-16 surrogate + # pairs also fail + try: + sample.decode(res) + except UnicodeDecodeError as e: + self.assertEqual(e.args[0].replace('-', '').lower(), + res.replace('-', '').lower()) + if res == 'utf-8': + self.assertTrue(e.args[-1], ( + 'invalid continuation byte', + 'invalid start byte')) + continue + if res == 'utf-16': + self.assertEqual(e.args[-1], 'unexpected end of data') + self.assertTrue(sample[:2] in ( + codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE)) + # the second two bytes are in the range \ud800-\udfff + # if someone wants to add tests for that as well. I don't + # see the need; we are not testing UTF decoding here. + continue + raise + +if __name__ == '__main__': + unittest.main()