From 4decc7986e32bb8f3511df3dd0c9b1c1d57453c1 Mon Sep 17 00:00:00 2001 From: Martijn Pieters Date: Thu, 25 Oct 2012 17:43:52 +0200 Subject: [PATCH 1/6] Use a JSON-specific encoding detection when no encoding has been specified. JSON *must* be encoded using UTF-8, UTF-16 or UTF-32 (see the [RFC][1]; detect the encoding based on the fact that JSON always starts with 2 ASCII characters. [1]: http://tools.ietf.org/html/rfc4627#section-3 --- requests/models.py | 15 ++++++++++++- requests/utils.py | 35 ++++++++++++++++++++++++++++++ tests/test_utils.py | 53 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 102 insertions(+), 1 deletion(-) create mode 100644 tests/test_utils.py diff --git a/requests/models.py b/requests/models.py index c19d3cca..06c8a71a 100644 --- a/requests/models.py +++ b/requests/models.py @@ -31,7 +31,8 @@ from .exceptions import ( from .utils import ( get_encoding_from_headers, stream_untransfer, guess_filename, requote_uri, stream_decode_response_unicode, get_netrc_auth, get_environ_proxies, - to_key_val_list, DEFAULT_CA_BUNDLE_PATH, parse_header_links, iter_slices) + to_key_val_list, DEFAULT_CA_BUNDLE_PATH, parse_header_links, iter_slices, + guess_json_utf) from .compat import ( cookielib, urlparse, urlunparse, urljoin, urlsplit, urlencode, str, bytes, StringIO, is_py2, chardet, json, builtin_str) @@ -842,6 +843,18 @@ class Response(object): @property def json(self): """Returns the json-encoded content of a response, if any.""" + + if not self.encoding and len(self.content) > 3: + # No encoding set. JSON RFC 4627 section 3 states we should expect + # UTF-8, -16 or -32. Detect which one to use; If the detection or + # decoding fails, fall back to `self.text` (using chardet to make + # a best guess). + encoding = guess_json_utf(self.content) + if encoding is not None: + try: + return json.loads(self.content.decode(encoding)) + except (ValueError, UnicodeDecodeError): + pass try: return json.loads(self.text or self.content) except ValueError: diff --git a/requests/utils.py b/requests/utils.py index 7e9f6315..ec9f4d2c 100644 --- a/requests/utils.py +++ b/requests/utils.py @@ -579,3 +579,38 @@ def parse_header_links(value): links.append(link) return links + + +# Null bytes; no need to recreate these on each call to guess_json_utf +_null = '\x00'.encode('ascii') # encoding to ASCII for Python 3 +_null2 = _null * 2 +_null3 = _null * 3 + + +def guess_json_utf(data): + # JSON always starts with two ASCII characters, so detection is as + # easy as counting the nulls and from their location and count + # determine the encoding. Also detect a BOM, if present. + sample = data[:4] + if sample in (codecs.BOM_UTF32_LE, codecs.BOM32_BE): + return 'utf-32' # BOM included + if sample[:3] == codecs.BOM_UTF8: + return 'utf-8-sig' # BOM included, MS style (discouraged) + if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE): + return 'utf-16' # BOM included + nullcount = sample.count(_null) + if nullcount == 0: + return 'utf-8' + if nullcount == 2: + if sample[::2] == _null2: # 1st and 3rd are null + return 'utf-16-be' + if sample[1::2] == _null2: # 2nd and 4th are null + return 'utf-16-le' + # Did not detect 2 valid UTF-16 ascii-range characters + if nullcount == 3: + if sample[:3] == _null3: + return 'utf-32-be' + if sample[1:] == _null3: + return 'utf-32-le' + # Did not detect a valid UTF-32 ascii-range character + return None diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 00000000..27fa18e4 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import os +import sys +import unittest +import random + +# Path hack. +sys.path.insert(0, os.path.abspath('..')) +import requests.utils +from requests.compat import is_py3, bytes + + +class GuessJSONUTFTests(unittest.TestCase): + """Smoke test for https functionality.""" + + codecs = ( + 'utf-8', 'utf-8-sig', + 'utf-16', 'utf-16-le', 'utf-16-be', + 'utf-32', 'utf-32-le', 'utf-32-be' + ) + + def test_guess_encoding(self): + # Throw 4-character ASCII strings (encoded to a UTF encoding) + # at the guess routine; it should correctly guess all codecs. + unichr = chr if is_py3 else __builtins__.unichr + guess = requests.utils.guess_json_utf + for c in range(33, 127): # printable only + sample = unichr(c) * 4 + for codec in self.codecs: + res = guess(sample.encode(codec)) + self.assertEqual(res, codec) + + def test_smoke_encoding(self): + # Throw random 4-byte strings at the guess function. + # Any guess for a UTF encoding is verified, a decode exception + # is a test failure. + chr = (lambda c: bytes([c])) if is_py3 else __builtins__.chr + guess = requests.utils.guess_json_utf + for i in range(1000): + sample = bytes().join( + [chr(random.randrange(256)) for _ in range(4)]) + res = guess(sample) + if res is not None and res != 'utf-8': + # This should decode without errors if this is *really* + # something in this encoding. Skip UTF-8, it is more + # picky about valid data. + sample.decode(res) + + +if __name__ == '__main__': + unittest.main() From 9832bd89172fb1200e8fdfb2e1cc334ad0fcf332 Mon Sep 17 00:00:00 2001 From: Martijn Pieters Date: Thu, 25 Oct 2012 17:56:19 +0200 Subject: [PATCH 2/6] Correct a c&p mistake: set a correct docstring for the unit test class. --- tests/test_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index 27fa18e4..dabb579e 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -13,7 +13,7 @@ from requests.compat import is_py3, bytes class GuessJSONUTFTests(unittest.TestCase): - """Smoke test for https functionality.""" + """Tests for the JSON UTF encoding guessing code.""" codecs = ( 'utf-8', 'utf-8-sig', From 25f2806f23d2a88920a3558f8bc4f798a115d014 Mon Sep 17 00:00:00 2001 From: Martijn Pieters Date: Thu, 25 Oct 2012 18:12:55 +0200 Subject: [PATCH 3/6] Cheek: insert myself into AUTHORS.rst. --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 9a7f25d5..3c074d1b 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -115,3 +115,4 @@ Patches and Suggestions - Rhys Elsmore - André Graf (dergraf) - Stephen Zhuang (everbird) +- Martijn Pieters From a4be9a2578dcb8f0862cc80f1e37243edfc04bd7 Mon Sep 17 00:00:00 2001 From: Martijn Pieters Date: Thu, 25 Oct 2012 18:22:07 +0200 Subject: [PATCH 4/6] Redefine the `unichr` and bytes-variant of `chr` at module level. Needed to appease Travis; it's python 2.6 and 2.7 builds are weird and the `__builtins__` dict is not following CPython conventions. --- tests/test_utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index dabb579e..1122fb37 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -12,6 +12,11 @@ import requests.utils from requests.compat import is_py3, bytes +if is_py3: + unichr = chr + chr = lambda c: bytes([c]) + + class GuessJSONUTFTests(unittest.TestCase): """Tests for the JSON UTF encoding guessing code.""" @@ -24,7 +29,6 @@ class GuessJSONUTFTests(unittest.TestCase): def test_guess_encoding(self): # Throw 4-character ASCII strings (encoded to a UTF encoding) # at the guess routine; it should correctly guess all codecs. - unichr = chr if is_py3 else __builtins__.unichr guess = requests.utils.guess_json_utf for c in range(33, 127): # printable only sample = unichr(c) * 4 @@ -36,7 +40,6 @@ class GuessJSONUTFTests(unittest.TestCase): # Throw random 4-byte strings at the guess function. # Any guess for a UTF encoding is verified, a decode exception # is a test failure. - chr = (lambda c: bytes([c])) if is_py3 else __builtins__.chr guess = requests.utils.guess_json_utf for i in range(1000): sample = bytes().join( From be01a35ef12c7e71c0e71c4e37d1f1a392a66fd8 Mon Sep 17 00:00:00 2001 From: Martijn Pieters Date: Thu, 25 Oct 2012 18:27:21 +0200 Subject: [PATCH 5/6] Better not call it `chr`, rename to `byteschr`. --- tests/test_utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index 1122fb37..c0560ec1 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -14,7 +14,9 @@ from requests.compat import is_py3, bytes if is_py3: unichr = chr - chr = lambda c: bytes([c]) + byteschr = lambda c: bytes([c]) +else: + byteschr = chr class GuessJSONUTFTests(unittest.TestCase): @@ -43,7 +45,7 @@ class GuessJSONUTFTests(unittest.TestCase): guess = requests.utils.guess_json_utf for i in range(1000): sample = bytes().join( - [chr(random.randrange(256)) for _ in range(4)]) + [byteschr(random.randrange(256)) for _ in range(4)]) res = guess(sample) if res is not None and res != 'utf-8': # This should decode without errors if this is *really* From e26ccb34eb1c0d3948bfd9e50ffe333605ae554d Mon Sep 17 00:00:00 2001 From: Martijn Pieters Date: Fri, 26 Oct 2012 12:15:27 +0200 Subject: [PATCH 6/6] Fix the smoke test in the face of UTF-16 surrogate pairs. If the random data starts with a UTF-16 BOM *and* the next two bytes are for a character in the `\ud800`-`\udfff` range decoding would fail. Small chance, but still possible. Extend it to check the UTF-8 error as well. The goal is to test that the guesser was *mostly* correct, and to verify the cases where it wasn't that it was to be expected. Most of all that the function doesn't buckle under wildly unexpected data. --- tests/test_utils.py | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index c0560ec1..5cd0684e 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,6 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +import codecs import os import sys import unittest @@ -47,12 +48,30 @@ class GuessJSONUTFTests(unittest.TestCase): sample = bytes().join( [byteschr(random.randrange(256)) for _ in range(4)]) res = guess(sample) - if res is not None and res != 'utf-8': + if res is not None: # This should decode without errors if this is *really* - # something in this encoding. Skip UTF-8, it is more - # picky about valid data. - sample.decode(res) - + # something in this encoding. However, UTF-8 is a lot + # more picky, so we expect errors there. UTF-16 surrogate + # pairs also fail + try: + sample.decode(res) + except UnicodeDecodeError as e: + self.assertEqual(e.args[0].replace('-', '').lower(), + res.replace('-', '').lower()) + if res == 'utf-8': + self.assertTrue(e.args[-1], ( + 'invalid continuation byte', + 'invalid start byte')) + continue + if res == 'utf-16': + self.assertEqual(e.args[-1], 'unexpected end of data') + self.assertTrue(sample[:2] in ( + codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE)) + # the second two bytes are in the range \ud800-\udfff + # if someone wants to add tests for that as well. I don't + # see the need; we are not testing UTF decoding here. + continue + raise if __name__ == '__main__': unittest.main()