From 329efcae6e93e992582f0f3e01f4a7ed31b54979 Mon Sep 17 00:00:00 2001 From: Matt Giuca Date: Tue, 14 Feb 2012 11:47:14 +1100 Subject: [PATCH 1/7] Added test case for quoted and illegal characters in the URL. This is a test case for issue #369. It passes in Python 2, but fails in Python 3 (though the test suite doesn't actually work in Python 3, it can be shown to fail with some modification). --- test_requests.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/test_requests.py b/test_requests.py index eda9f791..4f9275c8 100644 --- a/test_requests.py +++ b/test_requests.py @@ -117,6 +117,28 @@ class RequestsTestSuite(TestSetup, unittest.TestCase): requests.get(url=httpbin('get'), headers=heads) + def test_session_with_escaped_url(self): + # Test a URL that contains percent-escaped characters + # This URL should not be modified (double-escaped) + # Tests: + # - Quoted illegal characters ("%20" (' '), "%3C" ('<'), "%3E" ('>')) + # - Quoted reserved characters ("%25" ('%'), "%23" ('#'), "%2F" ('/')) + # - Quoted non-ASCII characters ("%C3%98", "%C3%A5") + path_fully_escaped = '%3Ca%25b%23c%2Fd%3E/%C3%98%20%C3%A5' + url = httpbin('get/' + path_fully_escaped) + response = get(url) + self.assertEqual(response.url, httpbin('get/' + path_fully_escaped)) + + # Test that illegal characters in a path get properly percent-escaped + # Tests: + # - Bare illegal characters (space, '<') + # - Bare non-ASCII characters ('\u00d8') + path = u' Date: Tue, 14 Feb 2012 11:50:02 +1100 Subject: [PATCH 2/7] Fixed URI re-encoding on Python 3 (Issue #369). Request.full_url now performs requoting of the path (like it does in Python 2). Request.path_url no longer quotes the already-quoted path (double quoting). Fixed utils.requote_path so it works properly in Python 3. --- requests/models.py | 5 +---- requests/utils.py | 6 +++--- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/requests/models.py b/requests/models.py index b72ac64c..dcc51b09 100644 --- a/requests/models.py +++ b/requests/models.py @@ -324,7 +324,7 @@ class Request(object): if isinstance(path, str): path = path.encode('utf-8') - path = requote_path(path) + path = requote_path(path) url = (urlunparse([ scheme, netloc, path, params, query, fragment ])) @@ -352,9 +352,6 @@ class Request(object): if not path: path = '/' - if is_py3: - path = quote(path.encode('utf-8')) - url.append(path) query = p.query diff --git a/requests/utils.py b/requests/utils.py index 0f23a527..a773f109 100644 --- a/requests/utils.py +++ b/requests/utils.py @@ -403,6 +403,6 @@ def requote_path(path): This function passes the given path through an unquote/quote cycle to ensure that it is fully and consistently quoted. """ - parts = path.split(b"/") - parts = (quote(unquote(part), safe=b"") for part in parts) - return b"/".join(parts) + parts = path.split("/") + parts = (quote(unquote(part), safe="") for part in parts) + return "/".join(parts) From 08bc1198d3ed312ab5ce60c29d8447dd5d3f90ea Mon Sep 17 00:00:00 2001 From: Matt Giuca Date: Tue, 14 Feb 2012 12:15:32 +1100 Subject: [PATCH 3/7] Added test case for Issue #369. Tests that legal reserved and unreserved characters in the path are not percent-encoded. Currently fails. --- test_requests.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/test_requests.py b/test_requests.py index 4f9275c8..1168d3bd 100644 --- a/test_requests.py +++ b/test_requests.py @@ -138,6 +138,20 @@ class RequestsTestSuite(TestSetup, unittest.TestCase): response = get(url) self.assertEqual(response.url, httpbin('get/' + path_fully_escaped)) + # Test that reserved characters in a path do not get percent-escaped + # Tests: + # - All reserved characters (RFC 3986), except '?', '#', '[' and ']', + # which are not allowed in the path, and ';' which delimits + # parameters. + # All such characters must be allowed bare in path, and must not be + # encoded. + # - Special unreserved characters (RFC 3986), which should not be + # encoded (even though it wouldn't hurt). + path_reserved = '!$&\'()*+,/:=@-._~' + url = httpbin('get/' + path_reserved) + response = get(url) + self.assertEqual(response.url, httpbin('get/' + path_reserved)) + def test_user_agent_transfers(self): """Issue XX""" From 75bd9d0e9446dd9819142848fb26c6b8ae8c3532 Mon Sep 17 00:00:00 2001 From: Matt Giuca Date: Tue, 14 Feb 2012 12:20:48 +1100 Subject: [PATCH 4/7] Added test case testing URI normalisation. Tests that percent-encoded unreserved characters get de-encoded. --- test_requests.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test_requests.py b/test_requests.py index 1168d3bd..00cc3c10 100644 --- a/test_requests.py +++ b/test_requests.py @@ -152,6 +152,14 @@ class RequestsTestSuite(TestSetup, unittest.TestCase): response = get(url) self.assertEqual(response.url, httpbin('get/' + path_reserved)) + # Test that percent-encoded unreserved characters in a path get + # normalised to their un-encoded forms. + path_unreserved = 'ABCDwxyz1234-._~' + path_unreserved_escaped = '%41%42%43%44%77%78%79%7A%31%32%33%34%2D%2E%5F%7E' + url = httpbin('get/' + path_unreserved_escaped) + response = get(url) + self.assertEqual(response.url, httpbin('get/' + path_unreserved)) + def test_user_agent_transfers(self): """Issue XX""" From c0763bb8d5b3f4ac74a1d77d064474e4823e2246 Mon Sep 17 00:00:00 2001 From: Matt Giuca Date: Tue, 14 Feb 2012 12:49:31 +1100 Subject: [PATCH 5/7] Changed test case test_path_is_not_double_encoded. Uses the space character instead of ~ as a test (since ~ is unreserved, it shouldn't really be encoded at all). --- test_requests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test_requests.py b/test_requests.py index 00cc3c10..2783a23b 100644 --- a/test_requests.py +++ b/test_requests.py @@ -74,9 +74,9 @@ class RequestsTestSuite(TestSetup, unittest.TestCase): def test_path_is_not_double_encoded(self): - request = requests.Request("http://0.0.0.0/get/~test") + request = requests.Request("http://0.0.0.0/get/test case") - assert request.path_url == "/get/%7Etest" + assert request.path_url == "/get/test%20case" def test_HTTP_200_OK_GET(self): r = get(httpbin('/get')) From fcac1c3746b72777b8b25f78b9abff0d3e283389 Mon Sep 17 00:00:00 2001 From: Matt Giuca Date: Tue, 14 Feb 2012 12:51:03 +1100 Subject: [PATCH 6/7] Fixed URI encoding of reserved characters (Issue #369). Previously, util.requote_path would unquote and requote all characters, causing reserved characters to become encoded (changing the semantics of the URI). Now, it has special code for unquoting just the unreserved characters, then quotes only illegal characters. This ensures that illegal characters are fixed, and URIs are normalised, but reserved characters do not erroneously become quoted. Test case test_session_with_escaped_url now passes. --- requests/utils.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/requests/utils.py b/requests/utils.py index a773f109..f4f98c45 100644 --- a/requests/utils.py +++ b/requests/utils.py @@ -396,6 +396,28 @@ def stream_decompress(iterator, mode='gzip'): if rv: yield rv +# The unreserved URI characters (RFC 3986) +UNRESERVED_SET = frozenset( + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" + + "0123456789-._~") + +def unquote_unreserved(uri): + """Un-escape any percent-escape sequences in a URI that are unreserved + characters. + This leaves all reserved, illegal and non-ASCII bytes encoded. + """ + parts = uri.split('%') + for i in range(1, len(parts)): + h = parts[i][0:2] + if len(h) == 2: + c = chr(int(h, 16)) + if c in UNRESERVED_SET: + parts[i] = c + parts[i][2:] + else: + parts[i] = '%' + parts[i] + else: + parts[i] = '%' + parts[i] + return ''.join(parts) def requote_path(path): """Re-quote the given URL path component. @@ -404,5 +426,9 @@ def requote_path(path): ensure that it is fully and consistently quoted. """ parts = path.split("/") - parts = (quote(unquote(part), safe="") for part in parts) + # Unquote only the unreserved characters + # Then quote only illegal characters (do not quote reserved, unreserved, + # or '%') + parts = (quote(unquote_unreserved(part), safe="!#$%&'()*+,/:;=?@[]~") + for part in parts) return "/".join(parts) From 1ffce4f7dcb646c4c4b357c214aa9ad77e69f645 Mon Sep 17 00:00:00 2001 From: Matt Giuca Date: Tue, 14 Feb 2012 12:57:49 +1100 Subject: [PATCH 7/7] Simplify requote_path. It no longer needs to split on '/' since '/' will not be encoded. --- requests/utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/requests/utils.py b/requests/utils.py index f4f98c45..abbc752a 100644 --- a/requests/utils.py +++ b/requests/utils.py @@ -425,10 +425,8 @@ def requote_path(path): This function passes the given path through an unquote/quote cycle to ensure that it is fully and consistently quoted. """ - parts = path.split("/") # Unquote only the unreserved characters # Then quote only illegal characters (do not quote reserved, unreserved, # or '%') - parts = (quote(unquote_unreserved(part), safe="!#$%&'()*+,/:;=?@[]~") - for part in parts) + return quote(unquote_unreserved(path), safe="!#$%&'()*+,/:;=?@[]~") return "/".join(parts)