From 02031e3e14eecab831c7aeb62befbe9ada3ad641 Mon Sep 17 00:00:00 2001 From: Ian Epperson Date: Fri, 30 Jan 2015 19:22:19 -0800 Subject: [PATCH 1/8] Test to show bug when delimiter is split between reads --- tests/test_requests.py | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/tests/test_requests.py b/tests/test_requests.py index 4d91dd20..d56e365a 100755 --- a/tests/test_requests.py +++ b/tests/test_requests.py @@ -1296,6 +1296,46 @@ class TestRequests: assert r.request.url == pr.request.url assert r.request.headers == pr.request.headers + def test_response_lines(self): + """ + iter_lines should be able to handle data dribbling in which might + not be lined up ideally. + """ + mock_chunks = [ + 'This \r\n', + '', + 'is\r', + '\n', + 'a', + ' ', + '', + '', + 'test.', + '\r', + '\n', + 'end.', + ] + + def mock_iter_content(*args, **kwargs): + '''Fake difficult data.''' + for chunk in mock_chunks: + yield chunk + + r = requests.Response() + r._content_consumed = True + r.iter_content = mock_iter_content + + assert list(r.iter_lines(delimiter='\r\n')) == \ + ''.join(mock_chunks).split('\r\n') + + # This test can't pass because '\n' by itself is a single line-end, but + # '\r\n' is also a single line-end + assert not (list(r.iter_lines()) == ''.join(mock_chunks).splitlines()) + + # However, this should pass if everything is '\r' + mock_chunks = [chunk.replace('\n', '\r') for chunk in mock_chunks] + assert list(r.iter_lines()) == ''.join(mock_chunks).splitlines() + def test_prepared_request_is_pickleable(self, httpbin): p = requests.Request('GET', httpbin('get')).prepare() From 9174925916b797e30d1f402ced4d835149af9efe Mon Sep 17 00:00:00 2001 From: Ian Epperson Date: Fri, 30 Jan 2015 19:23:04 -0800 Subject: [PATCH 2/8] Fix bug when delimiter is split between responses --- requests/models.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/requests/models.py b/requests/models.py index ac348e9d..b4cdead4 100644 --- a/requests/models.py +++ b/requests/models.py @@ -776,23 +776,36 @@ class Response(object): .. note:: This method is not reentrant safe. """ - pending = None - for chunk in self.iter_content(chunk_size=chunk_size, decode_unicode=decode_unicode): + for chunk in self.iter_content(chunk_size=chunk_size, + decode_unicode=decode_unicode): + # Skip any null responses + if not chunk: + continue + + # Consume any pending data if pending is not None: chunk = pending + chunk + pending = None + # Either split on a line, or split on a specified delimiter if delimiter: lines = chunk.split(delimiter) else: lines = chunk.splitlines() - if lines and lines[-1] and chunk and lines[-1][-1] == chunk[-1]: + # The split(delimiter) will always end with whatever remains past + # the delimiter ('' if nothing more). However splitlines() will + # not end with a '' if the final text is a line delimiter. + + # Therefore, if we're in delimiter mode, always pop the final + # item to prepend to the next chunk. However, only do this for + # non-delimiter mode if the chunk does not match the end of the + # last line. + if delimiter or (lines[-1] and lines[-1][-1] == chunk[-1]): pending = lines.pop() - else: - pending = None for line in lines: yield line From 9881be25f3d88e8cfcb1dd006cc7eeca37c08c83 Mon Sep 17 00:00:00 2001 From: Alex Chan Date: Sun, 4 Dec 2016 23:06:03 +0000 Subject: [PATCH 3/8] Review markups for @Lukasa --- requests/models.py | 27 +++++++++++++++++---------- tests/test_requests.py | 21 ++++++++++++--------- 2 files changed, 29 insertions(+), 19 deletions(-) diff --git a/requests/models.py b/requests/models.py index b4cdead4..fe554ffc 100644 --- a/requests/models.py +++ b/requests/models.py @@ -780,7 +780,6 @@ class Response(object): for chunk in self.iter_content(chunk_size=chunk_size, decode_unicode=decode_unicode): - # Skip any null responses if not chunk: continue @@ -796,15 +795,23 @@ class Response(object): else: lines = chunk.splitlines() - # The split(delimiter) will always end with whatever remains past - # the delimiter ('' if nothing more). However splitlines() will - # not end with a '' if the final text is a line delimiter. - - # Therefore, if we're in delimiter mode, always pop the final - # item to prepend to the next chunk. However, only do this for - # non-delimiter mode if the chunk does not match the end of the - # last line. - if delimiter or (lines[-1] and lines[-1][-1] == chunk[-1]): + # Calling `.split(delimiter)` will always end with whatever text + # remains beyond the delimiter, or '' if the delimiter is the end + # of the text. On the other hand, `.splitlines()` doesn't include + # a '' if the text ends in a line delimiter. + # + # For example: + # + # 'abc\ndef\n'.split('\n') ~> ['abc', 'def', ''] + # 'abc\ndef\n'.splitlines() ~> ['abc', 'def'] + # + # So if we have a specified delimiter, we always pop the final + # item and prepend it to the next chunk. + # + # If we're using `splitlines()`, we only do this if the chunk + # ended midway through a line. + incomplete_line = (lines[-1] and lines[-1].endswith(chunk[-1])) + if delimiter or incomplete_line: pending = lines.pop() for line in lines: diff --git a/tests/test_requests.py b/tests/test_requests.py index d56e365a..76a2d45a 100755 --- a/tests/test_requests.py +++ b/tests/test_requests.py @@ -1296,10 +1296,11 @@ class TestRequests: assert r.request.url == pr.request.url assert r.request.headers == pr.request.headers + def test_response_lines(self): """ - iter_lines should be able to handle data dribbling in which might - not be lined up ideally. + iter_lines should be able to handle data dribbling in which delimiters + might not be lined up ideally. """ mock_chunks = [ 'This \r\n', @@ -1315,6 +1316,7 @@ class TestRequests: '\n', 'end.', ] + mock_data = ''.join(mock_chunks) def mock_iter_content(*args, **kwargs): '''Fake difficult data.''' @@ -1325,16 +1327,17 @@ class TestRequests: r._content_consumed = True r.iter_content = mock_iter_content - assert list(r.iter_lines(delimiter='\r\n')) == \ - ''.join(mock_chunks).split('\r\n') + assert list(r.iter_lines(delimiter='\r\n')) == mock_data.split('\r\n') - # This test can't pass because '\n' by itself is a single line-end, but - # '\r\n' is also a single line-end - assert not (list(r.iter_lines()) == ''.join(mock_chunks).splitlines()) + # Because '\n' is a single line-end, when `iter_lines()` receives + # the chunks containing a single '\n', it emits '' as a line -- whereas + # `.splitlines()` combines with the '\r' and splits on `\r\n`. + assert list(r.iter_lines()) != mock_data.splitlines() - # However, this should pass if everything is '\r' + # If we change all the line breaks to `\r`, we should be okay. mock_chunks = [chunk.replace('\n', '\r') for chunk in mock_chunks] - assert list(r.iter_lines()) == ''.join(mock_chunks).splitlines() + mock_data = ''.join(mock_chunks) + assert list(r.iter_lines()) == mock_data.splitlines() def test_prepared_request_is_pickleable(self, httpbin): p = requests.Request('GET', httpbin('get')).prepare() From 0380ac5893ffeb52e65ded5e22c785dd4f659496 Mon Sep 17 00:00:00 2001 From: Vincent Barbaresi Date: Tue, 14 Mar 2017 19:26:55 +0100 Subject: [PATCH 4/8] add some parametrized tests for iter_lines() Write a list of different chunk splits and their expected results to test against, using ianepperson's breakdown as specification: https://github.com/kennethreitz/requests/pull/2431#issuecomment-72333964 --- tests/test_requests.py | 41 +++++++++++++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/tests/test_requests.py b/tests/test_requests.py index 76a2d45a..96769830 100755 --- a/tests/test_requests.py +++ b/tests/test_requests.py @@ -1318,10 +1318,7 @@ class TestRequests: ] mock_data = ''.join(mock_chunks) - def mock_iter_content(*args, **kwargs): - '''Fake difficult data.''' - for chunk in mock_chunks: - yield chunk + mock_iter_content = lambda *args, **kwargs: (e for e in mock_chunks) r = requests.Response() r._content_consumed = True @@ -1332,13 +1329,45 @@ class TestRequests: # Because '\n' is a single line-end, when `iter_lines()` receives # the chunks containing a single '\n', it emits '' as a line -- whereas # `.splitlines()` combines with the '\r' and splits on `\r\n`. - assert list(r.iter_lines()) != mock_data.splitlines() - + result = list(r.iter_lines()) + assert result != mock_data.splitlines() + assert result[2] == '' + assert result[4] == '' # If we change all the line breaks to `\r`, we should be okay. mock_chunks = [chunk.replace('\n', '\r') for chunk in mock_chunks] mock_data = ''.join(mock_chunks) assert list(r.iter_lines()) == mock_data.splitlines() + + @pytest.mark.parametrize( + 'content, expected_no_delimiter, expected_delimiter', ( + ([''], [], []), + (['line\n'], ['line'], ['line\n']), + (['line', '\n'], ['line'], ['line\n']), + (['line\r\n'], ['line'], ['line', '']), + (['line', '\r\n'], ['line'], ['line', '']), + (['a\r', '\nb\r'], ['a', '', 'b'], ['a', 'b\r']), + (['a\n', '\nb'], ['a', '', 'b'], ['a\n\nb']), + (['a\r\n','\rb\n'], ['a', '', 'b'], ['a', '\rb\n']), + (['a\nb', 'c'], ['a', 'bc'], ['a\nbc']), + (['a\n', '\rb', '\r\nc'], ['a', '', 'b', 'c'], ['a\n\rb', 'c']) + )) + def test_response_lines_parametrized(self, content, expected_no_delimiter, expected_delimiter): + """ + Test a lot of potential chunk splits to ensure consistency of + iter_lines(delimiter=x), as well as the legacy behavior of + iter_lines() without delimiter + https://github.com/kennethreitz/requests/pull/2431#issuecomment-72333964 + """ + mock_chunks = content + mock_iter_content = lambda *args, **kwargs: (e for e in mock_chunks) + + r = requests.Response() + r._content_consumed = True + r.iter_content = mock_iter_content + assert list(r.iter_lines()) == expected_no_delimiter + assert list(r.iter_lines(delimiter='\r\n')) == expected_delimiter + def test_prepared_request_is_pickleable(self, httpbin): p = requests.Request('GET', httpbin('get')).prepare() From 5a8bc193844b1d46d89784c11e62b3a4882afa25 Mon Sep 17 00:00:00 2001 From: Vincent Barbaresi Date: Wed, 15 Mar 2017 00:52:32 +0100 Subject: [PATCH 5/8] add more tests for iter_lines() check the case of an empty chunk somewhere in the stream --- tests/test_requests.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/test_requests.py b/tests/test_requests.py index 96769830..ac4bc2d5 100755 --- a/tests/test_requests.py +++ b/tests/test_requests.py @@ -1345,12 +1345,15 @@ class TestRequests: (['line\n'], ['line'], ['line\n']), (['line', '\n'], ['line'], ['line\n']), (['line\r\n'], ['line'], ['line', '']), + # Empty chunk in the end of stream, same behavior as the previous + (['line\r\n', ''], ['line'], ['line', '']), (['line', '\r\n'], ['line'], ['line', '']), (['a\r', '\nb\r'], ['a', '', 'b'], ['a', 'b\r']), (['a\n', '\nb'], ['a', '', 'b'], ['a\n\nb']), (['a\r\n','\rb\n'], ['a', '', 'b'], ['a', '\rb\n']), (['a\nb', 'c'], ['a', 'bc'], ['a\nbc']), - (['a\n', '\rb', '\r\nc'], ['a', '', 'b', 'c'], ['a\n\rb', 'c']) + (['a\n', '\rb', '\r\nc'], ['a', '', 'b', 'c'], ['a\n\rb', 'c']), + (['a\r\nb', '', 'c'], ['a', 'bc'], ['a', 'bc']) # Empty chunk with pending data )) def test_response_lines_parametrized(self, content, expected_no_delimiter, expected_delimiter): """ From cc2ac23c0d0253f10060c9bce3826b12122953ff Mon Sep 17 00:00:00 2001 From: Vincent Barbaresi Date: Wed, 15 Mar 2017 00:54:04 +0100 Subject: [PATCH 6/8] remove useless brackets in iter_lines boolean condition --- requests/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requests/models.py b/requests/models.py index fe554ffc..a3958f73 100644 --- a/requests/models.py +++ b/requests/models.py @@ -810,7 +810,7 @@ class Response(object): # # If we're using `splitlines()`, we only do this if the chunk # ended midway through a line. - incomplete_line = (lines[-1] and lines[-1].endswith(chunk[-1])) + incomplete_line = lines[-1] and lines[-1].endswith(chunk[-1]) if delimiter or incomplete_line: pending = lines.pop() From 052595ffbfd0464a5272a33927cd183c357beab0 Mon Sep 17 00:00:00 2001 From: Vincent Barbaresi Date: Wed, 15 Mar 2017 12:16:32 +0100 Subject: [PATCH 7/8] add explanatory comment about skipping null chunks in iter_lines --- requests/models.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/requests/models.py b/requests/models.py index a3958f73..375aae7a 100644 --- a/requests/models.py +++ b/requests/models.py @@ -780,7 +780,10 @@ class Response(object): for chunk in self.iter_content(chunk_size=chunk_size, decode_unicode=decode_unicode): - # Skip any null responses + # Skip any null responses: if there is pending data it is necessarily an + # incomplete chunk, so if we don't have more data we don't want to bother + # trying to get it. Unconsumed pending data will be yielded anyway in the + # end of the loop if the stream ends. if not chunk: continue From d491e9f9b28815343d4114be51832960907b12d8 Mon Sep 17 00:00:00 2001 From: Vincent Barbaresi Date: Wed, 15 Mar 2017 22:27:36 +0100 Subject: [PATCH 8/8] use [-1] instead of endswith() to work with bytes or string Also add a parametrize on decode_unicode for iter_lines() test to check with bytestrings and str content --- requests/models.py | 2 +- tests/test_requests.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/requests/models.py b/requests/models.py index 375aae7a..46a809da 100644 --- a/requests/models.py +++ b/requests/models.py @@ -813,7 +813,7 @@ class Response(object): # # If we're using `splitlines()`, we only do this if the chunk # ended midway through a line. - incomplete_line = lines[-1] and lines[-1].endswith(chunk[-1]) + incomplete_line = lines[-1] and lines[-1][-1] == chunk[-1] if delimiter or incomplete_line: pending = lines.pop() diff --git a/tests/test_requests.py b/tests/test_requests.py index ac4bc2d5..8f68e0c4 100755 --- a/tests/test_requests.py +++ b/tests/test_requests.py @@ -1816,11 +1816,12 @@ class TestRequests: prep = r.prepare() assert 'stuff=elixr' == prep.body - def test_response_iter_lines(self, httpbin): + @pytest.mark.parametrize('decode_unicode', (True, False)) + def test_response_iter_lines(self, httpbin, decode_unicode): r = requests.get(httpbin('stream/4'), stream=True) assert r.status_code == 200 - - it = r.iter_lines() + r.encoding = 'utf-8' + it = r.iter_lines(decode_unicode=decode_unicode) next(it) assert len(list(it)) == 3