diff --git a/requests/models.py b/requests/models.py index ac348e9d..46a809da 100644 --- a/requests/models.py +++ b/requests/models.py @@ -776,23 +776,46 @@ class Response(object): .. note:: This method is not reentrant safe. """ - pending = None - for chunk in self.iter_content(chunk_size=chunk_size, decode_unicode=decode_unicode): + for chunk in self.iter_content(chunk_size=chunk_size, + decode_unicode=decode_unicode): + # Skip any null responses: if there is pending data it is necessarily an + # incomplete chunk, so if we don't have more data we don't want to bother + # trying to get it. Unconsumed pending data will be yielded anyway in the + # end of the loop if the stream ends. + if not chunk: + continue + # Consume any pending data if pending is not None: chunk = pending + chunk + pending = None + # Either split on a line, or split on a specified delimiter if delimiter: lines = chunk.split(delimiter) else: lines = chunk.splitlines() - if lines and lines[-1] and chunk and lines[-1][-1] == chunk[-1]: + # Calling `.split(delimiter)` will always end with whatever text + # remains beyond the delimiter, or '' if the delimiter is the end + # of the text. On the other hand, `.splitlines()` doesn't include + # a '' if the text ends in a line delimiter. + # + # For example: + # + # 'abc\ndef\n'.split('\n') ~> ['abc', 'def', ''] + # 'abc\ndef\n'.splitlines() ~> ['abc', 'def'] + # + # So if we have a specified delimiter, we always pop the final + # item and prepend it to the next chunk. + # + # If we're using `splitlines()`, we only do this if the chunk + # ended midway through a line. + incomplete_line = lines[-1] and lines[-1][-1] == chunk[-1] + if delimiter or incomplete_line: pending = lines.pop() - else: - pending = None for line in lines: yield line diff --git a/tests/test_requests.py b/tests/test_requests.py index 4d91dd20..8f68e0c4 100755 --- a/tests/test_requests.py +++ b/tests/test_requests.py @@ -1296,6 +1296,81 @@ class TestRequests: assert r.request.url == pr.request.url assert r.request.headers == pr.request.headers + + def test_response_lines(self): + """ + iter_lines should be able to handle data dribbling in which delimiters + might not be lined up ideally. + """ + mock_chunks = [ + 'This \r\n', + '', + 'is\r', + '\n', + 'a', + ' ', + '', + '', + 'test.', + '\r', + '\n', + 'end.', + ] + mock_data = ''.join(mock_chunks) + + mock_iter_content = lambda *args, **kwargs: (e for e in mock_chunks) + + r = requests.Response() + r._content_consumed = True + r.iter_content = mock_iter_content + + assert list(r.iter_lines(delimiter='\r\n')) == mock_data.split('\r\n') + + # Because '\n' is a single line-end, when `iter_lines()` receives + # the chunks containing a single '\n', it emits '' as a line -- whereas + # `.splitlines()` combines with the '\r' and splits on `\r\n`. + result = list(r.iter_lines()) + assert result != mock_data.splitlines() + assert result[2] == '' + assert result[4] == '' + # If we change all the line breaks to `\r`, we should be okay. + mock_chunks = [chunk.replace('\n', '\r') for chunk in mock_chunks] + mock_data = ''.join(mock_chunks) + assert list(r.iter_lines()) == mock_data.splitlines() + + + @pytest.mark.parametrize( + 'content, expected_no_delimiter, expected_delimiter', ( + ([''], [], []), + (['line\n'], ['line'], ['line\n']), + (['line', '\n'], ['line'], ['line\n']), + (['line\r\n'], ['line'], ['line', '']), + # Empty chunk in the end of stream, same behavior as the previous + (['line\r\n', ''], ['line'], ['line', '']), + (['line', '\r\n'], ['line'], ['line', '']), + (['a\r', '\nb\r'], ['a', '', 'b'], ['a', 'b\r']), + (['a\n', '\nb'], ['a', '', 'b'], ['a\n\nb']), + (['a\r\n','\rb\n'], ['a', '', 'b'], ['a', '\rb\n']), + (['a\nb', 'c'], ['a', 'bc'], ['a\nbc']), + (['a\n', '\rb', '\r\nc'], ['a', '', 'b', 'c'], ['a\n\rb', 'c']), + (['a\r\nb', '', 'c'], ['a', 'bc'], ['a', 'bc']) # Empty chunk with pending data + )) + def test_response_lines_parametrized(self, content, expected_no_delimiter, expected_delimiter): + """ + Test a lot of potential chunk splits to ensure consistency of + iter_lines(delimiter=x), as well as the legacy behavior of + iter_lines() without delimiter + https://github.com/kennethreitz/requests/pull/2431#issuecomment-72333964 + """ + mock_chunks = content + mock_iter_content = lambda *args, **kwargs: (e for e in mock_chunks) + + r = requests.Response() + r._content_consumed = True + r.iter_content = mock_iter_content + assert list(r.iter_lines()) == expected_no_delimiter + assert list(r.iter_lines(delimiter='\r\n')) == expected_delimiter + def test_prepared_request_is_pickleable(self, httpbin): p = requests.Request('GET', httpbin('get')).prepare() @@ -1741,11 +1816,12 @@ class TestRequests: prep = r.prepare() assert 'stuff=elixr' == prep.body - def test_response_iter_lines(self, httpbin): + @pytest.mark.parametrize('decode_unicode', (True, False)) + def test_response_iter_lines(self, httpbin, decode_unicode): r = requests.get(httpbin('stream/4'), stream=True) assert r.status_code == 200 - - it = r.iter_lines() + r.encoding = 'utf-8' + it = r.iter_lines(decode_unicode=decode_unicode) next(it) assert len(list(it)) == 3