Merge pull request #3923 from vbarbaresi/3.0.0-iter_lines

Rebase #3745 and add some tests
2026-06-05 22:50:18 +00:00 · 2017-03-16 09:53:03 +00:00
parent 84dc6b66da d491e9f9b2
commit 73456b0048
2 changed files with 107 additions and 8 deletions
@@ -776,23 +776,46 @@ class Response(object):

        .. note:: This method is not reentrant safe.
        """
-
        pending = None

-        for chunk in self.iter_content(chunk_size=chunk_size, decode_unicode=decode_unicode):
+        for chunk in self.iter_content(chunk_size=chunk_size,
+                                       decode_unicode=decode_unicode):
+            # Skip any null responses: if there is pending data it is necessarily an
+            # incomplete chunk, so if we don't have more data we don't want to bother
+            # trying to get it. Unconsumed pending data will be yielded anyway in the
+            # end of the loop if the stream ends.
+            if not chunk:
+                continue

+            # Consume any pending data
            if pending is not None:
                chunk = pending + chunk
+                pending = None

+            # Either split on a line, or split on a specified delimiter
            if delimiter:
                lines = chunk.split(delimiter)
            else:
                lines = chunk.splitlines()

-            if lines and lines[-1] and chunk and lines[-1][-1] == chunk[-1]:
+            # Calling `.split(delimiter)` will always end with whatever text
+            # remains beyond the delimiter, or '' if the delimiter is the end
+            # of the text.  On the other hand, `.splitlines()` doesn't include
+            # a '' if the text ends in a line delimiter.
+            #
+            # For example:
+            #
+            #     'abc\ndef\n'.split('\n')  ~> ['abc', 'def', '']
+            #     'abc\ndef\n'.splitlines() ~> ['abc', 'def']
+            #
+            # So if we have a specified delimiter, we always pop the final
+            # item and prepend it to the next chunk.
+            #
+            # If we're using `splitlines()`, we only do this if the chunk
+            # ended midway through a line.
+            incomplete_line = lines[-1] and lines[-1][-1] == chunk[-1]
+            if delimiter or incomplete_line:
                pending = lines.pop()
-            else:
-                pending = None

            for line in lines:
                yield line
@@ -1296,6 +1296,81 @@ class TestRequests:
        assert r.request.url == pr.request.url
        assert r.request.headers == pr.request.headers

+
+    def test_response_lines(self):
+        """
+        iter_lines should be able to handle data dribbling in which delimiters
+        might not be lined up ideally.
+        """
+        mock_chunks = [
+            'This \r\n',
+            '',
+            'is\r',
+            '\n',
+            'a',
+            ' ',
+            '',
+            '',
+            'test.',
+            '\r',
+            '\n',
+            'end.',
+        ]
+        mock_data = ''.join(mock_chunks)
+
+        mock_iter_content = lambda *args, **kwargs: (e for e in mock_chunks)
+
+        r = requests.Response()
+        r._content_consumed = True
+        r.iter_content = mock_iter_content
+
+        assert list(r.iter_lines(delimiter='\r\n')) == mock_data.split('\r\n')
+
+        # Because '\n' is a single line-end, when `iter_lines()` receives
+        # the chunks containing a single '\n', it emits '' as a line -- whereas
+        # `.splitlines()` combines with the '\r' and splits on `\r\n`.
+        result = list(r.iter_lines())
+        assert result != mock_data.splitlines()
+        assert result[2] == ''
+        assert result[4] == ''
+        # If we change all the line breaks to `\r`, we should be okay.
+        mock_chunks = [chunk.replace('\n', '\r') for chunk in mock_chunks]
+        mock_data = ''.join(mock_chunks)
+        assert list(r.iter_lines()) == mock_data.splitlines()
+
+
+    @pytest.mark.parametrize(
+        'content, expected_no_delimiter, expected_delimiter', (
+            ([''], [], []),
+            (['line\n'], ['line'], ['line\n']),
+            (['line', '\n'], ['line'], ['line\n']),
+            (['line\r\n'], ['line'], ['line', '']),
+            # Empty chunk in the end of stream, same behavior as the previous
+            (['line\r\n', ''], ['line'], ['line', '']),
+            (['line', '\r\n'], ['line'], ['line', '']),
+            (['a\r', '\nb\r'], ['a', '', 'b'], ['a', 'b\r']),
+            (['a\n', '\nb'], ['a', '', 'b'], ['a\n\nb']),
+            (['a\r\n','\rb\n'], ['a', '', 'b'], ['a', '\rb\n']),
+            (['a\nb', 'c'], ['a', 'bc'], ['a\nbc']),
+            (['a\n', '\rb', '\r\nc'], ['a', '', 'b', 'c'], ['a\n\rb', 'c']),
+            (['a\r\nb', '', 'c'], ['a', 'bc'], ['a', 'bc'])  # Empty chunk with pending data
+        ))
+    def test_response_lines_parametrized(self, content, expected_no_delimiter, expected_delimiter):
+        """
+        Test a lot of potential chunk splits to ensure consistency of
+        iter_lines(delimiter=x), as well as the legacy behavior of
+        iter_lines() without delimiter
+        https://github.com/kennethreitz/requests/pull/2431#issuecomment-72333964
+        """
+        mock_chunks = content
+        mock_iter_content = lambda *args, **kwargs: (e for e in mock_chunks)
+
+        r = requests.Response()
+        r._content_consumed = True
+        r.iter_content = mock_iter_content
+        assert list(r.iter_lines()) == expected_no_delimiter
+        assert list(r.iter_lines(delimiter='\r\n')) == expected_delimiter
+
    def test_prepared_request_is_pickleable(self, httpbin):
        p = requests.Request('GET', httpbin('get')).prepare()

@@ -1741,11 +1816,12 @@ class TestRequests:
        prep = r.prepare()
        assert 'stuff=elixr' == prep.body

-    def test_response_iter_lines(self, httpbin):
+    @pytest.mark.parametrize('decode_unicode', (True, False))
+    def test_response_iter_lines(self, httpbin, decode_unicode):
        r = requests.get(httpbin('stream/4'), stream=True)
        assert r.status_code == 200
-
-        it = r.iter_lines()
+        r.encoding = 'utf-8'
+        it = r.iter_lines(decode_unicode=decode_unicode)
        next(it)
        assert len(list(it)) == 3