diff --git a/Pipfile b/Pipfile index 62550b6..76e7c31 100644 --- a/Pipfile +++ b/Pipfile @@ -24,6 +24,7 @@ pytest = "*" "e1839a8" = {path = ".", editable = true} sphinx = "*" mypy = "*" +pytest-asyncio = "*" [scripts] diff --git a/requests_html.py b/requests_html.py index 9740f46..9c14ff5 100644 --- a/requests_html.py +++ b/requests_html.py @@ -1,7 +1,9 @@ import sys import asyncio from urllib.parse import urlparse, urlunparse +from concurrent.futures import ThreadPoolExecutor from concurrent.futures._base import TimeoutError +from functools import partial from typing import Set, Union, List, MutableMapping, Optional import pyppeteer @@ -357,7 +359,7 @@ class Element(BaseParser): __slots__ = [ 'element', 'url', 'skip_anchors', 'default_encoding', '_encoding', - '_encoding', '_html', '_lxml', '_pq', '_attrs', 'session' + '_html', '_lxml', '_pq', '_attrs', 'session' ] def __init__(self, *, element, url: _URL, default_encoding: _DefaultEncoding = None) -> None: @@ -599,3 +601,37 @@ class HTMLSession(requests.Session): r = super(HTMLSession, self).request(*args, **kwargs) return HTMLResponse._from_response(r) + + +class AsyncHTMLSession(requests.Session): + """ An async consumable session. """ + + def __init__(self, loop=None, workers=None, + mock_browser: bool = True, *args, **kwargs): + """ Set or create an event loop and a thread pool. + + :param loop: Asyncio lopp to use. + :param workers: Amount of threads to use for executing async calls. + If not pass it will default to the number of processors on the + machine, multiplied by 5. """ + super().__init__(*args, **kwargs) + + # Mock a web browser's user agent. + if mock_browser: + self.headers['User-Agent'] = user_agent() + + self.hooks["response"].append(self.response_hook) + + self.loop = loop or asyncio.get_event_loop() + self.thread_pool = ThreadPoolExecutor(max_workers=workers) + + @staticmethod + def response_hook(response, **kwargs) -> HTMLResponse: + """ Change response enconding and replace it by a HTMLResponse. """ + response.encoding = DEFAULT_ENCODING + return HTMLResponse._from_response(response) + + def request(self, *args, **kwargs): + """ Partial original request func and run it in a thread. """ + func = partial(super().request, *args, **kwargs) + return self.loop.run_in_executor(self.thread_pool, func) diff --git a/tests/test_requests_html.py b/tests/test_requests_html.py index 40cc7a3..5f35aa7 100644 --- a/tests/test_requests_html.py +++ b/tests/test_requests_html.py @@ -1,7 +1,8 @@ import os +from functools import partial import pytest -from requests_html import HTMLSession, HTML +from requests_html import HTMLSession, AsyncHTMLSession, HTML from requests_file import FileAdapter session = HTMLSession() @@ -15,12 +16,31 @@ def get(): return session.get(url) +@pytest.fixture +def async_get(event_loop): + """ AsyncSession cannot be created global since it will create + a different loop from pytest-asyncio. """ + async_session = AsyncHTMLSession() + async_session.mount('file://', FileAdapter()) + path = os.path.sep.join((os.path.dirname(os.path.abspath(__file__)), 'python.html')) + url = 'file://{}'.format(path) + + return partial(async_session.get, url) + + @pytest.mark.ok def test_file_get(): r = get() assert r.status_code == 200 +@pytest.mark.ok +@pytest.mark.asyncio +async def test_async_file_get(async_get): + r = await async_get() + assert r.status_code == 200 + + @pytest.mark.ok def test_class_seperation(): r = get() @@ -53,6 +73,7 @@ def test_containing(): for e in python: assert 'python' in e.full_text.lower() + @pytest.mark.ok def test_attrs(): r = get() @@ -71,6 +92,16 @@ def test_links(): assert len(about.absolute_links) == 6 +@pytest.mark.ok +@pytest.mark.asyncio +async def test_async_links(async_get): + r = await async_get() + about = r.html.find('#about', first=True) + + assert len(about.links) == 6 + assert len(about.absolute_links) == 6 + + @pytest.mark.ok def test_search(): r = get()