From 23d81af0efffbdda5ea003ea89a6c7b97921ee10 Mon Sep 17 00:00:00 2001 From: Ordanis Sanchez Date: Sun, 4 Mar 2018 16:41:57 -0400 Subject: [PATCH 1/3] Add AsyncHTMLSession --- Pipfile | 1 + requests_html.py | 18 ++++++++++++++++++ tests/test_requests_html.py | 21 ++++++++++++++++++++- 3 files changed, 39 insertions(+), 1 deletion(-) diff --git a/Pipfile b/Pipfile index 62550b6..76e7c31 100644 --- a/Pipfile +++ b/Pipfile @@ -24,6 +24,7 @@ pytest = "*" "e1839a8" = {path = ".", editable = true} sphinx = "*" mypy = "*" +pytest-asyncio = "*" [scripts] diff --git a/requests_html.py b/requests_html.py index 0238863..c7f6e11 100644 --- a/requests_html.py +++ b/requests_html.py @@ -1,7 +1,9 @@ import sys import asyncio from urllib.parse import urlparse, urlunparse +from concurrent.futures import ThreadPoolExecutor from concurrent.futures._base import TimeoutError +from functools import partial from typing import Set, Union, List, MutableMapping, Optional import pyppeteer @@ -599,3 +601,19 @@ class HTMLSession(requests.Session): r = super(HTMLSession, self).request(*args, **kwargs) return HTMLResponse._from_response(r) + + +class AsyncHTMLSession(requests.Session): + """ """ + + def __init__(self, *args, **kwargs): + """ Create loop and thread pool. """ + self.loop = asyncio.get_event_loop() + self.thread_pool = ThreadPoolExecutor() + + super().__init__(*args, **kwargs) + + def request(self, *args, **kwargs): + """ Partial original request func and run it in a thread. """ + func = partial(super().request, *args, **kwargs) + return self.loop.run_in_executor(self.thread_pool, func) diff --git a/tests/test_requests_html.py b/tests/test_requests_html.py index 40cc7a3..3b70215 100644 --- a/tests/test_requests_html.py +++ b/tests/test_requests_html.py @@ -1,7 +1,7 @@ import os import pytest -from requests_html import HTMLSession, HTML +from requests_html import HTMLSession, AsyncHTMLSession, HTML from requests_file import FileAdapter session = HTMLSession() @@ -15,12 +15,31 @@ def get(): return session.get(url) +@pytest.fixture +def async_get(event_loop): + """ AsyncSession cannot be created global since it will create + a different loop from pytest-asyncio. """ + async_session = AsyncHTMLSession() + async_session.mount('file://', FileAdapter()) + path = os.path.sep.join((os.path.dirname(os.path.abspath(__file__)), 'python.html')) + url = 'file://{}'.format(path) + + return async_session.get(url) + + @pytest.mark.ok def test_file_get(): r = get() assert r.status_code == 200 +@pytest.mark.ok +@pytest.mark.asyncio +async def test_async_file_get(async_get): + r = await async_get + assert r.status_code == 200 + + @pytest.mark.ok def test_class_seperation(): r = get() From ea05c69fe567fac1d49d12ed5468000966f2ffc4 Mon Sep 17 00:00:00 2001 From: Ordanis Sanchez Date: Mon, 5 Mar 2018 15:10:18 -0400 Subject: [PATCH 2/3] Add HTMLResponse hook and mock_browser param --- requests_html.py | 15 +++++++++++++-- tests/test_requests_html.py | 16 ++++++++++++++-- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/requests_html.py b/requests_html.py index c7f6e11..6630a2a 100644 --- a/requests_html.py +++ b/requests_html.py @@ -606,12 +606,23 @@ class HTMLSession(requests.Session): class AsyncHTMLSession(requests.Session): """ """ - def __init__(self, *args, **kwargs): + def __init__(self, mock_browser: bool = True, *args, **kwargs): """ Create loop and thread pool. """ + super().__init__(*args, **kwargs) + + if mock_browser: + self.headers['User-Agent'] = user_agent() + + self.hooks["response"].append(self.response_hook) + self.loop = asyncio.get_event_loop() self.thread_pool = ThreadPoolExecutor() - super().__init__(*args, **kwargs) + @staticmethod + def response_hook(response, **kwargs) -> HTMLResponse: + """ Change response enconding and replace it by a HTMLResponse. """ + response.encoding = DEFAULT_ENCODING + return HTMLResponse._from_response(response) def request(self, *args, **kwargs): """ Partial original request func and run it in a thread. """ diff --git a/tests/test_requests_html.py b/tests/test_requests_html.py index 3b70215..5f35aa7 100644 --- a/tests/test_requests_html.py +++ b/tests/test_requests_html.py @@ -1,4 +1,5 @@ import os +from functools import partial import pytest from requests_html import HTMLSession, AsyncHTMLSession, HTML @@ -24,7 +25,7 @@ def async_get(event_loop): path = os.path.sep.join((os.path.dirname(os.path.abspath(__file__)), 'python.html')) url = 'file://{}'.format(path) - return async_session.get(url) + return partial(async_session.get, url) @pytest.mark.ok @@ -36,7 +37,7 @@ def test_file_get(): @pytest.mark.ok @pytest.mark.asyncio async def test_async_file_get(async_get): - r = await async_get + r = await async_get() assert r.status_code == 200 @@ -72,6 +73,7 @@ def test_containing(): for e in python: assert 'python' in e.full_text.lower() + @pytest.mark.ok def test_attrs(): r = get() @@ -90,6 +92,16 @@ def test_links(): assert len(about.absolute_links) == 6 +@pytest.mark.ok +@pytest.mark.asyncio +async def test_async_links(async_get): + r = await async_get() + about = r.html.find('#about', first=True) + + assert len(about.links) == 6 + assert len(about.absolute_links) == 6 + + @pytest.mark.ok def test_search(): r = get() From c7ba3c17cf6fd4676779261acd9c9175cde17d62 Mon Sep 17 00:00:00 2001 From: Ordanis Sanchez Date: Mon, 5 Mar 2018 15:52:51 -0400 Subject: [PATCH 3/3] Add loop and workers params to AsyncHTMLSession --- requests_html.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/requests_html.py b/requests_html.py index 6630a2a..368a271 100644 --- a/requests_html.py +++ b/requests_html.py @@ -604,19 +604,26 @@ class HTMLSession(requests.Session): class AsyncHTMLSession(requests.Session): - """ """ + """ An async consumable session. """ - def __init__(self, mock_browser: bool = True, *args, **kwargs): - """ Create loop and thread pool. """ + def __init__(self, loop=None, workers=None, + mock_browser: bool = True, *args, **kwargs): + """ Set or create an event loop and a thread pool. + + :param loop: Asyncio lopp to use. + :param workers: Amount of threads to use for executing async calls. + If not pass it will default to the number of processors on the + machine, multiplied by 5. """ super().__init__(*args, **kwargs) + # Mock a web browser's user agent. if mock_browser: self.headers['User-Agent'] = user_agent() self.hooks["response"].append(self.response_hook) - self.loop = asyncio.get_event_loop() - self.thread_pool = ThreadPoolExecutor() + self.loop = loop or asyncio.get_event_loop() + self.thread_pool = ThreadPoolExecutor(max_workers=workers) @staticmethod def response_hook(response, **kwargs) -> HTMLResponse: