From 495b0c3bfd2fd928449b3986ffab8c80f9b9750e Mon Sep 17 00:00:00 2001 From: Kenneth Reitz Date: Wed, 28 Feb 2018 06:49:21 -0500 Subject: [PATCH 1/4] xpath improvements Signed-off-by: Kenneth Reitz --- requests_html.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/requests_html.py b/requests_html.py index ee2f6b2..5081a01 100644 --- a/requests_html.py +++ b/requests_html.py @@ -123,12 +123,27 @@ class BaseParser: return elements def xpath(self, selector: str, first: bool = False, _encoding: str = None): - """Given an XPath selector, returns a list of :class:`Element ` objects. + """Given an XPath selector, returns a list of + :class:`Element ` objects. - See W3School's `XPath Examples `_ for more details. + If a sub-selector is specified (e.g. ``//a/@href``), a simple + list of results is returned. + + See W3School's `XPath Examples + `_ + for more details. + + If ``first`` is ``True``, only returns the first + :class:`Element ` found. + """ + selected = self.lxml.xpath(selector) + try: + c = [Element(element=e, url=self.url, default_encoding=_encoding or self.encoding) for e in selected] + # Sanity check. + [e.keys for e in c] + except AttributeError: + c = selected - If ``first`` is ``True``, only returns the first :class:`Element ` found.""" - c = [Element(element=e, url=self.url, default_encoding=_encoding or self.encoding) for e in self.lxml.xpath(selector)] if first: try: return c[0] From 26c9f089602894d56c9a4841ac58d11036bd6615 Mon Sep 17 00:00:00 2001 From: Kenneth Reitz Date: Wed, 28 Feb 2018 06:51:23 -0500 Subject: [PATCH 2/4] cleanup Signed-off-by: Kenneth Reitz --- requests_html.py | 1 - 1 file changed, 1 deletion(-) diff --git a/requests_html.py b/requests_html.py index 5081a01..08d2370 100644 --- a/requests_html.py +++ b/requests_html.py @@ -105,7 +105,6 @@ class BaseParser: See W3School's `CSS Selectors Reference `_ for more details. - If ``first`` is ``True``, only returns the first :class:`Element ` found.""" encoding = _encoding or self.encoding From 95886bdd259c57934cf24b2df03a39c5fd87e5e8 Mon Sep 17 00:00:00 2001 From: Kenneth Reitz Date: Wed, 28 Feb 2018 06:59:14 -0500 Subject: [PATCH 3/4] clean up Signed-off-by: Kenneth Reitz --- requests_html.py | 13 +++++++------ tests/test_requests_html.py | 5 ++++- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/requests_html.py b/requests_html.py index 08d2370..d4ba93f 100644 --- a/requests_html.py +++ b/requests_html.py @@ -136,12 +136,13 @@ class BaseParser: :class:`Element ` found. """ selected = self.lxml.xpath(selector) - try: - c = [Element(element=e, url=self.url, default_encoding=_encoding or self.encoding) for e in selected] - # Sanity check. - [e.keys for e in c] - except AttributeError: - c = selected + c = [] + for selection in selected: + if not isinstance(selection, etree._ElementUnicodeResult): + element = Element(element=selection, url=self.url, default_encoding=_encoding or self.encoding) + else: + element = selection + c.append(element) if first: try: diff --git a/tests/test_requests_html.py b/tests/test_requests_html.py index c2c9ad7..8a3c871 100644 --- a/tests/test_requests_html.py +++ b/tests/test_requests_html.py @@ -59,6 +59,9 @@ def test_xpath(): html = r.html.xpath('/html', first=True) assert 'no-js' in html.attrs['class'] + a_hrefs = r.html.xpath('//a/@href') + print(a_hrefs) + def test_html_loading(): doc = """""" @@ -77,4 +80,4 @@ def test_anchor_links(): if __name__ == '__main__': - test_anchor_links() + test_xpath() From df39a9f34c38abe2333da5c4cdc4e07a787c450f Mon Sep 17 00:00:00 2001 From: Kenneth Reitz Date: Wed, 28 Feb 2018 06:59:54 -0500 Subject: [PATCH 4/4] tests Signed-off-by: Kenneth Reitz --- tests/test_requests_html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_requests_html.py b/tests/test_requests_html.py index 8a3c871..5780bde 100644 --- a/tests/test_requests_html.py +++ b/tests/test_requests_html.py @@ -60,7 +60,7 @@ def test_xpath(): assert 'no-js' in html.attrs['class'] a_hrefs = r.html.xpath('//a/@href') - print(a_hrefs) + assert '#site-map' in a_hrefs def test_html_loading():