From 454d53d68aa6d3bf665aaa8f85a12a6563b881cc Mon Sep 17 00:00:00 2001 From: j Date: Wed, 26 Jul 2017 13:18:10 +0200 Subject: [PATCH] fix ddg results --- ox/web/duckduckgo.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/ox/web/duckduckgo.py b/ox/web/duckduckgo.py index a8f7869..b4b3494 100644 --- a/ox/web/duckduckgo.py +++ b/ox/web/duckduckgo.py @@ -6,17 +6,25 @@ from six.moves import urllib import ox from ox import strip_tags, decode_html from ox.cache import read_url +import lxml.html def find(query, timeout=ox.cache.cache_timeout): + """ + Returns tuples with title, url, description + """ if not isinstance(query, bytes): query = query.encode('utf-8') params = urllib.parse.urlencode({'q': query}) url = 'http://duckduckgo.com/html/?' + params data = read_url(url, timeout=timeout).decode('utf-8') + doc = lxml.html.document_fromstring(data) results = [] - regex = '(.*?).*?
(.*?)
' - for r in re.compile(regex, re.DOTALL).findall(data): - results.append((strip_tags(decode_html(r[1])), r[0], strip_tags(decode_html(r[2])))) + for e in doc.xpath("//a[contains(@class, 'result__a')]"): + url = e.attrib['href'] + if 'uddg=' in url: + url = urllib.parse.unquote(url.split('&uddg=')[-1]) + title = e.text_content() + description = '' + results.append((title, url, description)) return results -