diff --git a/ox/web/duckduckgo.py b/ox/web/duckduckgo.py index a8f7869..b4b3494 100644 --- a/ox/web/duckduckgo.py +++ b/ox/web/duckduckgo.py @@ -6,17 +6,25 @@ from six.moves import urllib import ox from ox import strip_tags, decode_html from ox.cache import read_url +import lxml.html def find(query, timeout=ox.cache.cache_timeout): + """ + Returns tuples with title, url, description + """ if not isinstance(query, bytes): query = query.encode('utf-8') params = urllib.parse.urlencode({'q': query}) url = 'http://duckduckgo.com/html/?' + params data = read_url(url, timeout=timeout).decode('utf-8') + doc = lxml.html.document_fromstring(data) results = [] - regex = '(.*?).*?
(.*?)
' - for r in re.compile(regex, re.DOTALL).findall(data): - results.append((strip_tags(decode_html(r[1])), r[0], strip_tags(decode_html(r[2])))) + for e in doc.xpath("//a[contains(@class, 'result__a')]"): + url = e.attrib['href'] + if 'uddg=' in url: + url = urllib.parse.unquote(url.split('&uddg=')[-1]) + title = e.text_content() + description = '' + results.append((title, url, description)) return results -