diff --git a/ox/web/google.py b/ox/web/google.py index 72aa32f..0842d01 100644 --- a/ox/web/google.py +++ b/ox/web/google.py @@ -17,6 +17,31 @@ def quote_plus(s): s = s.encode('utf-8') return urllib.parse.quote_plus(s) + +def infobox(query, timeout=DEFAULT_TIMEOUT): + import lxml.html + data = read_url(url, timeout=timeout) + doc = lxml.html.document_fromstring(data) + k = 'kp-wholepage' + wholepage = doc.cssselect('.' + k) + infobox = {} + if wholepage: + page = wholepage[0] + for a in page.cssselect('a'): + if a.attrib.get('href', '').startswith('http'): + domain = '.'.join(a.attrib['href'].split('/')[2].split('.')[-2:]) + infobox[domain] = a.attrib['href'] + for e in page.cssselect('*[data-attrid]'): + key = e.attrib['data-attrid'] + value = e.text_content() + if value and key not in ( + 'kc:/film/film:media_actions_wholepage', + 'action:watch_film' + ): + infobox[key] = value + return infobox + + def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT): """ Return max_results tuples with title, url, description