parse google infobox

2021-08-29 13:43:33 +02:00 · 2021-08-29 13:43:33 +02:00 · ad2ccd4626
commit ad2ccd4626
parent 2172bcb3fb
1 changed files with 25 additions and 0 deletions
--- a/ox/web/google.py
+++ b/ox/web/google.py
@ -17,6 +17,31 @@ def quote_plus(s):
        s = s.encode('utf-8')
    return urllib.parse.quote_plus(s)

+
+def infobox(query, timeout=DEFAULT_TIMEOUT):
+    import lxml.html
+    data = read_url(url, timeout=timeout)
+    doc = lxml.html.document_fromstring(data)
+    k = 'kp-wholepage'
+    wholepage = doc.cssselect('.' + k)
+    infobox = {}
+    if wholepage:
+        page = wholepage[0]
+        for a in page.cssselect('a'):
+            if a.attrib.get('href', '').startswith('http'):
+                domain = '.'.join(a.attrib['href'].split('/')[2].split('.')[-2:])
+                infobox[domain] = a.attrib['href']
+        for e in page.cssselect('*[data-attrid]'):
+            key = e.attrib['data-attrid']
+            value = e.text_content()
+            if value and key not in (
+                'kc:/film/film:media_actions_wholepage',
+                'action:watch_film'
+            ):
+                infobox[key] = value
+    return infobox
+
+
 def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
    """
    Return max_results tuples with title, url, description