parse google infobox

2021-08-29 13:43:33 +02:00 · 2021-08-29 13:43:33 +02:00 · ad2ccd4626
commit ad2ccd4626
parent 2172bcb3fb
1 changed files with 25 additions and 0 deletions
--- a/ox/web/google.py
+++ b/ox/web/google.py
@ -17,6 +17,31 @@ def quote_plus(s):
        s = s.encode('utf-8')
    return urllib.parse.quote_plus(s)
 def infobox(query, timeout=DEFAULT_TIMEOUT):
    import lxml.html
    data = read_url(url, timeout=timeout)
    doc = lxml.html.document_fromstring(data)
    k = 'kp-wholepage'
    wholepage = doc.cssselect('.' + k)
    infobox = {}
    if wholepage:
        page = wholepage[0]
        for a in page.cssselect('a'):
            if a.attrib.get('href', '').startswith('http'):
                domain = '.'.join(a.attrib['href'].split('/')[2].split('.')[-2:])
                infobox[domain] = a.attrib['href']
        for e in page.cssselect('*[data-attrid]'):
            key = e.attrib['data-attrid']
            value = e.text_content()
            if value and key not in (
                'kc:/film/film:media_actions_wholepage',
                'action:watch_film'
            ):
                infobox[key] = value
    return infobox
 def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
    """
    Return max_results tuples with title, url, description