parse google infobox
This commit is contained in:
parent
2172bcb3fb
commit
ad2ccd4626
1 changed files with 25 additions and 0 deletions
|
@ -17,6 +17,31 @@ def quote_plus(s):
|
|||
s = s.encode('utf-8')
|
||||
return urllib.parse.quote_plus(s)
|
||||
|
||||
|
||||
def infobox(query, timeout=DEFAULT_TIMEOUT):
|
||||
import lxml.html
|
||||
data = read_url(url, timeout=timeout)
|
||||
doc = lxml.html.document_fromstring(data)
|
||||
k = 'kp-wholepage'
|
||||
wholepage = doc.cssselect('.' + k)
|
||||
infobox = {}
|
||||
if wholepage:
|
||||
page = wholepage[0]
|
||||
for a in page.cssselect('a'):
|
||||
if a.attrib.get('href', '').startswith('http'):
|
||||
domain = '.'.join(a.attrib['href'].split('/')[2].split('.')[-2:])
|
||||
infobox[domain] = a.attrib['href']
|
||||
for e in page.cssselect('*[data-attrid]'):
|
||||
key = e.attrib['data-attrid']
|
||||
value = e.text_content()
|
||||
if value and key not in (
|
||||
'kc:/film/film:media_actions_wholepage',
|
||||
'action:watch_film'
|
||||
):
|
||||
infobox[key] = value
|
||||
return infobox
|
||||
|
||||
|
||||
def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
|
||||
"""
|
||||
Return max_results tuples with title, url, description
|
||||
|
|
Loading…
Reference in a new issue