parse google infobox
This commit is contained in:
parent
2172bcb3fb
commit
ad2ccd4626
1 changed files with 25 additions and 0 deletions
|
@ -17,6 +17,31 @@ def quote_plus(s):
|
||||||
s = s.encode('utf-8')
|
s = s.encode('utf-8')
|
||||||
return urllib.parse.quote_plus(s)
|
return urllib.parse.quote_plus(s)
|
||||||
|
|
||||||
|
|
||||||
|
def infobox(query, timeout=DEFAULT_TIMEOUT):
|
||||||
|
import lxml.html
|
||||||
|
data = read_url(url, timeout=timeout)
|
||||||
|
doc = lxml.html.document_fromstring(data)
|
||||||
|
k = 'kp-wholepage'
|
||||||
|
wholepage = doc.cssselect('.' + k)
|
||||||
|
infobox = {}
|
||||||
|
if wholepage:
|
||||||
|
page = wholepage[0]
|
||||||
|
for a in page.cssselect('a'):
|
||||||
|
if a.attrib.get('href', '').startswith('http'):
|
||||||
|
domain = '.'.join(a.attrib['href'].split('/')[2].split('.')[-2:])
|
||||||
|
infobox[domain] = a.attrib['href']
|
||||||
|
for e in page.cssselect('*[data-attrid]'):
|
||||||
|
key = e.attrib['data-attrid']
|
||||||
|
value = e.text_content()
|
||||||
|
if value and key not in (
|
||||||
|
'kc:/film/film:media_actions_wholepage',
|
||||||
|
'action:watch_film'
|
||||||
|
):
|
||||||
|
infobox[key] = value
|
||||||
|
return infobox
|
||||||
|
|
||||||
|
|
||||||
def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
|
def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
|
||||||
"""
|
"""
|
||||||
Return max_results tuples with title, url, description
|
Return max_results tuples with title, url, description
|
||||||
|
|
Loading…
Reference in a new issue