parse google page, ajax json api is limited to 50 requests

This commit is contained in:
j 2011-11-28 14:24:21 +01:00
parent 0614f1e5b1
commit b7aba8bfcf

View file

@ -1,36 +1,16 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
import re import re
import time
import urllib import urllib
import urllib2
import weakref
import threading
import Queue
import ox import ox
from ox import stripTags from ox import stripTags
from ox.utils import json
'''
usage:
import google
google.find(query)
for result in google.find(query): result
result is title, url, description
google.find(query, max_results)
FIXME: how search depper than first page?
'''
DEFAULT_MAX_RESULTS = 10 DEFAULT_MAX_RESULTS = 10
DEFAULT_TIMEOUT = 24*60*60 DEFAULT_TIMEOUT = 24*60*60
def readUrl(url, data=None, headers=ox.net.DEFAULT_HEADERS, timeout=DEFAULT_TIMEOUT): def readUrlUnicode(url, data=None, headers=ox.net.DEFAULT_HEADERS, timeout=DEFAULT_TIMEOUT):
return ox.cache.readUrl(url, data, headers, timeout) return ox.cache.readUrlUnicode(url, data, headers, timeout)
def quote_plus(s): def quote_plus(s):
if not isinstance(s, str): if not isinstance(s, str):
@ -42,30 +22,17 @@ def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
Return max_results tuples with title, url, description Return max_results tuples with title, url, description
>>> find("The Matrix site:imdb.com", 1)[0][0] >>> find("The Matrix site:imdb.com", 1)[0][0]
'The Matrix (1999) - IMDb' u'The Matrix (1999) - IMDb'
>>> find("The Matrix site:imdb.com", 1)[0][1] >>> find("The Matrix site:imdb.com", 1)[0][1]
'http://www.imdb.com/title/tt0133093/' u'http://www.imdb.com/title/tt0133093/'
""" """
_results = _find(query, timeout=timeout) url = 'http://google.com/search?q=%s' % quote_plus(query)
data = readUrlUnicode(url, timeout=timeout)
results = [] results = []
for r in _results: for a in re.compile('<a href="(\S+?)" class=l .*?>(.*?)</a>').findall(data):
results.append((r['titleNoFormatting'], r['unescapedUrl'], stripTags(r['content']))) results.append((stripTags(a[1]), a[0], ''))
if len(results) >= max_results: if len(results) >= max_results:
break break
return results return results
def _find(query, timeout=DEFAULT_TIMEOUT):
"""
Return parsed json results from google ajax api
>>> _find("The Matrix site:imdb.com")[0]['titleNoFormatting']
'The Matrix (1999) - IMDb'
>>> _find("The Matrix site:imdb.com")[0]['url']
'http://www.imdb.com/title/tt0133093/'
"""
url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&q=%s' % quote_plus(query)
results = json.loads(ox.cache.readUrl(url, timeout=timeout))['responseData']['results']
return results