use googleapi for all google queries

This commit is contained in:
j 2010-07-28 15:04:43 +02:00
parent 79286b4619
commit 8569759865
2 changed files with 34 additions and 15 deletions

View file

@ -5,3 +5,8 @@ try:
except: except:
from datetime import datetime from datetime import datetime
try:
import simplejson as json
except:
import json

View file

@ -7,11 +7,10 @@ import urllib2
import weakref import weakref
import threading import threading
import Queue import Queue
import simplejson
import ox import ox
from ox import stripTags from ox import stripTags
from ox.utils import json
''' '''
@ -34,24 +33,39 @@ def readUrl(url, data=None, headers=ox.net.DEFAULT_HEADERS, timeout=DEFAULT_TIME
return ox.cache.readUrl(url, data, headers, timeout) return ox.cache.readUrl(url, data, headers, timeout)
def quote_plus(s): def quote_plus(s):
return urllib.quote_plus(s.encode('utf-8')) if not isinstance(s, str):
s = s.encode('utf-8')
return urllib.quote_plus(s)
def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT): def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
url = "http://www.google.com/search?q=%s" % quote_plus(query) """
data = readUrl(url, timeout=timeout) Return max_results tuples with title, url, description
link_re = r'<a href="(?P<url>[^"]*?)" class=l.*?>(?P<name>.*?)</a>' + \
r'.*?(?:<br>|<table.*?>)' + \ >>> find("The Matrix site:imdb.com", 1)[0][0]
r'(?P<desc>.*?)' + '(?:<font color=#008000>|<a)' u'The Matrix (1999)'
>>> find("The Matrix site:imdb.com", 1)[0][1]
u'http://www.imdb.com/title/tt0133093/'
"""
_results = _find(query)
results = [] results = []
for match in re.compile(link_re, re.DOTALL).finditer(data): for r in _results:
(name, url, desc) = match.group('name', 'url', 'desc') results.append((r['titleNoFormatting'], r['unescapedUrl'], stripTags(r['content'])))
results.append((stripTags(name), url, stripTags(desc))) if len(results) >= max_results:
if len(results) > max_results: break
results = results[:max_results]
return results return results
def _find(query): def _find(query, timeout=DEFAULT_TIMEOUT):
"""
Return parsed json results from google ajax api
>>> _find("The Matrix site:imdb.com")[0]['titleNoFormatting']
u'The Matrix (1999)'
>>> _find("The Matrix site:imdb.com")[0]['url']
u'http://www.imdb.com/title/tt0133093/'
"""
url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&q=%s' % quote_plus(query) url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&q=%s' % quote_plus(query)
results = simplejson.loads(ox.cache.readUrlUnicode(url))['responseData']['results'] results = json.loads(ox.cache.readUrlUnicode(url, timeout=timeout))['responseData']['results']
return results return results