_find() method (using ajax api) added to google module

This commit is contained in:
Rolux 2009-07-04 12:37:25 +02:00
parent e01af3fdd7
commit bb8db87d02

View file

@ -1,13 +1,15 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
import re import re
import time import time
import urllib import urllib
import urllib2 import urllib2
import weakref import weakref
import threading import threading
import Queue import Queue
import simplejson
import oxlib import oxlib
from oxlib import stripTags from oxlib import stripTags
@ -25,26 +27,31 @@ google.find(query, max_results)
FIXME: how search depper than first page? FIXME: how search depper than first page?
''' '''
DEFAULT_MAX_RESULTS = 10 DEFAULT_MAX_RESULTS = 10
def getUrl(url, data=None, headers=oxlib.net.DEFAULT_HEADERS): def getUrl(url, data=None, headers=oxlib.net.DEFAULT_HEADERS):
google_timeout=24*60*60 google_timeout=24*60*60
return oxlib.cache.getUrl(url, data, headers, google_timeout) return oxlib.cache.getUrl(url, data, headers, google_timeout)
def quote_plus(s): def quote_plus(s):
return urllib.quote_plus(s.encode('utf-8')) return urllib.quote_plus(s.encode('utf-8'))
def find(query, max_results=DEFAULT_MAX_RESULTS): def find(query, max_results=DEFAULT_MAX_RESULTS):
url = "http://www.google.com/search?q=%s" % quote_plus(query) url = "http://www.google.com/search?q=%s" % quote_plus(query)
data = getUrl(url) data = getUrl(url)
link_re = r'<a href="(?P<url>[^"]*?)" class=l.*?>(?P<name>.*?)</a>' + \ link_re = r'<a href="(?P<url>[^"]*?)" class=l.*?>(?P<name>.*?)</a>' + \
r'.*?(?:<br>|<table.*?>)' + \ r'.*?(?:<br>|<table.*?>)' + \
r'(?P<desc>.*?)' + '(?:<font color=#008000>|<a)' r'(?P<desc>.*?)' + '(?:<font color=#008000>|<a)'
results = [] results = []
for match in re.compile(link_re, re.DOTALL).finditer(data): for match in re.compile(link_re, re.DOTALL).finditer(data):
(name, url, desc) = match.group('name', 'url', 'desc') (name, url, desc) = match.group('name', 'url', 'desc')
results.append((stripTags(name), url, stripTags(desc))) results.append((stripTags(name), url, stripTags(desc)))
if len(results) > max_results: if len(results) > max_results:
results = results[:max_results] results = results[:max_results]
return results
def _find(query):
url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&q=%s' % quote_plus(query)
results = simplejson.loads(getUrlUnicode(url))['responseData']['results']
return results return results