From 856975986567bbf5170314a8e58cf6a14c53bdf9 Mon Sep 17 00:00:00 2001 From: j <0x006A@0x2620.org> Date: Wed, 28 Jul 2010 15:04:43 +0200 Subject: [PATCH] use googleapi for all google queries --- ox/utils.py | 5 +++++ ox/web/google.py | 44 +++++++++++++++++++++++++++++--------------- 2 files changed, 34 insertions(+), 15 deletions(-) diff --git a/ox/utils.py b/ox/utils.py index 332d5e8..40bd606 100644 --- a/ox/utils.py +++ b/ox/utils.py @@ -5,3 +5,8 @@ try: except: from datetime import datetime +try: + import simplejson as json +except: + import json + diff --git a/ox/web/google.py b/ox/web/google.py index a980b2e..3e8aed6 100644 --- a/ox/web/google.py +++ b/ox/web/google.py @@ -7,11 +7,10 @@ import urllib2 import weakref import threading import Queue -import simplejson - import ox from ox import stripTags +from ox.utils import json ''' @@ -34,24 +33,39 @@ def readUrl(url, data=None, headers=ox.net.DEFAULT_HEADERS, timeout=DEFAULT_TIME return ox.cache.readUrl(url, data, headers, timeout) def quote_plus(s): - return urllib.quote_plus(s.encode('utf-8')) + if not isinstance(s, str): + s = s.encode('utf-8') + return urllib.quote_plus(s) def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT): - url = "http://www.google.com/search?q=%s" % quote_plus(query) - data = readUrl(url, timeout=timeout) - link_re = r'(?P.*?)' + \ - r'.*?(?:
|)' + \ - r'(?P.*?)' + '(?:|>> find("The Matrix site:imdb.com", 1)[0][0] + u'The Matrix (1999)' + + >>> find("The Matrix site:imdb.com", 1)[0][1] + u'http://www.imdb.com/title/tt0133093/' + """ + _results = _find(query) results = [] - for match in re.compile(link_re, re.DOTALL).finditer(data): - (name, url, desc) = match.group('name', 'url', 'desc') - results.append((stripTags(name), url, stripTags(desc))) - if len(results) > max_results: - results = results[:max_results] + for r in _results: + results.append((r['titleNoFormatting'], r['unescapedUrl'], stripTags(r['content']))) + if len(results) >= max_results: + break return results -def _find(query): +def _find(query, timeout=DEFAULT_TIMEOUT): + """ + Return parsed json results from google ajax api + + >>> _find("The Matrix site:imdb.com")[0]['titleNoFormatting'] + u'The Matrix (1999)' + + >>> _find("The Matrix site:imdb.com")[0]['url'] + u'http://www.imdb.com/title/tt0133093/' + """ url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&q=%s' % quote_plus(query) - results = simplejson.loads(ox.cache.readUrlUnicode(url))['responseData']['results'] + results = json.loads(ox.cache.readUrlUnicode(url, timeout=timeout))['responseData']['results'] return results