parse google page, ajax json api is limited to 50 requests

2011-11-28 14:24:21 +01:00 · 2011-11-28 14:24:21 +01:00 · b7aba8bfcf
commit b7aba8bfcf
parent 0614f1e5b1
1 changed files with 8 additions and 41 deletions
--- a/ox/web/google.py
+++ b/ox/web/google.py
@ -1,36 +1,16 @@
 # -*- coding: utf-8 -*-
 # vi:si:et:sw=4:sts=4:ts=4
 import re
-import time
 import urllib
-import urllib2
-import weakref
-import threading
-import Queue

 import ox
 from ox import stripTags
-from ox.utils import json

-
-'''
-usage:
-import google
-google.find(query)
-
-for result in google.find(query): result
-
-result is title, url, description
-
-google.find(query, max_results)
-
-FIXME: how search depper than first page?
-'''
 DEFAULT_MAX_RESULTS = 10
 DEFAULT_TIMEOUT = 24*60*60

-def readUrl(url, data=None, headers=ox.net.DEFAULT_HEADERS, timeout=DEFAULT_TIMEOUT):
-    return ox.cache.readUrl(url, data, headers, timeout)
+def readUrlUnicode(url, data=None, headers=ox.net.DEFAULT_HEADERS, timeout=DEFAULT_TIMEOUT):
+    return ox.cache.readUrlUnicode(url, data, headers, timeout)

 def quote_plus(s):
    if not isinstance(s, str):
@ -42,30 +22,17 @@ def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
    Return max_results tuples with title, url, description 

    >>> find("The Matrix site:imdb.com", 1)[0][0]
-    'The Matrix (1999) - IMDb'
+    u'The Matrix (1999) - IMDb'

    >>> find("The Matrix site:imdb.com", 1)[0][1]
-    'http://www.imdb.com/title/tt0133093/'
+    u'http://www.imdb.com/title/tt0133093/'
    """
-    _results =  _find(query, timeout=timeout)
+    url = 'http://google.com/search?q=%s' % quote_plus(query)
+    data = readUrlUnicode(url, timeout=timeout)
    results = []
-    for r in _results:
-        results.append((r['titleNoFormatting'], r['unescapedUrl'], stripTags(r['content'])))
+    for a in re.compile('<a href="(\S+?)" class=l .*?>(.*?)</a>').findall(data):
+        results.append((stripTags(a[1]), a[0], ''))
        if len(results) >= max_results:
            break
    return results

-def _find(query, timeout=DEFAULT_TIMEOUT):
-    """
-    Return parsed json results from google ajax api
-
-    >>> _find("The Matrix site:imdb.com")[0]['titleNoFormatting']
-    'The Matrix (1999) - IMDb'
-
-    >>> _find("The Matrix site:imdb.com")[0]['url']
-    'http://www.imdb.com/title/tt0133093/'
-    """
-    url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&q=%s' % quote_plus(query)
-    results = json.loads(ox.cache.readUrl(url, timeout=timeout))['responseData']['results']
-    return results
-