just a one page find version of google.find

2008-04-29 16:27:19 +02:00 · 2008-04-29 16:27:19 +02:00 · 7a53ee62b9
commit 7a53ee62b9
parent 101f230bd1
1 changed files with 13 additions and 148 deletions
--- a/ox/google.py
+++ b/ox/google.py
@ -14,8 +14,6 @@ from oxutils import stripTags


 '''
-FIXME this module should be replaced by something smaller. a simple find function would do.
-
 usage:
 import google
 google.find(query)
@ -27,6 +25,7 @@ result is title, url, description

 google.find(query, max_results)

+FIXME: how search depper than first page?
 '''
 DEFAULT_MAX_RESULTS = 10

@ -37,152 +36,18 @@ def getUrl(url, data=None, headers=oxutils.net.DEFAULT_HEADERS):
 def quote_plus(s):
  return urllib.quote_plus(s.encode('utf-8'))

-def get_search_page_links(page, results_per_page, begin, end, link_re):
-  """
-  Given str contents of search result page, return list of links.
-
-  Returns list of (name, url, desc) str tuples.  See make_searcher()
-  for a description of results_per_page and link_re.
-  """
-  if begin is not None and begin in page:
-    page = page[page.index(begin):]
-  if end is not None and end in page:
-    page = page[:page.index(end)]
-  ans = []
-  for match in re.compile(link_re, re.DOTALL).finditer(page):
+def find(query, max_results=DEFAULT_MAX_RESULTS):
+  url = "http://www.google.com/search?q=%s" % quote_plus(query)
+  data = getUrl(url)
+  link_re = r'<a href="(?P<url>[^"]*?)" class=l.*?>(?P<name>.*?)</a>' +  \
+            r'.*?(?:<br>|<table.*?>)' +  \
+            r'(?P<desc>.*?)' + '(?:<font color=#008000>|<a)'
+  results = []
+  for match in re.compile(link_re, re.DOTALL).finditer(data):
    (name, url, desc) = match.group('name', 'url', 'desc')
-    ans += [(stripTags(name), url, stripTags(desc))]
-  return ans
+    results.append([(stripTags(name), url, stripTags(desc))])
+  if len(results) > max_results:
+    results = results[:max_results]
+  return results


-def nonblocking(f, blocking_return=None, sleep_time=0.01):
-  """
-  Wrap a callable which returns an iter so that it no longer blocks.
-
-  The wrapped iterator returns blocking_return while callable f is
-  blocking.  The callable f is called in a background thread.  If the
-  wrapped iterator is deleted, then the iterator returned by f is
-  deleted also and the background thread is terminated.
-  """
-  def g(*args, **kwargs):
-    f_iter = f(*args, **kwargs)
-    g_iter = None
-    def run():
-      while True:
-        g_obj = g_iter()
-        if g_obj is None:
-          return
-        if g_obj.q.qsize() == 0:
-          try:
-            f_next = f_iter.next()
-          except Exception, e:
-            g_obj.exc = e
-            return
-          g_obj.q.put(f_next)
-        else:
-          del g_obj
-          time.sleep(sleep_time)
-    class Iter:
-      def __init__(self):
-        self.q = Queue.Queue()
-        self.exc = None
-        self.thread = threading.Thread(target=run)
-        self.thread.setDaemon(True)
-      def next(self):
-        if self.exc is not None:
-          raise self.exc
-        try:
-          return self.q.get_nowait()
-        except Queue.Empty:
-          return blocking_return
-      def __iter__(self):
-        return self
-
-    obj = Iter()
-    g_iter = weakref.ref(obj)
-    obj.thread.start()
-    try:
-      return obj
-    finally:
-      del obj
-  return g
-def make_searcher(query_url, results_per_page, page_url, page_mode,
-                  begin, end, link_re):
-  """
-  Return a search function for the given search engine.
-
-  Here query_url is the URL for the initial search, with %(q)s for
-  the query string, results_per_page is the number of search results
-  per page, page_url is the URL for the 2nd and subsequent pages of
-  search results, with %(q)s for the query string and %(n)s for the
-  page "number."  Here page_mode controls the actual value for the
-  page "number:"
-
-   - page_mode='page0':   Use 0-based index of the page.
-   - page_mode='page1':   Use 1-based index of the page.
-   - page_mode='offset0': Use 0-based index of the search result,
-                          which is a multiple of results_per_page.
-   - page_mode='offset1': Use 1-based index of the search result
-                          (one plus a multiple of results_per_page).
-
-  If begin is not None, then only text after the first occurrence of
-  begin will be used in the search results page.  If end is not None,
-  then only text before the first occurrence of end will be used.
-
-  Finally, link_re is a regex string (see module re) which matches
-  three named groups: 'name', 'url', and 'desc'.  These correspond to
-  the name, URL and description of each search result.  The regex is
-  applied in re.DOTALL mode.
-
-  Returns a search() function which has the same interface as
-  described in the module docstring.
-  """
-  def search_blocking(query, max_results):
-    last_links = None
-    page_num = 0
-    q = Queue.Queue()
-    for i in range(max_results):
-      if q.qsize() == 0:
-        if page_num == 0:
-          page = getUrl(query_url % {'q': quote_plus(query)})
-        else:
-          if page_mode == 'page0':
-            n = page_num
-          elif page_mode == 'page1':
-            n = page_num + 1
-          elif page_mode == 'offset0':
-            n = page_num * results_per_page
-          elif page_mode == 'offset1':
-            n = page_num * results_per_page + 1
-          else:
-            raise ValueError('unknown page mode')
-          page = getUrl(page_url % {'n': n, 'q': quote_plus(query)})
-        page_num += 1
-        links = get_search_page_links(page, results_per_page, begin, end, link_re)
-        if len(links) == 0 or links == last_links:
-          break
-        last_links = links
-        for link in links:
-          q.put(link)
-      yield q.get()
-
-  search_nonblocking = nonblocking(search_blocking)
-
-  def search(query, max_results=DEFAULT_MAX_RESULTS, blocking=True):
-    """
-    See docstring for web_search module.
-    """
-    if blocking:
-      return search_blocking(query, max_results)
-    else:
-      return search_nonblocking(query, max_results)
-
-  return search
-
-find = make_searcher('http://www.google.com/search?q=%(q)s', 10,
-                          'http://www.google.com/search?start=%(n)d&q=%(q)s', 'offset0',
-                          None, None,
-                          r'<a href="(?P<url>[^"]*?)" class=l.*?>(?P<name>.*?)</a>' +
-                          r'.*?(?:<br>|<table.*?>)' +
-                          r'(?P<desc>.*?)' + '(?:<font color=#008000>|<a)')
-