python-oxweb/ox/google.py

# -*- Mode: Python; -*-
# -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2
import re
import time
import urllib
import urllib2
import weakref
import threading
import Queue

import oxutils 
from oxutils import stripTags


'''
FIXME this module should be replaced by something smaller. a simple find function would do.

usage:
import google
google.find(query)
<generator object at 0x833aeac>

for result in google.find(query): result

result is title, url, description

google.find(query, max_results)

'''
DEFAULT_MAX_RESULTS = 10

def getUrl(url, data=None, headers=oxutils.net.DEFAULT_HEADERS):
  google_timeout=24*60*60
  return oxutils.cache.getUrl(url, data, headers, google_timeout)

def quote_plus(s):
  return urllib.quote_plus(s.encode('utf-8'))

def get_search_page_links(page, results_per_page, begin, end, link_re):
  """
  Given str contents of search result page, return list of links.

  Returns list of (name, url, desc) str tuples.  See make_searcher()
  for a description of results_per_page and link_re.
  """
  if begin is not None and begin in page:
    page = page[page.index(begin):]
  if end is not None and end in page:
    page = page[:page.index(end)]
  ans = []
  for match in re.compile(link_re, re.DOTALL).finditer(page):
    (name, url, desc) = match.group('name', 'url', 'desc')
    ans += [(stripTags(name), url, stripTags(desc))]
  return ans


def nonblocking(f, blocking_return=None, sleep_time=0.01):
  """
  Wrap a callable which returns an iter so that it no longer blocks.

  The wrapped iterator returns blocking_return while callable f is
  blocking.  The callable f is called in a background thread.  If the
  wrapped iterator is deleted, then the iterator returned by f is
  deleted also and the background thread is terminated.
  """
  def g(*args, **kwargs):
    f_iter = f(*args, **kwargs)
    g_iter = None
    def run():
      while True:
        g_obj = g_iter()
        if g_obj is None:
          return
        if g_obj.q.qsize() == 0:
          try:
            f_next = f_iter.next()
          except Exception, e:
            g_obj.exc = e
            return
          g_obj.q.put(f_next)
        else:
          del g_obj
          time.sleep(sleep_time)
    class Iter:
      def __init__(self):
        self.q = Queue.Queue()
        self.exc = None
        self.thread = threading.Thread(target=run)
        self.thread.setDaemon(True)
      def next(self):
        if self.exc is not None:
          raise self.exc
        try:
          return self.q.get_nowait()
        except Queue.Empty:
          return blocking_return
      def __iter__(self):
        return self

    obj = Iter()
    g_iter = weakref.ref(obj)
    obj.thread.start()
    try:
      return obj
    finally:
      del obj
  return g
def make_searcher(query_url, results_per_page, page_url, page_mode,
                  begin, end, link_re):
  """
  Return a search function for the given search engine.

  Here query_url is the URL for the initial search, with %(q)s for
  the query string, results_per_page is the number of search results
  per page, page_url is the URL for the 2nd and subsequent pages of
  search results, with %(q)s for the query string and %(n)s for the
  page "number."  Here page_mode controls the actual value for the
  page "number:"

   - page_mode='page0':   Use 0-based index of the page.
   - page_mode='page1':   Use 1-based index of the page.
   - page_mode='offset0': Use 0-based index of the search result,
                          which is a multiple of results_per_page.
   - page_mode='offset1': Use 1-based index of the search result
                          (one plus a multiple of results_per_page).

  If begin is not None, then only text after the first occurrence of
  begin will be used in the search results page.  If end is not None,
  then only text before the first occurrence of end will be used.

  Finally, link_re is a regex string (see module re) which matches
  three named groups: 'name', 'url', and 'desc'.  These correspond to
  the name, URL and description of each search result.  The regex is
  applied in re.DOTALL mode.

  Returns a search() function which has the same interface as
  described in the module docstring.
  """
  def search_blocking(query, max_results):
    last_links = None
    page_num = 0
    q = Queue.Queue()
    for i in range(max_results):
      if q.qsize() == 0:
        if page_num == 0:
          page = getUrl(query_url % {'q': quote_plus(query)})
        else:
          if page_mode == 'page0':
            n = page_num
          elif page_mode == 'page1':
            n = page_num + 1
          elif page_mode == 'offset0':
            n = page_num * results_per_page
          elif page_mode == 'offset1':
            n = page_num * results_per_page + 1
          else:
            raise ValueError('unknown page mode')
          page = getUrl(page_url % {'n': n, 'q': quote_plus(query)})
        page_num += 1
        links = get_search_page_links(page, results_per_page, begin, end, link_re)
        if len(links) == 0 or links == last_links:
          break
        last_links = links
        for link in links:
          q.put(link)
      yield q.get()

  search_nonblocking = nonblocking(search_blocking)

  def search(query, max_results=DEFAULT_MAX_RESULTS, blocking=True):
    """
    See docstring for web_search module.
    """
    if blocking:
      return search_blocking(query, max_results)
    else:
      return search_nonblocking(query, max_results)

  return search

find = make_searcher('http://www.google.com/search?q=%(q)s', 10,
                          'http://www.google.com/search?start=%(n)d&q=%(q)s', 'offset0',
                          None, None,
                          r'<a href="(?P<url>[^"]*?)" class=l.*?>(?P<name>.*?)</a>' +
                          r'.*?(?:<br>|<table.*?>)' +
                          r'(?P<desc>.*?)' + '(?:<font color=#008000>|<a)')
lets start with google and imdb 2008-04-28 09:52:21 +00:00			`# -- Mode: Python; --`
			`# -- coding: utf-8 --`
			`# vi:si:et:sw=2:sts=2:ts=2`
			`import re`
			`import time`
			`import urllib`
			`import urllib2`
			`import weakref`
			`import threading`
			`import Queue`

			`import oxutils`
			`from oxutils import stripTags`


			`'''`
spelling 2008-04-29 14:20:03 +00:00			`FIXME this module should be replaced by something smaller. a simple find function would do.`

lets start with google and imdb 2008-04-28 09:52:21 +00:00			`usage:`
			`import google`
			`google.find(query)`
			`<generator object at 0x833aeac>`

			`for result in google.find(query): result`

			`result is title, url, description`

			`google.find(query, max_results)`

			`'''`
			`DEFAULT_MAX_RESULTS = 10`

			`def getUrl(url, data=None, headers=oxutils.net.DEFAULT_HEADERS):`
			`google_timeout=246060`
			`return oxutils.cache.getUrl(url, data, headers, google_timeout)`

			`def quote_plus(s):`
			`return urllib.quote_plus(s.encode('utf-8'))`

			`def get_search_page_links(page, results_per_page, begin, end, link_re):`
			`"""`
			`Given str contents of search result page, return list of links.`

			`Returns list of (name, url, desc) str tuples. See make_searcher()`
			`for a description of results_per_page and link_re.`
			`"""`
			`if begin is not None and begin in page:`
			`page = page[page.index(begin):]`
			`if end is not None and end in page:`
			`page = page[:page.index(end)]`
			`ans = []`
			`for match in re.compile(link_re, re.DOTALL).finditer(page):`
			`(name, url, desc) = match.group('name', 'url', 'desc')`
			`ans += [(stripTags(name), url, stripTags(desc))]`
			`return ans`


			`def nonblocking(f, blocking_return=None, sleep_time=0.01):`
			`"""`
			`Wrap a callable which returns an iter so that it no longer blocks.`

			`The wrapped iterator returns blocking_return while callable f is`
			`blocking. The callable f is called in a background thread. If the`
			`wrapped iterator is deleted, then the iterator returned by f is`
			`deleted also and the background thread is terminated.`
			`"""`
			`def g(args, *kwargs):`
			`f_iter = f(args, *kwargs)`
			`g_iter = None`
			`def run():`
			`while True:`
			`g_obj = g_iter()`
			`if g_obj is None:`
			`return`
			`if g_obj.q.qsize() == 0:`
			`try:`
			`f_next = f_iter.next()`
			`except Exception, e:`
			`g_obj.exc = e`
			`return`
			`g_obj.q.put(f_next)`
			`else:`
			`del g_obj`
			`time.sleep(sleep_time)`
			`class Iter:`
			`def __init__(self):`
			`self.q = Queue.Queue()`
			`self.exc = None`
			`self.thread = threading.Thread(target=run)`
			`self.thread.setDaemon(True)`
			`def next(self):`
			`if self.exc is not None:`
			`raise self.exc`
			`try:`
			`return self.q.get_nowait()`
			`except Queue.Empty:`
			`return blocking_return`
			`def __iter__(self):`
			`return self`

			`obj = Iter()`
			`g_iter = weakref.ref(obj)`
			`obj.thread.start()`
			`try:`
			`return obj`
			`finally:`
			`del obj`
			`return g`
			`def make_searcher(query_url, results_per_page, page_url, page_mode,`
			`begin, end, link_re):`
			`"""`
			`Return a search function for the given search engine.`

			`Here query_url is the URL for the initial search, with %(q)s for`
			`the query string, results_per_page is the number of search results`
			`per page, page_url is the URL for the 2nd and subsequent pages of`
			`search results, with %(q)s for the query string and %(n)s for the`
			`page "number." Here page_mode controls the actual value for the`
			`page "number:"`

			`- page_mode='page0': Use 0-based index of the page.`
			`- page_mode='page1': Use 1-based index of the page.`
			`- page_mode='offset0': Use 0-based index of the search result,`
			`which is a multiple of results_per_page.`
			`- page_mode='offset1': Use 1-based index of the search result`
			`(one plus a multiple of results_per_page).`

			`If begin is not None, then only text after the first occurrence of`
			`begin will be used in the search results page. If end is not None,`
			`then only text before the first occurrence of end will be used.`

			`Finally, link_re is a regex string (see module re) which matches`
			`three named groups: 'name', 'url', and 'desc'. These correspond to`
			`the name, URL and description of each search result. The regex is`
			`applied in re.DOTALL mode.`

			`Returns a search() function which has the same interface as`
			`described in the module docstring.`
			`"""`
			`def search_blocking(query, max_results):`
			`last_links = None`
			`page_num = 0`
			`q = Queue.Queue()`
			`for i in range(max_results):`
			`if q.qsize() == 0:`
			`if page_num == 0:`
			`page = getUrl(query_url % {'q': quote_plus(query)})`
			`else:`
			`if page_mode == 'page0':`
			`n = page_num`
			`elif page_mode == 'page1':`
			`n = page_num + 1`
			`elif page_mode == 'offset0':`
			`n = page_num * results_per_page`
			`elif page_mode == 'offset1':`
			`n = page_num * results_per_page + 1`
			`else:`
			`raise ValueError('unknown page mode')`
			`page = getUrl(page_url % {'n': n, 'q': quote_plus(query)})`
			`page_num += 1`
			`links = get_search_page_links(page, results_per_page, begin, end, link_re)`
			`if len(links) == 0 or links == last_links:`
			`break`
			`last_links = links`
			`for link in links:`
			`q.put(link)`
			`yield q.get()`

			`search_nonblocking = nonblocking(search_blocking)`

			`def search(query, max_results=DEFAULT_MAX_RESULTS, blocking=True):`
			`"""`
			`See docstring for web_search module.`
			`"""`
			`if blocking:`
			`return search_blocking(query, max_results)`
			`else:`
			`return search_nonblocking(query, max_results)`

			`return search`

			`find = make_searcher('http://www.google.com/search?q=%(q)s', 10,`
			`'http://www.google.com/search?start=%(n)d&q=%(q)s', 'offset0',`
			`None, None,`
			`r'<a href="(?P<url>[^"]?)" class=l.?>(?P<name>.*?)</a>' +`
			`r'.?(?:<br>\|<table.?>)' +`
			`r'(?P<desc>.*?)' + '(?:<font color=#008000>\|<a)')`