diff --git a/ox/google.py b/ox/google.py
index da508ec..cf08686 100644
--- a/ox/google.py
+++ b/ox/google.py
@@ -14,8 +14,6 @@ from oxutils import stripTags
'''
-FIXME this module should be replaced by something smaller. a simple find function would do.
-
usage:
import google
google.find(query)
@@ -27,6 +25,7 @@ result is title, url, description
google.find(query, max_results)
+FIXME: how search depper than first page?
'''
DEFAULT_MAX_RESULTS = 10
@@ -37,152 +36,18 @@ def getUrl(url, data=None, headers=oxutils.net.DEFAULT_HEADERS):
def quote_plus(s):
return urllib.quote_plus(s.encode('utf-8'))
-def get_search_page_links(page, results_per_page, begin, end, link_re):
- """
- Given str contents of search result page, return list of links.
-
- Returns list of (name, url, desc) str tuples. See make_searcher()
- for a description of results_per_page and link_re.
- """
- if begin is not None and begin in page:
- page = page[page.index(begin):]
- if end is not None and end in page:
- page = page[:page.index(end)]
- ans = []
- for match in re.compile(link_re, re.DOTALL).finditer(page):
+def find(query, max_results=DEFAULT_MAX_RESULTS):
+ url = "http://www.google.com/search?q=%s" % quote_plus(query)
+ data = getUrl(url)
+ link_re = r'(?P.*?)' + \
+ r'.*?(?:
|
)' + \
+ r'(?P.*?)' + '(?:| max_results:
+ results = results[:max_results]
+ return results
-def nonblocking(f, blocking_return=None, sleep_time=0.01):
- """
- Wrap a callable which returns an iter so that it no longer blocks.
-
- The wrapped iterator returns blocking_return while callable f is
- blocking. The callable f is called in a background thread. If the
- wrapped iterator is deleted, then the iterator returned by f is
- deleted also and the background thread is terminated.
- """
- def g(*args, **kwargs):
- f_iter = f(*args, **kwargs)
- g_iter = None
- def run():
- while True:
- g_obj = g_iter()
- if g_obj is None:
- return
- if g_obj.q.qsize() == 0:
- try:
- f_next = f_iter.next()
- except Exception, e:
- g_obj.exc = e
- return
- g_obj.q.put(f_next)
- else:
- del g_obj
- time.sleep(sleep_time)
- class Iter:
- def __init__(self):
- self.q = Queue.Queue()
- self.exc = None
- self.thread = threading.Thread(target=run)
- self.thread.setDaemon(True)
- def next(self):
- if self.exc is not None:
- raise self.exc
- try:
- return self.q.get_nowait()
- except Queue.Empty:
- return blocking_return
- def __iter__(self):
- return self
-
- obj = Iter()
- g_iter = weakref.ref(obj)
- obj.thread.start()
- try:
- return obj
- finally:
- del obj
- return g
-def make_searcher(query_url, results_per_page, page_url, page_mode,
- begin, end, link_re):
- """
- Return a search function for the given search engine.
-
- Here query_url is the URL for the initial search, with %(q)s for
- the query string, results_per_page is the number of search results
- per page, page_url is the URL for the 2nd and subsequent pages of
- search results, with %(q)s for the query string and %(n)s for the
- page "number." Here page_mode controls the actual value for the
- page "number:"
-
- - page_mode='page0': Use 0-based index of the page.
- - page_mode='page1': Use 1-based index of the page.
- - page_mode='offset0': Use 0-based index of the search result,
- which is a multiple of results_per_page.
- - page_mode='offset1': Use 1-based index of the search result
- (one plus a multiple of results_per_page).
-
- If begin is not None, then only text after the first occurrence of
- begin will be used in the search results page. If end is not None,
- then only text before the first occurrence of end will be used.
-
- Finally, link_re is a regex string (see module re) which matches
- three named groups: 'name', 'url', and 'desc'. These correspond to
- the name, URL and description of each search result. The regex is
- applied in re.DOTALL mode.
-
- Returns a search() function which has the same interface as
- described in the module docstring.
- """
- def search_blocking(query, max_results):
- last_links = None
- page_num = 0
- q = Queue.Queue()
- for i in range(max_results):
- if q.qsize() == 0:
- if page_num == 0:
- page = getUrl(query_url % {'q': quote_plus(query)})
- else:
- if page_mode == 'page0':
- n = page_num
- elif page_mode == 'page1':
- n = page_num + 1
- elif page_mode == 'offset0':
- n = page_num * results_per_page
- elif page_mode == 'offset1':
- n = page_num * results_per_page + 1
- else:
- raise ValueError('unknown page mode')
- page = getUrl(page_url % {'n': n, 'q': quote_plus(query)})
- page_num += 1
- links = get_search_page_links(page, results_per_page, begin, end, link_re)
- if len(links) == 0 or links == last_links:
- break
- last_links = links
- for link in links:
- q.put(link)
- yield q.get()
-
- search_nonblocking = nonblocking(search_blocking)
-
- def search(query, max_results=DEFAULT_MAX_RESULTS, blocking=True):
- """
- See docstring for web_search module.
- """
- if blocking:
- return search_blocking(query, max_results)
- else:
- return search_nonblocking(query, max_results)
-
- return search
-
-find = make_searcher('http://www.google.com/search?q=%(q)s', 10,
- 'http://www.google.com/search?start=%(n)d&q=%(q)s', 'offset0',
- None, None,
- r'(?P.*?)' +
- r'.*?(?:
|)' +
- r'(?P.*?)' + '(?:|