diff --git a/ox/google.py b/ox/google.py index da508ec..cf08686 100644 --- a/ox/google.py +++ b/ox/google.py @@ -14,8 +14,6 @@ from oxutils import stripTags ''' -FIXME this module should be replaced by something smaller. a simple find function would do. - usage: import google google.find(query) @@ -27,6 +25,7 @@ result is title, url, description google.find(query, max_results) +FIXME: how search depper than first page? ''' DEFAULT_MAX_RESULTS = 10 @@ -37,152 +36,18 @@ def getUrl(url, data=None, headers=oxutils.net.DEFAULT_HEADERS): def quote_plus(s): return urllib.quote_plus(s.encode('utf-8')) -def get_search_page_links(page, results_per_page, begin, end, link_re): - """ - Given str contents of search result page, return list of links. - - Returns list of (name, url, desc) str tuples. See make_searcher() - for a description of results_per_page and link_re. - """ - if begin is not None and begin in page: - page = page[page.index(begin):] - if end is not None and end in page: - page = page[:page.index(end)] - ans = [] - for match in re.compile(link_re, re.DOTALL).finditer(page): +def find(query, max_results=DEFAULT_MAX_RESULTS): + url = "http://www.google.com/search?q=%s" % quote_plus(query) + data = getUrl(url) + link_re = r'(?P.*?)' + \ + r'.*?(?:
|)' + \ + r'(?P.*?)' + '(?:| max_results: + results = results[:max_results] + return results -def nonblocking(f, blocking_return=None, sleep_time=0.01): - """ - Wrap a callable which returns an iter so that it no longer blocks. - - The wrapped iterator returns blocking_return while callable f is - blocking. The callable f is called in a background thread. If the - wrapped iterator is deleted, then the iterator returned by f is - deleted also and the background thread is terminated. - """ - def g(*args, **kwargs): - f_iter = f(*args, **kwargs) - g_iter = None - def run(): - while True: - g_obj = g_iter() - if g_obj is None: - return - if g_obj.q.qsize() == 0: - try: - f_next = f_iter.next() - except Exception, e: - g_obj.exc = e - return - g_obj.q.put(f_next) - else: - del g_obj - time.sleep(sleep_time) - class Iter: - def __init__(self): - self.q = Queue.Queue() - self.exc = None - self.thread = threading.Thread(target=run) - self.thread.setDaemon(True) - def next(self): - if self.exc is not None: - raise self.exc - try: - return self.q.get_nowait() - except Queue.Empty: - return blocking_return - def __iter__(self): - return self - - obj = Iter() - g_iter = weakref.ref(obj) - obj.thread.start() - try: - return obj - finally: - del obj - return g -def make_searcher(query_url, results_per_page, page_url, page_mode, - begin, end, link_re): - """ - Return a search function for the given search engine. - - Here query_url is the URL for the initial search, with %(q)s for - the query string, results_per_page is the number of search results - per page, page_url is the URL for the 2nd and subsequent pages of - search results, with %(q)s for the query string and %(n)s for the - page "number." Here page_mode controls the actual value for the - page "number:" - - - page_mode='page0': Use 0-based index of the page. - - page_mode='page1': Use 1-based index of the page. - - page_mode='offset0': Use 0-based index of the search result, - which is a multiple of results_per_page. - - page_mode='offset1': Use 1-based index of the search result - (one plus a multiple of results_per_page). - - If begin is not None, then only text after the first occurrence of - begin will be used in the search results page. If end is not None, - then only text before the first occurrence of end will be used. - - Finally, link_re is a regex string (see module re) which matches - three named groups: 'name', 'url', and 'desc'. These correspond to - the name, URL and description of each search result. The regex is - applied in re.DOTALL mode. - - Returns a search() function which has the same interface as - described in the module docstring. - """ - def search_blocking(query, max_results): - last_links = None - page_num = 0 - q = Queue.Queue() - for i in range(max_results): - if q.qsize() == 0: - if page_num == 0: - page = getUrl(query_url % {'q': quote_plus(query)}) - else: - if page_mode == 'page0': - n = page_num - elif page_mode == 'page1': - n = page_num + 1 - elif page_mode == 'offset0': - n = page_num * results_per_page - elif page_mode == 'offset1': - n = page_num * results_per_page + 1 - else: - raise ValueError('unknown page mode') - page = getUrl(page_url % {'n': n, 'q': quote_plus(query)}) - page_num += 1 - links = get_search_page_links(page, results_per_page, begin, end, link_re) - if len(links) == 0 or links == last_links: - break - last_links = links - for link in links: - q.put(link) - yield q.get() - - search_nonblocking = nonblocking(search_blocking) - - def search(query, max_results=DEFAULT_MAX_RESULTS, blocking=True): - """ - See docstring for web_search module. - """ - if blocking: - return search_blocking(query, max_results) - else: - return search_nonblocking(query, max_results) - - return search - -find = make_searcher('http://www.google.com/search?q=%(q)s', 10, - 'http://www.google.com/search?start=%(n)d&q=%(q)s', 'offset0', - None, None, - r'(?P.*?)' + - r'.*?(?:
|)' + - r'(?P.*?)' + '(?:|