187 lines
5.9 KiB
Python
187 lines
5.9 KiB
Python
# -*- Mode: Python; -*-
|
|
# -*- coding: utf-8 -*-
|
|
# vi:si:et:sw=2:sts=2:ts=2
|
|
import re
|
|
import time
|
|
import urllib
|
|
import urllib2
|
|
import weakref
|
|
import threading
|
|
import Queue
|
|
|
|
import oxutils
|
|
from oxutils import stripTags
|
|
|
|
|
|
'''
|
|
FIXME this function should be replaced by something more minimal find function
|
|
usage:
|
|
import google
|
|
google.find(query)
|
|
<generator object at 0x833aeac>
|
|
|
|
for result in google.find(query): result
|
|
|
|
result is title, url, description
|
|
|
|
google.find(query, max_results)
|
|
|
|
'''
|
|
DEFAULT_MAX_RESULTS = 10
|
|
|
|
def getUrl(url, data=None, headers=oxutils.net.DEFAULT_HEADERS):
|
|
google_timeout=24*60*60
|
|
return oxutils.cache.getUrl(url, data, headers, google_timeout)
|
|
|
|
def quote_plus(s):
|
|
return urllib.quote_plus(s.encode('utf-8'))
|
|
|
|
def get_search_page_links(page, results_per_page, begin, end, link_re):
|
|
"""
|
|
Given str contents of search result page, return list of links.
|
|
|
|
Returns list of (name, url, desc) str tuples. See make_searcher()
|
|
for a description of results_per_page and link_re.
|
|
"""
|
|
if begin is not None and begin in page:
|
|
page = page[page.index(begin):]
|
|
if end is not None and end in page:
|
|
page = page[:page.index(end)]
|
|
ans = []
|
|
for match in re.compile(link_re, re.DOTALL).finditer(page):
|
|
(name, url, desc) = match.group('name', 'url', 'desc')
|
|
ans += [(stripTags(name), url, stripTags(desc))]
|
|
return ans
|
|
|
|
|
|
def nonblocking(f, blocking_return=None, sleep_time=0.01):
|
|
"""
|
|
Wrap a callable which returns an iter so that it no longer blocks.
|
|
|
|
The wrapped iterator returns blocking_return while callable f is
|
|
blocking. The callable f is called in a background thread. If the
|
|
wrapped iterator is deleted, then the iterator returned by f is
|
|
deleted also and the background thread is terminated.
|
|
"""
|
|
def g(*args, **kwargs):
|
|
f_iter = f(*args, **kwargs)
|
|
g_iter = None
|
|
def run():
|
|
while True:
|
|
g_obj = g_iter()
|
|
if g_obj is None:
|
|
return
|
|
if g_obj.q.qsize() == 0:
|
|
try:
|
|
f_next = f_iter.next()
|
|
except Exception, e:
|
|
g_obj.exc = e
|
|
return
|
|
g_obj.q.put(f_next)
|
|
else:
|
|
del g_obj
|
|
time.sleep(sleep_time)
|
|
class Iter:
|
|
def __init__(self):
|
|
self.q = Queue.Queue()
|
|
self.exc = None
|
|
self.thread = threading.Thread(target=run)
|
|
self.thread.setDaemon(True)
|
|
def next(self):
|
|
if self.exc is not None:
|
|
raise self.exc
|
|
try:
|
|
return self.q.get_nowait()
|
|
except Queue.Empty:
|
|
return blocking_return
|
|
def __iter__(self):
|
|
return self
|
|
|
|
obj = Iter()
|
|
g_iter = weakref.ref(obj)
|
|
obj.thread.start()
|
|
try:
|
|
return obj
|
|
finally:
|
|
del obj
|
|
return g
|
|
def make_searcher(query_url, results_per_page, page_url, page_mode,
|
|
begin, end, link_re):
|
|
"""
|
|
Return a search function for the given search engine.
|
|
|
|
Here query_url is the URL for the initial search, with %(q)s for
|
|
the query string, results_per_page is the number of search results
|
|
per page, page_url is the URL for the 2nd and subsequent pages of
|
|
search results, with %(q)s for the query string and %(n)s for the
|
|
page "number." Here page_mode controls the actual value for the
|
|
page "number:"
|
|
|
|
- page_mode='page0': Use 0-based index of the page.
|
|
- page_mode='page1': Use 1-based index of the page.
|
|
- page_mode='offset0': Use 0-based index of the search result,
|
|
which is a multiple of results_per_page.
|
|
- page_mode='offset1': Use 1-based index of the search result
|
|
(one plus a multiple of results_per_page).
|
|
|
|
If begin is not None, then only text after the first occurrence of
|
|
begin will be used in the search results page. If end is not None,
|
|
then only text before the first occurrence of end will be used.
|
|
|
|
Finally, link_re is a regex string (see module re) which matches
|
|
three named groups: 'name', 'url', and 'desc'. These correspond to
|
|
the name, URL and description of each search result. The regex is
|
|
applied in re.DOTALL mode.
|
|
|
|
Returns a search() function which has the same interface as
|
|
described in the module docstring.
|
|
"""
|
|
def search_blocking(query, max_results):
|
|
last_links = None
|
|
page_num = 0
|
|
q = Queue.Queue()
|
|
for i in range(max_results):
|
|
if q.qsize() == 0:
|
|
if page_num == 0:
|
|
page = getUrl(query_url % {'q': quote_plus(query)})
|
|
else:
|
|
if page_mode == 'page0':
|
|
n = page_num
|
|
elif page_mode == 'page1':
|
|
n = page_num + 1
|
|
elif page_mode == 'offset0':
|
|
n = page_num * results_per_page
|
|
elif page_mode == 'offset1':
|
|
n = page_num * results_per_page + 1
|
|
else:
|
|
raise ValueError('unknown page mode')
|
|
page = getUrl(page_url % {'n': n, 'q': quote_plus(query)})
|
|
page_num += 1
|
|
links = get_search_page_links(page, results_per_page, begin, end, link_re)
|
|
if len(links) == 0 or links == last_links:
|
|
break
|
|
last_links = links
|
|
for link in links:
|
|
q.put(link)
|
|
yield q.get()
|
|
|
|
search_nonblocking = nonblocking(search_blocking)
|
|
|
|
def search(query, max_results=DEFAULT_MAX_RESULTS, blocking=True):
|
|
"""
|
|
See docstring for web_search module.
|
|
"""
|
|
if blocking:
|
|
return search_blocking(query, max_results)
|
|
else:
|
|
return search_nonblocking(query, max_results)
|
|
|
|
return search
|
|
|
|
find = make_searcher('http://www.google.com/search?q=%(q)s', 10,
|
|
'http://www.google.com/search?start=%(n)d&q=%(q)s', 'offset0',
|
|
None, None,
|
|
r'<a href="(?P<url>[^"]*?)" class=l.*?>(?P<name>.*?)</a>' +
|
|
r'.*?(?:<br>|<table.*?>)' +
|
|
r'(?P<desc>.*?)' + '(?:<font color=#008000>|<a)')
|
|
|