just a one page find version of google.find
This commit is contained in:
parent
101f230bd1
commit
7a53ee62b9
1 changed files with 13 additions and 148 deletions
161
ox/google.py
161
ox/google.py
|
@ -14,8 +14,6 @@ from oxutils import stripTags
|
|||
|
||||
|
||||
'''
|
||||
FIXME this module should be replaced by something smaller. a simple find function would do.
|
||||
|
||||
usage:
|
||||
import google
|
||||
google.find(query)
|
||||
|
@ -27,6 +25,7 @@ result is title, url, description
|
|||
|
||||
google.find(query, max_results)
|
||||
|
||||
FIXME: how search depper than first page?
|
||||
'''
|
||||
DEFAULT_MAX_RESULTS = 10
|
||||
|
||||
|
@ -37,152 +36,18 @@ def getUrl(url, data=None, headers=oxutils.net.DEFAULT_HEADERS):
|
|||
def quote_plus(s):
|
||||
return urllib.quote_plus(s.encode('utf-8'))
|
||||
|
||||
def get_search_page_links(page, results_per_page, begin, end, link_re):
|
||||
"""
|
||||
Given str contents of search result page, return list of links.
|
||||
|
||||
Returns list of (name, url, desc) str tuples. See make_searcher()
|
||||
for a description of results_per_page and link_re.
|
||||
"""
|
||||
if begin is not None and begin in page:
|
||||
page = page[page.index(begin):]
|
||||
if end is not None and end in page:
|
||||
page = page[:page.index(end)]
|
||||
ans = []
|
||||
for match in re.compile(link_re, re.DOTALL).finditer(page):
|
||||
def find(query, max_results=DEFAULT_MAX_RESULTS):
|
||||
url = "http://www.google.com/search?q=%s" % quote_plus(query)
|
||||
data = getUrl(url)
|
||||
link_re = r'<a href="(?P<url>[^"]*?)" class=l.*?>(?P<name>.*?)</a>' + \
|
||||
r'.*?(?:<br>|<table.*?>)' + \
|
||||
r'(?P<desc>.*?)' + '(?:<font color=#008000>|<a)'
|
||||
results = []
|
||||
for match in re.compile(link_re, re.DOTALL).finditer(data):
|
||||
(name, url, desc) = match.group('name', 'url', 'desc')
|
||||
ans += [(stripTags(name), url, stripTags(desc))]
|
||||
return ans
|
||||
results.append([(stripTags(name), url, stripTags(desc))])
|
||||
if len(results) > max_results:
|
||||
results = results[:max_results]
|
||||
return results
|
||||
|
||||
|
||||
def nonblocking(f, blocking_return=None, sleep_time=0.01):
|
||||
"""
|
||||
Wrap a callable which returns an iter so that it no longer blocks.
|
||||
|
||||
The wrapped iterator returns blocking_return while callable f is
|
||||
blocking. The callable f is called in a background thread. If the
|
||||
wrapped iterator is deleted, then the iterator returned by f is
|
||||
deleted also and the background thread is terminated.
|
||||
"""
|
||||
def g(*args, **kwargs):
|
||||
f_iter = f(*args, **kwargs)
|
||||
g_iter = None
|
||||
def run():
|
||||
while True:
|
||||
g_obj = g_iter()
|
||||
if g_obj is None:
|
||||
return
|
||||
if g_obj.q.qsize() == 0:
|
||||
try:
|
||||
f_next = f_iter.next()
|
||||
except Exception, e:
|
||||
g_obj.exc = e
|
||||
return
|
||||
g_obj.q.put(f_next)
|
||||
else:
|
||||
del g_obj
|
||||
time.sleep(sleep_time)
|
||||
class Iter:
|
||||
def __init__(self):
|
||||
self.q = Queue.Queue()
|
||||
self.exc = None
|
||||
self.thread = threading.Thread(target=run)
|
||||
self.thread.setDaemon(True)
|
||||
def next(self):
|
||||
if self.exc is not None:
|
||||
raise self.exc
|
||||
try:
|
||||
return self.q.get_nowait()
|
||||
except Queue.Empty:
|
||||
return blocking_return
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
obj = Iter()
|
||||
g_iter = weakref.ref(obj)
|
||||
obj.thread.start()
|
||||
try:
|
||||
return obj
|
||||
finally:
|
||||
del obj
|
||||
return g
|
||||
def make_searcher(query_url, results_per_page, page_url, page_mode,
|
||||
begin, end, link_re):
|
||||
"""
|
||||
Return a search function for the given search engine.
|
||||
|
||||
Here query_url is the URL for the initial search, with %(q)s for
|
||||
the query string, results_per_page is the number of search results
|
||||
per page, page_url is the URL for the 2nd and subsequent pages of
|
||||
search results, with %(q)s for the query string and %(n)s for the
|
||||
page "number." Here page_mode controls the actual value for the
|
||||
page "number:"
|
||||
|
||||
- page_mode='page0': Use 0-based index of the page.
|
||||
- page_mode='page1': Use 1-based index of the page.
|
||||
- page_mode='offset0': Use 0-based index of the search result,
|
||||
which is a multiple of results_per_page.
|
||||
- page_mode='offset1': Use 1-based index of the search result
|
||||
(one plus a multiple of results_per_page).
|
||||
|
||||
If begin is not None, then only text after the first occurrence of
|
||||
begin will be used in the search results page. If end is not None,
|
||||
then only text before the first occurrence of end will be used.
|
||||
|
||||
Finally, link_re is a regex string (see module re) which matches
|
||||
three named groups: 'name', 'url', and 'desc'. These correspond to
|
||||
the name, URL and description of each search result. The regex is
|
||||
applied in re.DOTALL mode.
|
||||
|
||||
Returns a search() function which has the same interface as
|
||||
described in the module docstring.
|
||||
"""
|
||||
def search_blocking(query, max_results):
|
||||
last_links = None
|
||||
page_num = 0
|
||||
q = Queue.Queue()
|
||||
for i in range(max_results):
|
||||
if q.qsize() == 0:
|
||||
if page_num == 0:
|
||||
page = getUrl(query_url % {'q': quote_plus(query)})
|
||||
else:
|
||||
if page_mode == 'page0':
|
||||
n = page_num
|
||||
elif page_mode == 'page1':
|
||||
n = page_num + 1
|
||||
elif page_mode == 'offset0':
|
||||
n = page_num * results_per_page
|
||||
elif page_mode == 'offset1':
|
||||
n = page_num * results_per_page + 1
|
||||
else:
|
||||
raise ValueError('unknown page mode')
|
||||
page = getUrl(page_url % {'n': n, 'q': quote_plus(query)})
|
||||
page_num += 1
|
||||
links = get_search_page_links(page, results_per_page, begin, end, link_re)
|
||||
if len(links) == 0 or links == last_links:
|
||||
break
|
||||
last_links = links
|
||||
for link in links:
|
||||
q.put(link)
|
||||
yield q.get()
|
||||
|
||||
search_nonblocking = nonblocking(search_blocking)
|
||||
|
||||
def search(query, max_results=DEFAULT_MAX_RESULTS, blocking=True):
|
||||
"""
|
||||
See docstring for web_search module.
|
||||
"""
|
||||
if blocking:
|
||||
return search_blocking(query, max_results)
|
||||
else:
|
||||
return search_nonblocking(query, max_results)
|
||||
|
||||
return search
|
||||
|
||||
find = make_searcher('http://www.google.com/search?q=%(q)s', 10,
|
||||
'http://www.google.com/search?start=%(n)d&q=%(q)s', 'offset0',
|
||||
None, None,
|
||||
r'<a href="(?P<url>[^"]*?)" class=l.*?>(?P<name>.*?)</a>' +
|
||||
r'.*?(?:<br>|<table.*?>)' +
|
||||
r'(?P<desc>.*?)' + '(?:<font color=#008000>|<a)')
|
||||
|
||||
|
|
Loading…
Reference in a new issue