just a one page find version of google.find
This commit is contained in:
parent
101f230bd1
commit
7a53ee62b9
1 changed files with 13 additions and 148 deletions
161
ox/google.py
161
ox/google.py
|
@ -14,8 +14,6 @@ from oxutils import stripTags
|
||||||
|
|
||||||
|
|
||||||
'''
|
'''
|
||||||
FIXME this module should be replaced by something smaller. a simple find function would do.
|
|
||||||
|
|
||||||
usage:
|
usage:
|
||||||
import google
|
import google
|
||||||
google.find(query)
|
google.find(query)
|
||||||
|
@ -27,6 +25,7 @@ result is title, url, description
|
||||||
|
|
||||||
google.find(query, max_results)
|
google.find(query, max_results)
|
||||||
|
|
||||||
|
FIXME: how search depper than first page?
|
||||||
'''
|
'''
|
||||||
DEFAULT_MAX_RESULTS = 10
|
DEFAULT_MAX_RESULTS = 10
|
||||||
|
|
||||||
|
@ -37,152 +36,18 @@ def getUrl(url, data=None, headers=oxutils.net.DEFAULT_HEADERS):
|
||||||
def quote_plus(s):
|
def quote_plus(s):
|
||||||
return urllib.quote_plus(s.encode('utf-8'))
|
return urllib.quote_plus(s.encode('utf-8'))
|
||||||
|
|
||||||
def get_search_page_links(page, results_per_page, begin, end, link_re):
|
def find(query, max_results=DEFAULT_MAX_RESULTS):
|
||||||
"""
|
url = "http://www.google.com/search?q=%s" % quote_plus(query)
|
||||||
Given str contents of search result page, return list of links.
|
data = getUrl(url)
|
||||||
|
link_re = r'<a href="(?P<url>[^"]*?)" class=l.*?>(?P<name>.*?)</a>' + \
|
||||||
Returns list of (name, url, desc) str tuples. See make_searcher()
|
r'.*?(?:<br>|<table.*?>)' + \
|
||||||
for a description of results_per_page and link_re.
|
r'(?P<desc>.*?)' + '(?:<font color=#008000>|<a)'
|
||||||
"""
|
results = []
|
||||||
if begin is not None and begin in page:
|
for match in re.compile(link_re, re.DOTALL).finditer(data):
|
||||||
page = page[page.index(begin):]
|
|
||||||
if end is not None and end in page:
|
|
||||||
page = page[:page.index(end)]
|
|
||||||
ans = []
|
|
||||||
for match in re.compile(link_re, re.DOTALL).finditer(page):
|
|
||||||
(name, url, desc) = match.group('name', 'url', 'desc')
|
(name, url, desc) = match.group('name', 'url', 'desc')
|
||||||
ans += [(stripTags(name), url, stripTags(desc))]
|
results.append([(stripTags(name), url, stripTags(desc))])
|
||||||
return ans
|
if len(results) > max_results:
|
||||||
|
results = results[:max_results]
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
def nonblocking(f, blocking_return=None, sleep_time=0.01):
|
|
||||||
"""
|
|
||||||
Wrap a callable which returns an iter so that it no longer blocks.
|
|
||||||
|
|
||||||
The wrapped iterator returns blocking_return while callable f is
|
|
||||||
blocking. The callable f is called in a background thread. If the
|
|
||||||
wrapped iterator is deleted, then the iterator returned by f is
|
|
||||||
deleted also and the background thread is terminated.
|
|
||||||
"""
|
|
||||||
def g(*args, **kwargs):
|
|
||||||
f_iter = f(*args, **kwargs)
|
|
||||||
g_iter = None
|
|
||||||
def run():
|
|
||||||
while True:
|
|
||||||
g_obj = g_iter()
|
|
||||||
if g_obj is None:
|
|
||||||
return
|
|
||||||
if g_obj.q.qsize() == 0:
|
|
||||||
try:
|
|
||||||
f_next = f_iter.next()
|
|
||||||
except Exception, e:
|
|
||||||
g_obj.exc = e
|
|
||||||
return
|
|
||||||
g_obj.q.put(f_next)
|
|
||||||
else:
|
|
||||||
del g_obj
|
|
||||||
time.sleep(sleep_time)
|
|
||||||
class Iter:
|
|
||||||
def __init__(self):
|
|
||||||
self.q = Queue.Queue()
|
|
||||||
self.exc = None
|
|
||||||
self.thread = threading.Thread(target=run)
|
|
||||||
self.thread.setDaemon(True)
|
|
||||||
def next(self):
|
|
||||||
if self.exc is not None:
|
|
||||||
raise self.exc
|
|
||||||
try:
|
|
||||||
return self.q.get_nowait()
|
|
||||||
except Queue.Empty:
|
|
||||||
return blocking_return
|
|
||||||
def __iter__(self):
|
|
||||||
return self
|
|
||||||
|
|
||||||
obj = Iter()
|
|
||||||
g_iter = weakref.ref(obj)
|
|
||||||
obj.thread.start()
|
|
||||||
try:
|
|
||||||
return obj
|
|
||||||
finally:
|
|
||||||
del obj
|
|
||||||
return g
|
|
||||||
def make_searcher(query_url, results_per_page, page_url, page_mode,
|
|
||||||
begin, end, link_re):
|
|
||||||
"""
|
|
||||||
Return a search function for the given search engine.
|
|
||||||
|
|
||||||
Here query_url is the URL for the initial search, with %(q)s for
|
|
||||||
the query string, results_per_page is the number of search results
|
|
||||||
per page, page_url is the URL for the 2nd and subsequent pages of
|
|
||||||
search results, with %(q)s for the query string and %(n)s for the
|
|
||||||
page "number." Here page_mode controls the actual value for the
|
|
||||||
page "number:"
|
|
||||||
|
|
||||||
- page_mode='page0': Use 0-based index of the page.
|
|
||||||
- page_mode='page1': Use 1-based index of the page.
|
|
||||||
- page_mode='offset0': Use 0-based index of the search result,
|
|
||||||
which is a multiple of results_per_page.
|
|
||||||
- page_mode='offset1': Use 1-based index of the search result
|
|
||||||
(one plus a multiple of results_per_page).
|
|
||||||
|
|
||||||
If begin is not None, then only text after the first occurrence of
|
|
||||||
begin will be used in the search results page. If end is not None,
|
|
||||||
then only text before the first occurrence of end will be used.
|
|
||||||
|
|
||||||
Finally, link_re is a regex string (see module re) which matches
|
|
||||||
three named groups: 'name', 'url', and 'desc'. These correspond to
|
|
||||||
the name, URL and description of each search result. The regex is
|
|
||||||
applied in re.DOTALL mode.
|
|
||||||
|
|
||||||
Returns a search() function which has the same interface as
|
|
||||||
described in the module docstring.
|
|
||||||
"""
|
|
||||||
def search_blocking(query, max_results):
|
|
||||||
last_links = None
|
|
||||||
page_num = 0
|
|
||||||
q = Queue.Queue()
|
|
||||||
for i in range(max_results):
|
|
||||||
if q.qsize() == 0:
|
|
||||||
if page_num == 0:
|
|
||||||
page = getUrl(query_url % {'q': quote_plus(query)})
|
|
||||||
else:
|
|
||||||
if page_mode == 'page0':
|
|
||||||
n = page_num
|
|
||||||
elif page_mode == 'page1':
|
|
||||||
n = page_num + 1
|
|
||||||
elif page_mode == 'offset0':
|
|
||||||
n = page_num * results_per_page
|
|
||||||
elif page_mode == 'offset1':
|
|
||||||
n = page_num * results_per_page + 1
|
|
||||||
else:
|
|
||||||
raise ValueError('unknown page mode')
|
|
||||||
page = getUrl(page_url % {'n': n, 'q': quote_plus(query)})
|
|
||||||
page_num += 1
|
|
||||||
links = get_search_page_links(page, results_per_page, begin, end, link_re)
|
|
||||||
if len(links) == 0 or links == last_links:
|
|
||||||
break
|
|
||||||
last_links = links
|
|
||||||
for link in links:
|
|
||||||
q.put(link)
|
|
||||||
yield q.get()
|
|
||||||
|
|
||||||
search_nonblocking = nonblocking(search_blocking)
|
|
||||||
|
|
||||||
def search(query, max_results=DEFAULT_MAX_RESULTS, blocking=True):
|
|
||||||
"""
|
|
||||||
See docstring for web_search module.
|
|
||||||
"""
|
|
||||||
if blocking:
|
|
||||||
return search_blocking(query, max_results)
|
|
||||||
else:
|
|
||||||
return search_nonblocking(query, max_results)
|
|
||||||
|
|
||||||
return search
|
|
||||||
|
|
||||||
find = make_searcher('http://www.google.com/search?q=%(q)s', 10,
|
|
||||||
'http://www.google.com/search?start=%(n)d&q=%(q)s', 'offset0',
|
|
||||||
None, None,
|
|
||||||
r'<a href="(?P<url>[^"]*?)" class=l.*?>(?P<name>.*?)</a>' +
|
|
||||||
r'.*?(?:<br>|<table.*?>)' +
|
|
||||||
r'(?P<desc>.*?)' + '(?:<font color=#008000>|<a)')
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue