From ee442c40902df22f88e5a65e373c512dc6101300 Mon Sep 17 00:00:00 2001 From: j <0x006A@0x2620.org> Date: Sun, 16 Sep 2007 19:00:22 +0000 Subject: [PATCH] use wikipedia search stupid --- scrapeit/wikipedia.py | 35 ++++++++++------------------------- 1 file changed, 10 insertions(+), 25 deletions(-) diff --git a/scrapeit/wikipedia.py b/scrapeit/wikipedia.py index ff3597a..6856120 100644 --- a/scrapeit/wikipedia.py +++ b/scrapeit/wikipedia.py @@ -10,29 +10,14 @@ from imdb import IMDb from google import google -def searchByImdb(imdb_id, title=None, director=None): - if not title: - i = IMDb(imdb_id) - title = i.parseTitle() - director = i.parseCredits()['director'] - if director: - director = director[0] - else: - director = '' - q = u'%s %s site:en.wikipedia.org' % (title, director) - #print q.encode('utf-8') - for g in google(q, 7): - url = g[1] - data = read_url(url) - soup = BeautifulSoup(data) - edit_url = soup('a', dict(href=re.compile('action=edit'), - title=re.compile('You can edit this page.'))) - if edit_url: - edit_url = edit_url[0]['href'] - edit_url = "http://en.wikipedia.org%s" % edit_url.replace('&', '&') - data = read_url(edit_url) - w_imdb_id = data.find('imdb_id') - if w_imdb_id > 0: - if imdb_id in data[w_imdb_id:w_imdb_id+50]: - return url +def searchByImdb(imdb_id): + if len(imdb_id) != 7: return '' + url = "http://en.wikipedia.org/w/index.php?title=Special%3ASearch&search=imdb_id%20" + imdb_id + "&fulltext=Search" + data = read_url(url) + soup = BeautifulSoup(data) + result = soup('li', {"style":"padding-bottom: 1em;"}) + if result: + url = result[0]('a')[0]['href'] + url = "http://en.wikipedia.org%s" % url + return url return ''