From 3f7e8a8927efc4882e146b136838c64e4efbc997 Mon Sep 17 00:00:00 2001 From: j <0x006A@0x2620.org> Date: Sat, 15 Sep 2007 20:05:31 +0000 Subject: [PATCH] add wikipedia.searchByImdb --- scrapeit/imdb.py | 2 +- scrapeit/wikipedia.py | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) create mode 100644 scrapeit/wikipedia.py diff --git a/scrapeit/imdb.py b/scrapeit/imdb.py index dc198d3..004ed44 100644 --- a/scrapeit/imdb.py +++ b/scrapeit/imdb.py @@ -199,7 +199,7 @@ class IMDb: title = imdbpy_utils.normalizeTitle(title[1:title.rfind('"')]) + se + title[title.rfind('"')+1:] else: title = imdbpy_utils.normalizeTitle(title[1:title.rfind('"')]) + ':' + title[title.rfind('"')+1:] - return title + return imdbpy_utils.normalizeTitle(title) def parseYear(self): year = '' diff --git a/scrapeit/wikipedia.py b/scrapeit/wikipedia.py new file mode 100644 index 0000000..54a21e4 --- /dev/null +++ b/scrapeit/wikipedia.py @@ -0,0 +1,34 @@ +# -*- Mode: Python; -*- +# -*- coding: utf-8 -*- +# vi:si:et:sw=2:sts=2:ts=2 +import re + +from BeautifulSoup import BeautifulSoup + +from utils import read_url +from imdb import IMDb +from google import google + + +def searchByImdb(imdb_id, title=None, director=None): + if not title: + i = IMDb(imdb_id) + title = i.parseTitle() + director = i.parseCredits()['director'] + if director: + director = director[0] + else: + director = '' + for g in google('"%s" %s site:en.wikipedia.org' % (title, director), 20): + url = g[1] + data = read_url(url) + soup = BeautifulSoup(data) + edit_url = soup('a', dict(href=re.compile('action=edit'), + title=re.compile('You can edit this page.')))[0]['href'] + edit_url = "http://en.wikipedia.org%s" % edit_url.replace('&', '&') + data = read_url(edit_url) + w_imdb_id = data.find('imdb_id') + if w_imdb_id > 0: + if imdb_id in data[w_imdb_id:w_imdb_id+50]: + return url + return ''