add wikipedia.searchByImdb
This commit is contained in:
parent
7581cf3501
commit
3f7e8a8927
2 changed files with 35 additions and 1 deletions
|
@ -199,7 +199,7 @@ class IMDb:
|
||||||
title = imdbpy_utils.normalizeTitle(title[1:title.rfind('"')]) + se + title[title.rfind('"')+1:]
|
title = imdbpy_utils.normalizeTitle(title[1:title.rfind('"')]) + se + title[title.rfind('"')+1:]
|
||||||
else:
|
else:
|
||||||
title = imdbpy_utils.normalizeTitle(title[1:title.rfind('"')]) + ':' + title[title.rfind('"')+1:]
|
title = imdbpy_utils.normalizeTitle(title[1:title.rfind('"')]) + ':' + title[title.rfind('"')+1:]
|
||||||
return title
|
return imdbpy_utils.normalizeTitle(title)
|
||||||
|
|
||||||
def parseYear(self):
|
def parseYear(self):
|
||||||
year = ''
|
year = ''
|
||||||
|
|
34
scrapeit/wikipedia.py
Normal file
34
scrapeit/wikipedia.py
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
# -*- Mode: Python; -*-
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vi:si:et:sw=2:sts=2:ts=2
|
||||||
|
import re
|
||||||
|
|
||||||
|
from BeautifulSoup import BeautifulSoup
|
||||||
|
|
||||||
|
from utils import read_url
|
||||||
|
from imdb import IMDb
|
||||||
|
from google import google
|
||||||
|
|
||||||
|
|
||||||
|
def searchByImdb(imdb_id, title=None, director=None):
|
||||||
|
if not title:
|
||||||
|
i = IMDb(imdb_id)
|
||||||
|
title = i.parseTitle()
|
||||||
|
director = i.parseCredits()['director']
|
||||||
|
if director:
|
||||||
|
director = director[0]
|
||||||
|
else:
|
||||||
|
director = ''
|
||||||
|
for g in google('"%s" %s site:en.wikipedia.org' % (title, director), 20):
|
||||||
|
url = g[1]
|
||||||
|
data = read_url(url)
|
||||||
|
soup = BeautifulSoup(data)
|
||||||
|
edit_url = soup('a', dict(href=re.compile('action=edit'),
|
||||||
|
title=re.compile('You can edit this page.')))[0]['href']
|
||||||
|
edit_url = "http://en.wikipedia.org%s" % edit_url.replace('&', '&')
|
||||||
|
data = read_url(edit_url)
|
||||||
|
w_imdb_id = data.find('imdb_id')
|
||||||
|
if w_imdb_id > 0:
|
||||||
|
if imdb_id in data[w_imdb_id:w_imdb_id+50]:
|
||||||
|
return url
|
||||||
|
return ''
|
Loading…
Reference in a new issue