510 lines
16 KiB
Python
510 lines
16 KiB
Python
# -*- Mode: Python; -*-
|
|
# -*- coding: utf-8 -*-
|
|
# vi:si:et:sw=2:sts=2:ts=2
|
|
|
|
import urllib2
|
|
from urllib import quote
|
|
import re, time
|
|
import os
|
|
|
|
from elementtree.ElementTree import parse, tostring
|
|
from BeautifulSoup import BeautifulSoup
|
|
|
|
from google import google
|
|
from utils import stripTags, htmldecode
|
|
|
|
import utils
|
|
import chardet
|
|
|
|
cache_base = "/var/cache/scrapeit/cache/"
|
|
|
|
def read_url_utf8(url):
|
|
path = os.path.join(cache_base, url.replace('http://',''))
|
|
if path.endswith('/'):
|
|
path = "%sindex.html" % path
|
|
if os.path.isdir(path):
|
|
path = "%s/index.html" % path
|
|
if os.path.exists(path):
|
|
f = open(path)
|
|
data = f.read()
|
|
encoding = chardet.detect(data)['encoding']
|
|
if not encoding: encoding = 'latin-1'
|
|
f.close()
|
|
data = unicode(data, encoding)
|
|
return data
|
|
else:
|
|
data = utils.read_url(url)
|
|
folder = os.path.dirname(path)
|
|
if not os.path.exists(folder):
|
|
os.makedirs(folder)
|
|
f = open(path, 'w')
|
|
f.write(data)
|
|
f.close()
|
|
encoding = chardet.detect(data)['encoding']
|
|
if not encoding: encoding = 'latin-1'
|
|
data = unicode(data, encoding)
|
|
return data
|
|
|
|
def read_url(url):
|
|
path = os.path.join(cache_base, url.replace('http://',''))
|
|
if path.endswith('/'):
|
|
path = "%sindex.html" % path
|
|
if os.path.isdir(path):
|
|
path = "%s/index.html" % path
|
|
if os.path.exists(path):
|
|
f = open(path)
|
|
data = f.read()
|
|
f.close()
|
|
return data
|
|
else:
|
|
data = utils.read_url(url)
|
|
folder = os.path.dirname(path)
|
|
if not os.path.exists(folder):
|
|
os.makedirs(folder)
|
|
f = open(path, 'w')
|
|
f.write(data)
|
|
f.close()
|
|
return data
|
|
|
|
def _get_data(url):
|
|
data = None
|
|
try:
|
|
data = read_url(url)
|
|
except:
|
|
print "error reading data from", url
|
|
return data
|
|
|
|
def get_image(url):
|
|
return read_url(url)
|
|
|
|
def _castList(data, regexp):
|
|
soup = re.compile(regexp).findall(data)
|
|
if soup:
|
|
soup = BeautifulSoup(soup[0])
|
|
names = []
|
|
for i in soup('a', {'href': re.compile('/name/nm')}):
|
|
if i.string:
|
|
cast = stripTags(i.string)
|
|
if cast not in names:
|
|
names.append(cast)
|
|
return names
|
|
return []
|
|
|
|
def _getTerm(data, regexp):
|
|
term = ''
|
|
try:
|
|
reg = re.compile(regexp, re.IGNORECASE)
|
|
m = reg.search(data)
|
|
if m:
|
|
term = stripTags(m.group(1)).strip()
|
|
except:
|
|
print "waring, parsing failed for", regexp
|
|
return term.encode('utf8')
|
|
|
|
|
|
class IMDb:
|
|
def __init__(self, imdb):
|
|
self.imdb = imdb
|
|
self.pageSource = None
|
|
self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb
|
|
|
|
self.businessSource = None
|
|
self.businessUrl = "%sbusiness" % self.pageUrl
|
|
self.connectionsSource = None
|
|
self.connectionsUrl = "%smovieconnections" % self.pageUrl
|
|
self.creditsSource = None
|
|
self.creditsUrl = "%sfullcredits" % self.pageUrl
|
|
self.episodesSource = None
|
|
self.episodesUrl = "%sepisodes" % self.pageUrl
|
|
self.keywordSource = None
|
|
self.keywordUrl = "%skeywords" % self.pageUrl
|
|
self.plotSource = None
|
|
self.plotUrl = "%splotsummary" % self.pageUrl
|
|
self.releaseinfoSource = None
|
|
self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl
|
|
self.triviaSource = None
|
|
self.triviaUrl = "%strivia" % self.pageUrl
|
|
|
|
def getPage(self, forcereload = False):
|
|
if forcereload or not self.pageSource:
|
|
self.pageSource = read_url_utf8(self.pageUrl)
|
|
return self.pageSource
|
|
|
|
def parse_raw_value(self, key, value):
|
|
if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'):
|
|
value = unicode(value, 'utf-8')
|
|
value = stripTags(value).strip()
|
|
if key == 'runtime':
|
|
parsed_value = _getTerm(value, '(.*?) min')
|
|
parsed_value = _getTerm(parsed_value, '([0-9]+)')
|
|
if not parsed_value:
|
|
parsed_value = _getTerm(value, '(.*?) sec')
|
|
parsed_value = _getTerm(parsed_value, '([0-9]+)')
|
|
if not parsed_value:
|
|
parsed_value = 0
|
|
else:
|
|
parsed_value = int(parsed_value)
|
|
else:
|
|
parsed_value = int(parsed_value) * 60
|
|
elif key in ('country', 'language'):
|
|
parsed_value = value.split(' / ')
|
|
elif key == 'genre':
|
|
parsed_value = value.replace('more', '').strip().split(' / ')
|
|
elif key == 'tagline':
|
|
parsed_value = value.replace('more', '').strip()
|
|
elif key == 'plot_outline':
|
|
parsed_value = value.replace('(view trailer)', '').strip()
|
|
if parsed_value.endswith('more'):
|
|
parsed_value = parsed_value[:-4].strip()
|
|
elif key == 'tv_series':
|
|
m = re.compile('<a href="/title/tt(.*?)/">(.*?)</a>').findall(value)
|
|
if m:
|
|
parsed_value = m[0][0]
|
|
else:
|
|
parsed_value = ''
|
|
elif key == 'also_known_as':
|
|
parsed_value = ''
|
|
m = re.compile('(.*) \(International: English title').findall(value)
|
|
if m:
|
|
parsed_value = m[0]
|
|
else:
|
|
m = re.compile('(.*) \(USA').findall(value)
|
|
if m:
|
|
parsed_value = m[0]
|
|
parsed_value = parsed_value.split('<br />')[-1].split('(')[0].strip()
|
|
else:
|
|
print value
|
|
parsed_value = value
|
|
return parsed_value
|
|
|
|
def parseTitle(self):
|
|
title = ''
|
|
data = self.getPage()
|
|
soup = BeautifulSoup(data)
|
|
html_title = soup('div', {'id': 'tn15title'})
|
|
if not html_title:
|
|
html_title = soup('title')
|
|
if html_title:
|
|
html_title = str(html_title[0])
|
|
title = stripTags(html_title)
|
|
title = re.sub('\(\d\d\d\d\)', '', title)
|
|
title = re.sub('\(\d\d\d\d/I\)', '', title)
|
|
for t in ('TV-Series', '(mini)', '(VG)', '(V)', '(TV)'):
|
|
title = title.replace(t, '')
|
|
title = title.strip()
|
|
if title.startswith('"') and title.endswith('"'):
|
|
title = title[1:-1]
|
|
return title
|
|
|
|
def parseYear(self):
|
|
year = ''
|
|
data = self.getPage()
|
|
soup = BeautifulSoup(data)
|
|
html_title = soup('div', {'id': 'tn15title'})
|
|
if not html_title:
|
|
html_title = soup('title')
|
|
if html_title:
|
|
html_title = str(html_title[0])
|
|
html_title = stripTags(html_title)
|
|
year = re.compile('\((\d\d\d\d)\)').findall(html_title)
|
|
if not year:
|
|
year = re.compile('\((\d\d\d\d)/').findall(html_title)
|
|
if year:
|
|
year = year[0]
|
|
else: year = ''
|
|
return year
|
|
|
|
def parse(self):
|
|
data = self.getPage()
|
|
IMDbDict ={}
|
|
#Poster
|
|
IMDbDict['poster'] = _getTerm(data, 'name="poster".*?<img .*?src="(.*?)"')
|
|
if not IMDbDict['poster']:
|
|
IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'
|
|
#Title, Year
|
|
IMDbDict['year'] = self.parseYear()
|
|
IMDbDict['title'] = self.parseTitle()
|
|
|
|
#Rating
|
|
m = re.compile('<b>(.*?)/10</b>', re.IGNORECASE).search(data)
|
|
if m:
|
|
IMDbDict['rating'] = int(float(m.group(1)) * 1000)
|
|
else:
|
|
IMDbDict['rating'] = -1
|
|
#Votes
|
|
m = re.compile('<small>\(<a href="ratings">(.*?) votes</a>\)</small>', re.IGNORECASE).findall(data)
|
|
if m:
|
|
IMDbDict['votes'] = int(m[0].replace(',', ''))
|
|
else:
|
|
IMDbDict['votes'] = -1
|
|
|
|
data = data.replace('\n',' ')
|
|
#some values
|
|
keys = ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline', 'tv_series', 'also_known_as')
|
|
for key in keys:
|
|
IMDbDict[key] = ''
|
|
IMDbDict['runtime'] = 0
|
|
soup = BeautifulSoup(data)
|
|
for info in soup('div', {'class': 'info'}):
|
|
key = str(info).split('</h5>')[0].split('<h5>')
|
|
if len(key) > 1:
|
|
raw_value = str(info).split('</h5>')[1]
|
|
key = key[1][:-1].lower().replace(' ', '_')
|
|
if key in keys:
|
|
IMDbDict[key] = self.parse_raw_value(key, raw_value)
|
|
|
|
IMDbDict['title_english'] = IMDbDict.pop('also_known_as', IMDbDict['title'])
|
|
#is episode
|
|
IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '')
|
|
|
|
IMDbDict['episodes'] = self.parseEpisodes()
|
|
IMDbDict['credits'] = self.parseCredits()
|
|
IMDbDict['plot'] = self.parsePlot()
|
|
IMDbDict['keywords'] = self.parseKeywords()
|
|
|
|
IMDbDict['trivia'] = self.parseTrivia()
|
|
IMDbDict['connections'] = self.parseConnections()
|
|
IMDbDict['release_date'] = self.parseReleaseinfo()
|
|
IMDbDict['business'] = self.parseBusiness()
|
|
self.IMDbDict = IMDbDict
|
|
return self.IMDbDict
|
|
|
|
def getCredits(self, forcereload = False):
|
|
if forcereload or not self.creditsSource:
|
|
self.creditsSource = read_url_utf8(self.creditsUrl)
|
|
return self.creditsSource
|
|
|
|
def parseCredits(self):
|
|
data = self.getCredits()
|
|
credits = {}
|
|
credits['director'] = _castList(data, 'Directed by.*?(<tr>.*?)</table>')
|
|
credits['writer'] = _castList(data, 'Writing credits.*?(<tr>.*?)</table>')
|
|
credits['producer'] = _castList(data, 'Produced by.*?(<tr>.*?)</table>')
|
|
#credits['cast'] = _castList(data, 'Cast</b>.*?(<tr.*?)</table>')
|
|
credits['cast'] = []
|
|
soup = re.compile('Cast</b>.*?(<tr.*?)</table>').findall(data)
|
|
soup = BeautifulSoup(data)
|
|
cast = soup('table', {'class': 'cast'})
|
|
if cast:
|
|
cast = str(cast[0])
|
|
names = re.compile('<a href="/name/nm.*?/">(.*?)</a>.*?</td><td class="char">(.*?)</td></tr>').findall(cast)
|
|
for name in names:
|
|
real_name = name[0]
|
|
role_name = name[1]
|
|
if role_name:
|
|
role_name = role_name.split('(')[0].replace('/ ...','').strip()
|
|
credits['cast'].append((stripTags(real_name), stripTags(role_name)))
|
|
self.credits = credits
|
|
return self.credits
|
|
|
|
def getPlot(self, forcereload = False):
|
|
if forcereload or not self.plotSource:
|
|
self.plotSource = read_url_utf8(self.plotUrl)
|
|
return self.plotSource
|
|
|
|
def parsePlot(self):
|
|
soup = BeautifulSoup(self.getPlot())
|
|
plot = soup('p', {'class':'plotpar'})
|
|
if plot:
|
|
plot = unicode(plot[0]).split('<i>')[0]
|
|
else:
|
|
plot = u''
|
|
plot = stripTags(plot).strip()
|
|
self.plot = plot
|
|
return plot
|
|
|
|
def getEpisodes(self, forcereload = False):
|
|
if forcereload or not self.episodesSource:
|
|
self.episodesSource = read_url_utf8(self.episodesUrl)
|
|
return self.episodesSource
|
|
|
|
def parseEpisodes(self):
|
|
episodes = {}
|
|
cdata = self.getEpisodes().replace('\r\n',' ')
|
|
regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>.*?</b><br>(.*?)<br/>'''
|
|
#regexp = r'''Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></span><br>.*?<br>(.*?)</td>'''
|
|
reg = re.compile(regexp, re.IGNORECASE)
|
|
m = reg.findall(cdata)
|
|
for match in m:
|
|
try:
|
|
episode = "S%02dE%02d" % (int(match[0]), int(match[1]))
|
|
episodes[episode] = {}
|
|
episodes[episode]['imdb'] = match[2]
|
|
episodes[episode]['title'] = match[3].strip()
|
|
description = htmldecode(match[4])
|
|
description = stripTags(description.split('Next US airings:')[0])
|
|
episodes[episode]['description'] = description
|
|
except:
|
|
import traceback
|
|
print traceback.print_exc()
|
|
pass
|
|
self.episodes = episodes
|
|
return self.episodes
|
|
|
|
def getKeywords(self, forcereload = False):
|
|
if forcereload or not self.keywordSource:
|
|
self.keywordSource = read_url_utf8(self.keywordUrl)
|
|
return self.keywordSource
|
|
|
|
def parseKeywords(self):
|
|
soup = BeautifulSoup(self.getKeywords())
|
|
keywords = []
|
|
for key in soup('a', {'href': re.compile('^/keyword/')}):
|
|
keywords.append(htmldecode(key.string))
|
|
self.keywords = keywords
|
|
return self.keywords
|
|
|
|
def getTrivia(self, forcereload = False):
|
|
if forcereload or not self.triviaSource:
|
|
self.triviaSource = read_url_utf8(self.triviaUrl)
|
|
return self.triviaSource
|
|
|
|
def parseTrivia(self):
|
|
trivia = []
|
|
soup = BeautifulSoup(self.getTrivia())
|
|
triviaList = []
|
|
for i in soup('ul', {'class': "trivia"}):
|
|
for t in i('li'):
|
|
t = str(t).replace('<br />', '').strip()
|
|
if t.startswith('<li>') and t.endswith('</li>'):
|
|
t = t[4:-5].strip()
|
|
trivia.append(t)
|
|
self.trivia = trivia
|
|
return self.trivia
|
|
|
|
def getConnections(self, forcereload = False):
|
|
if forcereload or not self.connectionsSource:
|
|
self.connectionsSource = read_url_utf8(self.connectionsUrl)
|
|
return self.connectionsSource
|
|
|
|
def parseConnections(self):
|
|
connections = {}
|
|
soup = BeautifulSoup(self.getConnections())
|
|
content = soup('div', {'id': 'tn15content'})[0]
|
|
blocks = str(content).split('<h5>')[1:]
|
|
for c in blocks:
|
|
connection = c.split('</h5>')[0]
|
|
cs = BeautifulSoup(c)
|
|
if connection:
|
|
#relation -> list of imdb ids
|
|
connections[connection] = [a.get('href')[-8:-1] for a in cs('a', {'href': re.compile('/title/tt')})]
|
|
return connections
|
|
|
|
def getReleaseinfo(self, forcereload = False):
|
|
if forcereload or not self.releaseinfoSource:
|
|
self.releaseinfoSource = read_url_utf8(self.releaseinfoUrl)
|
|
return self.releaseinfoSource
|
|
|
|
def parseReleaseinfo(self):
|
|
soup = BeautifulSoup(self.getReleaseinfo())
|
|
for row in soup('table',{'border': '0', 'cellpadding':'2'})[0]('tr'):
|
|
d = row('td', {'align':'right'})
|
|
if d:
|
|
try:
|
|
possible_date = stripTags(str(d[0])).strip()
|
|
rdate = time.strptime(possible_date, "%d %B %Y")
|
|
rdate = time.strftime('%Y-%m-%d', rdate)
|
|
return rdate
|
|
except:
|
|
pass
|
|
return None
|
|
|
|
def getBusiness(self, forcereload = False):
|
|
if forcereload or not self.businessSource:
|
|
self.businessSource = read_url_utf8(self.businessUrl)
|
|
return self.businessSource
|
|
|
|
def parseBusiness(self):
|
|
soup = BeautifulSoup(self.getBusiness())
|
|
business = {'budget': 0, 'gross': 0, 'profit': 0}
|
|
content = soup('div', {'id': 'tn15content'})[0]
|
|
blocks = str(content).split('<h5>')[1:]
|
|
for c in blocks:
|
|
cs = BeautifulSoup(c)
|
|
line = c.split('</h5>')
|
|
if line:
|
|
title = line[0]
|
|
line = line[1]
|
|
if title in ['Budget', 'Gross']:
|
|
values = re.compile('\$(.*?) ').findall(line)
|
|
values = [int(value.replace(',','')) for value in values]
|
|
if values:
|
|
business[title.lower()] = max(values)
|
|
if business['budget'] and business['gross']:
|
|
business['profit'] = business['gross'] - business['budget']
|
|
return business
|
|
|
|
def guess(title, director=''):
|
|
#FIXME: proper file -> title
|
|
title = title.split('-')[0]
|
|
title = title.split('(')[0]
|
|
title = title.split('.')[0]
|
|
title = title.strip()
|
|
imdb_url = 'http://www.imdb.com/find?q=%s' % quote(title.encode('utf-8'))
|
|
return_url = ''
|
|
|
|
#lest first try google
|
|
#i.e. site:imdb.com Michael Stevens Sin
|
|
if director:
|
|
search = 'site:imdb.com %s "%s"' % (director, title)
|
|
else:
|
|
search = 'site:imdb.com "%s"' % title
|
|
for (name, url, desc) in google(search, 1):
|
|
if url.startswith('http://www.imdb.com/title/tt'):
|
|
return url[28:35]
|
|
|
|
try:
|
|
req = urllib2.Request(imdb_url, None, utils.DEFAULT_HEADERS)
|
|
u = urllib2.urlopen(req)
|
|
data = u.read()
|
|
return_url = u.url
|
|
u.close()
|
|
except:
|
|
return None
|
|
if return_url.startswith('http://www.imdb.com/title/tt'):
|
|
return return_url[28:35]
|
|
if data:
|
|
imdb_id = _getTerm(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)')
|
|
if imdb_id:
|
|
return imdb_id
|
|
|
|
imdb_url = 'http://www.imdb.com/find?q=%s;s=tt;site=aka' % quote(title.encode('utf-8'))
|
|
req = urllib2.Request(imdb_url, None, utils.DEFAULT_HEADERS)
|
|
u = urllib2.urlopen(req)
|
|
data = u.read()
|
|
return_url = u.url
|
|
u.close()
|
|
if return_url.startswith('http://www.imdb.com/title/tt'):
|
|
return return_url[28:35]
|
|
|
|
return None
|
|
|
|
def getEpisodeData(title, episode, show_url = None):
|
|
'''
|
|
Collect information about an episode.
|
|
|
|
Returns dict with title, show, description and episode
|
|
'''
|
|
episodeData = {
|
|
'title': u'',
|
|
'show': title,
|
|
'description': u'',
|
|
'episode': episode,
|
|
}
|
|
description = u''
|
|
if not show_url:
|
|
imdbid = guess(title)
|
|
else:
|
|
imdbid = "%07d" % int(re.compile('title/tt(\d*)').findall(link)[0])
|
|
if imdbid:
|
|
i = IMDb(imdbid).parse()
|
|
episodeData['title'] = i['episodes'][episode]['title']
|
|
episodeData['description'] = i['episodes'][episode]['description']
|
|
episodeData['imdb'] = i['episodes'][episode]['imdb']
|
|
return episodeData
|
|
|
|
|
|
if __name__ == '__main__':
|
|
import sys
|
|
#print parse(sys.argv[1])
|
|
print "imdb:", guess(sys.argv[1])
|