# -*- Mode: Python; -*-
# -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2
import urllib2
from urllib import quote
import re, time
import os
from elementtree.ElementTree import parse, tostring
from BeautifulSoup import BeautifulSoup
from google import google
from utils import stripTags, read_url_utf8, htmldecode
import utils
def read_url(url):
base = "/var/cache/scrapeit/cache/"
path = os.path.join(base, url.replace('http://',''))
if path.endswith('/'):
path = "%sindex.html" % path
if os.path.isdir(path):
path = "%s/index.html" % path
if os.path.exists(path):
f = open(path)
data = f.read()
f.close()
return data
else:
data = utils.read_url(url)
folder = os.path.dirname(path)
if not os.path.exists(folder):
os.makedirs(folder)
f = open(path, 'w')
f.write(data)
f.close()
return data
def _get_data(url):
data = None
try:
data = read_url(url)
except:
print "error reading data from", url
return data
def get_image(url):
return read_url(url)
def _castList(data, regexp):
soup = re.compile(regexp).findall(data)
if soup:
soup = BeautifulSoup(soup[0])
names = []
for i in soup('a', {'href': re.compile('/name/nm')}):
if i.string:
cast = stripTags(i.string)
if cast not in names:
names.append(cast)
return names
return []
def _getTerm(data, regexp):
term = ''
try:
reg = re.compile(regexp, re.IGNORECASE)
m = reg.search(data)
if m:
term = stripTags(m.group(1)).strip()
except:
print "waring, parsing failed for", regexp
return term.encode('utf8')
class IMDb:
def __init__(self, imdb):
self.imdb = imdb
self.pageSource = None
self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb
self.businessSource = None
self.businessUrl = "%sbusiness" % self.pageUrl
self.connectionsSource = None
self.connectionsUrl = "%smovieconnections" % self.pageUrl
self.creditsSource = None
self.creditsUrl = "%sfullcredits" % self.pageUrl
self.episodesSource = None
self.episodesUrl = "%sepisodes" % self.pageUrl
self.keywordSource = None
self.keywordUrl = "%skeywords" % self.pageUrl
self.plotSource = None
self.plotUrl = "%splotsummary" % self.pageUrl
self.releaseinfoSource = None
self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl
self.triviaSource = None
self.triviaUrl = "%strivia" % self.pageUrl
def getPage(self, forcereload = False):
if forcereload or not self.pageSource:
self.pageSource = read_url_utf8(self.pageUrl)
return self.pageSource
def parse_raw_value(self, key, value):
if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'):
value = unicode(value, 'utf-8')
value = stripTags(value).strip()
if key == 'runtime':
parsed_value = _getTerm(value, '(.*?) min')
parsed_value = _getTerm(parsed_value, '([0-9]+)')
if not parsed_value:
parsed_value = _getTerm(value, '(.*?) sec')
parsed_value = _getTerm(parsed_value, '([0-9]+)')
if not parsed_value:
parsed_value = 0
else:
parsed_value = int(parsed_value)
else:
parsed_value = int(parsed_value) * 60
elif key in ('country', 'language'):
parsed_value = value.split(' / ')
elif key == 'genre':
parsed_value = value.replace('more', '').strip().split(' / ')
elif key == 'tagline':
parsed_value = value.replace('more', '').strip()
elif key == 'plot_outline':
parsed_value = value.replace('(view trailer)', '').strip()
if parsed_value.endswith('more'):
parsed_value = parsed_value[:-4].strip()
elif key == 'tv_series':
m = re.compile('(.*?)').findall(value)
if m:
parsed_value = m[0][0]
else:
parsed_value = ''
elif key == 'also_known_as':
parsed_value = ''
m = re.compile('(.*) \(International: English title').findall(value)
if m:
parsed_value = m[0]
else:
m = re.compile('(.*) \(USA').findall(value)
if m:
parsed_value = m[0]
parsed_value = parsed_value.split('
')[-1].split('(')[0].strip()
else:
print value
parsed_value = value
return parsed_value
def parseTitle(self):
title = ''
data = self.getPage()
soup = BeautifulSoup(data)
html_title = soup('div', {'id': 'tn15title'})
if not html_title:
html_title = soup('title')
if html_title:
html_title = str(html_title[0])
title = stripTags(html_title)
title = re.sub('\(\d\d\d\d\)', '', title)
title = re.sub('\(\d\d\d\d/I\)', '', title)
for t in ('TV-Series', '(mini)', '(VG)', '(V)', '(TV)'):
title = title.replace(t, '')
title = title.strip()
if title.startswith('"') and title.endswith('"'):
title = title[1:-1]
return title
def parseYear(self):
year = ''
data = self.getPage()
soup = BeautifulSoup(data)
html_title = soup('div', {'id': 'tn15title'})
if not html_title:
html_title = soup('title')
if html_title:
html_title = str(html_title[0])
html_title = stripTags(html_title)
year = re.compile('\((\d\d\d\d)\)').findall(html_title)
if not year:
year = re.compile('\((\d\d\d\d)/').findall(html_title)
if year:
year = year[0]
else: year = ''
return year
def parse(self):
data = self.getPage()
IMDbDict ={}
#Poster
IMDbDict['poster'] = _getTerm(data, 'name="poster".*?(.*?)/10', re.IGNORECASE).search(data)
if m:
IMDbDict['rating'] = int(float(m.group(1)) * 1000)
else:
IMDbDict['rating'] = -1
#Votes
m = re.compile('\((.*?) votes\)', re.IGNORECASE).findall(data)
if m:
IMDbDict['votes'] = int(m[0].replace(',', ''))
else:
IMDbDict['votes'] = -1
data = data.replace('\n',' ')
#some values
keys = ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline', 'tv_series', 'also_known_as')
for key in keys:
IMDbDict[key] = ''
IMDbDict['runtime'] = 0
soup = BeautifulSoup(data)
for info in soup('div', {'class': 'info'}):
key = str(info).split('')[0].split('