# -*- Mode: Python; -*- # -*- coding: utf-8 -*- # vi:si:et:sw=2:sts=2:ts=2 import urllib2 from urllib import quote import re, time import os from elementtree.ElementTree import parse, tostring from BeautifulSoup import BeautifulSoup from google import google from utils import stripTags, read_url_utf8, htmldecode import utils def read_url(url): base = "/var/cache/scrapeit/cache/" path = os.path.join(base, url.replace('http://','')) if path.endswith('/'): path = "%sindex.html" % path if os.path.isdir(path): path = "%s/index.html" % path if os.path.exists(path): f = open(path) data = f.read() f.close() return data else: data = utils.read_url(url) folder = os.path.dirname(path) if not os.path.exists(folder): os.makedirs(folder) f = open(path, 'w') f.write(data) f.close() return data def _get_data(url): data = None try: data = read_url(url) except: print "error reading data from", url return data def get_image(url): return read_url(url) def _castList(data, regexp): soup = re.compile(regexp).findall(data) if soup: soup = BeautifulSoup(soup[0]) names = [] for i in soup('a', {'href': re.compile('/name/nm')}): if i.string: cast = stripTags(i.string) if cast not in names: names.append(cast) return names return [] def _getTerm(data, regexp): term = '' try: reg = re.compile(regexp, re.IGNORECASE) m = reg.search(data) if m: term = stripTags(m.group(1)).strip() except: print "waring, parsing failed for", regexp return term.encode('utf8') class IMDb: def __init__(self, imdb): self.imdb = imdb self.pageSource = None self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb self.businessSource = None self.businessUrl = "%sbusiness" % self.pageUrl self.connectionsSource = None self.connectionsUrl = "%smovieconnections" % self.pageUrl self.creditsSource = None self.creditsUrl = "%sfullcredits" % self.pageUrl self.episodesSource = None self.episodesUrl = "%sepisodes" % self.pageUrl self.keywordSource = None self.keywordUrl = "%skeywords" % self.pageUrl self.plotSource = None self.plotUrl = "%splotsummary" % self.pageUrl self.releaseinfoSource = None self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl self.triviaSource = None self.triviaUrl = "%strivia" % self.pageUrl def getPage(self, forcereload = False): if forcereload or not self.pageSource: self.pageSource = read_url_utf8(self.pageUrl) return self.pageSource def parse_raw_value(self, key, value): if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'): value = unicode(value, 'utf-8') value = stripTags(value).strip() if key == 'runtime': parsed_value = _getTerm(value, '(.*?) min') parsed_value = _getTerm(parsed_value, '([0-9]+)') if not parsed_value: parsed_value = _getTerm(value, '(.*?) sec') parsed_value = _getTerm(parsed_value, '([0-9]+)') if not parsed_value: parsed_value = 0 else: parsed_value = int(parsed_value) else: parsed_value = int(parsed_value) * 60 elif key in ('country', 'language'): parsed_value = value.split(' / ') elif key == 'genre': parsed_value = value.replace('more', '').strip().split(' / ') elif key == 'tagline': parsed_value = value.replace('more', '').strip() elif key == 'plot_outline': parsed_value = value.replace('(view trailer)', '').strip() if parsed_value.endswith('more'): parsed_value = parsed_value[:-4].strip() elif key == 'tv_series': m = re.compile('(.*?)').findall(value) if m: parsed_value = m[0][0] else: parsed_value = '' elif key == 'also_known_as': parsed_value = '' m = re.compile('(.*) \(International: English title').findall(value) if m: parsed_value = m[0] else: m = re.compile('(.*) \(USA').findall(value) if m: parsed_value = m[0] parsed_value = parsed_value.split('
')[-1].split('(')[0].strip() else: print value parsed_value = value return parsed_value def parseTitle(self): title = '' data = self.getPage() soup = BeautifulSoup(data) html_title = soup('div', {'id': 'tn15title'}) if not html_title: html_title = soup('title') if html_title: html_title = str(html_title[0]) title = stripTags(html_title) title = re.sub('\(\d\d\d\d\)', '', title) title = re.sub('\(\d\d\d\d/I\)', '', title) for t in ('TV-Series', '(mini)', '(VG)', '(V)', '(TV)'): title = title.replace(t, '') title = title.strip() if title.startswith('"') and title.endswith('"'): title = title[1:-1] return title def parseYear(self): year = '' data = self.getPage() soup = BeautifulSoup(data) html_title = soup('div', {'id': 'tn15title'}) if not html_title: html_title = soup('title') if html_title: html_title = str(html_title[0]) html_title = stripTags(html_title) year = re.compile('\((\d\d\d\d)\)').findall(html_title) if not year: year = re.compile('\((\d\d\d\d)/').findall(html_title) if year: year = year[0] else: year = '' return year def parse(self): data = self.getPage() IMDbDict ={} #Poster IMDbDict['poster'] = _getTerm(data, 'name="poster".*?(.*?)/10', re.IGNORECASE).search(data) if m: IMDbDict['rating'] = int(float(m.group(1)) * 1000) else: IMDbDict['rating'] = -1 #Votes m = re.compile('\((.*?) votes\)', re.IGNORECASE).findall(data) if m: IMDbDict['votes'] = int(m[0].replace(',', '')) else: IMDbDict['votes'] = -1 data = data.replace('\n',' ') #some values keys = ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline', 'tv_series', 'also_known_as') for key in keys: IMDbDict[key] = '' IMDbDict['runtime'] = 0 soup = BeautifulSoup(data) for info in soup('div', {'class': 'info'}): key = str(info).split('')[0].split('
') if len(key) > 1: raw_value = str(info).split('
')[1] key = key[1][:-1].lower().replace(' ', '_') if key in keys: IMDbDict[key] = self.parse_raw_value(key, raw_value) IMDbDict['title_english'] = IMDbDict.pop('also_known_as', IMDbDict['title']) #is episode IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '') IMDbDict['episodes'] = self.parseEpisodes() IMDbDict['credits'] = self.parseCredits() IMDbDict['plot'] = self.parsePlot() IMDbDict['keywords'] = self.parseKeywords() IMDbDict['trivia'] = self.parseTrivia() IMDbDict['connections'] = self.parseConnections() IMDbDict['release_date'] = self.parseReleaseinfo() IMDbDict['business'] = self.parseBusiness() self.IMDbDict = IMDbDict return self.IMDbDict def getCredits(self, forcereload = False): if forcereload or not self.creditsSource: self.creditsSource = read_url_utf8(self.creditsUrl) return self.creditsSource def parseCredits(self): data = self.getCredits() credits = {} credits['director'] = _castList(data, 'Directed by.*?(.*?)') credits['writer'] = _castList(data, 'Writing credits.*?(.*?)') credits['producer'] = _castList(data, 'Produced by.*?(.*?)') #credits['cast'] = _castList(data, 'Cast.*?(') credits['cast'] = [] soup = re.compile('Cast.*?(').findall(data) soup = BeautifulSoup(data) cast = soup('table', {'class': 'cast'}) if cast: cast = str(cast[0]) names = re.compile('(.*?).*?(.*?)').findall(cast) for name in names: real_name = name[0] role_name = name[1] if role_name: role_name = role_name.split('(')[0].replace('/ ...','').strip() credits['cast'].append((stripTags(real_name), stripTags(role_name))) self.credits = credits return self.credits def getPlot(self, forcereload = False): if forcereload or not self.plotSource: self.plotSource = read_url_utf8(self.plotUrl) return self.plotSource def parsePlot(self): soup = BeautifulSoup(self.getPlot()) plot = soup('p', {'class':'plotpar'}) if plot: plot = unicode(plot[0]).split('')[0] else: plot = u'' plot = stripTags(plot).strip() self.plot = plot return plot def getEpisodes(self, forcereload = False): if forcereload or not self.episodesSource: self.episodesSource = read_url_utf8(self.episodesUrl) return self.episodesSource def parseEpisodes(self): episodes = {} cdata = self.getEpisodes().replace('\r\n',' ') regexp = r'''

Season (.*?), Episode (.*?): (.*?)

.*?
(.*?)
''' #regexp = r'''Season (.*?), Episode (.*?): (.*?)
.*?
(.*?)''' reg = re.compile(regexp, re.IGNORECASE) m = reg.findall(cdata) for match in m: try: episode = "S%02dE%02d" % (int(match[0]), int(match[1])) episodes[episode] = {} episodes[episode]['imdb'] = match[2] episodes[episode]['title'] = match[3].strip() description = htmldecode(match[4]) description = stripTags(description.split('Next US airings:')[0]) episodes[episode]['description'] = description except: import traceback print traceback.print_exc() pass self.episodes = episodes return self.episodes def getKeywords(self, forcereload = False): if forcereload or not self.keywordSource: self.keywordSource = read_url_utf8(self.keywordUrl) return self.keywordSource def parseKeywords(self): soup = BeautifulSoup(self.getKeywords()) keywords = [] for key in soup('a', {'href': re.compile('/keyword')}): keywords.append(htmldecode(key.string)) self.keywords = keywords return self.keywords def getTrivia(self, forcereload = False): if forcereload or not self.triviaSource: self.triviaSource = read_url_utf8(self.triviaUrl) return self.triviaSource def parseTrivia(self): trivia = [] soup = BeautifulSoup(self.getTrivia()) triviaList = [] for i in soup('ul', {'class': "trivia"}): for t in i('li'): t = str(t).replace('
', '').strip() if t.startswith('
  • ') and t.endswith('
  • '): t = t[4:-5].strip() trivia.append(t) self.trivia = trivia return self.trivia def getConnections(self, forcereload = False): if forcereload or not self.connectionsSource: self.connectionsSource = read_url_utf8(self.connectionsUrl) return self.connectionsSource def parseConnections(self): connections = {} soup = BeautifulSoup(self.getConnections()) content = soup('div', {'id': 'tn15content'})[0] blocks = str(content).split('
    ')[1:] for c in blocks: connection = c.split('
    ')[0] cs = BeautifulSoup(c) if connection: #relation -> list of imdb ids connections[connection] = [a.get('href')[-8:-1] for a in cs('a', {'href': re.compile('/title/tt')})] return connections def getReleaseinfo(self, forcereload = False): if forcereload or not self.releaseinfoSource: self.releaseinfoSource = read_url_utf8(self.releaseinfoUrl) return self.releaseinfoSource def parseReleaseinfo(self): soup = BeautifulSoup(self.getReleaseinfo()) for row in soup('table',{'border': '0', 'cellpadding':'2'})[0]('tr'): d = row('td', {'align':'right'}) if d: try: possible_date = stripTags(str(d[0])).strip() rdate = time.strptime(possible_date, "%d %B %Y") rdate = time.strftime('%Y-%m-%d', rdate) return rdate except: pass return None def getBusiness(self, forcereload = False): if forcereload or not self.businessSource: self.businessSource = read_url_utf8(self.businessUrl) return self.businessSource def parseBusiness(self): soup = BeautifulSoup(self.getBusiness()) business = {'budget': 0, 'gross': 0, 'profit': 0} content = soup('div', {'id': 'tn15content'})[0] blocks = str(content).split('
    ')[1:] for c in blocks: cs = BeautifulSoup(c) line = c.split('
    ') if line: title = line[0] line = line[1] if title in ['Budget', 'Gross']: values = re.compile('\$(.*?) ').findall(line) values = [int(value.replace(',','')) for value in values] if values: business[title.lower()] = max(values) if business['budget'] and business['gross']: business['profit'] = business['gross'] - business['budget'] return business def guess(title, director=''): #FIXME: proper file -> title title = title.split('-')[0] title = title.split('(')[0] title = title.split('.')[0] title = title.strip() imdb_url = 'http://www.imdb.com/find?q=%s' % quote(title.encode('utf-8')) return_url = '' #lest first try google #i.e. site:imdb.com Michael Stevens Sin if director: search = 'site:imdb.com %s "%s"' % (director, title) else: search = 'site:imdb.com "%s"' % title for (name, url, desc) in google(search, 1): if url.startswith('http://www.imdb.com/title/tt'): return url[28:35] try: req = urllib2.Request(imdb_url, None, utils.DEFAULT_HEADERS) u = urllib2.urlopen(req) data = u.read() return_url = u.url u.close() except: return None if return_url.startswith('http://www.imdb.com/title/tt'): return return_url[28:35] if data: imdb_id = _getTerm(data.replace('\n', ' '), 'Popular Results.*?
    1. .*?