# -*- Mode: Python; -*- # -*- coding: utf-8 -*- # vi:si:et:sw=2:sts=2:ts=2 import urllib2 from urllib import quote import re, time import os from elementtree.ElementTree import parse, tostring from BeautifulSoup import BeautifulSoup from google import google from utils import stripTags, htmldecode import utils import chardet cache_base = "/var/cache/scrapeit/cache/" def read_url_utf8(url): data = read_url(url) encoding = chardet.detect(data)['encoding'] if not encoding: encoding = 'latin-1' data = unicode(data, encoding) return data def read_url(url): path = os.path.join(cache_base, url.replace('http://','')) if path.endswith('/'): path = "%sindex.html" % path if os.path.isdir(path): path = "%s/index.html" % path if os.path.exists(path): f = open(path) data = f.read() f.close() return data else: data = utils.read_url(url) folder = os.path.dirname(path) if not os.path.exists(folder): os.makedirs(folder) f = open(path, 'w') f.write(data) f.close() return data def _get_data(url): data = None try: data = read_url(url) except: print "error reading data from", url return data def get_image(url): return read_url(url) def _castList(data, regexp): soup = re.compile(regexp).findall(data) if soup: soup = BeautifulSoup(soup[0]) names = [] for i in soup('a', {'href': re.compile('/name/nm')}): if i.string: cast = stripTags(i.string) if cast not in names: names.append(cast) return names return [] def _getTerm(data, regexp): term = '' try: reg = re.compile(regexp, re.IGNORECASE) m = reg.search(data) if m: term = stripTags(m.group(1)).strip() except: print "waring, parsing failed for", regexp return term.encode('utf8') class IMDb: def __init__(self, imdb): self.imdb = imdb self.pageSource = None self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb self.businessSource = None self.businessUrl = "%sbusiness" % self.pageUrl self.connectionsSource = None self.connectionsUrl = "%smovieconnections" % self.pageUrl self.creditsSource = None self.creditsUrl = "%sfullcredits" % self.pageUrl self.episodesSource = None self.episodesUrl = "%sepisodes" % self.pageUrl self.keywordSource = None self.keywordUrl = "%skeywords" % self.pageUrl self.plotSource = None self.plotUrl = "%splotsummary" % self.pageUrl self.releaseinfoSource = None self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl self.triviaSource = None self.triviaUrl = "%strivia" % self.pageUrl self.locationSource = None self.locationUrl = "%slocations" % self.pageUrl def getPage(self, forcereload = False): if forcereload or not self.pageSource: self.pageSource = read_url_utf8(self.pageUrl) return self.pageSource def parse_raw_value(self, key, value): if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'): value = unicode(value, 'utf-8') value = stripTags(value).strip() if key == 'runtime': parsed_value = _getTerm(value, '(.*?) min') parsed_value = _getTerm(parsed_value, '([0-9]+)') if not parsed_value: parsed_value = _getTerm(value, '(.*?) sec') parsed_value = _getTerm(parsed_value, '([0-9]+)') if not parsed_value: parsed_value = 0 else: parsed_value = int(parsed_value) else: parsed_value = int(parsed_value) * 60 elif key in ('country', 'language'): parsed_value = value.split(' / ') elif key == 'genre': parsed_value = value.replace('more', '').strip().split(' / ') elif key == 'tagline': parsed_value = value.replace('more', '').strip() elif key == 'plot_outline': parsed_value = value.replace('(view trailer)', '').strip() if parsed_value.endswith('more'): parsed_value = parsed_value[:-4].strip() elif key == 'tv_series': m = re.compile('(.*?)').findall(value) if m: parsed_value = m[0][0] else: parsed_value = '' elif key == 'also_known_as': parsed_value = '' m = re.compile('(.*) \(International: English title').findall(value) if m: parsed_value = m[0] else: m = re.compile('(.*) \(USA').findall(value) if m: parsed_value = m[0] parsed_value = parsed_value.split('
')[-1].split('(')[0] director = self.parseCredits().get('director', None) if director: director = director[0] parsed_value = parsed_value.replace(director, '') if parsed_value.startswith("'s"): parsed_value = parsed_value[2:].strip() parsed_value = parsed_value.strip() else: print value parsed_value = value return parsed_value def parseTitle(self): title = '' data = self.getPage() soup = BeautifulSoup(data) html_title = soup('div', {'id': 'tn15title'}) if not html_title: html_title = soup('title') if html_title: html_title = str(html_title[0]) html_title = html_title.replace('
', ' ').replace(' ', ' ') title = stripTags(html_title) title = re.sub('\(\d\d\d\d\)', '', title) title = re.sub('\(\d\d\d\d/I*\)', '', title) for t in ('TV-Series', '(mini)', '(VG)', '(V)', '(TV)'): title = title.replace(t, '') if title.find(u'\xa0') > -1: title = title[:title.find(u'\xa0')] title = title.strip() if title.startswith('"') and title.endswith('"'): title = title[1:-1] elif title.startswith('"') and title.find('"',1) > 0 and \ title.find('"',1) == title.rfind('"'): title = title[1:title.rfind('"')] + ':' + title[title.rfind('"')+1:] return title def parseYear(self): year = '' data = self.getPage() soup = BeautifulSoup(data) html_title = soup('div', {'id': 'tn15title'}) if not html_title: html_title = soup('title') if html_title: html_title = str(html_title[0]) html_title = stripTags(html_title) year = re.compile('\((\d\d\d\d)\)').findall(html_title) if not year: year = re.compile('\((\d\d\d\d)/').findall(html_title) if year: year = year[0] else: year = '' return year def parse(self): data = self.getPage() IMDbDict ={} #Poster IMDbDict['poster'] = _getTerm(data, 'name="poster".*?(.*?)/10', re.IGNORECASE).search(data) if m: IMDbDict['rating'] = int(float(m.group(1)) * 1000) else: IMDbDict['rating'] = -1 #Votes m = re.compile('\((.*?) votes\)', re.IGNORECASE).findall(data) if m: IMDbDict['votes'] = int(m[0].replace(',', '')) else: IMDbDict['votes'] = -1 data = data.replace('\n',' ') #some values keys = ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline', 'tv_series', 'also_known_as') for key in keys: IMDbDict[key] = '' IMDbDict['runtime'] = 0 soup = BeautifulSoup(data) for info in soup('div', {'class': 'info'}): key = str(info).split('')[0].split('
') if len(key) > 1: raw_value = str(info).split('
')[1] key = key[1][:-1].lower().replace(' ', '_') if key in keys: IMDbDict[key] = self.parse_raw_value(key, raw_value) IMDbDict['title_english'] = IMDbDict.pop('also_known_as', IMDbDict['title']) #is episode IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '') IMDbDict['episodes'] = self.parseEpisodes() IMDbDict['credits'] = self.parseCredits() IMDbDict['plot'] = self.parsePlot() IMDbDict['keywords'] = self.parseKeywords() IMDbDict['trivia'] = self.parseTrivia() IMDbDict['connections'] = self.parseConnections() IMDbDict['locations'] = self.parseLocations() IMDbDict['release_date'] = self.parseReleaseinfo() IMDbDict['business'] = self.parseBusiness() self.IMDbDict = IMDbDict return self.IMDbDict def getCredits(self, forcereload = False): if forcereload or not self.creditsSource: self.creditsSource = read_url_utf8(self.creditsUrl) return self.creditsSource def parseCredits(self): data = self.getCredits() credits = {} credits['director'] = _castList(data, 'Directed by.*?(.*?)') credits['writer'] = _castList(data, 'Writing credits.*?(.*?)') credits['producer'] = _castList(data, 'Produced by.*?(.*?)') #credits['cast'] = _castList(data, 'Cast.*?(') credits['cast'] = [] soup = re.compile('Cast.*?(').findall(data) soup = BeautifulSoup(data) cast = soup('table', {'class': 'cast'}) if cast: cast = str(cast[0]).replace(u'\xa0', ' ') names = re.compile('(.*?).*?(.*?)').findall(cast) for name in names: real_name = name[0] role_name = name[1] if role_name: role_name = role_name.split('(')[0].replace('/ ...','') credits['cast'].append((stripTags(real_name), stripTags(role_name))) self.credits = credits return self.credits def getPlot(self, forcereload = False): if forcereload or not self.plotSource: self.plotSource = read_url_utf8(self.plotUrl) return self.plotSource def parsePlot(self): soup = BeautifulSoup(self.getPlot()) plot = soup('p', {'class':'plotpar'}) if plot: plot = unicode(plot[0]).split('')[0] else: plot = u'' plot = stripTags(plot).strip() self.plot = plot return plot def getEpisodes(self, forcereload = False): if forcereload or not self.episodesSource: self.episodesSource = read_url_utf8(self.episodesUrl) return self.episodesSource def parseEpisodes(self): episodes = {} cdata = self.getEpisodes().replace('\r\n',' ') regexp = r'''

Season (.*?), Episode (.*?): (.*?)

.*?
(.*?)
''' #regexp = r'''Season (.*?), Episode (.*?): (.*?)
.*?
(.*?)''' reg = re.compile(regexp, re.IGNORECASE) m = reg.findall(cdata) for match in m: try: episode = "S%02dE%02d" % (int(match[0]), int(match[1])) episodes[episode] = {} episodes[episode]['imdb'] = match[2] episodes[episode]['title'] = match[3].strip() description = htmldecode(match[4]) description = stripTags(description.split('Next US airings:')[0]) episodes[episode]['description'] = description except: import traceback print traceback.print_exc() pass self.episodes = episodes return self.episodes def getLocations(self, forcereload = False): if forcereload or not self.locationSource: self.keywordSource = read_url_utf8(self.locationUrl) return self.keywordSource def parseLocations(self): soup = BeautifulSoup(self.getLocations()) locations = [] for key in soup('a', {'href': re.compile('^/List')}): locations.append(htmldecode(key.string)) self.locations = locations return self.locations def getKeywords(self, forcereload = False): if forcereload or not self.keywordSource: self.keywordSource = read_url_utf8(self.keywordUrl) return self.keywordSource def parseKeywords(self): soup = BeautifulSoup(self.getKeywords()) keywords = [] for key in soup('a', {'href': re.compile('^/keyword/')}): k = htmldecode(key.string) k = k.replace(u'\xa0', ' ') keywords.append(k) self.keywords = keywords return self.keywords def getTrivia(self, forcereload = False): if forcereload or not self.triviaSource: self.triviaSource = read_url_utf8(self.triviaUrl) return self.triviaSource def parseTrivia(self): trivia = [] soup = BeautifulSoup(self.getTrivia()) triviaList = [] for i in soup('ul', {'class': "trivia"}): for t in i('li'): t = str(t).replace('
', '').strip() if t.startswith('
  • ') and t.endswith('
  • '): t = t[4:-5].strip() trivia.append(t) self.trivia = trivia return self.trivia def getConnections(self, forcereload = False): if forcereload or not self.connectionsSource: self.connectionsSource = read_url_utf8(self.connectionsUrl) return self.connectionsSource def parseConnections(self): connections = {} soup = BeautifulSoup(self.getConnections()) content = soup('div', {'id': 'tn15content'})[0] blocks = str(content).split('
    ')[1:] for c in blocks: connection = c.split('
    ')[0] cs = BeautifulSoup(c) if connection: #relation -> list of imdb ids connections[connection] = [a.get('href')[-8:-1] for a in cs('a', {'href': re.compile('/title/tt')})] return connections def getReleaseinfo(self, forcereload = False): if forcereload or not self.releaseinfoSource: self.releaseinfoSource = read_url_utf8(self.releaseinfoUrl) return self.releaseinfoSource def parseReleaseinfo(self): soup = BeautifulSoup(self.getReleaseinfo()) for row in soup('table',{'border': '0', 'cellpadding':'2'})[0]('tr'): d = row('td', {'align':'right'}) if d: try: possible_date = stripTags(str(d[0])).strip() rdate = time.strptime(possible_date, "%d %B %Y") rdate = time.strftime('%Y-%m-%d', rdate) return rdate except: pass return None def getBusiness(self, forcereload = False): if forcereload or not self.businessSource: self.businessSource = read_url_utf8(self.businessUrl) return self.businessSource def parseBusiness(self): soup = BeautifulSoup(self.getBusiness()) business = {'budget': 0, 'gross': 0, 'profit': 0} content = soup('div', {'id': 'tn15content'})[0] blocks = str(content).split('
    ')[1:] for c in blocks: cs = BeautifulSoup(c) line = c.split('
    ') if line: title = line[0] line = line[1] if title in ['Budget', 'Gross']: values = re.compile('\$(.*?) ').findall(line) values = [int(value.replace(',','')) for value in values] if values: business[title.lower()] = max(values) if business['budget'] and business['gross']: business['profit'] = business['gross'] - business['budget'] return business def guess(title, director=''): #FIXME: proper file -> title title = title.split('-')[0] title = title.split('(')[0] title = title.split('.')[0] title = title.strip() imdb_url = 'http://www.imdb.com/find?q=%s' % quote(title.encode('utf-8')) return_url = '' #lest first try google #i.e. site:imdb.com Michael Stevens Sin if director: search = 'site:imdb.com %s "%s"' % (director, title) else: search = 'site:imdb.com "%s"' % title for (name, url, desc) in google(search, 1): if url.startswith('http://www.imdb.com/title/tt'): return url[28:35] try: req = urllib2.Request(imdb_url, None, utils.DEFAULT_HEADERS) u = urllib2.urlopen(req) data = u.read() return_url = u.url u.close() except: return None if return_url.startswith('http://www.imdb.com/title/tt'): return return_url[28:35] if data: imdb_id = _getTerm(data.replace('\n', ' '), 'Popular Results.*?
    1. .*?