python-oxweb/oxweb/imdb.py

# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import urllib2
from urllib import quote, unquote
import re
import os
import time

import chardet
import oxlib
from oxlib import stripTags, decodeHtml, findRe, findString
import oxlib.cache
from oxlib.normalize import normalizeTitle, normalizeImdbId
from oxlib import *

import google

'''
    never timeout imdb data, to update cache remove data from cache folder
'''
def readUrlUnicode(url, data=None, headers=oxlib.cache.DEFAULT_HEADERS, timeout=-1):
    return oxlib.cache.readUrlUnicode(url, data, headers, timeout)

'''
check if result is valid while updating
def validate(result, header):
    return header['status'] == u'200'

try:
    d = oxlib.cache.readUrlUnicode(url, data, headers, timeout=0, valid=validate)
except oxlib.cache.InvalidResult, e:
    print e.headers

'''
def getMovieId(title, director='', year=''):
    '''
    >>> getMovieId('The Matrix')
    '0133093'
    '''
    if year:
        title = "%s (%s)" % (title, year)
    if director:
        query = 'site:imdb.com %s "%s"' % (director, title)
    else:
        query = 'site:imdb.com "%s"' % title
    for (name, url, desc) in google.find(query, 3, timeout=-1):
        if url.startswith('http://www.imdb.com/title/tt'):
            return url[28:35]
    return ''

def getMovieData(imdbId):
    return IMDb(imdbId).parse()

# internal functions below
def getUrlBase(imdbId):
    return "http://www.imdb.com/title/tt%s/" % imdbId

def getRawMovieData(imdbId):
    imdbId = normalizeImdbId(imdbId)
    data = getMovieInfo(imdbId)
    data['credits'] = getMovieCredits(imdbId)
    data['poster'] = getMoviePoster(imdbId)
    data['company credits'] = getMovieCompanyCredits(imdbId)
    data['filming locations'] = getMovieLocations(imdbId)
    data['movie connections'] = getMovieConnections(imdbId)
    data['external reviews'] = getMovieExternalReviews(imdbId)
    data['trivia'] = getMovieTrivia(imdbId)
    data['keywords'] = getMovieKeywords(imdbId)
    data['media'] = {}
    data['media']['images'] = getMovieImages(imdbId)
    data['media']['trailers'] = getMovieTrailers(imdbId)
    data['plotsummary'] = getMoviePlot(imdbId)
    data['release dates'] = getMovieReleaseDates(imdbId)
    data['release date'] = getMovieReleaseDate(imdbId)
    return data

def getMovieInfo(imdbId, timeout=-1):
    data = readUrlUnicode(getUrlBase(imdbId), timeout=timeout)
    info = dict()
    info['poster'] = findRe(data, 'name="poster".*?<img .*?src="(.*?)"')
    if info['poster'] and '_V' in info['poster']:
        info['poster']= "%s.jpg" % info['poster'].split('._V')[0]

    for i in re.compile('<h5>(.*?):</h5>(.*?)<div class="info"', re.DOTALL).findall(data):
        title = stripTags(i[0]).strip().lower()
        if title in ('genre', ):
            txt = i[1].split('<a class="tn15more')[0].split('</div>')[0]
        else:
            txt= i[1]
        txt = stripTags(txt).strip()

        def cleanUp(k):
            k = decodeHtml(k).replace(u'\xa0', ' ').strip()
            if k.endswith('more'): k=k[:-len('more')].strip()
            return k
        txt = cleanUp(txt)
        if title not in ('plot', 'trivia', 'filming locations', 'mpaa', 'tagline', 'original air date'):
            if '|' in txt:
                txt = [cleanUp(k) for k in txt.split('|')]
            elif ', ' in txt:
                txt = [cleanUp(k) for k in txt.split(', ')]
            elif title in ('country', 'language', 'genre'):
                txt = [cleanUp(txt), ]
        if title == 'tv series':
            info['series_imdb'] = findRe(i[1], 'tt(\d{7})')
        if title == 'original air date':
            info['series_episode_info'] = txt.split('\n')[-1].strip()
            txt = txt.split('\n')[0].strip()
        if not title.startswith('moviemeter'):
            info[title] = txt
    for key in ('user comments', 'writers (wga)', 'plot keywords'):
       if key in info:
          del info[key]
    if 'release date' in info:
        if isinstance(info['release date'], list):
            info['release date'] = info['release date'][0]
        info['release date'] = info['release date'].split('\n')[0]
    if 'plot' in info:
        info['plot'] = info['plot'].split('| add synopsis')[0].strip()
        info['plot'] = info['plot'].split('| full synopsis')[0].strip()
        if info['plot'] in ('add synopsis', 'full synopsis'):
            info['plot'] = ''

    #get Title
    title = ''
    year = ''
    html_title = findRe(data, '<div id="tn15title">(.*?)</div>')
    if not html_title:
        html_title = findRe(data, '<title>(.*?)</title>')
    else:
        html_title = html_title.split('<span class="pro-link">')[0]
    if html_title:
        html_title = html_title.replace('<br />', ' ').replace('  ', ' ')
        title = stripTags(html_title)
        title = decodeHtml(title)
        year = findRe(title, '\((\d{4})\)')
        if not year:
            year = findRe(title, '\((\d{4})')
        _y = findRe(title, r'(\([0-9\?]{4}[/IVXLCDM]*?\))')
        if _y:
            title = title.replace(_y, '')
        for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
            title = title.replace(t, '')
    title = title.strip()
    if title.find(u'\xa0') > -1:
        title = title[:title.find(u'\xa0')].strip()
    if title.startswith('"') and title.endswith('"'):
        title = title[1:-1]
    info['title'] = normalizeTitle(title)
    info['year'] = year

    #Series
    if title.startswith('"') and title.find('"',1) > 0 and \
        title.find('"',1) == title.rfind('"'):
        episode_title = title[title.rfind('"')+1:]
        episode_title = re.sub("\?{4}", "", episode_title).strip()
        episode_title = re.sub("\d{4}", "", episode_title).strip()
        if episode_title == '-': episode_title=''
        title = normalizeTitle(title[1:title.rfind('"')])
        if episode_title:
            info['episode title'] = episode_title
            info['series title'] = title
            info['title'] = "%s: %s" % (title, episode_title)
        else:
            info['title'] = title

    se = re.compile("Season (\d*), Episode (\d*)\)").findall(info.get('series_episode_info', ''))
    if se:
        info['season'] = int(se[0][0])
        info['episode'] = int(se[0][1])
        info['title'] = "%s (S%02dE%02d) %s" % (
                    info['series title'], info['season'], info['episode'], info['episode title'])
        info['title'] = info['title'].strip()
        del info['series_episode_info']

    #Rating
    rating = findRe(data, '<b>([\d\.]*?)/10</b>')
    if rating:
        info['rating'] = float(rating)
    else:
        info['rating'] = -1

    #Votes
    info['votes'] = -1
    if "user rating" in info:
        if isinstance(info['user rating'], list):
            info['user rating'] = ' '.join(info['user rating'])
        votes = findRe(info['user rating'], '([\d,]*?) votes')
        if votes:
            info['votes'] = int(votes.replace(',', ''))
    return info

def getMovieRuntimeSeconds(imdbId):
    info = getMovieInfo(imdbId)
    if 'runtime' in info:
        value = info['runtime'][0]
        parsed_value = findRe(value, '(.*?) min')
        parsed_value = findRe(parsed_value, '([0-9]+)')
        if not parsed_value:
            parsed_value = findRe(value, '(.*?) sec')
            parsed_value = findRe(parsed_value, '([0-9]+)')
            if not parsed_value:
                parsed_value = 0
            else:
                parsed_value = int(parsed_value)
        else:
            parsed_value = int(parsed_value) * 60
    else:
        parsed_value = -1
    return parsed_value

def getMoviePoster(imdbId):
    info = getMovieInfo(imdbId)
    return info['poster']

def getMovieYear(imdbId):
    '''
    >>> getMovieYear('0315404')
    u'1964'

    >>> getMovieYear('0734840')
    u'1990'

    >>> getMovieYear('0815352')
    u'1964'
    '''
    info = getMovieInfo(imdbId)
    return info['year']

def getMovieTitle(imdbId):
    '''
    >>> getMovieTitle('0306414')
    u'The Wire'

    >>> getMovieTitle('0734840')
    u'Twin Peaks (S01E02) Episode #1.2'

    >>> getMovieTitle('0734840')
    u'Twin Peaks (S01E02) Episode #1.2'

    >>> getMovieTitle('0749451')
    u'The Wire (S01E01) The Target'
    '''
    info = getMovieInfo(imdbId)
    return info['title']

def getMovieAKATitles(imdbId):
    '''
    >>> getMovieAKATitle('0040980')
    [(u'Frauen der Nacht', u'Germany'),
     (u'Les femmes de la nuit', u'France'),
     (u'Women of the Night', u'(undefined)')]
    '''
    url = "%sreleaseinfo" % getUrlBase(imdbId)
    data = readUrlUnicode(url)
    titles = findRe(data, 'name="akas".*?<table.*?>(.*?)</table>')
    titles = re.compile("td>(.*?)</td>\n\n<td>(.*)</td>").findall(titles)
    return titles

def creditList(data, section=None):
    if section == 'cast':
        credits_ = re.compile('''<tr .*?<td class="nm">(.*?)</td><td class="ddd">.*?</td><td class="char">(.*?)</td></tr>''').findall(data)
    else:
        credits_ = re.compile('''<tr>.*?<td valign="top">(.*?)</td><td.*?</td><td valign="top">(.*?)</td></tr>''').findall(data)
    credits = []
    for c_ in credits_:
        c = [stripTags(decodeHtml(c_[0]).strip()), stripTags(decodeHtml(c_[1]).strip())]
        if section=='writers':
            c[1] = c[1].replace('<br>', '').strip().replace(')', '').replace('(','')
            if c[1].endswith(' and'): c[1] = c[1][:-4]
        credits.append(c)
    return credits

def getMovieCredits(imdbId):
    credits = dict()
    url = "%sfullcredits" % getUrlBase(imdbId)
    data = readUrlUnicode(url)
    groups = data.split('<h5>')
    for g in groups:
        #<a class="glossary" name="writers" href="/glossary/W#writer">Writing credits</a>
        section = re.compile('''name="(.*?)".*? href="/Glossary''', re.IGNORECASE).findall(g)
        if section:
            credits[section[0]] = creditList(g, section[0])
    return credits

def getMovieTrailers(imdbId):
    from BeautifulSoup import BeautifulSoup

    url = "%strailers" % getUrlBase(imdbId)
    data = readUrlUnicode(url)
    soup = BeautifulSoup(data)
    videos = soup('div', {'class':"video-gallery"})
    trailers = []
    if videos:
        for a in videos[0]('a'):
            title = stripTags(unicode(a)).strip()
            url = 'http://www.imdb.com' + a['href']
            videoId = findRe(url, '/(vi\d*?)/')
            iframeUrl = "http://www.imdb.com/video/trailer/%s/player" % videoId
            iframe = readUrlUnicode(iframeUrl)
            videoUrl = unquote(findRe(iframe, 'addVariable\("file", "(.*?)"'))
            trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl})
    return trailers

def getMovieQuotes(imdbId):
    url = "%squotes" % getUrlBase(imdbId)
    data = readUrlUnicode(url)
    quotes = re.compile('<b>(.*?)</b>:(.*?)<br>', re.DOTALL).findall(findString(data, '<a name="q'))
    quotes = [(q[0].strip(),q[1].strip())  for q in quotes]
    return quotes

def getMoviePlot(imdbId):
    url = "%splotsummary" % getUrlBase(imdbId)
    data = readUrlUnicode(url)
    plot = findRe(data, '<p class="plotpar">(.*?)<i>').split('</p>')[0]
    return plot.strip()

def getMovieTechnical(imdbId):
    url = "%stechnical" % getUrlBase(imdbId)
    data = readUrlUnicode(url)
    results = {}
    for t in re.compile('<h5>(.*?)</h5>(.*?)<br/>', re.DOTALL).findall(data):
        results[t[0].strip()] = t[1].strip()
    return results

def getMovieCompanyCredits(imdbId):
    url = "%scompanycredits" % getUrlBase(imdbId)
    data = readUrlUnicode(url)
    results = {}
    for field, c in re.compile('<h2>(.*?)</h2><ul>(.*?)</ul>').findall(data):
        results[field.strip()] = []
        for company in re.compile('<li>(.*?)</li>').findall(c):
            results[field.strip()].append(company)
    return results

def getMovieLocations(imdbId):
    url = "%slocations" % getUrlBase(imdbId)
    data = readUrlUnicode(url)
    locations = re.compile('<dt><a href="/List.*?>(.*?)</a></dt>').findall(data)
    locations = [decodeHtml(l) for l in locations]
    return locations

def getMovieImages(imdbId, keys=('still_frame', 'poster', 'product')):
    photos = {}
    for key in keys:
        url = "%smediaindex?refine=%s" % (getUrlBase(imdbId), key)
        data = readUrlUnicode(url)
        photos[key] = {}
        for s in  re.compile('''<img alt="(.*?)".*?src="(http://ia.media-imdb.com/.*?.jpg)''').findall(data):
            img= "%s.jpg" % s[1].split('._V')[0]
            title = s[0]
            if key=='still_frame':
                if not "_CR0" in s[1]:
                    photos[key][img] = title
            else:
                photos[key][img] = title
    return photos

def getMovieStills(imdbId):
    return getMovieImages(imdbId, ['still_frame'])['still_frame']

def getMoviePosters(imdbId):
    posters = getMovieImages(imdbId, ['poster'])['poster']
    poster = getMoviePoster(imdbId)
    if poster:
        posters[poster] = 'main poster'
    return posters
  
def getMovieTrivia(imdbId):
    url = "%strivia" % getUrlBase(imdbId)
    data_ = readUrlUnicode(url)
    data = findRe(data_, '<ul class="trivia">(.*?)</ul>')
    trivia = re.compile('<li>(.*?)</li>', re.DOTALL).findall(data)
    def clean(t):
        t = decodeHtml(t)
        t = t.replace(u'', '"').strip()
        for s in ('<br><br>', '<br>\n<br>', '<br>'):
            if t.endswith(s):
                t = t[:-len(s)].strip()
        return t.strip()
    if len(trivia) == 0:
        trivia = re.compile('<div class="sodatext">(.*?)<span', re.DOTALL).findall(data_)

    trivia = [clean(t) for t in trivia]

    return trivia

def getMovieConnections(imdbId):
    url = "%smovieconnections" % getUrlBase(imdbId)
    data = readUrlUnicode(url)
    connections={}
    for c in re.compile('''<h5>(.*?)</h5>(.*?)\n\n''', re.DOTALL).findall(data):
        connections[unicode(c[0])] = re.compile('''<a href="/title/tt(\d{7})/">''').findall(c[1])
    return connections

def getMovieKeywords(imdbId):
    url = "%skeywords" % getUrlBase(imdbId)
    data = readUrlUnicode(url)
    keywords = []
    for keyword in re.compile('''<a.*?href="/keyword.*?>(.*?)</a>''').findall(data):
        keyword = decodeHtml(keyword)
        keyword = keyword.replace(u'\xa0', ' ')
        keywords.append(keyword)
    return keywords

def getMovieExternalReviews(imdbId):
    url = "%sexternalreviews" % getUrlBase(imdbId)
    data = readUrlUnicode(url)
    data = findRe(data, '<ol>(.*?)</ol>')
    _reviews = re.compile('<li><a href="(http.*?)".*?>(.*?)</a></li>').findall(data)
    reviews = {}
    for r in _reviews:
        reviews[r[0]] = r[1]
    return reviews

def getMovieReleaseDate(imdbId):
    releasedates = getMovieReleaseDates(imdbId)
    first_release = None
    for r in releasedates:
        if not first_release or r[1] < first_release:
            first_release = r[1]
    return first_release

def _parseDate(d):
    '''
    >>>_parseDate('3 March 1972')
    '1972-03-03'
    '''
    try:
        parsed_date = time.strptime(d, "%d %B %Y")
        parsed_date = '%s-%02d-%02d' % (parsed_date.tm_year, parsed_date.tm_mon, parsed_date.tm_mday)
        return parsed_date
    except:
        try:
            parsed_date = time.strptime(d, "%B %Y")
            parsed_date = '%s-%02d-01' % (parsed_date.tm_year, parsed_date.tm_mon)
            return parsed_date
        except:
            pass
        try:
            parsed_date = time.strptime(d, "%Y")
            parsed_date = '%s-01-01' % (parsed_date.tm_year)
            return parsed_date
        except:
            pass
    return d

def getMovieReleaseDates(imdbId):
    url = "%sreleaseinfo" % getUrlBase(imdbId)
    data = readUrlUnicode(url)
    releasedates = []
    regexp = '''<tr><td>(.*?)</td>.*?<td align="right">(.*?)</td>.*?<td>(.*?)</td></tr>'''

    for r in re.compile(regexp, re.DOTALL).findall(data):
        r_ = (stripTags(r[0]).strip(),
              _parseDate(stripTags(r[1]).strip()),
              decodeHtml(stripTags(r[2]).strip()))
        releasedates.append(r_)
    return releasedates

def getMovieBusinessSum(imdbId):
    business = getMovieBusiness(imdbId)
    b_ = {'budget': 0, 'gross': 0, 'profit': 0}
    if 'budget' in business:
        #b_['budget'] = sum([int(intValue(i.replace(',', ''))) for i in business['budget']])
        budget = filter(lambda x: x.startswith('$'), business['budget'])
        if not budget:
            budget = business['budget']
        b_['budget'] = int(intValue(budget[0].replace(',', '')))
        
    if 'gross' in business:
        gross = filter(lambda x: x.startswith('$'), business['gross'])
        if gross:
            b_['gross'] = int(intValue(gross[0].replace(',', '')))
        #b_['gross'] = sum([int(intValue(i.replace(',', ''))) for i in business['gross']])
        #if 'weekend gross' in business:
        #    b_['gross'] += sum([int(intValue(i.replace(',', ''))) for i in business['weekend gross']])
    if b_['budget'] and b_['gross']:
        b_['profit'] = b_['gross'] - b_['budget']
    return b_

def getMovieFlimingDates(imdbId):
    business = getMovieBusiness(imdbId)
    if 'filming dates' in business and business['filming dates']:
        return business['filming dates'][0]
    return ''

def getMovieBusiness(imdbId):
    url = "%sbusiness" % getUrlBase(imdbId)
    data = readUrlUnicode(url)
    business = {}
    for r in re.compile('''<h5>(.*?)</h5>(.*?)<br/>.<br/>''', re.DOTALL).findall(data):
        key = stripTags(r[0]).strip().lower()
        value = [decodeHtml(stripTags(b).strip()) for b in r[1].split('<br/>')]
        business[key] = value
    return business

def getMovieEpisodes(imdbId):
    url = "%sepisodes" % getUrlBase(imdbId)
    data = readUrlUnicode(url)
    episodes = {}
    regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>(.*?)</b><br>(.*?)<br/>'''
    for r in re.compile(regexp, re.DOTALL).findall(data):
        try:
            episode = "S%02dE%02d" % (int(r[0]), int(r[1]))
            episodes[episode] = {}
            episodes[episode]['imdb'] = r[2]
            episodes[episode]['title'] = r[3].strip()
            if episodes[episode]['title'].startswith('Episode #%d'%int(r[0])):
                episodes[episode]['title'] = u''
            description = decodeHtml(r[5])
            description = stripTags(description.split('Next US airings:')[0])
            episodes[episode]['description'] = description.strip()
            episodes[episode]['date'] = ''
            try:
                d = stripTags(r[4])
                d = d.replace('Original Air Date: ', '')
                d = time.strftime("%Y-%m-%d", time.strptime(d, '%d %B %Y'))
                episodes[episode]['date'] = d
            except:
                pass
        except:
            import traceback
            print traceback.print_exc()
            pass
    return episodes

'''the old code below'''

class IMDb:
    def __init__(self, imdbId):
        self.imdb = imdbId
        self.pageUrl = getUrlBase(imdbId)

    def getPage(self):
        return readUrlUnicode(self.pageUrl)

    def parse_raw_value(self, key, value):
        if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'):
            value = stripTags(value.split('<a class="tn15more')[0]).strip()
        if key == 'runtime':
            parsed_value = getMovieRuntimeSeconds(self.imdb)
        elif key in ('country', 'language'):
            parsed_value = value.split(' / ')
            if len(parsed_value) == 1:
                parsed_value = parsed_value[0].split(' | ')
            parsed_value = [v.strip() for v in parsed_value]
        elif key == 'genre':
            parsed_value = value.replace('more', '').strip().split(' / ')
            if len(parsed_value) == 1:
                parsed_value = parsed_value[0].split(' | ')
            parsed_value = [v.strip() for v in parsed_value]
        elif key == 'tagline':
            parsed_value = value.replace('more', '').strip()
        elif key == 'plot_outline':
            parsed_value = value.replace('(view trailer)', '').strip()
            if parsed_value.endswith('more'):
                parsed_value = parsed_value[:-4].strip()
        elif key == 'tv_series':
            m = re.compile('<a href="/title/tt(.*?)/">(.*?)</a>').findall(value)
            if m:
                parsed_value = m[0][0]
            else:
                parsed_value = ''
        elif key == 'also_known_as':
            parsed_value = ''
            m = re.compile('(.*) \(International: English title').findall(value)
            if m:
                parsed_value = m[0]
            else:
                m = re.compile('(.*) \(USA').findall(value)
                if m:
                    parsed_value = m[0]
            parsed_value = parsed_value.split('<br />')[-1].split('(')[0]
            director = self.getCredits().get('director', None)
            if director:
                director = director[0]
                parsed_value = parsed_value.replace(director, '')
            if parsed_value.startswith("'s"):
                parsed_value = parsed_value[2:].strip()
            parsed_value = decodeHtml(parsed_value.strip())
        else:
            print value
            parsed_value = value
        return parsed_value

    def parseYear(self):
        return getMovieYear(self.imdb)

    def parse(self):
        from BeautifulSoup import BeautifulSoup

        data = self.getPage()
        IMDbDict ={}
        info = getMovieInfo(self.imdb)
        #Poster
        IMDbDict['poster'] = getMoviePoster(self.imdb)
        if not IMDbDict['poster']:
            IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'
        #Title, Year
        IMDbDict['year'] = self.parseYear()
        IMDbDict['title'] = getMovieTitle(self.imdb)

        #Rating
        #FIXME: in the future this could be just:
        #m = findRe(data, '<span id="voteuser">(.*?)</span>')
        m = re.compile('<b>(.*?)/10</b>', re.IGNORECASE).search(data)
        if m:
            r = stripTags(m.group(1))
            if r:
                IMDbDict['rating'] = int(float(r) * 1000)
            else:
                IMDbDict['rating'] = -1
        else:
            IMDbDict['rating'] = -1
        #Votes
        IMDbDict['votes'] = info['votes']

        data = data.replace('\n',' ')
        #some values
        keys = ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline', 'tv_series', 'also_known_as')
        for key in keys:
            IMDbDict[key] = ''
        IMDbDict['runtime'] = 0
        soup = BeautifulSoup(data)
        for info in soup('div', {'class': 'info'}):
            key = unicode(info).split('</h5>')[0].split('<h5>')
            if len(key) > 1:
                raw_value = unicode(info).split('</h5>')[1]
                key = key[1][:-1].lower().replace(' ', '_')
                if key in keys:
                    IMDbDict[key] = self.parse_raw_value(key, raw_value)
        IMDbDict['title_english'] = IMDbDict.pop('also_known_as', IMDbDict['title'])
        #is episode
        IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '')

        IMDbDict['episodes'] = getMovieEpisodes(self.imdb)
        if IMDbDict['episodes']:
            IMDbDict['tvshow'] = True
        else:
            IMDbDict['tvshow'] = False
        IMDbDict['credits'] = self.getCredits()
        IMDbDict['plot'] = getMoviePlot(self.imdb)
        IMDbDict['keywords'] = getMovieKeywords(self.imdb)
        IMDbDict['trivia'] = getMovieTrivia(self.imdb)
        IMDbDict['connections'] = getMovieConnections(self.imdb)
        IMDbDict['locations'] = getMovieLocations(self.imdb)
        IMDbDict['release_date'] = getMovieReleaseDate(self.imdb)
        IMDbDict['business'] = getMovieBusinessSum(self.imdb)
        IMDbDict['reviews'] = getMovieExternalReviews(self.imdb)
        IMDbDict['stills'] = getMovieStills(self.imdb)
        #IMDbDict['trailer'] = getMovieTrailer(self.imdb)
        self.IMDbDict = IMDbDict

        if IMDbDict['episode_of']:
            episode_of = getMovieInfo(IMDbDict['episode_of'])
            for key in ('country', 'language'):
                if not IMDbDict[key]:
                    IMDbDict[key] = episode_of[key]
        return self.IMDbDict

    def getCredits(self):
        raw_credits = getMovieCredits(self.imdb)
        credits = {}

        def getNames(creditList):
            return [stripTags(decodeHtml(c[0])) for c in creditList]

        credits['director'] = getNames(raw_credits.get('directors', ''))
        credits['writer'] = getNames(raw_credits.get('writers', ''))
        credits['producer'] = getNames(raw_credits.get('producers', ''))
        credits['cinematographer'] = getNames(raw_credits.get('cinematographers', ''))
        credits['editor'] = getNames(raw_credits.get('editors', ''))
        credits['cast'] = [(stripTags(decodeHtml(c[0])),stripTags(decodeHtml(c[1]))) for c in raw_credits.get('cast', [])]

        self.credits = credits
        return self.credits


def guess(title, director='', timeout=google.DEFAULT_TIMEOUT):
    #FIXME: proper file -> title
    title = title.split('-')[0]
    title = title.split('(')[0]
    title = title.split('.')[0]
    title = title.strip()
    imdb_url = 'http://www.imdb.com/find?q=%s' % quote(title.encode('utf-8'))
    return_url = ''

    #lest first try google
    #i.e. site:imdb.com Michael Stevens Sin
    if director:
        search = 'site:imdb.com %s "%s"' % (director, title)
    else:
        search = 'site:imdb.com "%s"' % title
    for (name, url, desc) in google.find(search, 2, timeout=timeout):
        if url.startswith('http://www.imdb.com/title/tt'):
             return normalizeImdbId(int(oxlib.intValue(url)))

    try:
        req = urllib2.Request(imdb_url, None, oxlib.net.DEFAULT_HEADERS)
        u = urllib2.urlopen(req)
        data = u.read()
        return_url = u.url
        u.close()
    except:
        return None
    if return_url.startswith('http://www.imdb.com/title/tt'):
        return return_url[28:35]
    if data:
        imdb_id = findRe(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)')
        if imdb_id:
            return imdb_id

    imdb_url = 'http://www.imdb.com/find?q=%s;s=tt;site=aka' % quote(title.encode('utf-8'))
    req = urllib2.Request(imdb_url, None, oxlib.net.DEFAULT_HEADERS)
    u = urllib2.urlopen(req)
    data = u.read()
    return_url = u.url
    u.close()
    if return_url.startswith('http://www.imdb.com/title/tt'):
        return return_url[28:35]

    return None

def getEpisodeData(title, episode, show_url = None):
    '''
      Collect information about an episode.

      Returns dict with title, show, description and episode
    '''
    episodeData = {
        'title': u'',
        'show': title,
        'description': u'',
        'episode': episode,
    }
    description = u''
    if not show_url:
        imdbid = guess(title)
    else:
        imdbid = "%07d" % int(re.compile('title/tt(\d*)').findall(link)[0])
    if imdbid:
        i = IMDb(imdbid).parse()
        episodeData['title'] = i['episodes'][episode]['title']
        episodeData['description'] = i['episodes'][episode]['description']
        episodeData['imdb'] = i['episodes'][episode]['imdb']
    return episodeData

def getPersonData(imdbId):
    imdbId = normalizeImdbId(imdbId)
    url = u'http://www.imdb.com/name/nm%s/' % imdbId
    data = readUrlUnicode(url)
    info = dict()
    info['name'] = findRe(data, u'<title>(.*?)</title>')
    filmo = data.split(u'<h3>Additional Details</h3>')[0]
    movies = {}
    for part in filmo.split(u'<div class="filmo"')[1:]:
        section = findRe(part, u'a name=".*?">(.*?):</a></h5>')
        section = decodeHtml(section)
        movies[section] = re.compile(u'href="/title/tt(\d{7})/"').findall(part)
    info['movies'] = movies
    return info

if __name__ == '__main__':
    import sys
    #print parse(sys.argv[1])
    print "imdb:", guess(sys.argv[1])
-												lets start with google and imdb

											
										
										
											2008-04-28 09:52:21 +00:00
+								# -*- coding: utf-8 -*-
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								# vi:si:et:sw=4:sts=4:ts=4
-												lets start with google and imdb

											
										
										
											2008-04-28 09:52:21 +00:00
+								import urllib2
-												get Trailers

											
										
										
											2008-04-29 19:09:10 +00:00
+								from urllib import quote, unquote
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								import re
-												lets start with google and imdb

											
										
										
											2008-04-28 09:52:21 +00:00
+								import os
 								import time
 								import chardet
-												back to oxlib, package_dir does not work with pip/python setup.py develop

											
										
										
											2009-10-12 15:18:59 +00:00
+								import oxlib
 								from oxlib import stripTags, decodeHtml, findRe, findString
 								import oxlib.cache
 								from oxlib.normalize import normalizeTitle, normalizeImdbId
 								from oxlib import *
-												lets start with google and imdb

											
										
										
											2008-04-28 09:52:21 +00:00
 								import google
-												never update imdb cache for now

											
										
										
											2009-05-31 20:28:01 +00:00
+								'''
 								    never timeout imdb data, to update cache remove data from cache folder
 								'''
-												back to oxlib, package_dir does not work with pip/python setup.py develop

											
										
										
											2009-10-12 15:18:59 +00:00
+								def readUrlUnicode(url, data=None, headers=oxlib.cache.DEFAULT_HEADERS, timeout=-1):
 								    return oxlib.cache.readUrlUnicode(url, data, headers, timeout)
-												reduce parsing time drastically thanks to updated encoding detection and removing BeautifulSoup

											
										
										
											2008-06-17 11:07:53 +00:00
-												cleanup urls

											
										
										
											2009-06-01 13:11:22 +00:00
+								'''
 								check if result is valid while updating
 								def validate(result, header):
 								    return header['status'] == u'200'
 								try:
-												back to oxlib, package_dir does not work with pip/python setup.py develop

											
										
										
											2009-10-12 15:18:59 +00:00
+								    d = oxlib.cache.readUrlUnicode(url, data, headers, timeout=0, valid=validate)
 								except oxlib.cache.InvalidResult, e:
-												cleanup urls

											
										
										
											2009-06-01 13:11:22 +00:00
+								    print e.headers
 								'''
-												- changes to imdb.py
  * user more oxutils functions
  * start migrating to a raw dict, first part, parse full cast with names from imdb
  * add getMovieId


											
										
										
											2008-04-29 16:12:27 +00:00
+								def getMovieId(title, director='', year=''):
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								    '''
 								    >>> getMovieId('The Matrix')
 								    '0133093'
 								    '''
 								    if year:
 								        title = "%s (%s)" % (title, year)
 								    if director:
 								        query = 'site:imdb.com %s "%s"' % (director, title)
 								    else:
 								        query = 'site:imdb.com "%s"' % title
-												movie title searches can do without beeing updated every 24 hours

											
										
										
											2009-08-19 21:00:31 +00:00
+								    for (name, url, desc) in google.find(query, 3, timeout=-1):
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								        if url.startswith('http://www.imdb.com/title/tt'):
 								            return url[28:35]
 								    return ''
-												- changes to imdb.py
  * user more oxutils functions
  * start migrating to a raw dict, first part, parse full cast with names from imdb
  * add getMovieId


											
										
										
											2008-04-29 16:12:27 +00:00
 								def getMovieData(imdbId):
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								    return IMDb(imdbId).parse()
-												- changes to imdb.py
  * user more oxutils functions
  * start migrating to a raw dict, first part, parse full cast with names from imdb
  * add getMovieId


											
										
										
											2008-04-29 16:12:27 +00:00
 								# internal functions below
 								def getUrlBase(imdbId):
-												cleanup urls

											
										
										
											2009-06-01 13:11:22 +00:00
+								    return "http://www.imdb.com/title/tt%s/" % imdbId
-												- changes to imdb.py
  * user more oxutils functions
  * start migrating to a raw dict, first part, parse full cast with names from imdb
  * add getMovieId


											
										
										
											2008-04-29 16:12:27 +00:00
 								def getRawMovieData(imdbId):
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								    imdbId = normalizeImdbId(imdbId)
 								    data = getMovieInfo(imdbId)
 								    data['credits'] = getMovieCredits(imdbId)
 								    data['poster'] = getMoviePoster(imdbId)
 								    data['company credits'] = getMovieCompanyCredits(imdbId)
 								    data['filming locations'] = getMovieLocations(imdbId)
 								    data['movie connections'] = getMovieConnections(imdbId)
 								    data['external reviews'] = getMovieExternalReviews(imdbId)
 								    data['trivia'] = getMovieTrivia(imdbId)
 								    data['keywords'] = getMovieKeywords(imdbId)
 								    data['media'] = {}
 								    data['media']['images'] = getMovieImages(imdbId)
 								    data['media']['trailers'] = getMovieTrailers(imdbId)
 								    data['plotsummary'] = getMoviePlot(imdbId)
 								    data['release dates'] = getMovieReleaseDates(imdbId)
 								    data['release date'] = getMovieReleaseDate(imdbId)
 								    return data
-												add one  dailymotion function

											
										
										
											2008-04-30 13:31:50 +00:00
-												imdb cleanups

											
										
										
											2009-11-30 00:46:56 +00:00
+								def getMovieInfo(imdbId, timeout=-1):
 								    data = readUrlUnicode(getUrlBase(imdbId), timeout=timeout)
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								    info = dict()
 								    info['poster'] = findRe(data, 'name="poster".*?<img .*?src="(.*?)"')
 								    if info['poster'] and '_V' in info['poster']:
 								        info['poster']= "%s.jpg" % info['poster'].split('._V')[0]
 								    for i in re.compile('<h5>(.*?):</h5>(.*?)<div class="info"', re.DOTALL).findall(data):
 								        title = stripTags(i[0]).strip().lower()
-												imdb cleanups

											
										
										
											2009-11-30 00:46:56 +00:00
+								        if title in ('genre', ):
-												cast and genre cleanup

											
										
										
											2010-04-08 09:52:44 +00:00
+								            txt = i[1].split('<a class="tn15more')[0].split('</div>')[0]
-												imdb cleanups

											
										
										
											2009-11-30 00:46:56 +00:00
+								        else:
 								            txt= i[1]
 								        txt = stripTags(txt).strip()
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								        def cleanUp(k):
 								            k = decodeHtml(k).replace(u'\xa0', ' ').strip()
 								            if k.endswith('more'): k=k[:-len('more')].strip()
 								            return k
 								        txt = cleanUp(txt)
-												parse title and add some tests

											
										
										
											2008-07-05 13:35:46 +00:00
+								        if title not in ('plot', 'trivia', 'filming locations', 'mpaa', 'tagline', 'original air date'):
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								            if '|' in txt:
 								                txt = [cleanUp(k) for k in txt.split('|')]
 								            elif ', ' in txt:
 								                txt = [cleanUp(k) for k in txt.split(', ')]
-												genre is array

											
										
										
											2009-08-03 10:44:04 +00:00
+								            elif title in ('country', 'language', 'genre'):
-												country is allways dict

											
										
										
											2008-07-16 18:05:30 +00:00
+								                txt = [cleanUp(txt), ]
-												add some sites, fix some bugs

											
										
										
											2008-07-29 17:04:23 +00:00
+								        if title == 'tv series':
 								            info['series_imdb'] = findRe(i[1], 'tt(\d{7})')
-												parse title and add some tests

											
										
										
											2008-07-05 13:35:46 +00:00
+								        if title == 'original air date':
-												only use season episode if its in the right place

											
										
										
											2008-07-13 13:31:16 +00:00
+								            info['series_episode_info'] = txt.split('\n')[-1].strip()
-												parse title and add some tests

											
										
										
											2008-07-05 13:35:46 +00:00
+								            txt = txt.split('\n')[0].strip()
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								        if not title.startswith('moviemeter'):
 								            info[title] = txt
-												parse title and add some tests

											
										
										
											2008-07-05 13:35:46 +00:00
+								    for key in ('user comments', 'writers (wga)', 'plot keywords'):
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								       if key in info:
 								          del info[key]
 								    if 'release date' in info:
-												fix release date for 0107394

											
										
										
											2008-11-11 00:34:34 +00:00
+								        if isinstance(info['release date'], list):
 								            info['release date'] = info['release date'][0]
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								        info['release date'] = info['release date'].split('\n')[0]
 								    if 'plot' in info:
 								        info['plot'] = info['plot'].split('| add synopsis')[0].strip()
 								        info['plot'] = info['plot'].split('| full synopsis')[0].strip()
 								        if info['plot'] in ('add synopsis', 'full synopsis'):
 								            info['plot'] = ''
 								    #get Title
 								    title = ''
 								    year = ''
 								    html_title = findRe(data, '<div id="tn15title">(.*?)</div>')
 								    if not html_title:
 								        html_title = findRe(data, '<title>(.*?)</title>')
-												no pro link

											
										
										
											2008-09-18 08:18:46 +00:00
+								    else:
 								        html_title = html_title.split('<span class="pro-link">')[0]
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								    if html_title:
 								        html_title = html_title.replace('<br />', ' ').replace('  ', ' ')
-												first strip, decode entities after that

											
										
										
											2009-07-14 12:49:00 +00:00
+								        title = stripTags(html_title)
 								        title = decodeHtml(title)
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								        year = findRe(title, '\((\d{4})\)')
 								        if not year:
 								            year = findRe(title, '\((\d{4})')
 								        _y = findRe(title, r'(\([0-9\?]{4}[/IVXLCDM]*?\))')
 								        if _y:
 								            title = title.replace(_y, '')
 								        for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
 								            title = title.replace(t, '')
 								    title = title.strip()
 								    if title.find(u'\xa0') > -1:
 								        title = title[:title.find(u'\xa0')].strip()
 								    if title.startswith('"') and title.endswith('"'):
 								        title = title[1:-1]
-												parse title and add some tests

											
										
										
											2008-07-05 13:35:46 +00:00
+								    info['title'] = normalizeTitle(title)
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								    info['year'] = year
-												only use season episode if its in the right place

											
										
										
											2008-07-13 13:31:16 +00:00
 								    #Series
-												parse title and add some tests

											
										
										
											2008-07-05 13:35:46 +00:00
+								    if title.startswith('"') and title.find('"',1) > 0 and \
 								        title.find('"',1) == title.rfind('"'):
 								        episode_title = title[title.rfind('"')+1:]
 								        episode_title = re.sub("\?{4}", "", episode_title).strip()
 								        episode_title = re.sub("\d{4}", "", episode_title).strip()
 								        if episode_title == '-': episode_title=''
 								        title = normalizeTitle(title[1:title.rfind('"')])
 								        if episode_title:
 								            info['episode title'] = episode_title
 								            info['series title'] = title
 								            info['title'] = "%s: %s" % (title, episode_title)
 								        else:
 								            info['title'] = title
-												only use season episode if its in the right place

											
										
										
											2008-07-13 13:31:16 +00:00
+								    se = re.compile("Season (\d*), Episode (\d*)\)").findall(info.get('series_episode_info', ''))
-												parse title and add some tests

											
										
										
											2008-07-05 13:35:46 +00:00
+								    if se:
 								        info['season'] = int(se[0][0])
 								        info['episode'] = int(se[0][1])
 								        info['title'] = "%s (S%02dE%02d) %s" % (
 								                    info['series title'], info['season'], info['episode'], info['episode title'])
 								        info['title'] = info['title'].strip()
-												only use season episode if its in the right place

											
										
										
											2008-07-13 13:31:16 +00:00
+								        del info['series_episode_info']
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
 								    #Rating
 								    rating = findRe(data, '<b>([\d\.]*?)/10</b>')
 								    if rating:
 								        info['rating'] = float(rating)
 								    else:
 								        info['rating'] = -1
 								    #Votes
-												ratings can not exist now

											
										
										
											2009-08-20 19:50:07 +00:00
+								    info['votes'] = -1
 								    if "user rating" in info:
-												use only one year parser, more relaxed about user rating

											
										
										
											2010-01-07 00:08:08 +00:00
+								        if isinstance(info['user rating'], list):
 								            info['user rating'] = ' '.join(info['user rating'])
-												ratings can not exist now

											
										
										
											2009-08-20 19:50:07 +00:00
+								        votes = findRe(info['user rating'], '([\d,]*?) votes')
 								        if votes:
 								            info['votes'] = int(votes.replace(',', ''))
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								    return info
-												more raw values from imdb

											
										
										
											2008-04-29 22:15:28 +00:00
-												fix imdb serach and mapping

											
										
										
											2009-07-10 08:47:01 +00:00
+								def getMovieRuntimeSeconds(imdbId):
 								    info = getMovieInfo(imdbId)
-												less BeautifulSoup

											
										
										
											2009-08-06 10:10:57 +00:00
+								    if 'runtime' in info:
 								        value = info['runtime'][0]
 								        parsed_value = findRe(value, '(.*?) min')
-												fix imdb serach and mapping

											
										
										
											2009-07-10 08:47:01 +00:00
+								        parsed_value = findRe(parsed_value, '([0-9]+)')
 								        if not parsed_value:
-												less BeautifulSoup

											
										
										
											2009-08-06 10:10:57 +00:00
+								            parsed_value = findRe(value, '(.*?) sec')
 								            parsed_value = findRe(parsed_value, '([0-9]+)')
 								            if not parsed_value:
 								                parsed_value = 0
 								            else:
 								                parsed_value = int(parsed_value)
-												fix imdb serach and mapping

											
										
										
											2009-07-10 08:47:01 +00:00
+								        else:
-												less BeautifulSoup

											
										
										
											2009-08-06 10:10:57 +00:00
+								            parsed_value = int(parsed_value) * 60
-												fix imdb serach and mapping

											
										
										
											2009-07-10 08:47:01 +00:00
+								    else:
-												less BeautifulSoup

											
										
										
											2009-08-06 10:10:57 +00:00
+								        parsed_value = -1
-												fix imdb serach and mapping

											
										
										
											2009-07-10 08:47:01 +00:00
+								    return parsed_value
-												parse title and add some tests

											
										
										
											2008-07-05 13:35:46 +00:00
-												add one  dailymotion function

											
										
										
											2008-04-30 13:31:50 +00:00
+								def getMoviePoster(imdbId):
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								    info = getMovieInfo(imdbId)
 								    return info['poster']
-												more raw values from imdb

											
										
										
											2008-04-29 22:15:28 +00:00
-												add one  dailymotion function

											
										
										
											2008-04-30 13:31:50 +00:00
+								def getMovieYear(imdbId):
-												parse title and add some tests

											
										
										
											2008-07-05 13:35:46 +00:00
+								    '''
 								    >>> getMovieYear('0315404')
 								    u'1964'
 								    >>> getMovieYear('0734840')
 								    u'1990'
 								    >>> getMovieYear('0815352')
 								    u'1964'
 								    '''
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								    info = getMovieInfo(imdbId)
 								    return info['year']
-												add one  dailymotion function

											
										
										
											2008-04-30 13:31:50 +00:00
 								def getMovieTitle(imdbId):
-												parse title and add some tests

											
										
										
											2008-07-05 13:35:46 +00:00
+								    '''
 								    >>> getMovieTitle('0306414')
 								    u'The Wire'
 								    >>> getMovieTitle('0734840')
 								    u'Twin Peaks (S01E02) Episode #1.2'
 								    >>> getMovieTitle('0734840')
 								    u'Twin Peaks (S01E02) Episode #1.2'
 								    >>> getMovieTitle('0749451')
 								    u'The Wire (S01E01) The Target'
 								    '''
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								    info = getMovieInfo(imdbId)
 								    return info['title']
-												- changes to imdb.py
  * user more oxutils functions
  * start migrating to a raw dict, first part, parse full cast with names from imdb
  * add getMovieId


											
										
										
											2008-04-29 16:12:27 +00:00
-												add AKATitles

											
										
										
											2009-08-02 16:23:17 +00:00
+								def getMovieAKATitles(imdbId):
 								    '''
 								    >>> getMovieAKATitle('0040980')
 								    [(u'Frauen der Nacht', u'Germany'),
 								     (u'Les femmes de la nuit', u'France'),
 								     (u'Women of the Night', u'(undefined)')]
 								    '''
 								    url = "%sreleaseinfo" % getUrlBase(imdbId)
-												depend on ox, install as ox.web, migrate getUrl to readUrl

											
										
										
											2009-10-12 11:47:43 +00:00
+								    data = readUrlUnicode(url)
-												add AKATitles

											
										
										
											2009-08-02 16:23:17 +00:00
+								    titles = findRe(data, 'name="akas".*?<table.*?>(.*?)</table>')
 								    titles = re.compile("td>(.*?)</td>\n\n<td>(.*)</td>").findall(titles)
 								    return titles
-												- changes to imdb.py
  * user more oxutils functions
  * start migrating to a raw dict, first part, parse full cast with names from imdb
  * add getMovieId


											
										
										
											2008-04-29 16:12:27 +00:00
+								def creditList(data, section=None):
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								    if section == 'cast':
 								        credits_ = re.compile('''<tr .*?<td class="nm">(.*?)</td><td class="ddd">.*?</td><td class="char">(.*?)</td></tr>''').findall(data)
 								    else:
 								        credits_ = re.compile('''<tr>.*?<td valign="top">(.*?)</td><td.*?</td><td valign="top">(.*?)</td></tr>''').findall(data)
 								    credits = []
 								    for c_ in credits_:
-												imdb cleanups

											
										
										
											2009-11-30 00:46:56 +00:00
+								        c = [stripTags(decodeHtml(c_[0]).strip()), stripTags(decodeHtml(c_[1]).strip())]
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								        if section=='writers':
 								            c[1] = c[1].replace('<br>', '').strip().replace(')', '').replace('(','')
 								            if c[1].endswith(' and'): c[1] = c[1][:-4]
 								        credits.append(c)
 								    return credits
-												- changes to imdb.py
  * user more oxutils functions
  * start migrating to a raw dict, first part, parse full cast with names from imdb
  * add getMovieId


											
										
										
											2008-04-29 16:12:27 +00:00
-												add one  dailymotion function

											
										
										
											2008-04-30 13:31:50 +00:00
+								def getMovieCredits(imdbId):
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								    credits = dict()
-												cleanup urls

											
										
										
											2009-06-01 13:11:22 +00:00
+								    url = "%sfullcredits" % getUrlBase(imdbId)
-												depend on ox, install as ox.web, migrate getUrl to readUrl

											
										
										
											2009-10-12 11:47:43 +00:00
+								    data = readUrlUnicode(url)
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								    groups = data.split('<h5>')
 								    for g in groups:
-												cast and genre cleanup

											
										
										
											2010-04-08 09:52:44 +00:00
+								        #<a class="glossary" name="writers" href="/glossary/W#writer">Writing credits</a>
 								        section = re.compile('''name="(.*?)".*? href="/Glossary''', re.IGNORECASE).findall(g)
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								        if section:
 								            credits[section[0]] = creditList(g, section[0])
 								    return credits
-												- changes to imdb.py
  * user more oxutils functions
  * start migrating to a raw dict, first part, parse full cast with names from imdb
  * add getMovieId


											
										
										
											2008-04-29 16:12:27 +00:00
-												get Trailers

											
										
										
											2008-04-29 19:09:10 +00:00
+								def getMovieTrailers(imdbId):
-												only import BeautifulSoup if used in depricated functions

											
										
										
											2010-01-22 23:03:14 +00:00
+								    from BeautifulSoup import BeautifulSoup
-												cleanup urls

											
										
										
											2009-06-01 13:11:22 +00:00
+								    url = "%strailers" % getUrlBase(imdbId)
-												depend on ox, install as ox.web, migrate getUrl to readUrl

											
										
										
											2009-10-12 11:47:43 +00:00
+								    data = readUrlUnicode(url)
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								    soup = BeautifulSoup(data)
 								    videos = soup('div', {'class':"video-gallery"})
 								    trailers = []
 								    if videos:
 								        for a in videos[0]('a'):
 								            title = stripTags(unicode(a)).strip()
 								            url = 'http://www.imdb.com' + a['href']
 								            videoId = findRe(url, '/(vi\d*?)/')
 								            iframeUrl = "http://www.imdb.com/video/trailer/%s/player" % videoId
-												depend on ox, install as ox.web, migrate getUrl to readUrl

											
										
										
											2009-10-12 11:47:43 +00:00
+								            iframe = readUrlUnicode(iframeUrl)
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								            videoUrl = unquote(findRe(iframe, 'addVariable\("file", "(.*?)"'))
 								            trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl})
 								    return trailers
-												lets start with google and imdb

											
										
										
											2008-04-28 09:52:21 +00:00
-												more raw values from imdb

											
										
										
											2008-04-29 22:15:28 +00:00
+								def getMovieQuotes(imdbId):
-												cleanup urls

											
										
										
											2009-06-01 13:11:22 +00:00
+								    url = "%squotes" % getUrlBase(imdbId)
-												depend on ox, install as ox.web, migrate getUrl to readUrl

											
										
										
											2009-10-12 11:47:43 +00:00
+								    data = readUrlUnicode(url)
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								    quotes = re.compile('<b>(.*?)</b>:(.*?)<br>', re.DOTALL).findall(findString(data, '<a name="q'))
 								    quotes = [(q[0].strip(),q[1].strip())  for q in quotes]
 								    return quotes
-												more raw values from imdb

											
										
										
											2008-04-29 22:15:28 +00:00
-												move plot parsing, remove use of BeautifulSoup from getMovieInfo

											
										
										
											2008-05-23 11:08:40 +00:00
+								def getMoviePlot(imdbId):
-												cleanup urls

											
										
										
											2009-06-01 13:11:22 +00:00
+								    url = "%splotsummary" % getUrlBase(imdbId)
-												depend on ox, install as ox.web, migrate getUrl to readUrl

											
										
										
											2009-10-12 11:47:43 +00:00
+								    data = readUrlUnicode(url)
-												fix plot, fix dont fail in youtube

											
										
										
											2008-12-07 14:39:39 +00:00
+								    plot = findRe(data, '<p class="plotpar">(.*?)<i>').split('</p>')[0]
 								    return plot.strip()
-												move plot parsing, remove use of BeautifulSoup from getMovieInfo

											
										
										
											2008-05-23 11:08:40 +00:00
-												more raw values from imdb

											
										
										
											2008-04-29 22:15:28 +00:00
+								def getMovieTechnical(imdbId):
-												cleanup urls

											
										
										
											2009-06-01 13:11:22 +00:00
+								    url = "%stechnical" % getUrlBase(imdbId)
-												depend on ox, install as ox.web, migrate getUrl to readUrl

											
										
										
											2009-10-12 11:47:43 +00:00
+								    data = readUrlUnicode(url)
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								    results = {}
 								    for t in re.compile('<h5>(.*?)</h5>(.*?)<br/>', re.DOTALL).findall(data):
 								        results[t[0].strip()] = t[1].strip()
 								    return results
-												more raw values from imdb

											
										
										
											2008-04-29 22:15:28 +00:00
 								def getMovieCompanyCredits(imdbId):
-												cleanup urls

											
										
										
											2009-06-01 13:11:22 +00:00
+								    url = "%scompanycredits" % getUrlBase(imdbId)
-												depend on ox, install as ox.web, migrate getUrl to readUrl

											
										
										
											2009-10-12 11:47:43 +00:00
+								    data = readUrlUnicode(url)
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								    results = {}
 								    for field, c in re.compile('<h2>(.*?)</h2><ul>(.*?)</ul>').findall(data):
 								        results[field.strip()] = []
 								        for company in re.compile('<li>(.*?)</li>').findall(c):
 								            results[field.strip()].append(company)
 								    return results
-												more raw values from imdb

											
										
										
											2008-04-29 22:15:28 +00:00
 								def getMovieLocations(imdbId):
-												cleanup urls

											
										
										
											2009-06-01 13:11:22 +00:00
+								    url = "%slocations" % getUrlBase(imdbId)
-												depend on ox, install as ox.web, migrate getUrl to readUrl

											
										
										
											2009-10-12 11:47:43 +00:00
+								    data = readUrlUnicode(url)
-												less BeautifulSoup

											
										
										
											2009-08-06 10:10:57 +00:00
+								    locations = re.compile('<dt><a href="/List.*?>(.*?)</a></dt>').findall(data)
-												imdb cleanups

											
										
										
											2009-11-30 00:46:56 +00:00
+								    locations = [decodeHtml(l) for l in locations]
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								    return locations
-												more raw values from imdb

											
										
										
											2008-04-29 22:15:28 +00:00
 								def getMovieImages(imdbId, keys=('still_frame', 'poster', 'product')):
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								    photos = {}
 								    for key in keys:
-												cleanup urls

											
										
										
											2009-06-01 13:11:22 +00:00
+								        url = "%smediaindex?refine=%s" % (getUrlBase(imdbId), key)
-												depend on ox, install as ox.web, migrate getUrl to readUrl

											
										
										
											2009-10-12 11:47:43 +00:00
+								        data = readUrlUnicode(url)
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								        photos[key] = {}
 								        for s in  re.compile('''<img alt="(.*?)".*?src="(http://ia.media-imdb.com/.*?.jpg)''').findall(data):
 								            img= "%s.jpg" % s[1].split('._V')[0]
 								            title = s[0]
 								            if key=='still_frame':
 								                if not "_CR0" in s[1]:
 								                    photos[key][img] = title
 								            else:
 								                photos[key][img] = title
 								    return photos
-												more raw values from imdb

											
										
										
											2008-04-29 22:15:28 +00:00
-												get Trailers

											
										
										
											2008-04-29 19:09:10 +00:00
+								def getMovieStills(imdbId):
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								    return getMovieImages(imdbId, ['still_frame'])['still_frame']
-												more raw values from imdb

											
										
										
											2008-04-29 22:15:28 +00:00
 								def getMoviePosters(imdbId):
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								    posters = getMovieImages(imdbId, ['poster'])['poster']
 								    poster = getMoviePoster(imdbId)
 								    if poster:
 								        posters[poster] = 'main poster'
 								    return posters
-												return unscaled poster, inlcude main poster in getMoiePosters

											
										
										
											2008-05-10 08:29:15 +00:00
-												more raw values from imdb

											
										
										
											2008-04-29 22:15:28 +00:00
+								def getMovieTrivia(imdbId):
-												cleanup urls

											
										
										
											2009-06-01 13:11:22 +00:00
+								    url = "%strivia" % getUrlBase(imdbId)
-												cleanup

											
										
										
											2010-08-24 17:08:03 +00:00
+								    data_ = readUrlUnicode(url)
 								    data = findRe(data_, '<ul class="trivia">(.*?)</ul>')
-												fix trivia and external reviews

											
										
										
											2009-10-23 18:53:48 +00:00
+								    trivia = re.compile('<li>(.*?)</li>', re.DOTALL).findall(data)
-												less BeautifulSoup

											
										
										
											2009-08-06 10:10:57 +00:00
+								    def clean(t):
 								        t = decodeHtml(t)
-												cleanup

											
										
										
											2010-08-24 17:08:03 +00:00
+								        t = t.replace(u'', '"').strip()
 								        for s in ('<br><br>', '<br>\n<br>', '<br>'):
 								            if t.endswith(s):
 								                t = t[:-len(s)].strip()
-												less BeautifulSoup

											
										
										
											2009-08-06 10:10:57 +00:00
+								        return t.strip()
-												cleanup

											
										
										
											2010-08-24 17:08:03 +00:00
+								    if len(trivia) == 0:
 								        trivia = re.compile('<div class="sodatext">(.*?)<span', re.DOTALL).findall(data_)
-												less BeautifulSoup

											
										
										
											2009-08-06 10:10:57 +00:00
+								    trivia = [clean(t) for t in trivia]
-												cleanup

											
										
										
											2010-08-24 17:08:03 +00:00
 								    return trivia
-												get Trailers

											
										
										
											2008-04-29 19:09:10 +00:00
-												add one  dailymotion function

											
										
										
											2008-04-30 13:31:50 +00:00
+								def getMovieConnections(imdbId):
-												cleanup urls

											
										
										
											2009-06-01 13:11:22 +00:00
+								    url = "%smovieconnections" % getUrlBase(imdbId)
-												depend on ox, install as ox.web, migrate getUrl to readUrl

											
										
										
											2009-10-12 11:47:43 +00:00
+								    data = readUrlUnicode(url)
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								    connections={}
 								    for c in re.compile('''<h5>(.*?)</h5>(.*?)\n\n''', re.DOTALL).findall(data):
 								        connections[unicode(c[0])] = re.compile('''<a href="/title/tt(\d{7})/">''').findall(c[1])
 								    return connections
-												add one  dailymotion function

											
										
										
											2008-04-30 13:31:50 +00:00
 								def getMovieKeywords(imdbId):
-												cleanup urls

											
										
										
											2009-06-01 13:11:22 +00:00
+								    url = "%skeywords" % getUrlBase(imdbId)
-												depend on ox, install as ox.web, migrate getUrl to readUrl

											
										
										
											2009-10-12 11:47:43 +00:00
+								    data = readUrlUnicode(url)
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								    keywords = []
 								    for keyword in re.compile('''<a.*?href="/keyword.*?>(.*?)</a>''').findall(data):
 								        keyword = decodeHtml(keyword)
 								        keyword = keyword.replace(u'\xa0', ' ')
 								        keywords.append(keyword)
 								    return keywords
-												add one  dailymotion function

											
										
										
											2008-04-30 13:31:50 +00:00
 								def getMovieExternalReviews(imdbId):
-												cleanup urls

											
										
										
											2009-06-01 13:11:22 +00:00
+								    url = "%sexternalreviews" % getUrlBase(imdbId)
-												depend on ox, install as ox.web, migrate getUrl to readUrl

											
										
										
											2009-10-12 11:47:43 +00:00
+								    data = readUrlUnicode(url)
-												fix trivia and external reviews

											
										
										
											2009-10-23 18:53:48 +00:00
+								    data = findRe(data, '<ol>(.*?)</ol>')
 								    _reviews = re.compile('<li><a href="(http.*?)".*?>(.*?)</a></li>').findall(data)
-												less BeautifulSoup

											
										
										
											2009-08-06 10:10:57 +00:00
+								    reviews = {}
 								    for r in _reviews:
 								        reviews[r[0]] = r[1]
 								    return reviews
-												add one  dailymotion function

											
										
										
											2008-04-30 13:31:50 +00:00
-												get more things out of IMDb class

											
										
										
											2008-05-25 17:29:14 +00:00
+								def getMovieReleaseDate(imdbId):
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								    releasedates = getMovieReleaseDates(imdbId)
-												None not empty string as release date

											
										
										
											2008-07-02 16:55:41 +00:00
+								    first_release = None
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								    for r in releasedates:
 								        if not first_release or r[1] < first_release:
 								            first_release = r[1]
 								    return first_release
-												get more things out of IMDb class

											
										
										
											2008-05-25 17:29:14 +00:00
-												also parse Year and month year dates

											
										
										
											2008-07-01 12:40:36 +00:00
+								def _parseDate(d):
-												fix parse date

											
										
										
											2008-10-04 15:17:13 +00:00
+								    '''
 								    >>>_parseDate('3 March 1972')
 								    '1972-03-03'
 								    '''
-												also parse Year and month year dates

											
										
										
											2008-07-01 12:40:36 +00:00
+								    try:
 								        parsed_date = time.strptime(d, "%d %B %Y")
-												dont use strftime, it does not work for years like 1897

											
										
										
											2008-10-04 13:57:23 +00:00
+								        parsed_date = '%s-%02d-%02d' % (parsed_date.tm_year, parsed_date.tm_mon, parsed_date.tm_mday)
-												also parse Year and month year dates

											
										
										
											2008-07-01 12:40:36 +00:00
+								        return parsed_date
 								    except:
 								        try:
 								            parsed_date = time.strptime(d, "%B %Y")
-												dont use strftime, it does not work for years like 1897

											
										
										
											2008-10-04 13:57:23 +00:00
+								            parsed_date = '%s-%02d-01' % (parsed_date.tm_year, parsed_date.tm_mon)
-												also parse Year and month year dates

											
										
										
											2008-07-01 12:40:36 +00:00
+								            return parsed_date
 								        except:
 								            pass
 								        try:
 								            parsed_date = time.strptime(d, "%Y")
-												dont use strftime, it does not work for years like 1897

											
										
										
											2008-10-04 13:57:23 +00:00
+								            parsed_date = '%s-01-01' % (parsed_date.tm_year)
-												also parse Year and month year dates

											
										
										
											2008-07-01 12:40:36 +00:00
+								            return parsed_date
 								        except:
 								            pass
 								    return d
-												get more things out of IMDb class

											
										
										
											2008-05-25 17:29:14 +00:00
+								def getMovieReleaseDates(imdbId):
-												cleanup urls

											
										
										
											2009-06-01 13:11:22 +00:00
+								    url = "%sreleaseinfo" % getUrlBase(imdbId)
-												depend on ox, install as ox.web, migrate getUrl to readUrl

											
										
										
											2009-10-12 11:47:43 +00:00
+								    data = readUrlUnicode(url)
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								    releasedates = []
 								    regexp = '''<tr><td>(.*?)</td>.*?<td align="right">(.*?)</td>.*?<td>(.*?)</td></tr>'''
-												get more things out of IMDb class

											
										
										
											2008-05-25 17:29:14 +00:00
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								    for r in re.compile(regexp, re.DOTALL).findall(data):
 								        r_ = (stripTags(r[0]).strip(),
-												also parse Year and month year dates

											
										
										
											2008-07-01 12:40:36 +00:00
+								              _parseDate(stripTags(r[1]).strip()),
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								              decodeHtml(stripTags(r[2]).strip()))
 								        releasedates.append(r_)
 								    return releasedates
-												get more things out of IMDb class

											
										
										
											2008-05-25 17:29:14 +00:00
 								def getMovieBusinessSum(imdbId):
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								    business = getMovieBusiness(imdbId)
 								    b_ = {'budget': 0, 'gross': 0, 'profit': 0}
 								    if 'budget' in business:
-												only last gross / budget, not sum

											
										
										
											2008-09-30 16:05:19 +00:00
+								        #b_['budget'] = sum([int(intValue(i.replace(',', ''))) for i in business['budget']])
-												try to use $ value for budget

											
										
										
											2008-09-30 16:30:40 +00:00
+								        budget = filter(lambda x: x.startswith('$'), business['budget'])
 								        if not budget:
 								            budget = business['budget']
 								        b_['budget'] = int(intValue(budget[0].replace(',', '')))
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								    if 'gross' in business:
-												imdb cleanups

											
										
										
											2009-11-30 00:46:56 +00:00
+								        gross = filter(lambda x: x.startswith('$'), business['gross'])
 								        if gross:
 								            b_['gross'] = int(intValue(gross[0].replace(',', '')))
-												only last gross / budget, not sum

											
										
										
											2008-09-30 16:05:19 +00:00
+								        #b_['gross'] = sum([int(intValue(i.replace(',', ''))) for i in business['gross']])
 								        #if 'weekend gross' in business:
 								        #    b_['gross'] += sum([int(intValue(i.replace(',', ''))) for i in business['weekend gross']])
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								    if b_['budget'] and b_['gross']:
 								        b_['profit'] = b_['gross'] - b_['budget']
 								    return b_
-												get more things out of IMDb class

											
										
										
											2008-05-25 17:29:14 +00:00
 								def getMovieFlimingDates(imdbId):
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								    business = getMovieBusiness(imdbId)
 								    if 'filming dates' in business and business['filming dates']:
 								        return business['filming dates'][0]
 								    return ''
-												get more things out of IMDb class

											
										
										
											2008-05-25 17:29:14 +00:00
 								def getMovieBusiness(imdbId):
-												cleanup urls

											
										
										
											2009-06-01 13:11:22 +00:00
+								    url = "%sbusiness" % getUrlBase(imdbId)
-												depend on ox, install as ox.web, migrate getUrl to readUrl

											
										
										
											2009-10-12 11:47:43 +00:00
+								    data = readUrlUnicode(url)
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								    business = {}
 								    for r in re.compile('''<h5>(.*?)</h5>(.*?)<br/>.<br/>''', re.DOTALL).findall(data):
 								        key = stripTags(r[0]).strip().lower()
 								        value = [decodeHtml(stripTags(b).strip()) for b in r[1].split('<br/>')]
 								        business[key] = value
 								    return business
-												get more things out of IMDb class

											
										
										
											2008-05-25 17:29:14 +00:00
 								def getMovieEpisodes(imdbId):
-												cleanup urls

											
										
										
											2009-06-01 13:11:22 +00:00
+								    url = "%sepisodes" % getUrlBase(imdbId)
-												depend on ox, install as ox.web, migrate getUrl to readUrl

											
										
										
											2009-10-12 11:47:43 +00:00
+								    data = readUrlUnicode(url)
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								    episodes = {}
 								    regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>(.*?)</b><br>(.*?)<br/>'''
 								    for r in re.compile(regexp, re.DOTALL).findall(data):
 								        try:
 								            episode = "S%02dE%02d" % (int(r[0]), int(r[1]))
 								            episodes[episode] = {}
 								            episodes[episode]['imdb'] = r[2]
 								            episodes[episode]['title'] = r[3].strip()
 								            if episodes[episode]['title'].startswith('Episode #%d'%int(r[0])):
 								                episodes[episode]['title'] = u''
 								            description = decodeHtml(r[5])
 								            description = stripTags(description.split('Next US airings:')[0])
 								            episodes[episode]['description'] = description.strip()
 								            episodes[episode]['date'] = ''
 								            try:
 								                d = stripTags(r[4])
 								                d = d.replace('Original Air Date: ', '')
 								                d = time.strftime("%Y-%m-%d", time.strptime(d, '%d %B %Y'))
 								                episodes[episode]['date'] = d
 								            except:
 								                pass
 								        except:
 								            import traceback
 								            print traceback.print_exc()
 								            pass
 								    return episodes
-												get more things out of IMDb class

											
										
										
											2008-05-25 17:29:14 +00:00
-												get Trailers

											
										
										
											2008-04-29 19:09:10 +00:00
+								'''the old code below'''
-												lets start with google and imdb

											
										
										
											2008-04-28 09:52:21 +00:00
 								class IMDb:
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								    def __init__(self, imdbId):
 								        self.imdb = imdbId
-												cleanup urls

											
										
										
											2009-06-01 13:11:22 +00:00
+								        self.pageUrl = getUrlBase(imdbId)
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
 								    def getPage(self):
-												depend on ox, install as ox.web, migrate getUrl to readUrl

											
										
										
											2009-10-12 11:47:43 +00:00
+								        return readUrlUnicode(self.pageUrl)
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
 								    def parse_raw_value(self, key, value):
 								        if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'):
-												remove more...

											
										
										
											2010-04-08 10:01:31 +00:00
+								            value = stripTags(value.split('<a class="tn15more')[0]).strip()
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								        if key == 'runtime':
-												fix imdb serach and mapping

											
										
										
											2009-07-10 08:47:01 +00:00
+								            parsed_value = getMovieRuntimeSeconds(self.imdb)
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								        elif key in ('country', 'language'):
 								            parsed_value = value.split(' / ')
 								            if len(parsed_value) == 1:
 								                parsed_value = parsed_value[0].split(' | ')
 								            parsed_value = [v.strip() for v in parsed_value]
 								        elif key == 'genre':
 								            parsed_value = value.replace('more', '').strip().split(' / ')
 								            if len(parsed_value) == 1:
 								                parsed_value = parsed_value[0].split(' | ')
 								            parsed_value = [v.strip() for v in parsed_value]
 								        elif key == 'tagline':
 								            parsed_value = value.replace('more', '').strip()
 								        elif key == 'plot_outline':
 								            parsed_value = value.replace('(view trailer)', '').strip()
 								            if parsed_value.endswith('more'):
 								                parsed_value = parsed_value[:-4].strip()
 								        elif key == 'tv_series':
 								            m = re.compile('<a href="/title/tt(.*?)/">(.*?)</a>').findall(value)
 								            if m:
 								                parsed_value = m[0][0]
 								            else:
 								                parsed_value = ''
 								        elif key == 'also_known_as':
 								            parsed_value = ''
 								            m = re.compile('(.*) \(International: English title').findall(value)
 								            if m:
 								                parsed_value = m[0]
 								            else:
 								                m = re.compile('(.*) \(USA').findall(value)
 								                if m:
 								                    parsed_value = m[0]
 								            parsed_value = parsed_value.split('<br />')[-1].split('(')[0]
 								            director = self.getCredits().get('director', None)
 								            if director:
 								                director = director[0]
 								                parsed_value = parsed_value.replace(director, '')
 								            if parsed_value.startswith("'s"):
 								                parsed_value = parsed_value[2:].strip()
 								            parsed_value = decodeHtml(parsed_value.strip())
-												lets start with google and imdb

											
										
										
											2008-04-28 09:52:21 +00:00
+								        else:
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								            print value
 								            parsed_value = value
 								        return parsed_value
 								    def parseYear(self):
-												use only one year parser, more relaxed about user rating

											
										
										
											2010-01-07 00:08:08 +00:00
+								        return getMovieYear(self.imdb)
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
 								    def parse(self):
-												only import BeautifulSoup if used in depricated functions

											
										
										
											2010-01-22 23:03:14 +00:00
+								        from BeautifulSoup import BeautifulSoup
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								        data = self.getPage()
 								        IMDbDict ={}
-												fix votes in IMDb

											
										
										
											2009-05-31 18:12:43 +00:00
+								        info = getMovieInfo(self.imdb)
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								        #Poster
 								        IMDbDict['poster'] = getMoviePoster(self.imdb)
 								        if not IMDbDict['poster']:
 								            IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'
 								        #Title, Year
 								        IMDbDict['year'] = self.parseYear()
-												parse title and add some tests

											
										
										
											2008-07-05 13:35:46 +00:00
+								        IMDbDict['title'] = getMovieTitle(self.imdb)
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
 								        #Rating
-												imdb is changing

											
										
										
											2010-01-08 11:37:37 +00:00
+								        #FIXME: in the future this could be just:
-												fix, comment

											
										
										
											2010-01-08 11:52:39 +00:00
+								        #m = findRe(data, '<span id="voteuser">(.*?)</span>')
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								        m = re.compile('<b>(.*?)/10</b>', re.IGNORECASE).search(data)
-												lets start with google and imdb

											
										
										
											2008-04-28 09:52:21 +00:00
+								        if m:
-												imdb is changing

											
										
										
											2010-01-08 11:37:37 +00:00
+								            r = stripTags(m.group(1))
 								            if r:
 								                IMDbDict['rating'] = int(float(r) * 1000)
 								            else:
 								                IMDbDict['rating'] = -1
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								        else:
 								            IMDbDict['rating'] = -1
 								        #Votes
-												fix votes in IMDb

											
										
										
											2009-05-31 18:12:43 +00:00
+								        IMDbDict['votes'] = info['votes']
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
 								        data = data.replace('\n',' ')
 								        #some values
 								        keys = ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline', 'tv_series', 'also_known_as')
 								        for key in keys:
 								            IMDbDict[key] = ''
 								        IMDbDict['runtime'] = 0
 								        soup = BeautifulSoup(data)
 								        for info in soup('div', {'class': 'info'}):
 								            key = unicode(info).split('</h5>')[0].split('<h5>')
 								            if len(key) > 1:
 								                raw_value = unicode(info).split('</h5>')[1]
 								                key = key[1][:-1].lower().replace(' ', '_')
 								                if key in keys:
 								                    IMDbDict[key] = self.parse_raw_value(key, raw_value)
 								        IMDbDict['title_english'] = IMDbDict.pop('also_known_as', IMDbDict['title'])
 								        #is episode
 								        IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '')
 								        IMDbDict['episodes'] = getMovieEpisodes(self.imdb)
 								        if IMDbDict['episodes']:
 								            IMDbDict['tvshow'] = True
 								        else:
 								            IMDbDict['tvshow'] = False
 								        IMDbDict['credits'] = self.getCredits()
 								        IMDbDict['plot'] = getMoviePlot(self.imdb)
 								        IMDbDict['keywords'] = getMovieKeywords(self.imdb)
 								        IMDbDict['trivia'] = getMovieTrivia(self.imdb)
 								        IMDbDict['connections'] = getMovieConnections(self.imdb)
 								        IMDbDict['locations'] = getMovieLocations(self.imdb)
 								        IMDbDict['release_date'] = getMovieReleaseDate(self.imdb)
 								        IMDbDict['business'] = getMovieBusinessSum(self.imdb)
 								        IMDbDict['reviews'] = getMovieExternalReviews(self.imdb)
 								        IMDbDict['stills'] = getMovieStills(self.imdb)
 								        #IMDbDict['trailer'] = getMovieTrailer(self.imdb)
 								        self.IMDbDict = IMDbDict
 								        if IMDbDict['episode_of']:
-												country is allways dict

											
										
										
											2008-07-16 18:05:30 +00:00
+								            episode_of = getMovieInfo(IMDbDict['episode_of'])
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								            for key in ('country', 'language'):
 								                if not IMDbDict[key]:
 								                    IMDbDict[key] = episode_of[key]
 								        return self.IMDbDict
 								    def getCredits(self):
 								        raw_credits = getMovieCredits(self.imdb)
 								        credits = {}
 								        def getNames(creditList):
 								            return [stripTags(decodeHtml(c[0])) for c in creditList]
 								        credits['director'] = getNames(raw_credits.get('directors', ''))
 								        credits['writer'] = getNames(raw_credits.get('writers', ''))
 								        credits['producer'] = getNames(raw_credits.get('producers', ''))
-												also return cinematographer and editor

											
										
										
											2009-06-01 13:16:12 +00:00
+								        credits['cinematographer'] = getNames(raw_credits.get('cinematographers', ''))
 								        credits['editor'] = getNames(raw_credits.get('editors', ''))
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								        credits['cast'] = [(stripTags(decodeHtml(c[0])),stripTags(decodeHtml(c[1]))) for c in raw_credits.get('cast', [])]
 								        self.credits = credits
 								        return self.credits
-												- changes to imdb.py
  * user more oxutils functions
  * start migrating to a raw dict, first part, parse full cast with names from imdb
  * add getMovieId


											
										
										
											2008-04-29 16:12:27 +00:00
-												lets start with google and imdb

											
										
										
											2008-04-28 09:52:21 +00:00
-												add timeout to imdb.guess

											
										
										
											2010-01-24 04:55:41 +00:00
+								def guess(title, director='', timeout=google.DEFAULT_TIMEOUT):
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								    #FIXME: proper file -> title
 								    title = title.split('-')[0]
 								    title = title.split('(')[0]
 								    title = title.split('.')[0]
 								    title = title.strip()
 								    imdb_url = 'http://www.imdb.com/find?q=%s' % quote(title.encode('utf-8'))
 								    return_url = ''
 								    #lest first try google
 								    #i.e. site:imdb.com Michael Stevens Sin
 								    if director:
 								        search = 'site:imdb.com %s "%s"' % (director, title)
-												lets start with google and imdb

											
										
										
											2008-04-28 09:52:21 +00:00
+								    else:
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								        search = 'site:imdb.com "%s"' % title
-												add timeout to imdb.guess

											
										
										
											2010-01-24 04:55:41 +00:00
+								    for (name, url, desc) in google.find(search, 2, timeout=timeout):
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								        if url.startswith('http://www.imdb.com/title/tt'):
-												back to oxlib, package_dir does not work with pip/python setup.py develop

											
										
										
											2009-10-12 15:18:59 +00:00
+								             return normalizeImdbId(int(oxlib.intValue(url)))
-												lets start with google and imdb

											
										
										
											2008-04-28 09:52:21 +00:00
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								    try:
-												back to oxlib, package_dir does not work with pip/python setup.py develop

											
										
										
											2009-10-12 15:18:59 +00:00
+								        req = urllib2.Request(imdb_url, None, oxlib.net.DEFAULT_HEADERS)
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								        u = urllib2.urlopen(req)
 								        data = u.read()
 								        return_url = u.url
 								        u.close()
 								    except:
 								        return None
 								    if return_url.startswith('http://www.imdb.com/title/tt'):
 								        return return_url[28:35]
 								    if data:
 								        imdb_id = findRe(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)')
 								        if imdb_id:
 								            return imdb_id
 								    imdb_url = 'http://www.imdb.com/find?q=%s;s=tt;site=aka' % quote(title.encode('utf-8'))
-												back to oxlib, package_dir does not work with pip/python setup.py develop

											
										
										
											2009-10-12 15:18:59 +00:00
+								    req = urllib2.Request(imdb_url, None, oxlib.net.DEFAULT_HEADERS)
-												lets start with google and imdb

											
										
										
											2008-04-28 09:52:21 +00:00
+								    u = urllib2.urlopen(req)
 								    data = u.read()
 								    return_url = u.url
 								    u.close()
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								    if return_url.startswith('http://www.imdb.com/title/tt'):
 								        return return_url[28:35]
-												lets start with google and imdb

											
										
										
											2008-04-28 09:52:21 +00:00
+								    return None
 								def getEpisodeData(title, episode, show_url = None):
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								    '''
 								      Collect information about an episode.
 								      Returns dict with title, show, description and episode
 								    '''
 								    episodeData = {
 								        'title': u'',
 								        'show': title,
 								        'description': u'',
 								        'episode': episode,
 								    }
 								    description = u''
 								    if not show_url:
 								        imdbid = guess(title)
 								    else:
 								        imdbid = "%07d" % int(re.compile('title/tt(\d*)').findall(link)[0])
 								    if imdbid:
 								        i = IMDb(imdbid).parse()
 								        episodeData['title'] = i['episodes'][episode]['title']
 								        episodeData['description'] = i['episodes'][episode]['description']
 								        episodeData['imdb'] = i['episodes'][episode]['imdb']
 								    return episodeData
-												lets start with google and imdb

											
										
										
											2008-04-28 09:52:21 +00:00
-												add imdb.getPersonData

											
										
										
											2008-09-30 14:00:21 +00:00
+								def getPersonData(imdbId):
 								    imdbId = normalizeImdbId(imdbId)
 								    url = u'http://www.imdb.com/name/nm%s/' % imdbId
-												depend on ox, install as ox.web, migrate getUrl to readUrl

											
										
										
											2009-10-12 11:47:43 +00:00
+								    data = readUrlUnicode(url)
-												add imdb.getPersonData

											
										
										
											2008-09-30 14:00:21 +00:00
+								    info = dict()
 								    info['name'] = findRe(data, u'<title>(.*?)</title>')
 								    filmo = data.split(u'<h3>Additional Details</h3>')[0]
 								    movies = {}
 								    for part in filmo.split(u'<div class="filmo"')[1:]:
 								        section = findRe(part, u'a name=".*?">(.*?):</a></h5>')
 								        section = decodeHtml(section)
 								        movies[section] = re.compile(u'href="/title/tt(\d{7})/"').findall(part)
 								    info['movies'] = movies
 								    return info
-												lets start with google and imdb

											
										
										
											2008-04-28 09:52:21 +00:00
+								if __name__ == '__main__':
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:47:02 +00:00
+								    import sys
 								    #print parse(sys.argv[1])
 								    print "imdb:", guess(sys.argv[1])