parse title and add some tests

2008-07-05 15:35:46 +02:00 · 2008-07-05 15:35:46 +02:00 · cde21b0363
commit cde21b0363
parent b00befc5fc
2 changed files with 57 additions and 27 deletions
--- a/oxweb/dailymotion.py
+++ b/oxweb/dailymotion.py
@ -7,11 +7,11 @@ from oxlib.cache import getUrl

 def getVideoUrl(url):
    '''
-    >>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms')
-    'http://www.dailymotion.com/get/16/320x240/flv/6191379.flv?key=0a710ad6ffbfe980b1252569d16f957313399d0'
+    >>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms').split('?key')[0]
+    'http://www.dailymotion.com/get/16/320x240/flv/6191379.flv'

-    >>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms')
-    'http://www.dailymotion.com/get/15/320x240/flv/6197800.flv?key=08a18365ca6962c5ff7526f69872c36813399d4'
+    >>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms').split('?key')[0]
+    'http://www.dailymotion.com/get/15/320x240/flv/6197800.flv'
    '''
    data = getUrl(url)
    video = re.compile('''video", "(.*?)"''').findall(data)
--- a/oxweb/imdb.py
+++ b/oxweb/imdb.py
@ -74,14 +74,16 @@ def getMovieInfo(imdbId):
            if k.endswith('more'): k=k[:-len('more')].strip()
            return k
        txt = cleanUp(txt)
-        if title not in ('plot', 'trivia', 'filming locations', 'mpaa', 'tagline'):
+        if title not in ('plot', 'trivia', 'filming locations', 'mpaa', 'tagline', 'original air date'):
            if '|' in txt:
                txt = [cleanUp(k) for k in txt.split('|')]
            elif ', ' in txt:
                txt = [cleanUp(k) for k in txt.split(', ')]
+        if title == 'original air date':
+            txt = txt.split('\n')[0].strip()
        if not title.startswith('moviemeter'):
            info[title] = txt
-    for key in ('user comments', 'writers (wga)'):
+    for key in ('user comments', 'writers (wga)', 'plot keywords'):
       if key in info:
          del info[key]
    if 'release date' in info:
@ -115,8 +117,30 @@ def getMovieInfo(imdbId):
        title = title[:title.find(u'\xa0')].strip()
    if title.startswith('"') and title.endswith('"'):
        title = title[1:-1]
-    info['title'] = title
+    info['title'] = normalizeTitle(title)
    info['year'] = year
+    if title.startswith('"') and title.find('"',1) > 0 and \
+        title.find('"',1) == title.rfind('"'):
+        episode_title = title[title.rfind('"')+1:]
+        episode_title = re.sub("\?{4}", "", episode_title).strip()
+        episode_title = re.sub("\d{4}", "", episode_title).strip()
+        if episode_title == '-': episode_title=''
+        title = normalizeTitle(title[1:title.rfind('"')])
+        if episode_title:
+            info['episode title'] = episode_title
+            info['series title'] = title
+            info['title'] = "%s: %s" % (title, episode_title)
+        else:
+            info['title'] = title
+
+    #Series
+    se = re.compile("Season (\d*), Episode (\d*)\)").findall(data)
+    if se:
+        info['season'] = int(se[0][0])
+        info['episode'] = int(se[0][1])
+        info['title'] = "%s (S%02dE%02d) %s" % (
+                    info['series title'], info['season'], info['episode'], info['episode title'])
+        info['title'] = info['title'].strip()

    #Rating
    rating = findRe(data, '<b>([\d\.]*?)/10</b>')
@ -131,17 +155,42 @@ def getMovieInfo(imdbId):
        info['votes'] = int(votes.replace(',', ''))
    else:
        info['votes'] = -1
+
    return info

+
 def getMoviePoster(imdbId):
    info = getMovieInfo(imdbId)
    return info['poster']

 def getMovieYear(imdbId):
+    '''
+    >>> getMovieYear('0315404')
+    u'1964'
+
+    >>> getMovieYear('0734840')
+    u'1990'
+
+    >>> getMovieYear('0815352')
+    u'1964'
+    '''
    info = getMovieInfo(imdbId)
    return info['year']

 def getMovieTitle(imdbId):
+    '''
+    >>> getMovieTitle('0306414')
+    u'The Wire'
+
+    >>> getMovieTitle('0734840')
+    u'Twin Peaks (S01E02) Episode #1.2'
+
+    >>> getMovieTitle('0734840')
+    u'Twin Peaks (S01E02) Episode #1.2'
+
+    >>> getMovieTitle('0749451')
+    u'The Wire (S01E01) The Target'
+    '''
    info = getMovieInfo(imdbId)
    return info['title']

@ -474,25 +523,6 @@ class IMDb:
            parsed_value = value
        return parsed_value

-    def parseTitle(self):
-        title = getMovieTitle(self.imdb)
-        title = normalizeTitle(title)
-        if title.startswith('"') and title.find('"',1) > 0 and \
-            title.find('"',1) == title.rfind('"'):
-            data = self.getPage()
-            se = re.compile("Season (\d*), Episode (\d*)\)").findall(data)
-            if se:
-                se = se[0]
-                se = ' (S%02dE%02d) ' % (int(se[0]), int(se[1]))
-                title = normalizeTitle(title[1:title.rfind('"')]) + se + title[title.rfind('"')+1:].strip()
-            else:
-                part2 = title[title.rfind('"')+1:]
-                part2 = re.sub("[\d\?-]", "", part2).strip()
-                title = normalizeTitle(title[1:title.rfind('"')])
-                if part2:
-                    title += ':' + part2
-        return normalizeTitle(title)
-
    def parseYear(self):
        year = ''
        data = self.getPage()
@ -520,7 +550,7 @@ class IMDb:
            IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'
        #Title, Year
        IMDbDict['year'] = self.parseYear()
-        IMDbDict['title'] = self.parseTitle()
+        IMDbDict['title'] = getMovieTitle(self.imdb)

        #Rating
        m = re.compile('<b>(.*?)/10</b>', re.IGNORECASE).search(data)