parse title and add some tests
This commit is contained in:
parent
b00befc5fc
commit
cde21b0363
2 changed files with 57 additions and 27 deletions
|
@ -7,11 +7,11 @@ from oxlib.cache import getUrl
|
||||||
|
|
||||||
def getVideoUrl(url):
|
def getVideoUrl(url):
|
||||||
'''
|
'''
|
||||||
>>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms')
|
>>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms').split('?key')[0]
|
||||||
'http://www.dailymotion.com/get/16/320x240/flv/6191379.flv?key=0a710ad6ffbfe980b1252569d16f957313399d0'
|
'http://www.dailymotion.com/get/16/320x240/flv/6191379.flv'
|
||||||
|
|
||||||
>>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms')
|
>>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms').split('?key')[0]
|
||||||
'http://www.dailymotion.com/get/15/320x240/flv/6197800.flv?key=08a18365ca6962c5ff7526f69872c36813399d4'
|
'http://www.dailymotion.com/get/15/320x240/flv/6197800.flv'
|
||||||
'''
|
'''
|
||||||
data = getUrl(url)
|
data = getUrl(url)
|
||||||
video = re.compile('''video", "(.*?)"''').findall(data)
|
video = re.compile('''video", "(.*?)"''').findall(data)
|
||||||
|
|
|
@ -74,14 +74,16 @@ def getMovieInfo(imdbId):
|
||||||
if k.endswith('more'): k=k[:-len('more')].strip()
|
if k.endswith('more'): k=k[:-len('more')].strip()
|
||||||
return k
|
return k
|
||||||
txt = cleanUp(txt)
|
txt = cleanUp(txt)
|
||||||
if title not in ('plot', 'trivia', 'filming locations', 'mpaa', 'tagline'):
|
if title not in ('plot', 'trivia', 'filming locations', 'mpaa', 'tagline', 'original air date'):
|
||||||
if '|' in txt:
|
if '|' in txt:
|
||||||
txt = [cleanUp(k) for k in txt.split('|')]
|
txt = [cleanUp(k) for k in txt.split('|')]
|
||||||
elif ', ' in txt:
|
elif ', ' in txt:
|
||||||
txt = [cleanUp(k) for k in txt.split(', ')]
|
txt = [cleanUp(k) for k in txt.split(', ')]
|
||||||
|
if title == 'original air date':
|
||||||
|
txt = txt.split('\n')[0].strip()
|
||||||
if not title.startswith('moviemeter'):
|
if not title.startswith('moviemeter'):
|
||||||
info[title] = txt
|
info[title] = txt
|
||||||
for key in ('user comments', 'writers (wga)'):
|
for key in ('user comments', 'writers (wga)', 'plot keywords'):
|
||||||
if key in info:
|
if key in info:
|
||||||
del info[key]
|
del info[key]
|
||||||
if 'release date' in info:
|
if 'release date' in info:
|
||||||
|
@ -115,8 +117,30 @@ def getMovieInfo(imdbId):
|
||||||
title = title[:title.find(u'\xa0')].strip()
|
title = title[:title.find(u'\xa0')].strip()
|
||||||
if title.startswith('"') and title.endswith('"'):
|
if title.startswith('"') and title.endswith('"'):
|
||||||
title = title[1:-1]
|
title = title[1:-1]
|
||||||
info['title'] = title
|
info['title'] = normalizeTitle(title)
|
||||||
info['year'] = year
|
info['year'] = year
|
||||||
|
if title.startswith('"') and title.find('"',1) > 0 and \
|
||||||
|
title.find('"',1) == title.rfind('"'):
|
||||||
|
episode_title = title[title.rfind('"')+1:]
|
||||||
|
episode_title = re.sub("\?{4}", "", episode_title).strip()
|
||||||
|
episode_title = re.sub("\d{4}", "", episode_title).strip()
|
||||||
|
if episode_title == '-': episode_title=''
|
||||||
|
title = normalizeTitle(title[1:title.rfind('"')])
|
||||||
|
if episode_title:
|
||||||
|
info['episode title'] = episode_title
|
||||||
|
info['series title'] = title
|
||||||
|
info['title'] = "%s: %s" % (title, episode_title)
|
||||||
|
else:
|
||||||
|
info['title'] = title
|
||||||
|
|
||||||
|
#Series
|
||||||
|
se = re.compile("Season (\d*), Episode (\d*)\)").findall(data)
|
||||||
|
if se:
|
||||||
|
info['season'] = int(se[0][0])
|
||||||
|
info['episode'] = int(se[0][1])
|
||||||
|
info['title'] = "%s (S%02dE%02d) %s" % (
|
||||||
|
info['series title'], info['season'], info['episode'], info['episode title'])
|
||||||
|
info['title'] = info['title'].strip()
|
||||||
|
|
||||||
#Rating
|
#Rating
|
||||||
rating = findRe(data, '<b>([\d\.]*?)/10</b>')
|
rating = findRe(data, '<b>([\d\.]*?)/10</b>')
|
||||||
|
@ -131,17 +155,42 @@ def getMovieInfo(imdbId):
|
||||||
info['votes'] = int(votes.replace(',', ''))
|
info['votes'] = int(votes.replace(',', ''))
|
||||||
else:
|
else:
|
||||||
info['votes'] = -1
|
info['votes'] = -1
|
||||||
|
|
||||||
return info
|
return info
|
||||||
|
|
||||||
|
|
||||||
def getMoviePoster(imdbId):
|
def getMoviePoster(imdbId):
|
||||||
info = getMovieInfo(imdbId)
|
info = getMovieInfo(imdbId)
|
||||||
return info['poster']
|
return info['poster']
|
||||||
|
|
||||||
def getMovieYear(imdbId):
|
def getMovieYear(imdbId):
|
||||||
|
'''
|
||||||
|
>>> getMovieYear('0315404')
|
||||||
|
u'1964'
|
||||||
|
|
||||||
|
>>> getMovieYear('0734840')
|
||||||
|
u'1990'
|
||||||
|
|
||||||
|
>>> getMovieYear('0815352')
|
||||||
|
u'1964'
|
||||||
|
'''
|
||||||
info = getMovieInfo(imdbId)
|
info = getMovieInfo(imdbId)
|
||||||
return info['year']
|
return info['year']
|
||||||
|
|
||||||
def getMovieTitle(imdbId):
|
def getMovieTitle(imdbId):
|
||||||
|
'''
|
||||||
|
>>> getMovieTitle('0306414')
|
||||||
|
u'The Wire'
|
||||||
|
|
||||||
|
>>> getMovieTitle('0734840')
|
||||||
|
u'Twin Peaks (S01E02) Episode #1.2'
|
||||||
|
|
||||||
|
>>> getMovieTitle('0734840')
|
||||||
|
u'Twin Peaks (S01E02) Episode #1.2'
|
||||||
|
|
||||||
|
>>> getMovieTitle('0749451')
|
||||||
|
u'The Wire (S01E01) The Target'
|
||||||
|
'''
|
||||||
info = getMovieInfo(imdbId)
|
info = getMovieInfo(imdbId)
|
||||||
return info['title']
|
return info['title']
|
||||||
|
|
||||||
|
@ -474,25 +523,6 @@ class IMDb:
|
||||||
parsed_value = value
|
parsed_value = value
|
||||||
return parsed_value
|
return parsed_value
|
||||||
|
|
||||||
def parseTitle(self):
|
|
||||||
title = getMovieTitle(self.imdb)
|
|
||||||
title = normalizeTitle(title)
|
|
||||||
if title.startswith('"') and title.find('"',1) > 0 and \
|
|
||||||
title.find('"',1) == title.rfind('"'):
|
|
||||||
data = self.getPage()
|
|
||||||
se = re.compile("Season (\d*), Episode (\d*)\)").findall(data)
|
|
||||||
if se:
|
|
||||||
se = se[0]
|
|
||||||
se = ' (S%02dE%02d) ' % (int(se[0]), int(se[1]))
|
|
||||||
title = normalizeTitle(title[1:title.rfind('"')]) + se + title[title.rfind('"')+1:].strip()
|
|
||||||
else:
|
|
||||||
part2 = title[title.rfind('"')+1:]
|
|
||||||
part2 = re.sub("[\d\?-]", "", part2).strip()
|
|
||||||
title = normalizeTitle(title[1:title.rfind('"')])
|
|
||||||
if part2:
|
|
||||||
title += ':' + part2
|
|
||||||
return normalizeTitle(title)
|
|
||||||
|
|
||||||
def parseYear(self):
|
def parseYear(self):
|
||||||
year = ''
|
year = ''
|
||||||
data = self.getPage()
|
data = self.getPage()
|
||||||
|
@ -520,7 +550,7 @@ class IMDb:
|
||||||
IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'
|
IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'
|
||||||
#Title, Year
|
#Title, Year
|
||||||
IMDbDict['year'] = self.parseYear()
|
IMDbDict['year'] = self.parseYear()
|
||||||
IMDbDict['title'] = self.parseTitle()
|
IMDbDict['title'] = getMovieTitle(self.imdb)
|
||||||
|
|
||||||
#Rating
|
#Rating
|
||||||
m = re.compile('<b>(.*?)/10</b>', re.IGNORECASE).search(data)
|
m = re.compile('<b>(.*?)/10</b>', re.IGNORECASE).search(data)
|
||||||
|
|
Loading…
Reference in a new issue