imdb cleanups
This commit is contained in:
parent
4e38581c51
commit
80641b5461
1 changed files with 15 additions and 5 deletions
|
@ -75,8 +75,8 @@ def getRawMovieData(imdbId):
|
||||||
data['release date'] = getMovieReleaseDate(imdbId)
|
data['release date'] = getMovieReleaseDate(imdbId)
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def getMovieInfo(imdbId):
|
def getMovieInfo(imdbId, timeout=-1):
|
||||||
data = readUrlUnicode(getUrlBase(imdbId))
|
data = readUrlUnicode(getUrlBase(imdbId), timeout=timeout)
|
||||||
info = dict()
|
info = dict()
|
||||||
info['poster'] = findRe(data, 'name="poster".*?<img .*?src="(.*?)"')
|
info['poster'] = findRe(data, 'name="poster".*?<img .*?src="(.*?)"')
|
||||||
if info['poster'] and '_V' in info['poster']:
|
if info['poster'] and '_V' in info['poster']:
|
||||||
|
@ -84,7 +84,12 @@ def getMovieInfo(imdbId):
|
||||||
|
|
||||||
for i in re.compile('<h5>(.*?):</h5>(.*?)<div class="info"', re.DOTALL).findall(data):
|
for i in re.compile('<h5>(.*?):</h5>(.*?)<div class="info"', re.DOTALL).findall(data):
|
||||||
title = stripTags(i[0]).strip().lower()
|
title = stripTags(i[0]).strip().lower()
|
||||||
txt= stripTags(i[1]).strip()
|
if title in ('genre', ):
|
||||||
|
txt = i[1].split('</div>')[0]
|
||||||
|
else:
|
||||||
|
txt= i[1]
|
||||||
|
txt = stripTags(txt).strip()
|
||||||
|
|
||||||
def cleanUp(k):
|
def cleanUp(k):
|
||||||
k = decodeHtml(k).replace(u'\xa0', ' ').strip()
|
k = decodeHtml(k).replace(u'\xa0', ' ').strip()
|
||||||
if k.endswith('more'): k=k[:-len('more')].strip()
|
if k.endswith('more'): k=k[:-len('more')].strip()
|
||||||
|
@ -258,7 +263,7 @@ def creditList(data, section=None):
|
||||||
credits_ = re.compile('''<tr>.*?<td valign="top">(.*?)</td><td.*?</td><td valign="top">(.*?)</td></tr>''').findall(data)
|
credits_ = re.compile('''<tr>.*?<td valign="top">(.*?)</td><td.*?</td><td valign="top">(.*?)</td></tr>''').findall(data)
|
||||||
credits = []
|
credits = []
|
||||||
for c_ in credits_:
|
for c_ in credits_:
|
||||||
c = [decodeHtml(c_[0]).strip(), decodeHtml(c_[1]).strip()]
|
c = [stripTags(decodeHtml(c_[0]).strip()), stripTags(decodeHtml(c_[1]).strip())]
|
||||||
if section=='writers':
|
if section=='writers':
|
||||||
c[1] = c[1].replace('<br>', '').strip().replace(')', '').replace('(','')
|
c[1] = c[1].replace('<br>', '').strip().replace(')', '').replace('(','')
|
||||||
if c[1].endswith(' and'): c[1] = c[1][:-4]
|
if c[1].endswith(' and'): c[1] = c[1][:-4]
|
||||||
|
@ -328,6 +333,7 @@ def getMovieLocations(imdbId):
|
||||||
url = "%slocations" % getUrlBase(imdbId)
|
url = "%slocations" % getUrlBase(imdbId)
|
||||||
data = readUrlUnicode(url)
|
data = readUrlUnicode(url)
|
||||||
locations = re.compile('<dt><a href="/List.*?>(.*?)</a></dt>').findall(data)
|
locations = re.compile('<dt><a href="/List.*?>(.*?)</a></dt>').findall(data)
|
||||||
|
locations = [decodeHtml(l) for l in locations]
|
||||||
return locations
|
return locations
|
||||||
|
|
||||||
def getMovieImages(imdbId, keys=('still_frame', 'poster', 'product')):
|
def getMovieImages(imdbId, keys=('still_frame', 'poster', 'product')):
|
||||||
|
@ -366,6 +372,8 @@ def getMovieTrivia(imdbId):
|
||||||
t = t.replace(u'', '"')
|
t = t.replace(u'', '"')
|
||||||
if t.endswith('<br><br>'):
|
if t.endswith('<br><br>'):
|
||||||
t = t[:-8]
|
t = t[:-8]
|
||||||
|
if t.endswith('<br>\n<br>'):
|
||||||
|
t = t[:-len('<br>\n<br>')]
|
||||||
return t.strip()
|
return t.strip()
|
||||||
trivia = [clean(t) for t in trivia]
|
trivia = [clean(t) for t in trivia]
|
||||||
return trivia
|
return trivia
|
||||||
|
@ -454,7 +462,9 @@ def getMovieBusinessSum(imdbId):
|
||||||
b_['budget'] = int(intValue(budget[0].replace(',', '')))
|
b_['budget'] = int(intValue(budget[0].replace(',', '')))
|
||||||
|
|
||||||
if 'gross' in business:
|
if 'gross' in business:
|
||||||
b_['gross'] = int(intValue(business['gross'][0].replace(',', '')))
|
gross = filter(lambda x: x.startswith('$'), business['gross'])
|
||||||
|
if gross:
|
||||||
|
b_['gross'] = int(intValue(gross[0].replace(',', '')))
|
||||||
#b_['gross'] = sum([int(intValue(i.replace(',', ''))) for i in business['gross']])
|
#b_['gross'] = sum([int(intValue(i.replace(',', ''))) for i in business['gross']])
|
||||||
#if 'weekend gross' in business:
|
#if 'weekend gross' in business:
|
||||||
# b_['gross'] += sum([int(intValue(i.replace(',', ''))) for i in business['weekend gross']])
|
# b_['gross'] += sum([int(intValue(i.replace(',', ''))) for i in business['weekend gross']])
|
||||||
|
|
Loading…
Reference in a new issue