tune criterion

This commit is contained in:
j 2011-03-09 13:10:20 +01:00
parent 15db38b442
commit e0f7b00b86
2 changed files with 26 additions and 3 deletions

View file

@ -36,6 +36,7 @@ def getData(id):
data["number"] = findRe(html, "<li>Spine #(\d+)") data["number"] = findRe(html, "<li>Spine #(\d+)")
data["title"] = findRe(html, "<meta property=['\"]og:title['\"] content=['\"](.*?)['\"]") data["title"] = findRe(html, "<meta property=['\"]og:title['\"] content=['\"](.*?)['\"]")
data["title"] = data["title"].split(u' \u2014 The Television Version')[0]
data["director"] = stripTags(findRe(html, "<h2 class=\"director\">(.*?)</h2>")) data["director"] = stripTags(findRe(html, "<h2 class=\"director\">(.*?)</h2>"))
results = findRe(html, '<div class="left_column">(.*?)</div>') results = findRe(html, '<div class="left_column">(.*?)</div>')
results = re.compile("<li>(.*?)</li>").findall(results) results = re.compile("<li>(.*?)</li>").findall(results)
@ -61,8 +62,8 @@ def getData(id):
data["stills"] = [result] data["stills"] = [result]
data["trailers"] = [] data["trailers"] = []
else: else:
data["stills"] = [findRe(html, "\"thumbnailURL\", \"(.*?)\"")] data["stills"] = filter(lambda x: x, [findRe(html, "\"thumbnailURL\", \"(.*?)\"")])
data["trailers"] = [findRe(html, "\"videoURL\", \"(.*?)\"")] data["trailers"] = filter(lambda x: x, [findRe(html, "\"videoURL\", \"(.*?)\"")])
data['imdbId'] = imdb.getMovieId(data['title'], data['director'], data['year']) data['imdbId'] = imdb.getMovieId(data['title'], data['director'], data['year'])
return data return data

View file

@ -315,11 +315,28 @@ def getMovieId(title, director='', year='', timeout=-1):
>>> getMovieId(u"Histoire(s) du cinéma: Le contrôle de l'univers", 'Jean-Luc Godard') >>> getMovieId(u"Histoire(s) du cinéma: Le contrôle de l'univers", 'Jean-Luc Godard')
u'0179214' u'0179214'
''' '''
#print (title, director)
imdbId = {
(u'Le jour se l\xe8ve', u'Marcel Carn\xe9'): '0031514',
(u'Wings', u'Larisa Shepitko'): '0061196',
(u'The Ascent', u'Larisa Shepitko'): '0075404',
(u'Fanny and Alexander', u'Ingmar Bergman'): '0083922',
(u'Torment', u'Alf Sj\xf6berg'): '0036914',
(u'Crisis', u'Ingmar Bergman'): '0038675',
(u'To Joy', u'Ingmar Bergman'): '0043048',
(u'Humain, trop humain', u'Louis Malle'): '0071635',
(u'Place de la R\xe9publique', u'Louis Malle'): '0071999',
(u'God\u2019s Country', u'Louis Malle'): '0091125',
}.get((title, director), None)
if imdbId:
return imdbId
params = {'s':'tt','q': title} params = {'s':'tt','q': title}
if director: if director:
params['q'] = u'"%s" %s' % (title, director) params['q'] = u'"%s" %s' % (title, director)
if year: if year:
params['q'] = u'"%s (%s)" %s' % (title, year, director) params['q'] = u'"%s (%s)" %s' % (title, year, director)
google_query = "site:imdb.com %s" % params['q']
params['q'] = params['q'].encode('utf-8') params['q'] = params['q'].encode('utf-8')
params = urllib.urlencode(params) params = urllib.urlencode(params)
url = "http://akas.imdb.com/find?" + params url = "http://akas.imdb.com/find?" + params
@ -333,9 +350,14 @@ def getMovieId(title, director='', year='', timeout=-1):
return results[0] return results[0]
#otherwise get first result #otherwise get first result
r = '<td valign="top">.*?<a href="/title/tt(\d{7})/"' r = '<td valign="top">.*?<a href="/title/tt(\d{7})/"'
results = re.compile(r).findall(data) results = re.compile(r).findall(data)
if results: if results:
return results[0] return results[0]
#print google_query
results = google.find(google_query)
if results:
return findRe(results[0][1], 'title/tt(\d{7})')
#or nothing #or nothing
return '' return ''