more imdb refinement

This commit is contained in:
j 2010-07-10 10:24:56 +02:00
parent 6afafa7355
commit 18ce4cd92d
2 changed files with 35 additions and 19 deletions

View file

@ -7,12 +7,13 @@ import os
import time import time
import ox import ox
from ox import findRe from ox import findRe, stripTags
from ox.normalize import normalizeTitle, normalizeImdbId from ox.normalize import normalizeTitle, normalizeImdbId
from siteparser import SiteParser from siteparser import SiteParser
import google import google
class Imdb(SiteParser): class Imdb(SiteParser):
regex = { regex = {
'alternative_titles': { 'alternative_titles': {
@ -25,13 +26,17 @@ class Imdb(SiteParser):
}, },
'cast': { 'cast': {
'page': 'combined', 'page': 'fullcredits',
're': '<td class="nm">.*?>(.*?)</a>.*?<td class="char">(.*?)</td>', 're': [
'<td class="nm">.*?>(.*?)</a>.*?<td class="char">(.*?)</td>',
lambda ll: [stripTags(l) for l in ll]
],
'type': 'list' 'type': 'list'
}, },
'cinematographers': { 'cinematographers': {
'page': 'combined', 'page': 'fullcredits',
're': [ 're': [
lambda data: data.split('Series Crew')[0],
'Cinematography by</a>(.*?)</table>', 'Cinematography by</a>(.*?)</table>',
'<a href="/name/.*?/">(.*?)</a>' '<a href="/name/.*?/">(.*?)</a>'
], ],
@ -48,16 +53,18 @@ class Imdb(SiteParser):
'type': 'list' 'type': 'list'
}, },
'directors': { 'directors': {
'page': 'combined', 'page': 'fullcredits',
're': [ 're': [
lambda data: data.split('Series Crew')[0],
'Directed by</a>(.*?)</table>', 'Directed by</a>(.*?)</table>',
'<a href="/name/.*?/">(.*?)</a>' '<a href="/name/.*?/">(.*?)</a>'
], ],
'type': 'list' 'type': 'list'
}, },
'editors': { 'editors': {
'page': 'combined', 'page': 'fullcredits',
're': [ 're': [
lambda data: data.split('Series Crew')[0],
'Film Editing by</a>(.*?)</table>', 'Film Editing by</a>(.*?)</table>',
'<a href="/name/.*?/">(.*?)</a>' '<a href="/name/.*?/">(.*?)</a>'
], ],
@ -108,7 +115,7 @@ class Imdb(SiteParser):
}, },
'rating': { 'rating': {
'page': 'combined', 'page': 'combined',
're': '<div class="starbar-meta">.*?<b>(.*?)/10</b>', 're': '<div class="starbar-meta">.*?<b>([\d,.]?)/10</b>',
'type': 'float' 'type': 'float'
}, },
'release_date': { 'release_date': {
@ -141,12 +148,13 @@ class Imdb(SiteParser):
}, },
'votes': { 'votes': {
'page': 'combined', 'page': 'combined',
're': '<a href="ratings" class="tn15more">(.*?) votes</a>', 're': '<a href="ratings" class="tn15more">([\d,]*?) votes</a>',
'type': 'string' 'type': 'string'
}, },
'writers': { 'writers': {
'page': 'combined', 'page': 'fullcredits',
're': [ 're': [
lambda data: data.split('Series Crew')[0],
'Writing credits</a>(.*?)</table>', 'Writing credits</a>(.*?)</table>',
'<a href="/name/.*?/">(.*?)</a>' '<a href="/name/.*?/">(.*?)</a>'
], ],
@ -167,8 +175,8 @@ class Imdb(SiteParser):
if 'min' in self['runtime']: base=60 if 'min' in self['runtime']: base=60
else: base=1 else: base=1
self['runtime'] = int(findRe(self['runtime'], '([0-9]+)')) * base self['runtime'] = int(findRe(self['runtime'], '([0-9]+)')) * base
else: if 'runtime' in self and not self['runtime']:
self['runtime'] = 0 del self['runtime']
if 'votes' in self: self['votes'] = self['votes'].replace(',', '') if 'votes' in self: self['votes'] = self['votes'].replace(',', '')
if 'connections' in self: if 'connections' in self:
cc={} cc={}
@ -179,7 +187,8 @@ class Imdb(SiteParser):
self['connections'] = cc self['connections'] = cc
for key in ('countries', 'genres'): for key in ('countries', 'genres'):
self[key] = filter(lambda x: x.lower() != 'home', self[key]) if key in self:
self[key] = filter(lambda x: x.lower() != 'home', self[key])
def guess(title, director='', timeout=google.DEFAULT_TIMEOUT): def guess(title, director='', timeout=google.DEFAULT_TIMEOUT):
@ -192,7 +201,7 @@ def guess(title, director='', timeout=google.DEFAULT_TIMEOUT):
return_url = '' return_url = ''
#lest first try google #lest first try google
#i.e. site:imdb.com Michael Stevens Sin #i.e. site:imdb.com Michael Stevens "Sin"
if director: if director:
search = 'site:imdb.com %s "%s"' % (director, title) search = 'site:imdb.com %s "%s"' % (director, title)
else: else:

View file

@ -15,7 +15,7 @@ def cleanup(key, data, data_type):
data = [decodeHtml(p).strip() for p in data] data = [decodeHtml(p).strip() for p in data]
elif isinstance(data[0], list) or isinstance(data[0], tuple): elif isinstance(data[0], list) or isinstance(data[0], tuple):
data = [cleanup(key, p, data_type) for p in data] data = [cleanup(key, p, data_type) for p in data]
while len(data) == 1: while len(data) == 1 and not isinstance(data, basestring):
data = data[0] data = data[0]
if data_type == 'list' and isinstance(data, basestring): if data_type == 'list' and isinstance(data, basestring):
data = [data, ] data = [data, ]
@ -37,12 +37,18 @@ class SiteParser(dict):
if isinstance(self.regex[key]['re'], basestring): if isinstance(self.regex[key]['re'], basestring):
data = re.compile(self.regex[key]['re'], re.DOTALL).findall(data) data = re.compile(self.regex[key]['re'], re.DOTALL).findall(data)
data = cleanup(key, data, self.regex[key]['type']) data = cleanup(key, data, self.regex[key]['type'])
elif callable(self.regex[key]['re']):
data = self.regex[key]['re'](data)
else: else:
for r in self.regex[key]['re']: for r in self.regex[key]['re']:
if isinstance(data, basestring): if callable(r):
data = re.compile(r, re.DOTALL).findall(data) f = r
else: else:
data = [re.compile(r, re.DOTALL).findall(d) for d in data] f = re.compile(r, re.DOTALL).findall
if isinstance(data, basestring):
data = f(data)
else:
data = [f(d) for d in data]
data = cleanup(key, data, self.regex[key]['type']) data = cleanup(key, data, self.regex[key]['type'])
def apply_f(f, data): def apply_f(f, data):
if data and isinstance(data[0], list): if data and isinstance(data[0], list):
@ -50,12 +56,13 @@ class SiteParser(dict):
else: else:
data = f(data) data = f(data)
return data return data
if self.regex[key]['type'] == 'float': if self.regex[key]['type'] == 'float' and data:
data = apply_f(float, data) data = apply_f(float, data)
elif self.regex[key]['type'] == 'int': elif self.regex[key]['type'] == 'int':
data = apply_f(int, data) data = apply_f(int, data)
elif self.regex[key]['type'] == 'date': elif self.regex[key]['type'] == 'date':
parse_date = lambda d: d and datetime.strptime('-'.join(d), '%m-%d-%Y').strftime('%Y-%m-%d') parse_date = lambda d: d and datetime.strptime('-'.join(d), '%m-%d-%Y').strftime('%Y-%m-%d')
data = apply_f(parse_date, data) data = apply_f(parse_date, data)
self[key] = data if data:
self[key] = data