328 lines
12 KiB
Python
328 lines
12 KiB
Python
# -*- coding: utf-8 -*-
|
|
# vi:si:et:sw=4:sts=4:ts=4
|
|
from __future__ import division
|
|
|
|
import re
|
|
import unicodedata
|
|
from urllib import quote
|
|
import hashlib
|
|
import base64
|
|
|
|
from django.db import models
|
|
from django.conf import settings
|
|
import ox
|
|
from ox.django.fields import DictField
|
|
|
|
from lookup.models import get_movie_id
|
|
from poster.models import getPosters
|
|
|
|
def normalize_value(value):
|
|
if isinstance(value, str):
|
|
value = value.decode('utf-8')
|
|
if isinstance(value, unicode):
|
|
value = unicodedata.normalize('NFD', value)
|
|
return value
|
|
|
|
def find(info, guess=True):
|
|
q = Imdb.objects.all()
|
|
if 'id' in info:
|
|
q = q.filter(imdb=info['id'])
|
|
if q.count() == 0 and len(info['id']) == 7:
|
|
i = Imdb(imdb=info['id'])
|
|
i.save()
|
|
q = Imdb.objects.filter(imdb=info['id'])
|
|
else:
|
|
for key in Imdb.keys:
|
|
if key in info and info[key]:
|
|
if isinstance(info[key], basestring):
|
|
fkey = '%s__iexact'%key
|
|
else:
|
|
fkey = key
|
|
if isinstance(info[key], list):
|
|
fkey = '%s__iexact'%key
|
|
value = normalize_value(u'\n'.join(info[key]) + '\n')
|
|
else:
|
|
value = normalize_value(info[key])
|
|
q = q.filter(**{fkey:value})
|
|
if q.count() == 1:
|
|
m = q[0]
|
|
m.update()
|
|
return m
|
|
#For now fallback to ox.web.imdb.guess and try again
|
|
if guess and 'title' in info:
|
|
id = ox.web.imdb.get_movie_id(info['title'])
|
|
if id:
|
|
i, created = Imdb.objects.get_or_create(imdb=id)
|
|
if created:
|
|
i.update()
|
|
return find(info, False)
|
|
return None
|
|
|
|
class Imdb(models.Model):
|
|
created = models.DateTimeField(auto_now_add=True)
|
|
modified = models.DateTimeField(auto_now=True)
|
|
|
|
imdb = models.CharField(max_length=7, unique=True)
|
|
title = models.CharField(max_length=1000, blank=True, default='')
|
|
originalTitle = models.CharField(max_length=1000, blank=True, default='')
|
|
year = models.CharField(max_length=4, blank=True, default='')
|
|
director = models.CharField(max_length=9000, blank=True, default='')
|
|
|
|
season = models.IntegerField(blank=True, null=True)
|
|
episode = models.IntegerField(blank=True, null=True)
|
|
episodeTitle = models.CharField(max_length=1000, blank=True, default='')
|
|
episodeYear = models.CharField(max_length=4, blank=True, default='')
|
|
episodeDirector = models.CharField(max_length=1000, blank=True, default='')
|
|
seriesTitle = models.CharField(max_length=1000, blank=True, default='')
|
|
|
|
invalid = models.BooleanField(default=False)
|
|
patch = DictField(default=None, blank=True, null=True)
|
|
|
|
def __unicode__(self):
|
|
return u"[%s] %s%s" % (self.imdb, self.title, self.year and ' (%s)' % self.year or '')
|
|
|
|
def save(self, *args, **kwargs):
|
|
super(Imdb, self).save(*args, **kwargs)
|
|
if not self.invalid:
|
|
Match.update_item(self)
|
|
else:
|
|
Match.objects.filter(item=self).delete()
|
|
|
|
keys = ('title', 'director', 'year', 'season', 'episode',
|
|
'originalTitle',
|
|
'seriesTitle', 'episodeTitle', 'episodeYear', 'episodeDirector')
|
|
|
|
def apply_patch(self, data):
|
|
if self.patch:
|
|
data.update(self.patch)
|
|
if 'seriesTitle' in data and 'episodeTitle' in data:
|
|
if 'season' in data and 'episode' in data:
|
|
data['title'] = "%s (S%02dE%02d) %s" % (
|
|
data['seriesTitle'], data['season'], data['episode'], data['episodeTitle'])
|
|
else:
|
|
data['title'] = "%s (S01) %s" % (data['seriesTitle'], data['episodeTitle'])
|
|
data['title'] = data['title'].strip()
|
|
return data
|
|
|
|
def update(self, timeout=None):
|
|
if timeout != None:
|
|
info = ox.web.imdb.ImdbCombined(self.imdb, timeout=timeout)
|
|
else:
|
|
info = ox.web.imdb.ImdbCombined(self.imdb)
|
|
info = self.apply_patch(info)
|
|
if info:
|
|
for key in self.keys:
|
|
if key in info:
|
|
value = info[key]
|
|
if isinstance(value, list):
|
|
value = u'\n'.join(value) + '\n'
|
|
if isinstance(value, basestring):
|
|
value = normalize_value(value)
|
|
setattr(self, key, value)
|
|
if self.season < 0:
|
|
self.season = None
|
|
if self.episode < 0:
|
|
self.episode = None
|
|
self.save()
|
|
elif not self.invalid:
|
|
self.invalid = True
|
|
self.save()
|
|
|
|
def data(self, request=None, timeout=ox.cache.cache_timeout):
|
|
data = ox.web.imdb.Imdb(self.imdb, timeout=timeout)
|
|
data = self.apply_patch(data)
|
|
def fix_links(t):
|
|
def quote_string(string):
|
|
return quote(string).replace('_', '%09').replace('%20', '_')
|
|
def fix_names(m):
|
|
return '<a href="/name=%s">%s</a>' % (
|
|
quote_string(m.group(2).encode('utf-8')), m.group(2)
|
|
)
|
|
t = re.sub('<a href="(/name/.*?/)">(.*?)</a>', fix_names, t)
|
|
|
|
def fix_titles(m):
|
|
return '<a href="/title=%s">%s</a>' % (
|
|
quote_string(m.group(2).encode('utf-8')), m.group(2)
|
|
)
|
|
t = re.sub('<a href="(/title/.*?/)">(.*?)</a>', fix_titles, t)
|
|
|
|
return t
|
|
|
|
if 'trivia' in data:
|
|
data['trivia'] = [fix_links(t) for t in data['trivia']]
|
|
|
|
if 'summary' in data:
|
|
data['summary'] = fix_links(data['summary'])
|
|
|
|
if 'rating' in data:
|
|
data['rating'] = float(data['rating']) * 10
|
|
|
|
if 'votes' in data:
|
|
max_votes = ox.web.imdb.max_votes()
|
|
data['votes'] = 100 * float(data['votes']) / max_votes
|
|
else:
|
|
data['votes'] = 0
|
|
|
|
if 'reviews' in data:
|
|
reviews = []
|
|
for r in data['reviews']:
|
|
for url in settings.REVIEW_WHITELIST:
|
|
if url in r[0]:
|
|
reviews.append({
|
|
'source': settings.REVIEW_WHITELIST[url],
|
|
'url': r[0]
|
|
})
|
|
data['reviews'] = reviews
|
|
if not data['reviews']:
|
|
del data['reviews']
|
|
if 'posterId' in data:
|
|
del data['posterId']
|
|
data['likes'] = self.info('likes')
|
|
data['downloads'] = self.info('downloads')
|
|
data['links'] = self.links()
|
|
data['posters'] = self.posters(request)
|
|
if 'title' in data:
|
|
data['title'] = ox.escape_html(data['title'])
|
|
if 'originalTitle' in data:
|
|
data['originalTitle'] = ox.escape_html(data['originalTitle'])
|
|
if 'alternativeTitles' in data:
|
|
data['alternativeTitles'] = [(ox.escape_html(a[0]), a[1]) for a in data['alternativeTitles']]
|
|
if 'connections' in data:
|
|
for type in data['connections']:
|
|
for c in data['connections'][type]:
|
|
for key in ('title', 'description'):
|
|
if key in c:
|
|
c[key] = ox.sanitize_html(fix_links(c[key]))
|
|
return data
|
|
|
|
def info(self, key):
|
|
movie_id = get_movie_id(imdb_id=self.imdb)
|
|
return movie_id and movie_id.info(key) or 0
|
|
|
|
def links(self):
|
|
links = []
|
|
movie_id = get_movie_id(imdb_id=self.imdb)
|
|
if movie_id:
|
|
links = movie_id.links()
|
|
return links
|
|
|
|
def posters(self, request=None):
|
|
movie_id = get_movie_id(imdb_id=self.imdb)
|
|
return getPosters(movie_id, request and request.build_absolute_uri('/') or '')
|
|
|
|
def json(self):
|
|
j = {}
|
|
j['id'] = self.imdb
|
|
for key in self.keys:
|
|
j[key] = getattr(self, key)
|
|
for key in ('director', 'episodeDirector'):
|
|
if j[key].strip():
|
|
j[key] = j[key].strip().split('\n')
|
|
else:
|
|
del j[key]
|
|
for key in j.keys():
|
|
if not j[key]:
|
|
del j[key]
|
|
if 'year' in j and isinstance(j['year'], basestring) and j['year'].isdigit():
|
|
j['year'] = int(j['year'])
|
|
return j
|
|
|
|
def get_new_ids(timeout=-1):
|
|
known_ids = frozenset([i['imdb'] for i in Imdb.objects.all().values('imdb')])
|
|
robot = ox.cache.read_url('http://www.imdb.com/robots.txt', timeout=timeout)
|
|
sitemap_url = re.compile('\nSitemap: (http.+)').findall(robot)[0]
|
|
sitemap = ox.cache.read_url(sitemap_url, timeout=timeout)
|
|
urls = re.compile('<loc>(.+?)</loc>').findall(sitemap)
|
|
for url in sorted(urls, reverse=True):
|
|
s = ox.cache.read_url(url, timeout=timeout)
|
|
ids = re.compile('<loc>http://www.imdb.com/title/tt(\d{7})/combined</loc>').findall(s)
|
|
added = 0
|
|
for i in frozenset(ids) - known_ids:
|
|
m = Imdb(imdb=i)
|
|
m.update()
|
|
print m
|
|
added += 1
|
|
if added:
|
|
print url, added
|
|
|
|
class Match(models.Model):
|
|
keys = [
|
|
['title', 'director', 'year'],
|
|
['title', 'director'],
|
|
['title', 'year'],
|
|
['director', 'year'],
|
|
['title'],
|
|
['director']
|
|
]
|
|
|
|
key = models.CharField(max_length=28, db_index=True)
|
|
item = models.ForeignKey(Imdb, related_name='matches')
|
|
|
|
def __unicode__(self):
|
|
return '%s(%s)' % (self.hexdigest(), self.item.imdb)
|
|
|
|
def json(self):
|
|
return self.item.json()
|
|
|
|
def hexdigest(self):
|
|
return base64.b64decode(self.key).encode('hex')
|
|
|
|
@classmethod
|
|
def get_keys(cls, data):
|
|
data = {
|
|
'title': data['title'].lower(),
|
|
'year': str(data.get('year', '')),
|
|
'director': ';'.join(sorted(data.get('director', [])))
|
|
}
|
|
keys = []
|
|
if not data['director']:
|
|
_keys = [k for k in cls.keys if k not in (['director', 'year'], ['director'])]
|
|
else:
|
|
_keys = cls.keys
|
|
for k in _keys:
|
|
key = '\0'.join(k)
|
|
value = '\0'.join([data[v] for v in k])
|
|
if isinstance(value, unicode):
|
|
value = value.encode('utf-8')
|
|
value = str(value)
|
|
key = str(key)
|
|
key = base64.b64encode(hashlib.sha1(key + '\n' + value).digest())
|
|
keys.append(key)
|
|
return keys
|
|
|
|
@classmethod
|
|
def find(cls, data):
|
|
matches = []
|
|
items = []
|
|
for key in cls.get_keys(data):
|
|
for m in cls.objects.filter(key=key).order_by('item__imdb').select_related():
|
|
if m.item.id not in items:
|
|
matches.append(m.json())
|
|
items.append(m.item.id)
|
|
return matches
|
|
|
|
@classmethod
|
|
def update_item(cls, item):
|
|
info = item.json()
|
|
#ignore values without title, must be invalid
|
|
if not 'title' in info:
|
|
return
|
|
data = []
|
|
if 'originalTitle' in info:
|
|
data.append({
|
|
'title': info['originalTitle'],
|
|
'year': info.get('year', ''),
|
|
'director': info.get('director', [])
|
|
})
|
|
data.append(info)
|
|
existing_keys = [m.key for m in Match.objects.filter(item=item)]
|
|
current_keys = []
|
|
for d in data:
|
|
for key in cls.get_keys(d):
|
|
if key not in existing_keys:
|
|
m = Match(key=key, item=item)
|
|
m.save()
|
|
current_keys.append(key)
|
|
deleted_keys = list(set(existing_keys)-set(current_keys))
|
|
if deleted_keys:
|
|
Match.objects.filter(item=item, key__in=deleted_keys).delete()
|