oxdata/oxdata/movie/models.py
2016-10-13 15:19:15 +02:00

352 lines
12 KiB
Python

# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from __future__ import division, print_function, absolute_import
import re
import os
import unicodedata
from urllib.parse import quote
import hashlib
import base64
import binascii
import json
from django.db import models
from django.conf import settings
import ox
from oxdjango.fields import DictField
from lookup.models import get_movie_id
from poster.models import getPosters
def normalize_value(value):
if isinstance(value, bytes):
value = value.decode('utf-8')
if isinstance(value, str):
value = unicodedata.normalize('NFD', value)
return value
def find(info, guess=True):
q = Imdb.objects.all()
if 'id' in info:
q = q.filter(imdb=info['id'])
if q.count() == 0 and len(info['id']) == 7:
i = Imdb(imdb=info['id'])
i.save()
q = Imdb.objects.filter(imdb=info['id'])
else:
for key in Imdb.keys:
if key in info and info[key]:
if isinstance(info[key], str):
fkey = '%s__iexact' % key
else:
fkey = key
if isinstance(info[key], list):
fkey = '%s__iexact' % key
value = normalize_value('\n'.join(info[key]) + '\n')
else:
value = normalize_value(info[key])
q = q.filter(**{fkey:value})
if q.count() == 1:
m = q[0]
m.update()
return m
#For now fallback to ox.web.imdb.guess and try again
if guess and 'title' in info:
id = ox.web.imdb.get_movie_id(normalize_value(info['title']))
if id:
i, created = Imdb.objects.get_or_create(imdb=id)
if created:
i.update()
return find(info, False)
return None
class Imdb(models.Model):
created = models.DateTimeField(auto_now_add=True)
modified = models.DateTimeField(auto_now=True)
imdb = models.CharField(max_length=7, unique=True)
title = models.CharField(max_length=1000, blank=True, default='')
originalTitle = models.CharField(max_length=1000, blank=True, default='')
year = models.CharField(max_length=4, blank=True, default='')
director = models.CharField(max_length=9000, blank=True, default='')
season = models.IntegerField(blank=True, null=True)
episode = models.IntegerField(blank=True, null=True)
episodeTitle = models.CharField(max_length=1000, blank=True, default='')
episodeYear = models.CharField(max_length=4, blank=True, default='')
episodeDirector = models.CharField(max_length=1000, blank=True, default='')
seriesTitle = models.CharField(max_length=1000, blank=True, default='')
invalid = models.BooleanField(default=False)
patch = DictField(default=None, blank=True, null=True)
def __str__(self):
return "[%s] %s%s" % (self.imdb, self.title, self.year and ' (%s)' % self.year or '')
def save(self, *args, **kwargs):
super(Imdb, self).save(*args, **kwargs)
if not self.invalid:
Match.update_item(self)
else:
Match.objects.filter(item=self).delete()
keys = ('title', 'director', 'year', 'season', 'episode',
'originalTitle',
'seriesTitle', 'episodeTitle', 'episodeYear', 'episodeDirector')
def apply_patch(self, data):
if self.patch:
data.update(self.patch)
if 'seriesTitle' in data and 'episodeTitle' in data:
if 'season' in data and 'episode' in data:
data['title'] = "%s (S%02dE%02d) %s" % (
data['seriesTitle'], data['season'], data['episode'], data['episodeTitle'])
else:
data['title'] = "%s (S01) %s" % (data['seriesTitle'], data['episodeTitle'])
data['title'] = data['title'].strip()
return data
def update(self, timeout=None):
if timeout != None:
info = ox.web.imdb.ImdbCombined(self.imdb, timeout=timeout)
else:
info = ox.web.imdb.ImdbCombined(self.imdb)
info = self.apply_patch(info)
if info:
for key in self.keys:
if key in info:
value = info[key]
if isinstance(value, list):
value = '\n'.join(value) + '\n'
if isinstance(value, str):
value = normalize_value(value)
setattr(self, key, value)
if self.season and self.season < 0:
self.season = None
if self.episode and self.episode < 0:
self.episode = None
self.save()
elif not self.invalid:
self.invalid = True
self.save()
def data(self, request=None, timeout=ox.cache.cache_timeout):
data = ox.web.imdb.Imdb(self.imdb, timeout=timeout)
data = self.apply_patch(data)
def quote_string(string):
return quote(string).replace('_', '%09').replace('%20', '_')
def fix_names(m):
return '<a href="/name=%s">%s</a>' % (
quote_string(m.group(2).encode('utf-8')), m.group(2)
)
def fix_titles(m):
return '<a href="/title=%s">%s</a>' % (
quote_string(m.group(2).encode('utf-8')), m.group(2)
)
def fix_links(t):
t = re.sub('<a href="(/name/.*?)">(.*?)</a>', fix_names, t)
t = re.sub('<a href="(/title/.*?)">(.*?)</a>', fix_titles, t)
return t
if 'trivia' in data:
data['trivia'] = [fix_links(t) for t in data['trivia']]
if 'summary' in data:
data['summary'] = fix_links(data['summary'])
if 'rating' in data:
data['rating'] = float(data['rating']) * 10
if 'votes' in data:
max_votes = ox.web.imdb.max_votes()
data['votes'] = 100 * float(data['votes']) / max_votes
else:
data['votes'] = 0
if 'reviews' in data:
reviews = []
for r in data['reviews']:
for url in settings.REVIEW_WHITELIST:
if url in r[0]:
reviews.append({
'source': settings.REVIEW_WHITELIST[url],
'url': r[0]
})
data['reviews'] = reviews
if not data['reviews']:
del data['reviews']
if 'posterId' in data:
del data['posterId']
data['likes'] = self.info('likes')
data['downloads'] = self.info('downloads')
data['links'] = self.links()
data['posters'] = self.posters(request)
if 'title' in data:
data['title'] = ox.escape_html(data['title'])
if 'originalTitle' in data:
data['originalTitle'] = ox.escape_html(data['originalTitle'])
if 'alternativeTitles' in data:
data['alternativeTitles'] = [(ox.escape_html(a[0]), a[1]) for a in data['alternativeTitles']]
if 'connections' in data:
for type in data['connections']:
for c in data['connections'][type]:
for key in ('title', 'description'):
if key in c:
c[key] = ox.sanitize_html(fix_links(c[key]))
return data
def info(self, key):
movie_id = get_movie_id(imdb_id=self.imdb)
return movie_id and movie_id.info(key) or 0
def links(self):
links = []
movie_id = get_movie_id(imdb_id=self.imdb)
if movie_id:
links = movie_id.links()
return links
def posters(self, request=None):
movie_id = get_movie_id(imdb_id=self.imdb)
return getPosters(movie_id, request and request.build_absolute_uri('/') or '')
def json(self):
j = {}
j['id'] = self.imdb
for key in self.keys:
j[key] = getattr(self, key)
for key in ('director', 'episodeDirector'):
if j[key].strip():
j[key] = j[key].strip().split('\n')
else:
del j[key]
for key in list(j):
if not j[key]:
del j[key]
if 'year' in j and isinstance(j['year'], str) and j['year'].isdigit():
j['year'] = int(j['year'])
return j
def get_new_ids(timeout=-1):
new_ids_cache = '/tmp/missing.json'
if os.path.exists(new_ids_cache):
with open(new_ids_cache) as fd:
new_ids = set(json.load(fd))
else:
robot = ox.cache.read_url('http://www.imdb.com/robots.txt', timeout=timeout).decode('utf-8')
sitemap_url = re.compile('\nSitemap: (http.+)').findall(robot)[0]
sitemap = ox.cache.read_url(sitemap_url, timeout=timeout).decode('utf-8')
urls = re.compile('<loc>(.+?)</loc>').findall(sitemap)
ids = set()
for url in sorted(urls, reverse=False):
s = ox.cache.read_url(url, timeout=timeout).decode('utf-8')
ids |= set(re.compile('<loc>http://www.imdb.com/title/tt(\d{7})/combined</loc>').findall(s))
#print url, len(ids)
known_ids = frozenset([i['imdb'] for i in Imdb.objects.all().values('imdb')])
new_ids = frozenset(ids) - known_ids
print('adding %s new items' % len(new_ids))
added = 0
done = set()
for i in sorted(new_ids):
print(i)
m, created = Imdb.objects.get_or_create(imdb=i)
try:
m.update()
except:
with open('/tmp/missing.json', 'w') as fd:
json.dump(list(new_ids-done), fd)
raise
print(m)
if created:
added += 1
done.add(i)
if added:
print(added)
if os.path.exists(new_ids_cache):
os.unlink(new_ids_cache)
class Match(models.Model):
keys = [
['title', 'director', 'year'],
['title', 'director'],
['title', 'year'],
['director', 'year'],
['title'],
['director']
]
key = models.CharField(max_length=28, db_index=True)
item = models.ForeignKey(Imdb, related_name='matches')
def __str__(self):
return '%s(%s)' % (self.hexdigest(), self.item.imdb)
def json(self):
return self.item.json()
def hexdigest(self):
key = self.key.encode()
return binascii.hexlify(base64.b64decode(key)).decode()
@classmethod
def get_keys(cls, data):
data = {
'title': normalize_value(data['title'].lower()),
'year': str(data.get('year', '')),
'director': normalize_value(';'.join(sorted(data.get('director', []))))
}
keys = []
if not data['director']:
_keys = [k for k in cls.keys if k not in (['director', 'year'], ['director'])]
else:
_keys = cls.keys
for k in _keys:
key = '\0'.join(k)
value = '\0'.join([data[v] for v in k])
key = key.encode('utf-8')
value = value.encode('utf-8')
key = base64.b64encode(hashlib.sha1(key + b'\n' + value).digest()).decode()
keys.append(key)
return keys
@classmethod
def find(cls, data):
matches = []
items = []
for key in cls.get_keys(data):
for m in cls.objects.filter(key=key).order_by('item__imdb').select_related():
if m.item.id not in items:
matches.append(m.json())
items.append(m.item.id)
return matches
@classmethod
def update_item(cls, item):
info = item.json()
#ignore values without title, must be invalid
if 'title' not in info:
return
data = []
if 'originalTitle' in info:
data.append({
'title': info['originalTitle'],
'year': info.get('year', ''),
'director': info.get('director', [])
})
data.append(info)
existing_keys = [m.key for m in Match.objects.filter(item=item)]
current_keys = []
for d in data:
for key in cls.get_keys(d):
if key not in existing_keys:
m = Match(key=key, item=item)
m.save()
current_keys.append(key)
deleted_keys = list(set(existing_keys)-set(current_keys))
if deleted_keys:
Match.objects.filter(item=item, key__in=deleted_keys).delete()