# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from __future__ import division, print_function, absolute_import
import re
import os
import unicodedata
from six.moves.urllib.parse import quote
from six import string_types
import hashlib
import base64
import binascii
import json
from six import string_types
from django.db import models
from django.conf import settings
import ox
from oxdjango.fields import DictField
from lookup.models import get_movie_id
from poster.models import getPosters
def normalize_value(value):
if isinstance(value, bytes):
value = value.decode('utf-8')
if isinstance(value, string_types):
value = unicodedata.normalize('NFD', value)
return value
def find(info, guess=True):
q = Imdb.objects.all()
if 'id' in info:
q = q.filter(imdb=info['id'])
if q.count() == 0 and len(info['id']) == 7:
i = Imdb(imdb=info['id'])
i.save()
q = Imdb.objects.filter(imdb=info['id'])
else:
for key in Imdb.keys:
if key in info and info[key]:
if isinstance(info[key], string_types):
fkey = '%s__iexact'%key
else:
fkey = key
if isinstance(info[key], list):
fkey = '%s__iexact'%key
value = normalize_value(u'\n'.join(info[key]) + '\n')
else:
value = normalize_value(info[key])
q = q.filter(**{fkey:value})
if q.count() == 1:
m = q[0]
m.update()
return m
#For now fallback to ox.web.imdb.guess and try again
if guess and 'title' in info:
id = ox.web.imdb.get_movie_id(normalize_value(info['title']))
if id:
i, created = Imdb.objects.get_or_create(imdb=id)
if created:
i.update()
return find(info, False)
return None
class Imdb(models.Model):
created = models.DateTimeField(auto_now_add=True)
modified = models.DateTimeField(auto_now=True)
imdb = models.CharField(max_length=7, unique=True)
title = models.CharField(max_length=1000, blank=True, default='')
originalTitle = models.CharField(max_length=1000, blank=True, default='')
year = models.CharField(max_length=4, blank=True, default='')
director = models.CharField(max_length=9000, blank=True, default='')
season = models.IntegerField(blank=True, null=True)
episode = models.IntegerField(blank=True, null=True)
episodeTitle = models.CharField(max_length=1000, blank=True, default='')
episodeYear = models.CharField(max_length=4, blank=True, default='')
episodeDirector = models.CharField(max_length=1000, blank=True, default='')
seriesTitle = models.CharField(max_length=1000, blank=True, default='')
invalid = models.BooleanField(default=False)
patch = DictField(default=None, blank=True, null=True)
def __unicode__(self):
return u"[%s] %s%s" % (self.imdb, self.title, self.year and ' (%s)' % self.year or '')
def save(self, *args, **kwargs):
super(Imdb, self).save(*args, **kwargs)
if not self.invalid:
Match.update_item(self)
else:
Match.objects.filter(item=self).delete()
keys = ('title', 'director', 'year', 'season', 'episode',
'originalTitle',
'seriesTitle', 'episodeTitle', 'episodeYear', 'episodeDirector')
def apply_patch(self, data):
if self.patch:
data.update(self.patch)
if 'seriesTitle' in data and 'episodeTitle' in data:
if 'season' in data and 'episode' in data:
data['title'] = "%s (S%02dE%02d) %s" % (
data['seriesTitle'], data['season'], data['episode'], data['episodeTitle'])
else:
data['title'] = "%s (S01) %s" % (data['seriesTitle'], data['episodeTitle'])
data['title'] = data['title'].strip()
return data
def update(self, timeout=None):
if timeout != None:
info = ox.web.imdb.ImdbCombined(self.imdb, timeout=timeout)
else:
info = ox.web.imdb.ImdbCombined(self.imdb)
info = self.apply_patch(info)
if info:
for key in self.keys:
if key in info:
value = info[key]
if isinstance(value, list):
value = u'\n'.join(value) + '\n'
if isinstance(value, string_types):
value = normalize_value(value)
setattr(self, key, value)
if self.season and self.season < 0:
self.season = None
if self.episode and self.episode < 0:
self.episode = None
self.save()
elif not self.invalid:
self.invalid = True
self.save()
def data(self, request=None, timeout=ox.cache.cache_timeout):
data = ox.web.imdb.Imdb(self.imdb, timeout=timeout)
data = self.apply_patch(data)
def quote_string(string):
return quote(string).replace('_', '%09').replace('%20', '_')
def fix_names(m):
return '%s' % (
quote_string(m.group(2).encode('utf-8')), m.group(2)
)
def fix_titles(m):
return '%s' % (
quote_string(m.group(2).encode('utf-8')), m.group(2)
)
def fix_links(t):
t = re.sub('(.*?)', fix_names, t)
t = re.sub('(.*?)', fix_titles, t)
return t
if 'trivia' in data:
data['trivia'] = [fix_links(t) for t in data['trivia']]
if 'summary' in data:
data['summary'] = fix_links(data['summary'])
if 'rating' in data:
data['rating'] = float(data['rating']) * 10
if 'votes' in data:
max_votes = ox.web.imdb.max_votes()
data['votes'] = 100 * float(data['votes']) / max_votes
else:
data['votes'] = 0
if 'reviews' in data:
reviews = []
for r in data['reviews']:
for url in settings.REVIEW_WHITELIST:
if url in r[0]:
reviews.append({
'source': settings.REVIEW_WHITELIST[url],
'url': r[0]
})
data['reviews'] = reviews
if not data['reviews']:
del data['reviews']
if 'posterId' in data:
del data['posterId']
data['likes'] = self.info('likes')
data['downloads'] = self.info('downloads')
data['links'] = self.links()
data['posters'] = self.posters(request)
if 'title' in data:
data['title'] = ox.escape_html(data['title'])
if 'originalTitle' in data:
data['originalTitle'] = ox.escape_html(data['originalTitle'])
if 'alternativeTitles' in data:
data['alternativeTitles'] = [(ox.escape_html(a[0]), a[1]) for a in data['alternativeTitles']]
if 'connections' in data:
for type in data['connections']:
for c in data['connections'][type]:
for key in ('title', 'description'):
if key in c:
c[key] = ox.sanitize_html(fix_links(c[key]))
return data
def info(self, key):
movie_id = get_movie_id(imdb_id=self.imdb)
return movie_id and movie_id.info(key) or 0
def links(self):
links = []
movie_id = get_movie_id(imdb_id=self.imdb)
if movie_id:
links = movie_id.links()
return links
def posters(self, request=None):
movie_id = get_movie_id(imdb_id=self.imdb)
return getPosters(movie_id, request and request.build_absolute_uri('/') or '')
def json(self):
j = {}
j['id'] = self.imdb
for key in self.keys:
j[key] = getattr(self, key)
for key in ('director', 'episodeDirector'):
if j[key].strip():
j[key] = j[key].strip().split('\n')
else:
del j[key]
for key in list(j):
if not j[key]:
del j[key]
if 'year' in j and isinstance(j['year'], string_types) and j['year'].isdigit():
j['year'] = int(j['year'])
return j
def get_new_ids(timeout=-1):
new_ids_cache = '/tmp/missing.json'
if os.path.exists(new_ids_cache):
with open(new_ids_cache) as fd:
new_ids = set(json.load(fd))
else:
robot = ox.cache.read_url('http://www.imdb.com/robots.txt', timeout=timeout)
sitemap_url = re.compile('\nSitemap: (http.+)').findall(robot)[0]
sitemap = ox.cache.read_url(sitemap_url, timeout=timeout)
urls = re.compile('(.+?)').findall(sitemap)
ids = set()
for url in sorted(urls, reverse=False):
s = ox.cache.read_url(url, timeout=timeout)
ids |= set(re.compile('http://www.imdb.com/title/tt(\d{7})/combined').findall(s))
#print url, len(ids)
known_ids = frozenset([i['imdb'] for i in Imdb.objects.all().values('imdb')])
new_ids = frozenset(ids) - known_ids
print('adding %s new items' % len(new_ids))
added = 0
done = set()
for i in sorted(new_ids):
print(i)
m, created = Imdb.objects.get_or_create(imdb=i)
try:
m.update()
except:
with open('/tmp/missing.json', 'w') as fd:
json.dump(list(new_ids-done), fd)
raise
print(m)
if created:
added += 1
done.add(i)
if added:
print(added)
if os.path.exists(new_ids_cache):
os.unlink(new_ids_cache)
class Match(models.Model):
keys = [
['title', 'director', 'year'],
['title', 'director'],
['title', 'year'],
['director', 'year'],
['title'],
['director']
]
key = models.CharField(max_length=28, db_index=True)
item = models.ForeignKey(Imdb, related_name='matches')
def __unicode__(self):
return '%s(%s)' % (self.hexdigest(), self.item.imdb)
def json(self):
return self.item.json()
def hexdigest(self):
key = self.key.encode()
return binascii.hexlify(base64.b64decode(key)).decode()
@classmethod
def get_keys(cls, data):
data = {
'title': normalize_value(data['title'].lower()),
'year': str(data.get('year', '')),
'director': normalize_value(';'.join(sorted(data.get('director', []))))
}
keys = []
if not data['director']:
_keys = [k for k in cls.keys if k not in (['director', 'year'], ['director'])]
else:
_keys = cls.keys
for k in _keys:
key = '\0'.join(k)
value = '\0'.join([data[v] for v in k])
key = key.encode('utf-8')
value = value.encode('utf-8')
key = base64.b64encode(hashlib.sha1(key + b'\n' + value).digest()).decode()
keys.append(key)
return keys
@classmethod
def find(cls, data):
matches = []
items = []
for key in cls.get_keys(data):
for m in cls.objects.filter(key=key).order_by('item__imdb').select_related():
if m.item.id not in items:
matches.append(m.json())
items.append(m.item.id)
return matches
@classmethod
def update_item(cls, item):
info = item.json()
#ignore values without title, must be invalid
if 'title' not in info:
return
data = []
if 'originalTitle' in info:
data.append({
'title': info['originalTitle'],
'year': info.get('year', ''),
'director': info.get('director', [])
})
data.append(info)
existing_keys = [m.key for m in Match.objects.filter(item=item)]
current_keys = []
for d in data:
for key in cls.get_keys(d):
if key not in existing_keys:
m = Match(key=key, item=item)
m.save()
current_keys.append(key)
deleted_keys = list(set(existing_keys)-set(current_keys))
if deleted_keys:
Match.objects.filter(item=item, key__in=deleted_keys).delete()