# -*- coding: utf-8 -*- # vi:si:et:sw=4:sts=4:ts=4 from __future__ import division, print_function, absolute_import import re import os import unicodedata from six.moves.urllib.parse import quote from six import string_types import hashlib import base64 import binascii import json from six import string_types from django.db import models from django.conf import settings import ox from oxdjango.fields import DictField from lookup.models import get_movie_id from poster.models import getPosters def normalize_value(value): if isinstance(value, bytes): value = value.decode('utf-8') if isinstance(value, string_types): value = unicodedata.normalize('NFD', value) return value def find(info, guess=True): q = Imdb.objects.all() if 'id' in info: q = q.filter(imdb=info['id']) if q.count() == 0 and len(info['id']) == 7: i = Imdb(imdb=info['id']) i.save() q = Imdb.objects.filter(imdb=info['id']) else: for key in Imdb.keys: if key in info and info[key]: if isinstance(info[key], string_types): fkey = '%s__iexact'%key else: fkey = key if isinstance(info[key], list): fkey = '%s__iexact'%key value = normalize_value(u'\n'.join(info[key]) + '\n') else: value = normalize_value(info[key]) q = q.filter(**{fkey:value}) if q.count() == 1: m = q[0] m.update() return m #For now fallback to ox.web.imdb.guess and try again if guess and 'title' in info: id = ox.web.imdb.get_movie_id(normalize_value(info['title'])) if id: i, created = Imdb.objects.get_or_create(imdb=id) if created: i.update() return find(info, False) return None class Imdb(models.Model): created = models.DateTimeField(auto_now_add=True) modified = models.DateTimeField(auto_now=True) imdb = models.CharField(max_length=7, unique=True) title = models.CharField(max_length=1000, blank=True, default='') originalTitle = models.CharField(max_length=1000, blank=True, default='') year = models.CharField(max_length=4, blank=True, default='') director = models.CharField(max_length=9000, blank=True, default='') season = models.IntegerField(blank=True, null=True) episode = models.IntegerField(blank=True, null=True) episodeTitle = models.CharField(max_length=1000, blank=True, default='') episodeYear = models.CharField(max_length=4, blank=True, default='') episodeDirector = models.CharField(max_length=1000, blank=True, default='') seriesTitle = models.CharField(max_length=1000, blank=True, default='') invalid = models.BooleanField(default=False) patch = DictField(default=None, blank=True, null=True) def __unicode__(self): return u"[%s] %s%s" % (self.imdb, self.title, self.year and ' (%s)' % self.year or '') def save(self, *args, **kwargs): super(Imdb, self).save(*args, **kwargs) if not self.invalid: Match.update_item(self) else: Match.objects.filter(item=self).delete() keys = ('title', 'director', 'year', 'season', 'episode', 'originalTitle', 'seriesTitle', 'episodeTitle', 'episodeYear', 'episodeDirector') def apply_patch(self, data): if self.patch: data.update(self.patch) if 'seriesTitle' in data and 'episodeTitle' in data: if 'season' in data and 'episode' in data: data['title'] = "%s (S%02dE%02d) %s" % ( data['seriesTitle'], data['season'], data['episode'], data['episodeTitle']) else: data['title'] = "%s (S01) %s" % (data['seriesTitle'], data['episodeTitle']) data['title'] = data['title'].strip() return data def update(self, timeout=None): if timeout != None: info = ox.web.imdb.ImdbCombined(self.imdb, timeout=timeout) else: info = ox.web.imdb.ImdbCombined(self.imdb) info = self.apply_patch(info) if info: for key in self.keys: if key in info: value = info[key] if isinstance(value, list): value = u'\n'.join(value) + '\n' if isinstance(value, string_types): value = normalize_value(value) setattr(self, key, value) if self.season and self.season < 0: self.season = None if self.episode and self.episode < 0: self.episode = None self.save() elif not self.invalid: self.invalid = True self.save() def data(self, request=None, timeout=ox.cache.cache_timeout): data = ox.web.imdb.Imdb(self.imdb, timeout=timeout) data = self.apply_patch(data) def quote_string(string): return quote(string).replace('_', '%09').replace('%20', '_') def fix_names(m): return '%s' % ( quote_string(m.group(2).encode('utf-8')), m.group(2) ) def fix_titles(m): return '%s' % ( quote_string(m.group(2).encode('utf-8')), m.group(2) ) def fix_links(t): t = re.sub('(.*?)', fix_names, t) t = re.sub('(.*?)', fix_titles, t) return t if 'trivia' in data: data['trivia'] = [fix_links(t) for t in data['trivia']] if 'summary' in data: data['summary'] = fix_links(data['summary']) if 'rating' in data: data['rating'] = float(data['rating']) * 10 if 'votes' in data: max_votes = ox.web.imdb.max_votes() data['votes'] = 100 * float(data['votes']) / max_votes else: data['votes'] = 0 if 'reviews' in data: reviews = [] for r in data['reviews']: for url in settings.REVIEW_WHITELIST: if url in r[0]: reviews.append({ 'source': settings.REVIEW_WHITELIST[url], 'url': r[0] }) data['reviews'] = reviews if not data['reviews']: del data['reviews'] if 'posterId' in data: del data['posterId'] data['likes'] = self.info('likes') data['downloads'] = self.info('downloads') data['links'] = self.links() data['posters'] = self.posters(request) if 'title' in data: data['title'] = ox.escape_html(data['title']) if 'originalTitle' in data: data['originalTitle'] = ox.escape_html(data['originalTitle']) if 'alternativeTitles' in data: data['alternativeTitles'] = [(ox.escape_html(a[0]), a[1]) for a in data['alternativeTitles']] if 'connections' in data: for type in data['connections']: for c in data['connections'][type]: for key in ('title', 'description'): if key in c: c[key] = ox.sanitize_html(fix_links(c[key])) return data def info(self, key): movie_id = get_movie_id(imdb_id=self.imdb) return movie_id and movie_id.info(key) or 0 def links(self): links = [] movie_id = get_movie_id(imdb_id=self.imdb) if movie_id: links = movie_id.links() return links def posters(self, request=None): movie_id = get_movie_id(imdb_id=self.imdb) return getPosters(movie_id, request and request.build_absolute_uri('/') or '') def json(self): j = {} j['id'] = self.imdb for key in self.keys: j[key] = getattr(self, key) for key in ('director', 'episodeDirector'): if j[key].strip(): j[key] = j[key].strip().split('\n') else: del j[key] for key in list(j): if not j[key]: del j[key] if 'year' in j and isinstance(j['year'], string_types) and j['year'].isdigit(): j['year'] = int(j['year']) return j def get_new_ids(timeout=-1): new_ids_cache = '/tmp/missing.json' if os.path.exists(new_ids_cache): with open(new_ids_cache) as fd: new_ids = set(json.load(fd)) else: robot = ox.cache.read_url('http://www.imdb.com/robots.txt', timeout=timeout) sitemap_url = re.compile('\nSitemap: (http.+)').findall(robot)[0] sitemap = ox.cache.read_url(sitemap_url, timeout=timeout) urls = re.compile('(.+?)').findall(sitemap) ids = set() for url in sorted(urls, reverse=False): s = ox.cache.read_url(url, timeout=timeout) ids |= set(re.compile('http://www.imdb.com/title/tt(\d{7})/combined').findall(s)) #print url, len(ids) known_ids = frozenset([i['imdb'] for i in Imdb.objects.all().values('imdb')]) new_ids = frozenset(ids) - known_ids print('adding %s new items' % len(new_ids)) added = 0 done = set() for i in sorted(new_ids): print(i) m, created = Imdb.objects.get_or_create(imdb=i) try: m.update() except: with open('/tmp/missing.json', 'w') as fd: json.dump(list(new_ids-done), fd) raise print(m) if created: added += 1 done.add(i) if added: print(added) if os.path.exists(new_ids_cache): os.unlink(new_ids_cache) class Match(models.Model): keys = [ ['title', 'director', 'year'], ['title', 'director'], ['title', 'year'], ['director', 'year'], ['title'], ['director'] ] key = models.CharField(max_length=28, db_index=True) item = models.ForeignKey(Imdb, related_name='matches') def __unicode__(self): return '%s(%s)' % (self.hexdigest(), self.item.imdb) def json(self): return self.item.json() def hexdigest(self): key = self.key.encode() return binascii.hexlify(base64.b64decode(key)).decode() @classmethod def get_keys(cls, data): data = { 'title': normalize_value(data['title'].lower()), 'year': str(data.get('year', '')), 'director': normalize_value(';'.join(sorted(data.get('director', [])))) } keys = [] if not data['director']: _keys = [k for k in cls.keys if k not in (['director', 'year'], ['director'])] else: _keys = cls.keys for k in _keys: key = '\0'.join(k) value = '\0'.join([data[v] for v in k]) key = key.encode('utf-8') value = value.encode('utf-8') key = base64.b64encode(hashlib.sha1(key + b'\n' + value).digest()).decode() keys.append(key) return keys @classmethod def find(cls, data): matches = [] items = [] for key in cls.get_keys(data): for m in cls.objects.filter(key=key).order_by('item__imdb').select_related(): if m.item.id not in items: matches.append(m.json()) items.append(m.item.id) return matches @classmethod def update_item(cls, item): info = item.json() #ignore values without title, must be invalid if 'title' not in info: return data = [] if 'originalTitle' in info: data.append({ 'title': info['originalTitle'], 'year': info.get('year', ''), 'director': info.get('director', []) }) data.append(info) existing_keys = [m.key for m in Match.objects.filter(item=item)] current_keys = [] for d in data: for key in cls.get_keys(d): if key not in existing_keys: m = Match(key=key, item=item) m.save() current_keys.append(key) deleted_keys = list(set(existing_keys)-set(current_keys)) if deleted_keys: Match.objects.filter(item=item, key__in=deleted_keys).delete()