114 lines
3.9 KiB
Python
114 lines
3.9 KiB
Python
# -*- coding: utf-8 -*-
|
|
# vi:si:et:sw=4:sts=4:ts=4
|
|
|
|
import re
|
|
|
|
from django.db import models
|
|
import ox
|
|
|
|
|
|
def find(info, guess=True):
|
|
q = Imdb.objects.all()
|
|
if 'id' in info:
|
|
q = q.filter(imdb=info['id'])
|
|
else:
|
|
if 'seriesTitle' in info:
|
|
info['title'] = info['seriesTitle']
|
|
for key in Imdb.keys:
|
|
if key in info and info[key]:
|
|
if isinstance(info[key], basestring):
|
|
fkey = '%s__iexact'%key
|
|
else:
|
|
fkey = key
|
|
if isinstance(info[key], list):
|
|
q = q.filter(**{fkey: '\n'.join(info[key]) + '\n'})
|
|
else:
|
|
q = q.filter(**{fkey:info[key]})
|
|
if q.count() == 1:
|
|
m = q[0]
|
|
m.update()
|
|
return m
|
|
#For now fallback to ox.web.imdb.guess and try again
|
|
if guess:
|
|
id = ox.web.imdb.getMovieId(info['title'])
|
|
if id:
|
|
i, created = Imdb.objects.get_or_create(imdb=id)
|
|
if created:
|
|
i.update()
|
|
return find(info, False)
|
|
return None
|
|
|
|
class Imdb(models.Model):
|
|
created = models.DateTimeField(auto_now_add=True)
|
|
modified = models.DateTimeField(auto_now=True)
|
|
|
|
imdb = models.CharField(max_length=7, unique=True)
|
|
title = models.CharField(max_length=1000, blank=True, default='')
|
|
year = models.CharField(max_length=4, blank=True, default='')
|
|
director = models.CharField(max_length=9000, blank=True, default='')
|
|
|
|
season = models.IntegerField(blank=True, null=True)
|
|
episode = models.IntegerField(blank=True, null=True)
|
|
episodeTitle = models.CharField(max_length=1000, blank=True, default='')
|
|
episodeYear = models.CharField(max_length=4, blank=True, default='')
|
|
episodeDirector = models.CharField(max_length=1000, blank=True, default='')
|
|
|
|
invalid = models.BooleanField(default=False)
|
|
|
|
def __unicode__(self):
|
|
return u"%s (%s)" % (self.title, self.imdb)
|
|
|
|
keys = ('title', 'director', 'year', 'season', 'episode',
|
|
'episodeTitle', 'episodeYear', 'episodeDirector')
|
|
|
|
def update(self):
|
|
info = ox.web.imdb.ImdbCombined(self.imdb)
|
|
if info:
|
|
for key in self.keys:
|
|
if key in info:
|
|
value = info[key]
|
|
if key == 'title' and 'seriesTitle' in info:
|
|
value = info['seriesTitle']
|
|
if isinstance(value, list):
|
|
value = '\n'.join(value) + '\n'
|
|
setattr(self, key, value)
|
|
if self.season < 0:
|
|
self.season = None
|
|
if self.episode < 0:
|
|
self.episode = None
|
|
self.save()
|
|
elif not self.invalid:
|
|
self.invalid = True
|
|
self.save()
|
|
|
|
def json(self):
|
|
j = {}
|
|
j['id'] = self.imdb
|
|
for key in self.keys:
|
|
j[key] = getattr(self, key)
|
|
for key in ('director', 'episodeDirector'):
|
|
if j[key].strip():
|
|
j[key] = j[key].strip().split('\n')
|
|
else:
|
|
del j[key]
|
|
for key in j.keys():
|
|
if not j[key]:
|
|
del j[key]
|
|
return j
|
|
|
|
def get_new_ids(timeout=-1):
|
|
known_ids = frozenset([i['imdb'] for i in Imdb.objects.all().values('imdb')])
|
|
robot = ox.cache.readUrl('http://www.imdb.com/robots.txt', timeout=timeout)
|
|
sitemap_url = re.compile('\nSitemap: (http.+)').findall(robot)[0]
|
|
sitemap = ox.cache.readUrl(sitemap_url, timeout=timeout)
|
|
urls = re.compile('<loc>(.+?)</loc>').findall(sitemap)
|
|
for url in sorted(urls, reverse=True):
|
|
s = ox.cache.readUrl(url, timeout=timeout)
|
|
ids = re.compile('<loc>http://www.imdb.com/title/tt(\d{7})/combined</loc>').findall(s)
|
|
added = 0
|
|
for i in frozenset(ids) - known_ids:
|
|
m= Imdb(imdb=i)
|
|
m.update()
|
|
added += 1
|
|
if added:
|
|
print url, added
|