diff --git a/oxdata/movie/models.py b/oxdata/movie/models.py index b7263fc..3f4418c 100644 --- a/oxdata/movie/models.py +++ b/oxdata/movie/models.py @@ -3,12 +3,14 @@ from __future__ import division, print_function, absolute_import import re +import os import unicodedata from six.moves.urllib.parse import quote from six import string_types import hashlib import base64 import binascii +import json from six import string_types from django.db import models @@ -233,23 +235,42 @@ class Imdb(models.Model): return j def get_new_ids(timeout=-1): - known_ids = frozenset([i['imdb'] for i in Imdb.objects.all().values('imdb')]) - robot = ox.cache.read_url('http://www.imdb.com/robots.txt', timeout=timeout) - sitemap_url = re.compile('\nSitemap: (http.+)').findall(robot)[0] - sitemap = ox.cache.read_url(sitemap_url, timeout=timeout) - urls = re.compile('(.+?)').findall(sitemap) - for url in sorted(urls, reverse=True): - s = ox.cache.read_url(url, timeout=timeout) - ids = re.compile('http://www.imdb.com/title/tt(\d{7})/combined').findall(s) - added = 0 - for i in frozenset(ids) - known_ids: - m, created = Imdb.objects.get_or_create(imdb=i) + new_ids_cache = '/tmp/missing.json' + if os.path.exists(new_ids_cache): + with open(new_ids_cache) as fd: + new_ids = set(json.load(fd)) + else: + robot = ox.cache.read_url('http://www.imdb.com/robots.txt', timeout=timeout) + sitemap_url = re.compile('\nSitemap: (http.+)').findall(robot)[0] + sitemap = ox.cache.read_url(sitemap_url, timeout=timeout) + urls = re.compile('(.+?)').findall(sitemap) + ids = set() + for url in sorted(urls, reverse=False): + s = ox.cache.read_url(url, timeout=timeout) + ids |= set(re.compile('http://www.imdb.com/title/tt(\d{7})/combined').findall(s)) + #print url, len(ids) + known_ids = frozenset([i['imdb'] for i in Imdb.objects.all().values('imdb')]) + new_ids = frozenset(ids) - known_ids + print('adding %s new items' % len(new_ids)) + added = 0 + done = set() + for i in sorted(new_ids): + print(i) + m, created = Imdb.objects.get_or_create(imdb=i) + try: m.update() - print(m) - if created: - added += 1 - if added: - print(url, added) + except: + with open('/tmp/missing.json', 'w') as fd: + json.dump(list(new_ids-done), fd) + raise + print(m) + if created: + added += 1 + done.add(i) + if added: + print(added) + if os.path.exists(new_ids_cache): + os.unlink(new_ids_cache) class Match(models.Model): keys = [