diff --git a/oxdata/movie/models.py b/oxdata/movie/models.py
index b7263fc..3f4418c 100644
--- a/oxdata/movie/models.py
+++ b/oxdata/movie/models.py
@@ -3,12 +3,14 @@
from __future__ import division, print_function, absolute_import
import re
+import os
import unicodedata
from six.moves.urllib.parse import quote
from six import string_types
import hashlib
import base64
import binascii
+import json
from six import string_types
from django.db import models
@@ -233,23 +235,42 @@ class Imdb(models.Model):
return j
def get_new_ids(timeout=-1):
- known_ids = frozenset([i['imdb'] for i in Imdb.objects.all().values('imdb')])
- robot = ox.cache.read_url('http://www.imdb.com/robots.txt', timeout=timeout)
- sitemap_url = re.compile('\nSitemap: (http.+)').findall(robot)[0]
- sitemap = ox.cache.read_url(sitemap_url, timeout=timeout)
- urls = re.compile('(.+?)').findall(sitemap)
- for url in sorted(urls, reverse=True):
- s = ox.cache.read_url(url, timeout=timeout)
- ids = re.compile('http://www.imdb.com/title/tt(\d{7})/combined').findall(s)
- added = 0
- for i in frozenset(ids) - known_ids:
- m, created = Imdb.objects.get_or_create(imdb=i)
+ new_ids_cache = '/tmp/missing.json'
+ if os.path.exists(new_ids_cache):
+ with open(new_ids_cache) as fd:
+ new_ids = set(json.load(fd))
+ else:
+ robot = ox.cache.read_url('http://www.imdb.com/robots.txt', timeout=timeout)
+ sitemap_url = re.compile('\nSitemap: (http.+)').findall(robot)[0]
+ sitemap = ox.cache.read_url(sitemap_url, timeout=timeout)
+ urls = re.compile('(.+?)').findall(sitemap)
+ ids = set()
+ for url in sorted(urls, reverse=False):
+ s = ox.cache.read_url(url, timeout=timeout)
+ ids |= set(re.compile('http://www.imdb.com/title/tt(\d{7})/combined').findall(s))
+ #print url, len(ids)
+ known_ids = frozenset([i['imdb'] for i in Imdb.objects.all().values('imdb')])
+ new_ids = frozenset(ids) - known_ids
+ print('adding %s new items' % len(new_ids))
+ added = 0
+ done = set()
+ for i in sorted(new_ids):
+ print(i)
+ m, created = Imdb.objects.get_or_create(imdb=i)
+ try:
m.update()
- print(m)
- if created:
- added += 1
- if added:
- print(url, added)
+ except:
+ with open('/tmp/missing.json', 'w') as fd:
+ json.dump(list(new_ids-done), fd)
+ raise
+ print(m)
+ if created:
+ added += 1
+ done.add(i)
+ if added:
+ print(added)
+ if os.path.exists(new_ids_cache):
+ os.unlink(new_ids_cache)
class Match(models.Model):
keys = [