This commit is contained in:
root 2016-09-07 12:56:51 +00:00
parent 0dd6b1132e
commit 7991849d44

View file

@ -3,12 +3,14 @@
from __future__ import division, print_function, absolute_import from __future__ import division, print_function, absolute_import
import re import re
import os
import unicodedata import unicodedata
from six.moves.urllib.parse import quote from six.moves.urllib.parse import quote
from six import string_types from six import string_types
import hashlib import hashlib
import base64 import base64
import binascii import binascii
import json
from six import string_types from six import string_types
from django.db import models from django.db import models
@ -233,23 +235,42 @@ class Imdb(models.Model):
return j return j
def get_new_ids(timeout=-1): def get_new_ids(timeout=-1):
known_ids = frozenset([i['imdb'] for i in Imdb.objects.all().values('imdb')]) new_ids_cache = '/tmp/missing.json'
if os.path.exists(new_ids_cache):
with open(new_ids_cache) as fd:
new_ids = set(json.load(fd))
else:
robot = ox.cache.read_url('http://www.imdb.com/robots.txt', timeout=timeout) robot = ox.cache.read_url('http://www.imdb.com/robots.txt', timeout=timeout)
sitemap_url = re.compile('\nSitemap: (http.+)').findall(robot)[0] sitemap_url = re.compile('\nSitemap: (http.+)').findall(robot)[0]
sitemap = ox.cache.read_url(sitemap_url, timeout=timeout) sitemap = ox.cache.read_url(sitemap_url, timeout=timeout)
urls = re.compile('<loc>(.+?)</loc>').findall(sitemap) urls = re.compile('<loc>(.+?)</loc>').findall(sitemap)
for url in sorted(urls, reverse=True): ids = set()
for url in sorted(urls, reverse=False):
s = ox.cache.read_url(url, timeout=timeout) s = ox.cache.read_url(url, timeout=timeout)
ids = re.compile('<loc>http://www.imdb.com/title/tt(\d{7})/combined</loc>').findall(s) ids |= set(re.compile('<loc>http://www.imdb.com/title/tt(\d{7})/combined</loc>').findall(s))
#print url, len(ids)
known_ids = frozenset([i['imdb'] for i in Imdb.objects.all().values('imdb')])
new_ids = frozenset(ids) - known_ids
print('adding %s new items' % len(new_ids))
added = 0 added = 0
for i in frozenset(ids) - known_ids: done = set()
for i in sorted(new_ids):
print(i)
m, created = Imdb.objects.get_or_create(imdb=i) m, created = Imdb.objects.get_or_create(imdb=i)
try:
m.update() m.update()
except:
with open('/tmp/missing.json', 'w') as fd:
json.dump(list(new_ids-done), fd)
raise
print(m) print(m)
if created: if created:
added += 1 added += 1
done.add(i)
if added: if added:
print(url, added) print(added)
if os.path.exists(new_ids_cache):
os.unlink(new_ids_cache)
class Match(models.Model): class Match(models.Model):
keys = [ keys = [