resume
This commit is contained in:
parent
0dd6b1132e
commit
7991849d44
1 changed files with 37 additions and 16 deletions
|
@ -3,12 +3,14 @@
|
||||||
from __future__ import division, print_function, absolute_import
|
from __future__ import division, print_function, absolute_import
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
import os
|
||||||
import unicodedata
|
import unicodedata
|
||||||
from six.moves.urllib.parse import quote
|
from six.moves.urllib.parse import quote
|
||||||
from six import string_types
|
from six import string_types
|
||||||
import hashlib
|
import hashlib
|
||||||
import base64
|
import base64
|
||||||
import binascii
|
import binascii
|
||||||
|
import json
|
||||||
|
|
||||||
from six import string_types
|
from six import string_types
|
||||||
from django.db import models
|
from django.db import models
|
||||||
|
@ -233,23 +235,42 @@ class Imdb(models.Model):
|
||||||
return j
|
return j
|
||||||
|
|
||||||
def get_new_ids(timeout=-1):
|
def get_new_ids(timeout=-1):
|
||||||
known_ids = frozenset([i['imdb'] for i in Imdb.objects.all().values('imdb')])
|
new_ids_cache = '/tmp/missing.json'
|
||||||
robot = ox.cache.read_url('http://www.imdb.com/robots.txt', timeout=timeout)
|
if os.path.exists(new_ids_cache):
|
||||||
sitemap_url = re.compile('\nSitemap: (http.+)').findall(robot)[0]
|
with open(new_ids_cache) as fd:
|
||||||
sitemap = ox.cache.read_url(sitemap_url, timeout=timeout)
|
new_ids = set(json.load(fd))
|
||||||
urls = re.compile('<loc>(.+?)</loc>').findall(sitemap)
|
else:
|
||||||
for url in sorted(urls, reverse=True):
|
robot = ox.cache.read_url('http://www.imdb.com/robots.txt', timeout=timeout)
|
||||||
s = ox.cache.read_url(url, timeout=timeout)
|
sitemap_url = re.compile('\nSitemap: (http.+)').findall(robot)[0]
|
||||||
ids = re.compile('<loc>http://www.imdb.com/title/tt(\d{7})/combined</loc>').findall(s)
|
sitemap = ox.cache.read_url(sitemap_url, timeout=timeout)
|
||||||
added = 0
|
urls = re.compile('<loc>(.+?)</loc>').findall(sitemap)
|
||||||
for i in frozenset(ids) - known_ids:
|
ids = set()
|
||||||
m, created = Imdb.objects.get_or_create(imdb=i)
|
for url in sorted(urls, reverse=False):
|
||||||
|
s = ox.cache.read_url(url, timeout=timeout)
|
||||||
|
ids |= set(re.compile('<loc>http://www.imdb.com/title/tt(\d{7})/combined</loc>').findall(s))
|
||||||
|
#print url, len(ids)
|
||||||
|
known_ids = frozenset([i['imdb'] for i in Imdb.objects.all().values('imdb')])
|
||||||
|
new_ids = frozenset(ids) - known_ids
|
||||||
|
print('adding %s new items' % len(new_ids))
|
||||||
|
added = 0
|
||||||
|
done = set()
|
||||||
|
for i in sorted(new_ids):
|
||||||
|
print(i)
|
||||||
|
m, created = Imdb.objects.get_or_create(imdb=i)
|
||||||
|
try:
|
||||||
m.update()
|
m.update()
|
||||||
print(m)
|
except:
|
||||||
if created:
|
with open('/tmp/missing.json', 'w') as fd:
|
||||||
added += 1
|
json.dump(list(new_ids-done), fd)
|
||||||
if added:
|
raise
|
||||||
print(url, added)
|
print(m)
|
||||||
|
if created:
|
||||||
|
added += 1
|
||||||
|
done.add(i)
|
||||||
|
if added:
|
||||||
|
print(added)
|
||||||
|
if os.path.exists(new_ids_cache):
|
||||||
|
os.unlink(new_ids_cache)
|
||||||
|
|
||||||
class Match(models.Model):
|
class Match(models.Model):
|
||||||
keys = [
|
keys = [
|
||||||
|
|
Loading…
Reference in a new issue