diff --git a/oxdata/movie/imdbids.py b/oxdata/movie/imdbids.py
new file mode 100644
index 0000000..ce4d28f
--- /dev/null
+++ b/oxdata/movie/imdbids.py
@@ -0,0 +1,191 @@
+from datetime import datetime
+from glob import glob
+from optparse import OptionParser
+import calendar
+import codecs
+import json
+import os
+import re
+import sys
+
+from django.conf import settings
+import lxml.html
+import ox
+import ox.web.imdb
+
+
+MAX_PER_RANGE = 200 * 50
+
+DAY = 24 * 60 * 60
+TIMEOUT = 90 * DAY
+DATA_ROOT = os.path.join(settings.MEDIA_ROOT, 'imdb')
+
+def get_range(from_, to):
+ base_url = 'http://www.imdb.com'
+ url = '%s/search/title?adult=include&release_date=%s,%s&sort=release_date,asc&count=50' % (base_url, from_, to)
+ return url
+
+def get_year(year):
+ return get_range('%s-01-01' % year, '%s-12-31' % year)
+
+def get_month(year, month):
+ days = calendar.monthrange(year, month)[1]
+ month = '%s-%02d' % (year, month)
+ return get_range('%s-01' % month, '%s-%02d' % (month, days))
+
+def get_day(year, month, day):
+ day = '%s-%02d-%02d' % (year, month, day)
+ return get_range(day, day)
+
+def get_film_count(year, month=None, day=None):
+ if day is not None:
+ url = get_day(year, month, day)
+ elif month is not None:
+ url = get_month(year, month)
+ else:
+ url = get_year(year)
+ data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT)
+ total = re.compile('50.*?of (.*?) titles', re.DOTALL).findall(data)
+ if total:
+ return int(total[0].replace(',', ''))
+ print('no movies', url)
+ ox.web.imdb.delete_url(url)
+ return 0
+
+def get_path(name):
+ path = os.path.join(DATA_ROOT, name)
+ ox.makedirs(os.path.dirname(path))
+ return path
+
+def print_info(key, film_count, film_counts):
+ added = film_count - film_counts.get(key, 0)
+ if added != film_count:
+ extra = '(added %s)' % added
+ else:
+ extra = ''
+ print('update', key, 'now has', film_count, 'films', extra)
+
+def update_year(year, film_counts):
+ key = '%s' % year
+ if film_counts[key] > MAX_PER_RANGE:
+ for month in range(1, 13):
+ key = '%04d-%02d' % (year, month)
+ film_count = get_film_count(year, month)
+ if film_count != film_counts.get(key):
+ print_info(key, film_count, film_counts)
+ film_counts[key] = film_count
+ update_month(year, month, film_counts)
+ else:
+ update_ids(year)
+ save_film_counts(film_counts)
+
+def update_month(year, month, film_counts):
+ key = '%04d-%02d' % (year, month)
+ if film_counts[key] > MAX_PER_RANGE:
+ month_total = film_counts[key]
+ days_total = 0
+ days = calendar.monthrange(year, month)[1]
+ for day in range(1, days + 1):
+ key = '%04d-%02d-%02d' % (year, month, day)
+ film_count = get_film_count(year, month, day)
+ days_total += film_count
+ if film_count != film_counts.get(key):
+ print_info(key, film_count, film_counts)
+ film_counts[key] = film_count
+ if film_count > MAX_PER_RANGE:
+ print(key, '!!!to many per day')
+ else:
+ update_ids(year, month, day)
+ save_film_counts(film_counts)
+ if days_total != month_total:
+ print('!! month and days don\'t add up: %s month vs %s days total' % (month_total, days_total))
+ else:
+ update_ids(year, month)
+ save_film_counts(film_counts)
+
+
+def update_ids(year, month=None, day=None):
+ films = {}
+ if day is not None:
+ url = get_day(year, month, day)
+ key = '%04d-%02d-%02d' % (year, month, day)
+ elif month is not None:
+ url = get_month(year, month)
+ key = '%04d-%02d' % (year, mont)
+ else:
+ url = get_year(year)
+ key = '%04d' % year
+ data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT)
+ n = True
+ page = 2
+ while n:
+ n = re.compile('Next »', re.DOTALL).findall(data)
+ if n:
+ n = '%s&page=%s' % (url, page)
+ page += 1
+ doc = lxml.html.fromstring(data)
+ article = doc.find_class('article')
+ if article:
+ article = article[0]
+ else:
+ print('no article on', '%s&page=%s' % (url, page-2))
+ break
+ for header in article.find_class('lister-item-header'):
+ a = header.xpath('.//a')[0]
+ id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
+ title = a.text_content().strip()
+ try:
+ y = header.find_class('lister-item-year')[0].text_content()
+ y = re.sub('\([^\d]+\)', '', y)
+ y = y.rsplit('(', 1)[-1].split(')')[0].split('–')[0].split(' ')[0].strip()
+ if not y:
+ y = year
+ else:
+ y = int(y)
+ except:
+ print(n)
+ print(header.find_class('lister-item-year')[0].text_content())
+ raise
+ if id not in films:
+ films[id] = {
+ 'title': title,
+ 'year': y
+ }
+ #print(key, len(films), 'films')
+ if n:
+ #print(n)
+ data = ox.web.imdb.read_url(n, unicode=True, timeout=TIMEOUT)
+ path = get_path('ids/%s.json' % key)
+ with open(path, 'w') as fd:
+ json.dump(films, fd, indent=4, ensure_ascii=False, sort_keys=True)
+
+def save_film_counts(film_counts):
+ with open(get_path('film_counts.json'), 'w') as fd:
+ json.dump(film_counts, fd, indent=4, sort_keys=True)
+
+def update_index():
+ film_counts_json = get_path('film_counts.json')
+ if os.path.exists(film_counts_json):
+ with open(film_counts_json) as fd:
+ film_counts = json.load(fd)
+ else:
+ film_counts = {}
+
+ for year in range(1890, datetime.now().year+1):
+ film_count = get_film_count(year)
+ key = '%s' % year
+ if film_count != film_counts.get(key):
+ print_info(key, film_count, film_counts)
+ film_counts[key] = film_count
+ update_year(year, film_counts)
+ save_film_counts(film_counts)
+
+def get_unknown_ids(known_ids):
+ ids = []
+ for path in glob(get_path('ids/*.json')):
+ with open(path) as fd:
+ ids += json.load(fd).keys()
+ return frozenset(ids) - known_ids
+
+if __name__ == '__main__':
+ update_index()
diff --git a/oxdata/movie/models.py b/oxdata/movie/models.py
index 36e0b1d..3c95cbe 100644
--- a/oxdata/movie/models.py
+++ b/oxdata/movie/models.py
@@ -19,6 +19,8 @@ from oxdjango.fields import DictField
from lookup.models import get_movie_id
from poster.models import getPosters
+from .imdbids import get_unknown_ids, update_index
+
def normalize_value(value):
if isinstance(value, bytes):
value = value.decode('utf-8')
@@ -260,35 +262,28 @@ def get_new_ids(timeout=-1):
with open(new_ids_cache) as fd:
new_ids = set(json.load(fd))
else:
- robot = ox.cache.read_url('http://www.imdb.com/robots.txt', timeout=timeout).decode('utf-8')
- sitemap_url = re.compile('\nSitemap: (http.+)').findall(robot)[0]
- sitemap = ox.cache.read_url(sitemap_url, timeout=timeout).decode('utf-8')
- urls = re.compile('(.+?)').findall(sitemap)
- ids = set()
- for url in sorted(urls, reverse=False):
- s = ox.cache.read_url(url, timeout=timeout).decode('utf-8')
- ids |= set(re.compile('http://www.imdb.com/title/tt(\d{7})/combined').findall(s))
- #print url, len(ids)
+ update_index()
known_ids = frozenset([i['imdb'] for i in Imdb.objects.all().values('imdb')])
- new_ids = frozenset(ids) - known_ids
- print('adding %s new items' % len(new_ids))
- added = 0
- done = set()
- for i in sorted(new_ids):
- print(i)
- m, created = Imdb.objects.get_or_create(imdb=i)
- try:
- m.update()
- except:
- with open('/tmp/missing.json', 'w') as fd:
- json.dump(list(new_ids-done), fd)
- raise
- print(m)
- if created:
- added += 1
- done.add(i)
- if added:
- print(added)
+ new_ids = get_unknown_ids(known_ids)
+ if new_ids:
+ print('adding %s new items' % len(new_ids))
+ added = 0
+ done = set()
+ for i in sorted(new_ids):
+ print(i)
+ m, created = Imdb.objects.get_or_create(imdb=i)
+ try:
+ m.update()
+ except:
+ with open('/tmp/missing.json', 'w') as fd:
+ json.dump(list(new_ids-done), fd)
+ raise
+ print(m)
+ if created:
+ added += 1
+ done.add(i)
+ if added:
+ print(added)
if os.path.exists(new_ids_cache):
os.unlink(new_ids_cache)