From e8225eec58117ac207c02b0cf42f7558114cdd74 Mon Sep 17 00:00:00 2001 From: j Date: Tue, 1 May 2018 12:08:25 +0200 Subject: [PATCH] discover imdb ids by year --- oxdata/movie/imdbids.py | 191 ++++++++++++++++++++++++++++++++++++++++ oxdata/movie/models.py | 51 +++++------ 2 files changed, 214 insertions(+), 28 deletions(-) create mode 100644 oxdata/movie/imdbids.py diff --git a/oxdata/movie/imdbids.py b/oxdata/movie/imdbids.py new file mode 100644 index 0000000..ce4d28f --- /dev/null +++ b/oxdata/movie/imdbids.py @@ -0,0 +1,191 @@ +from datetime import datetime +from glob import glob +from optparse import OptionParser +import calendar +import codecs +import json +import os +import re +import sys + +from django.conf import settings +import lxml.html +import ox +import ox.web.imdb + + +MAX_PER_RANGE = 200 * 50 + +DAY = 24 * 60 * 60 +TIMEOUT = 90 * DAY +DATA_ROOT = os.path.join(settings.MEDIA_ROOT, 'imdb') + +def get_range(from_, to): + base_url = 'http://www.imdb.com' + url = '%s/search/title?adult=include&release_date=%s,%s&sort=release_date,asc&count=50' % (base_url, from_, to) + return url + +def get_year(year): + return get_range('%s-01-01' % year, '%s-12-31' % year) + +def get_month(year, month): + days = calendar.monthrange(year, month)[1] + month = '%s-%02d' % (year, month) + return get_range('%s-01' % month, '%s-%02d' % (month, days)) + +def get_day(year, month, day): + day = '%s-%02d-%02d' % (year, month, day) + return get_range(day, day) + +def get_film_count(year, month=None, day=None): + if day is not None: + url = get_day(year, month, day) + elif month is not None: + url = get_month(year, month) + else: + url = get_year(year) + data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT) + total = re.compile('50.*?of (.*?) titles', re.DOTALL).findall(data) + if total: + return int(total[0].replace(',', '')) + print('no movies', url) + ox.web.imdb.delete_url(url) + return 0 + +def get_path(name): + path = os.path.join(DATA_ROOT, name) + ox.makedirs(os.path.dirname(path)) + return path + +def print_info(key, film_count, film_counts): + added = film_count - film_counts.get(key, 0) + if added != film_count: + extra = '(added %s)' % added + else: + extra = '' + print('update', key, 'now has', film_count, 'films', extra) + +def update_year(year, film_counts): + key = '%s' % year + if film_counts[key] > MAX_PER_RANGE: + for month in range(1, 13): + key = '%04d-%02d' % (year, month) + film_count = get_film_count(year, month) + if film_count != film_counts.get(key): + print_info(key, film_count, film_counts) + film_counts[key] = film_count + update_month(year, month, film_counts) + else: + update_ids(year) + save_film_counts(film_counts) + +def update_month(year, month, film_counts): + key = '%04d-%02d' % (year, month) + if film_counts[key] > MAX_PER_RANGE: + month_total = film_counts[key] + days_total = 0 + days = calendar.monthrange(year, month)[1] + for day in range(1, days + 1): + key = '%04d-%02d-%02d' % (year, month, day) + film_count = get_film_count(year, month, day) + days_total += film_count + if film_count != film_counts.get(key): + print_info(key, film_count, film_counts) + film_counts[key] = film_count + if film_count > MAX_PER_RANGE: + print(key, '!!!to many per day') + else: + update_ids(year, month, day) + save_film_counts(film_counts) + if days_total != month_total: + print('!! month and days don\'t add up: %s month vs %s days total' % (month_total, days_total)) + else: + update_ids(year, month) + save_film_counts(film_counts) + + +def update_ids(year, month=None, day=None): + films = {} + if day is not None: + url = get_day(year, month, day) + key = '%04d-%02d-%02d' % (year, month, day) + elif month is not None: + url = get_month(year, month) + key = '%04d-%02d' % (year, mont) + else: + url = get_year(year) + key = '%04d' % year + data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT) + n = True + page = 2 + while n: + n = re.compile('Next »', re.DOTALL).findall(data) + if n: + n = '%s&page=%s' % (url, page) + page += 1 + doc = lxml.html.fromstring(data) + article = doc.find_class('article') + if article: + article = article[0] + else: + print('no article on', '%s&page=%s' % (url, page-2)) + break + for header in article.find_class('lister-item-header'): + a = header.xpath('.//a')[0] + id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0] + title = a.text_content().strip() + try: + y = header.find_class('lister-item-year')[0].text_content() + y = re.sub('\([^\d]+\)', '', y) + y = y.rsplit('(', 1)[-1].split(')')[0].split('–')[0].split(' ')[0].strip() + if not y: + y = year + else: + y = int(y) + except: + print(n) + print(header.find_class('lister-item-year')[0].text_content()) + raise + if id not in films: + films[id] = { + 'title': title, + 'year': y + } + #print(key, len(films), 'films') + if n: + #print(n) + data = ox.web.imdb.read_url(n, unicode=True, timeout=TIMEOUT) + path = get_path('ids/%s.json' % key) + with open(path, 'w') as fd: + json.dump(films, fd, indent=4, ensure_ascii=False, sort_keys=True) + +def save_film_counts(film_counts): + with open(get_path('film_counts.json'), 'w') as fd: + json.dump(film_counts, fd, indent=4, sort_keys=True) + +def update_index(): + film_counts_json = get_path('film_counts.json') + if os.path.exists(film_counts_json): + with open(film_counts_json) as fd: + film_counts = json.load(fd) + else: + film_counts = {} + + for year in range(1890, datetime.now().year+1): + film_count = get_film_count(year) + key = '%s' % year + if film_count != film_counts.get(key): + print_info(key, film_count, film_counts) + film_counts[key] = film_count + update_year(year, film_counts) + save_film_counts(film_counts) + +def get_unknown_ids(known_ids): + ids = [] + for path in glob(get_path('ids/*.json')): + with open(path) as fd: + ids += json.load(fd).keys() + return frozenset(ids) - known_ids + +if __name__ == '__main__': + update_index() diff --git a/oxdata/movie/models.py b/oxdata/movie/models.py index 36e0b1d..3c95cbe 100644 --- a/oxdata/movie/models.py +++ b/oxdata/movie/models.py @@ -19,6 +19,8 @@ from oxdjango.fields import DictField from lookup.models import get_movie_id from poster.models import getPosters +from .imdbids import get_unknown_ids, update_index + def normalize_value(value): if isinstance(value, bytes): value = value.decode('utf-8') @@ -260,35 +262,28 @@ def get_new_ids(timeout=-1): with open(new_ids_cache) as fd: new_ids = set(json.load(fd)) else: - robot = ox.cache.read_url('http://www.imdb.com/robots.txt', timeout=timeout).decode('utf-8') - sitemap_url = re.compile('\nSitemap: (http.+)').findall(robot)[0] - sitemap = ox.cache.read_url(sitemap_url, timeout=timeout).decode('utf-8') - urls = re.compile('(.+?)').findall(sitemap) - ids = set() - for url in sorted(urls, reverse=False): - s = ox.cache.read_url(url, timeout=timeout).decode('utf-8') - ids |= set(re.compile('http://www.imdb.com/title/tt(\d{7})/combined').findall(s)) - #print url, len(ids) + update_index() known_ids = frozenset([i['imdb'] for i in Imdb.objects.all().values('imdb')]) - new_ids = frozenset(ids) - known_ids - print('adding %s new items' % len(new_ids)) - added = 0 - done = set() - for i in sorted(new_ids): - print(i) - m, created = Imdb.objects.get_or_create(imdb=i) - try: - m.update() - except: - with open('/tmp/missing.json', 'w') as fd: - json.dump(list(new_ids-done), fd) - raise - print(m) - if created: - added += 1 - done.add(i) - if added: - print(added) + new_ids = get_unknown_ids(known_ids) + if new_ids: + print('adding %s new items' % len(new_ids)) + added = 0 + done = set() + for i in sorted(new_ids): + print(i) + m, created = Imdb.objects.get_or_create(imdb=i) + try: + m.update() + except: + with open('/tmp/missing.json', 'w') as fd: + json.dump(list(new_ids-done), fd) + raise + print(m) + if created: + added += 1 + done.add(i) + if added: + print(added) if os.path.exists(new_ids_cache): os.unlink(new_ids_cache)