from datetime import datetime from glob import glob from optparse import OptionParser import calendar import codecs import json import os import re import sys import time from django.conf import settings import lxml.html import ox import ox.web.imdb MAX_PER_RANGE = 200 * 50 DAY = 24 * 60 * 60 TIMEOUT = 90 * DAY DATA_ROOT = os.path.join(settings.MEDIA_ROOT, 'imdb') DEBUG = False def debug(*args, **kwargs): if DEBUG: print(*args, **kwargs) def read_url(url, timeout): data = ox.web.imdb.read_url(url, unicode=True, timeout=timeout) while '>500 Error - IMDb<' in data: print('Error', url) time.sleep(10) data = ox.web.imdb.read_url(url, unicode=True, timeout=0) return data def get_range(from_, to): base_url = 'https://www.imdb.com' url = '%s/search/title?adult=include&release_date=%s,%s&sort=release_date,asc&count=50' % (base_url, from_, to) return url def get_year(year): return get_range('%s-01-01' % year, '%s-12-31' % year) def get_month(year, month): days = calendar.monthrange(year, month)[1] month = '%s-%02d' % (year, month) return get_range('%s-01' % month, '%s-%02d' % (month, days)) def get_day(year, month, day): day = '%s-%02d-%02d' % (year, month, day) return get_range(day, day) def get_film_count(year, month=None, day=None): if day is not None: url = get_day(year, month, day) elif month is not None: url = get_month(year, month) else: url = get_year(year) data = read_url(url, timeout=TIMEOUT) total = re.compile('1-50 of ([\d,]+?) titles.').findall(data) if not total: total = re.compile('([\d,]+) titles.', re.DOTALL).findall(data) if total: return int(total[0].replace(',', '')) print('no movies', url) ox.web.imdb.delete_url(url) return 0 def get_path(name): path = os.path.join(DATA_ROOT, name) ox.makedirs(os.path.dirname(path)) return path def print_info(key, film_count, film_counts): added = film_count - film_counts.get(key, 0) if added != film_count: extra = '(added %s)' % added else: extra = '' print('update', key, 'now has', film_count, 'films', extra) def update_year(year, film_counts): key = '%s' % year if film_counts[key] > MAX_PER_RANGE: for month in range(1, 13): key = '%04d-%02d' % (year, month) film_count = get_film_count(year, month) if film_count != film_counts.get(key): print_info(key, film_count, film_counts) film_counts[key] = film_count update_month(year, month, film_counts, expected=film_count) else: r = update_ids(year) save_film_counts(film_counts) def update_month(year, month, film_counts): key = '%04d-%02d' % (year, month) if film_counts[key] > MAX_PER_RANGE: month_total = film_counts[key] days_total = 0 days = calendar.monthrange(year, month)[1] for day in range(1, days + 1): key = '%04d-%02d-%02d' % (year, month, day) film_count = get_film_count(year, month, day) days_total += film_count if film_count != film_counts.get(key): print_info(key, film_count, film_counts) film_counts[key] = film_count if film_count > MAX_PER_RANGE and film_count < 2*MAX_PER_RANGE: r = update_ids(year, month, day, sort='alpha', expected=film_count) save_film_counts(film_counts) elif film_count > MAX_PER_RANGE: print(key, '!!!to many per day', film_count, key) else: r = update_ids(year, month, day, expected=film_count) save_film_counts(film_counts) if days_total != month_total: print('!! month and days don\'t add up: %s month vs %s days total' % (month_total, days_total)) else: r = update_ids(year, month, expected=film_count) save_film_counts(film_counts) def parse_cast(string): results = {} for part in string.split('|'): cast = iter([t.strip() for t in part.split(':\n')]) cast = dict(zip(cast, cast)) for key in cast: rkey = key.lower() rkey = { 'director': 'directors', 'star': 'stars', }.get(rkey, rkey) results[rkey] = cast[key].split(', \n') return results def update_ids(year, month=None, day=None, sort=None, expexted=None): films = {} if day is not None: url = get_day(year, month, day) key = '%04d-%02d-%02d' % (year, month, day) elif month is not None: url = get_month(year, month) key = '%04d-%02d' % (year, month) else: url = get_year(year) key = '%04d' % year if sort == 'alpha': urls = [ url.replace('sort=release_date,asc', 'sort=alpha,asc'), url.replace('sort=release_date,asc', 'sort=alpha,desc'), ] else: urls = [url] if not expexted: expexted = get_film_count(year, month, day) for url in urls: data = read_url(url, timeout=TIMEOUT) n = True step = 50 start = 1 while n: start += step next_link = 'start=%s&ref_=adv_nxt"' % (start) has_next = re.compile(next_link).findall(data) if has_next: n = '%s&start=%s' % (url, start) else: n = False doc = lxml.html.fromstring(data) article = doc.find_class('article') if article: article = article[0] else: print('no article on', '%s&start=%s' % (url, start - 2*step)) ox.web.imdb.delete_url('%s&start=%s' % (url, start - 2*step)) break for content in article.find_class('lister-item-content'): header = content.find_class('lister-item-header')[0] a = header.xpath('.//a') if 'Episode:' in [ e.text_content() for e in header.xpath(".//small") ] and len(a) > 1: title = a[0].text_content().strip() + ': ' a = a[1] else: title = '' a = a[0] id = re.compile('title/tt(\d+)').findall(a.attrib['href'])[0] title += a.text_content().strip() try: y = header.find_class('lister-item-year')[0].text_content() y = re.sub('\([^\d]+\)', '', y) y = y.rsplit('(', 1)[-1].split(')')[0].split('–')[0].split(' ')[0].strip() if not y: y = year else: y = int(y) except: print(n) print(header.find_class('lister-item-year')[0].text_content()) raise text = content.xpath(".//p[contains(@class, 'text-muted')]") plot = text[1].text_content().strip() plot = plot.replace('See full summary »', '').replace('See full summary\xa0»', '').strip() if plot == 'Add a Plot': plot = '' genre = content.find_class('genre') if genre: genre = genre[0].text_content().strip().split(', ') else: genre = [] cast = content.xpath(".//p[contains(@class, '')]") cast = [t for t in cast if t.attrib.get('class') == ''] if cast: cast = parse_cast(cast[0].text_content()) if id not in films: films[id] = { 'title': title, 'year': y } if plot: films[id]['plot'] = plot if genre: films[id]['genre'] = genre if cast: films[id].update(cast) if expected and len(films) == expected and sort == 'alpha': n = False debug('%s: %s of %s films - next: %s' % (key, len(films), expected, n)) if n: data = read_url(n, timeout=TIMEOUT) path = get_path('ids/%s.json' % key) with open(path, 'w') as fd: json.dump(films, fd, indent=4, ensure_ascii=False, sort_keys=True) r = len(films) if r != expected: print('%s: got %s, expected %s' % (key, r, expexted)) return r def save_film_counts(film_counts): with open(get_path('film_counts.json'), 'w') as fd: json.dump(film_counts, fd, indent=4, sort_keys=True) def update_index(from_year=None): if from_year is None: from_year = 1874 film_counts_json = get_path('film_counts.json') if os.path.exists(film_counts_json): with open(film_counts_json) as fd: film_counts = json.load(fd) else: film_counts = {} for year in range(from_year, datetime.now().year+1): film_count = get_film_count(year) key = '%s' % year if film_count != film_counts.get(key): print_info(key, film_count, film_counts) film_counts[key] = film_count update_year(year, film_counts) save_film_counts(film_counts) def get_unknown_ids(known_ids): ids = [] for path in glob(get_path('ids/*.json')): with open(path) as fd: ids += json.load(fd).keys() return frozenset(ids) - known_ids if __name__ == '__main__': update_index()