discover imdb ids by year

This commit is contained in:
j 2018-05-01 12:08:25 +02:00
parent 1ebc53dd46
commit e8225eec58
2 changed files with 214 additions and 28 deletions

191
oxdata/movie/imdbids.py Normal file
View file

@ -0,0 +1,191 @@
from datetime import datetime
from glob import glob
from optparse import OptionParser
import calendar
import codecs
import json
import os
import re
import sys
from django.conf import settings
import lxml.html
import ox
import ox.web.imdb
MAX_PER_RANGE = 200 * 50
DAY = 24 * 60 * 60
TIMEOUT = 90 * DAY
DATA_ROOT = os.path.join(settings.MEDIA_ROOT, 'imdb')
def get_range(from_, to):
base_url = 'http://www.imdb.com'
url = '%s/search/title?adult=include&release_date=%s,%s&sort=release_date,asc&count=50' % (base_url, from_, to)
return url
def get_year(year):
return get_range('%s-01-01' % year, '%s-12-31' % year)
def get_month(year, month):
days = calendar.monthrange(year, month)[1]
month = '%s-%02d' % (year, month)
return get_range('%s-01' % month, '%s-%02d' % (month, days))
def get_day(year, month, day):
day = '%s-%02d-%02d' % (year, month, day)
return get_range(day, day)
def get_film_count(year, month=None, day=None):
if day is not None:
url = get_day(year, month, day)
elif month is not None:
url = get_month(year, month)
else:
url = get_year(year)
data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT)
total = re.compile('<span class="lister-current-last-item">50</span>.*?of (.*?) titles', re.DOTALL).findall(data)
if total:
return int(total[0].replace(',', ''))
print('no movies', url)
ox.web.imdb.delete_url(url)
return 0
def get_path(name):
path = os.path.join(DATA_ROOT, name)
ox.makedirs(os.path.dirname(path))
return path
def print_info(key, film_count, film_counts):
added = film_count - film_counts.get(key, 0)
if added != film_count:
extra = '(added %s)' % added
else:
extra = ''
print('update', key, 'now has', film_count, 'films', extra)
def update_year(year, film_counts):
key = '%s' % year
if film_counts[key] > MAX_PER_RANGE:
for month in range(1, 13):
key = '%04d-%02d' % (year, month)
film_count = get_film_count(year, month)
if film_count != film_counts.get(key):
print_info(key, film_count, film_counts)
film_counts[key] = film_count
update_month(year, month, film_counts)
else:
update_ids(year)
save_film_counts(film_counts)
def update_month(year, month, film_counts):
key = '%04d-%02d' % (year, month)
if film_counts[key] > MAX_PER_RANGE:
month_total = film_counts[key]
days_total = 0
days = calendar.monthrange(year, month)[1]
for day in range(1, days + 1):
key = '%04d-%02d-%02d' % (year, month, day)
film_count = get_film_count(year, month, day)
days_total += film_count
if film_count != film_counts.get(key):
print_info(key, film_count, film_counts)
film_counts[key] = film_count
if film_count > MAX_PER_RANGE:
print(key, '!!!to many per day')
else:
update_ids(year, month, day)
save_film_counts(film_counts)
if days_total != month_total:
print('!! month and days don\'t add up: %s month vs %s days total' % (month_total, days_total))
else:
update_ids(year, month)
save_film_counts(film_counts)
def update_ids(year, month=None, day=None):
films = {}
if day is not None:
url = get_day(year, month, day)
key = '%04d-%02d-%02d' % (year, month, day)
elif month is not None:
url = get_month(year, month)
key = '%04d-%02d' % (year, mont)
else:
url = get_year(year)
key = '%04d' % year
data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT)
n = True
page = 2
while n:
n = re.compile('Next &#187;</a>', re.DOTALL).findall(data)
if n:
n = '%s&page=%s' % (url, page)
page += 1
doc = lxml.html.fromstring(data)
article = doc.find_class('article')
if article:
article = article[0]
else:
print('no article on', '%s&page=%s' % (url, page-2))
break
for header in article.find_class('lister-item-header'):
a = header.xpath('.//a')[0]
id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
title = a.text_content().strip()
try:
y = header.find_class('lister-item-year')[0].text_content()
y = re.sub('\([^\d]+\)', '', y)
y = y.rsplit('(', 1)[-1].split(')')[0].split('')[0].split(' ')[0].strip()
if not y:
y = year
else:
y = int(y)
except:
print(n)
print(header.find_class('lister-item-year')[0].text_content())
raise
if id not in films:
films[id] = {
'title': title,
'year': y
}
#print(key, len(films), 'films')
if n:
#print(n)
data = ox.web.imdb.read_url(n, unicode=True, timeout=TIMEOUT)
path = get_path('ids/%s.json' % key)
with open(path, 'w') as fd:
json.dump(films, fd, indent=4, ensure_ascii=False, sort_keys=True)
def save_film_counts(film_counts):
with open(get_path('film_counts.json'), 'w') as fd:
json.dump(film_counts, fd, indent=4, sort_keys=True)
def update_index():
film_counts_json = get_path('film_counts.json')
if os.path.exists(film_counts_json):
with open(film_counts_json) as fd:
film_counts = json.load(fd)
else:
film_counts = {}
for year in range(1890, datetime.now().year+1):
film_count = get_film_count(year)
key = '%s' % year
if film_count != film_counts.get(key):
print_info(key, film_count, film_counts)
film_counts[key] = film_count
update_year(year, film_counts)
save_film_counts(film_counts)
def get_unknown_ids(known_ids):
ids = []
for path in glob(get_path('ids/*.json')):
with open(path) as fd:
ids += json.load(fd).keys()
return frozenset(ids) - known_ids
if __name__ == '__main__':
update_index()

View file

@ -19,6 +19,8 @@ from oxdjango.fields import DictField
from lookup.models import get_movie_id from lookup.models import get_movie_id
from poster.models import getPosters from poster.models import getPosters
from .imdbids import get_unknown_ids, update_index
def normalize_value(value): def normalize_value(value):
if isinstance(value, bytes): if isinstance(value, bytes):
value = value.decode('utf-8') value = value.decode('utf-8')
@ -260,35 +262,28 @@ def get_new_ids(timeout=-1):
with open(new_ids_cache) as fd: with open(new_ids_cache) as fd:
new_ids = set(json.load(fd)) new_ids = set(json.load(fd))
else: else:
robot = ox.cache.read_url('http://www.imdb.com/robots.txt', timeout=timeout).decode('utf-8') update_index()
sitemap_url = re.compile('\nSitemap: (http.+)').findall(robot)[0]
sitemap = ox.cache.read_url(sitemap_url, timeout=timeout).decode('utf-8')
urls = re.compile('<loc>(.+?)</loc>').findall(sitemap)
ids = set()
for url in sorted(urls, reverse=False):
s = ox.cache.read_url(url, timeout=timeout).decode('utf-8')
ids |= set(re.compile('<loc>http://www.imdb.com/title/tt(\d{7})/combined</loc>').findall(s))
#print url, len(ids)
known_ids = frozenset([i['imdb'] for i in Imdb.objects.all().values('imdb')]) known_ids = frozenset([i['imdb'] for i in Imdb.objects.all().values('imdb')])
new_ids = frozenset(ids) - known_ids new_ids = get_unknown_ids(known_ids)
print('adding %s new items' % len(new_ids)) if new_ids:
added = 0 print('adding %s new items' % len(new_ids))
done = set() added = 0
for i in sorted(new_ids): done = set()
print(i) for i in sorted(new_ids):
m, created = Imdb.objects.get_or_create(imdb=i) print(i)
try: m, created = Imdb.objects.get_or_create(imdb=i)
m.update() try:
except: m.update()
with open('/tmp/missing.json', 'w') as fd: except:
json.dump(list(new_ids-done), fd) with open('/tmp/missing.json', 'w') as fd:
raise json.dump(list(new_ids-done), fd)
print(m) raise
if created: print(m)
added += 1 if created:
done.add(i) added += 1
if added: done.add(i)
print(added) if added:
print(added)
if os.path.exists(new_ids_cache): if os.path.exists(new_ids_cache):
os.unlink(new_ids_cache) os.unlink(new_ids_cache)