discover imdb ids by year
This commit is contained in:
parent
1ebc53dd46
commit
e8225eec58
2 changed files with 214 additions and 28 deletions
191
oxdata/movie/imdbids.py
Normal file
191
oxdata/movie/imdbids.py
Normal file
|
@ -0,0 +1,191 @@
|
||||||
|
from datetime import datetime
|
||||||
|
from glob import glob
|
||||||
|
from optparse import OptionParser
|
||||||
|
import calendar
|
||||||
|
import codecs
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from django.conf import settings
|
||||||
|
import lxml.html
|
||||||
|
import ox
|
||||||
|
import ox.web.imdb
|
||||||
|
|
||||||
|
|
||||||
|
MAX_PER_RANGE = 200 * 50
|
||||||
|
|
||||||
|
DAY = 24 * 60 * 60
|
||||||
|
TIMEOUT = 90 * DAY
|
||||||
|
DATA_ROOT = os.path.join(settings.MEDIA_ROOT, 'imdb')
|
||||||
|
|
||||||
|
def get_range(from_, to):
|
||||||
|
base_url = 'http://www.imdb.com'
|
||||||
|
url = '%s/search/title?adult=include&release_date=%s,%s&sort=release_date,asc&count=50' % (base_url, from_, to)
|
||||||
|
return url
|
||||||
|
|
||||||
|
def get_year(year):
|
||||||
|
return get_range('%s-01-01' % year, '%s-12-31' % year)
|
||||||
|
|
||||||
|
def get_month(year, month):
|
||||||
|
days = calendar.monthrange(year, month)[1]
|
||||||
|
month = '%s-%02d' % (year, month)
|
||||||
|
return get_range('%s-01' % month, '%s-%02d' % (month, days))
|
||||||
|
|
||||||
|
def get_day(year, month, day):
|
||||||
|
day = '%s-%02d-%02d' % (year, month, day)
|
||||||
|
return get_range(day, day)
|
||||||
|
|
||||||
|
def get_film_count(year, month=None, day=None):
|
||||||
|
if day is not None:
|
||||||
|
url = get_day(year, month, day)
|
||||||
|
elif month is not None:
|
||||||
|
url = get_month(year, month)
|
||||||
|
else:
|
||||||
|
url = get_year(year)
|
||||||
|
data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT)
|
||||||
|
total = re.compile('<span class="lister-current-last-item">50</span>.*?of (.*?) titles', re.DOTALL).findall(data)
|
||||||
|
if total:
|
||||||
|
return int(total[0].replace(',', ''))
|
||||||
|
print('no movies', url)
|
||||||
|
ox.web.imdb.delete_url(url)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def get_path(name):
|
||||||
|
path = os.path.join(DATA_ROOT, name)
|
||||||
|
ox.makedirs(os.path.dirname(path))
|
||||||
|
return path
|
||||||
|
|
||||||
|
def print_info(key, film_count, film_counts):
|
||||||
|
added = film_count - film_counts.get(key, 0)
|
||||||
|
if added != film_count:
|
||||||
|
extra = '(added %s)' % added
|
||||||
|
else:
|
||||||
|
extra = ''
|
||||||
|
print('update', key, 'now has', film_count, 'films', extra)
|
||||||
|
|
||||||
|
def update_year(year, film_counts):
|
||||||
|
key = '%s' % year
|
||||||
|
if film_counts[key] > MAX_PER_RANGE:
|
||||||
|
for month in range(1, 13):
|
||||||
|
key = '%04d-%02d' % (year, month)
|
||||||
|
film_count = get_film_count(year, month)
|
||||||
|
if film_count != film_counts.get(key):
|
||||||
|
print_info(key, film_count, film_counts)
|
||||||
|
film_counts[key] = film_count
|
||||||
|
update_month(year, month, film_counts)
|
||||||
|
else:
|
||||||
|
update_ids(year)
|
||||||
|
save_film_counts(film_counts)
|
||||||
|
|
||||||
|
def update_month(year, month, film_counts):
|
||||||
|
key = '%04d-%02d' % (year, month)
|
||||||
|
if film_counts[key] > MAX_PER_RANGE:
|
||||||
|
month_total = film_counts[key]
|
||||||
|
days_total = 0
|
||||||
|
days = calendar.monthrange(year, month)[1]
|
||||||
|
for day in range(1, days + 1):
|
||||||
|
key = '%04d-%02d-%02d' % (year, month, day)
|
||||||
|
film_count = get_film_count(year, month, day)
|
||||||
|
days_total += film_count
|
||||||
|
if film_count != film_counts.get(key):
|
||||||
|
print_info(key, film_count, film_counts)
|
||||||
|
film_counts[key] = film_count
|
||||||
|
if film_count > MAX_PER_RANGE:
|
||||||
|
print(key, '!!!to many per day')
|
||||||
|
else:
|
||||||
|
update_ids(year, month, day)
|
||||||
|
save_film_counts(film_counts)
|
||||||
|
if days_total != month_total:
|
||||||
|
print('!! month and days don\'t add up: %s month vs %s days total' % (month_total, days_total))
|
||||||
|
else:
|
||||||
|
update_ids(year, month)
|
||||||
|
save_film_counts(film_counts)
|
||||||
|
|
||||||
|
|
||||||
|
def update_ids(year, month=None, day=None):
|
||||||
|
films = {}
|
||||||
|
if day is not None:
|
||||||
|
url = get_day(year, month, day)
|
||||||
|
key = '%04d-%02d-%02d' % (year, month, day)
|
||||||
|
elif month is not None:
|
||||||
|
url = get_month(year, month)
|
||||||
|
key = '%04d-%02d' % (year, mont)
|
||||||
|
else:
|
||||||
|
url = get_year(year)
|
||||||
|
key = '%04d' % year
|
||||||
|
data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT)
|
||||||
|
n = True
|
||||||
|
page = 2
|
||||||
|
while n:
|
||||||
|
n = re.compile('Next »</a>', re.DOTALL).findall(data)
|
||||||
|
if n:
|
||||||
|
n = '%s&page=%s' % (url, page)
|
||||||
|
page += 1
|
||||||
|
doc = lxml.html.fromstring(data)
|
||||||
|
article = doc.find_class('article')
|
||||||
|
if article:
|
||||||
|
article = article[0]
|
||||||
|
else:
|
||||||
|
print('no article on', '%s&page=%s' % (url, page-2))
|
||||||
|
break
|
||||||
|
for header in article.find_class('lister-item-header'):
|
||||||
|
a = header.xpath('.//a')[0]
|
||||||
|
id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
|
||||||
|
title = a.text_content().strip()
|
||||||
|
try:
|
||||||
|
y = header.find_class('lister-item-year')[0].text_content()
|
||||||
|
y = re.sub('\([^\d]+\)', '', y)
|
||||||
|
y = y.rsplit('(', 1)[-1].split(')')[0].split('–')[0].split(' ')[0].strip()
|
||||||
|
if not y:
|
||||||
|
y = year
|
||||||
|
else:
|
||||||
|
y = int(y)
|
||||||
|
except:
|
||||||
|
print(n)
|
||||||
|
print(header.find_class('lister-item-year')[0].text_content())
|
||||||
|
raise
|
||||||
|
if id not in films:
|
||||||
|
films[id] = {
|
||||||
|
'title': title,
|
||||||
|
'year': y
|
||||||
|
}
|
||||||
|
#print(key, len(films), 'films')
|
||||||
|
if n:
|
||||||
|
#print(n)
|
||||||
|
data = ox.web.imdb.read_url(n, unicode=True, timeout=TIMEOUT)
|
||||||
|
path = get_path('ids/%s.json' % key)
|
||||||
|
with open(path, 'w') as fd:
|
||||||
|
json.dump(films, fd, indent=4, ensure_ascii=False, sort_keys=True)
|
||||||
|
|
||||||
|
def save_film_counts(film_counts):
|
||||||
|
with open(get_path('film_counts.json'), 'w') as fd:
|
||||||
|
json.dump(film_counts, fd, indent=4, sort_keys=True)
|
||||||
|
|
||||||
|
def update_index():
|
||||||
|
film_counts_json = get_path('film_counts.json')
|
||||||
|
if os.path.exists(film_counts_json):
|
||||||
|
with open(film_counts_json) as fd:
|
||||||
|
film_counts = json.load(fd)
|
||||||
|
else:
|
||||||
|
film_counts = {}
|
||||||
|
|
||||||
|
for year in range(1890, datetime.now().year+1):
|
||||||
|
film_count = get_film_count(year)
|
||||||
|
key = '%s' % year
|
||||||
|
if film_count != film_counts.get(key):
|
||||||
|
print_info(key, film_count, film_counts)
|
||||||
|
film_counts[key] = film_count
|
||||||
|
update_year(year, film_counts)
|
||||||
|
save_film_counts(film_counts)
|
||||||
|
|
||||||
|
def get_unknown_ids(known_ids):
|
||||||
|
ids = []
|
||||||
|
for path in glob(get_path('ids/*.json')):
|
||||||
|
with open(path) as fd:
|
||||||
|
ids += json.load(fd).keys()
|
||||||
|
return frozenset(ids) - known_ids
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
update_index()
|
|
@ -19,6 +19,8 @@ from oxdjango.fields import DictField
|
||||||
from lookup.models import get_movie_id
|
from lookup.models import get_movie_id
|
||||||
from poster.models import getPosters
|
from poster.models import getPosters
|
||||||
|
|
||||||
|
from .imdbids import get_unknown_ids, update_index
|
||||||
|
|
||||||
def normalize_value(value):
|
def normalize_value(value):
|
||||||
if isinstance(value, bytes):
|
if isinstance(value, bytes):
|
||||||
value = value.decode('utf-8')
|
value = value.decode('utf-8')
|
||||||
|
@ -260,35 +262,28 @@ def get_new_ids(timeout=-1):
|
||||||
with open(new_ids_cache) as fd:
|
with open(new_ids_cache) as fd:
|
||||||
new_ids = set(json.load(fd))
|
new_ids = set(json.load(fd))
|
||||||
else:
|
else:
|
||||||
robot = ox.cache.read_url('http://www.imdb.com/robots.txt', timeout=timeout).decode('utf-8')
|
update_index()
|
||||||
sitemap_url = re.compile('\nSitemap: (http.+)').findall(robot)[0]
|
|
||||||
sitemap = ox.cache.read_url(sitemap_url, timeout=timeout).decode('utf-8')
|
|
||||||
urls = re.compile('<loc>(.+?)</loc>').findall(sitemap)
|
|
||||||
ids = set()
|
|
||||||
for url in sorted(urls, reverse=False):
|
|
||||||
s = ox.cache.read_url(url, timeout=timeout).decode('utf-8')
|
|
||||||
ids |= set(re.compile('<loc>http://www.imdb.com/title/tt(\d{7})/combined</loc>').findall(s))
|
|
||||||
#print url, len(ids)
|
|
||||||
known_ids = frozenset([i['imdb'] for i in Imdb.objects.all().values('imdb')])
|
known_ids = frozenset([i['imdb'] for i in Imdb.objects.all().values('imdb')])
|
||||||
new_ids = frozenset(ids) - known_ids
|
new_ids = get_unknown_ids(known_ids)
|
||||||
print('adding %s new items' % len(new_ids))
|
if new_ids:
|
||||||
added = 0
|
print('adding %s new items' % len(new_ids))
|
||||||
done = set()
|
added = 0
|
||||||
for i in sorted(new_ids):
|
done = set()
|
||||||
print(i)
|
for i in sorted(new_ids):
|
||||||
m, created = Imdb.objects.get_or_create(imdb=i)
|
print(i)
|
||||||
try:
|
m, created = Imdb.objects.get_or_create(imdb=i)
|
||||||
m.update()
|
try:
|
||||||
except:
|
m.update()
|
||||||
with open('/tmp/missing.json', 'w') as fd:
|
except:
|
||||||
json.dump(list(new_ids-done), fd)
|
with open('/tmp/missing.json', 'w') as fd:
|
||||||
raise
|
json.dump(list(new_ids-done), fd)
|
||||||
print(m)
|
raise
|
||||||
if created:
|
print(m)
|
||||||
added += 1
|
if created:
|
||||||
done.add(i)
|
added += 1
|
||||||
if added:
|
done.add(i)
|
||||||
print(added)
|
if added:
|
||||||
|
print(added)
|
||||||
if os.path.exists(new_ids_cache):
|
if os.path.exists(new_ids_cache):
|
||||||
os.unlink(new_ids_cache)
|
os.unlink(new_ids_cache)
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue