oxdata/oxdata/movie/imdbids.py

282 lines
9.6 KiB
Python
Raw Normal View History

2018-05-01 10:08:25 +00:00
from datetime import datetime
from glob import glob
from optparse import OptionParser
import calendar
import codecs
import json
import os
import re
import sys
2018-06-03 12:04:02 +00:00
import time
2018-05-01 10:08:25 +00:00
from django.conf import settings
import lxml.html
import ox
import ox.web.imdb
MAX_PER_RANGE = 200 * 50
DAY = 24 * 60 * 60
TIMEOUT = 90 * DAY
DATA_ROOT = os.path.join(settings.MEDIA_ROOT, 'imdb')
2019-08-04 09:22:42 +00:00
DEBUG = False
def debug(*args, **kwargs):
if DEBUG:
print(*args, **kwargs)
2018-06-03 12:04:02 +00:00
def read_url(url, timeout):
data = ox.web.imdb.read_url(url, unicode=True, timeout=timeout)
while '>500 Error - IMDb<' in data:
print('Error', url)
time.sleep(10)
data = ox.web.imdb.read_url(url, unicode=True, timeout=0)
return data
2018-05-01 10:08:25 +00:00
def get_range(from_, to):
2019-05-02 10:48:29 +00:00
base_url = 'https://www.imdb.com'
2018-05-01 10:08:25 +00:00
url = '%s/search/title?adult=include&release_date=%s,%s&sort=release_date,asc&count=50' % (base_url, from_, to)
return url
def get_year(year):
return get_range('%s-01-01' % year, '%s-12-31' % year)
def get_month(year, month):
days = calendar.monthrange(year, month)[1]
month = '%s-%02d' % (year, month)
return get_range('%s-01' % month, '%s-%02d' % (month, days))
def get_day(year, month, day):
day = '%s-%02d-%02d' % (year, month, day)
return get_range(day, day)
def get_film_count(year, month=None, day=None):
if day is not None:
url = get_day(year, month, day)
elif month is not None:
url = get_month(year, month)
else:
url = get_year(year)
2018-06-03 12:04:02 +00:00
data = read_url(url, timeout=TIMEOUT)
2019-05-02 10:48:29 +00:00
total = re.compile('<span>1-50 of ([\d,]+?) titles.</span>').findall(data)
2018-05-03 13:35:02 +00:00
if not total:
2019-08-05 08:33:29 +00:00
total = re.compile('<span>([\d,]+) titles.</span>', re.DOTALL).findall(data)
2018-05-01 10:08:25 +00:00
if total:
return int(total[0].replace(',', ''))
print('no movies', url)
ox.web.imdb.delete_url(url)
return 0
def get_path(name):
path = os.path.join(DATA_ROOT, name)
ox.makedirs(os.path.dirname(path))
return path
def print_info(key, film_count, film_counts):
added = film_count - film_counts.get(key, 0)
if added != film_count:
extra = '(added %s)' % added
else:
extra = ''
print('update', key, 'now has', film_count, 'films', extra)
def update_year(year, film_counts):
key = '%s' % year
if film_counts[key] > MAX_PER_RANGE:
for month in range(1, 13):
key = '%04d-%02d' % (year, month)
film_count = get_film_count(year, month)
if film_count != film_counts.get(key):
print_info(key, film_count, film_counts)
film_counts[key] = film_count
2019-08-04 09:22:42 +00:00
update_month(year, month, film_counts, expected=film_count)
2018-05-01 10:08:25 +00:00
else:
2018-05-03 10:11:51 +00:00
r = update_ids(year)
2018-05-01 10:08:25 +00:00
save_film_counts(film_counts)
def update_month(year, month, film_counts):
key = '%04d-%02d' % (year, month)
if film_counts[key] > MAX_PER_RANGE:
month_total = film_counts[key]
days_total = 0
days = calendar.monthrange(year, month)[1]
for day in range(1, days + 1):
key = '%04d-%02d-%02d' % (year, month, day)
film_count = get_film_count(year, month, day)
days_total += film_count
if film_count != film_counts.get(key):
print_info(key, film_count, film_counts)
film_counts[key] = film_count
2018-05-03 10:11:51 +00:00
if film_count > MAX_PER_RANGE and film_count < 2*MAX_PER_RANGE:
2019-08-04 09:22:42 +00:00
r = update_ids(year, month, day, sort='alpha', expected=film_count)
2018-05-03 10:11:51 +00:00
save_film_counts(film_counts)
elif film_count > MAX_PER_RANGE:
2019-08-04 09:22:42 +00:00
print(key, '!!!to many per day', film_count, key)
2018-05-01 10:08:25 +00:00
else:
2019-08-04 09:22:42 +00:00
r = update_ids(year, month, day, expected=film_count)
2018-05-01 10:08:25 +00:00
save_film_counts(film_counts)
if days_total != month_total:
print('!! month and days don\'t add up: %s month vs %s days total' % (month_total, days_total))
else:
2019-08-04 09:22:42 +00:00
r = update_ids(year, month, expected=film_count)
2018-05-01 10:08:25 +00:00
save_film_counts(film_counts)
2018-06-03 12:00:03 +00:00
def parse_cast(string):
results = {}
for part in string.split('|'):
cast = iter([t.strip() for t in part.split(':\n')])
cast = dict(zip(cast, cast))
for key in cast:
rkey = key.lower()
rkey = {
'director': 'directors',
'star': 'stars',
}.get(rkey, rkey)
results[rkey] = cast[key].split(', \n')
return results
2018-05-01 10:08:25 +00:00
2019-08-04 09:22:42 +00:00
def update_ids(year, month=None, day=None, sort=None, expexted=None):
2018-05-01 10:08:25 +00:00
films = {}
if day is not None:
url = get_day(year, month, day)
key = '%04d-%02d-%02d' % (year, month, day)
elif month is not None:
url = get_month(year, month)
2018-05-02 14:53:34 +00:00
key = '%04d-%02d' % (year, month)
2018-05-01 10:08:25 +00:00
else:
url = get_year(year)
key = '%04d' % year
2018-05-03 10:11:51 +00:00
if sort == 'alpha':
urls = [
url.replace('sort=release_date,asc', 'sort=alpha,asc'),
url.replace('sort=release_date,asc', 'sort=alpha,desc'),
]
else:
urls = [url]
2019-08-04 09:22:42 +00:00
if not expexted:
expexted = get_film_count(year, month, day)
2018-05-03 10:11:51 +00:00
for url in urls:
2018-06-03 12:04:02 +00:00
data = read_url(url, timeout=TIMEOUT)
2018-05-03 10:11:51 +00:00
n = True
2019-08-02 12:35:44 +00:00
step = 50
start = 1
2018-05-03 10:11:51 +00:00
while n:
2019-08-04 09:22:42 +00:00
start += step
next_link = 'start=%s&ref_=adv_nxt"' % (start)
has_next = re.compile(next_link).findall(data)
if has_next:
2019-08-02 12:35:44 +00:00
n = '%s&start=%s' % (url, start)
2019-08-04 09:22:42 +00:00
else:
n = False
2018-05-03 10:11:51 +00:00
doc = lxml.html.fromstring(data)
article = doc.find_class('article')
if article:
article = article[0]
else:
2019-08-02 12:35:44 +00:00
print('no article on', '%s&start=%s' % (url, start - 2*step))
ox.web.imdb.delete_url('%s&start=%s' % (url, start - 2*step))
2018-05-03 10:11:51 +00:00
break
2018-06-03 12:00:03 +00:00
for content in article.find_class('lister-item-content'):
header = content.find_class('lister-item-header')[0]
2018-05-03 13:35:02 +00:00
a = header.xpath('.//a')
if 'Episode:' in [
e.text_content()
for e in header.xpath(".//small")
] and len(a) > 1:
title = a[0].text_content().strip() + ': '
a = a[1]
else:
title = ''
a = a[0]
2019-08-02 11:11:01 +00:00
id = re.compile('title/tt(\d+)').findall(a.attrib['href'])[0]
2018-05-03 13:35:02 +00:00
title += a.text_content().strip()
2018-05-03 10:11:51 +00:00
try:
y = header.find_class('lister-item-year')[0].text_content()
y = re.sub('\([^\d]+\)', '', y)
y = y.rsplit('(', 1)[-1].split(')')[0].split('–')[0].split(' ')[0].strip()
if not y:
y = year
else:
y = int(y)
except:
print(n)
print(header.find_class('lister-item-year')[0].text_content())
raise
2018-06-03 12:00:03 +00:00
text = content.xpath(".//p[contains(@class, 'text-muted')]")
plot = text[1].text_content().strip()
plot = plot.replace('See full summary »', '').replace('See full summary\xa0»', '').strip()
if plot == 'Add a Plot':
plot = ''
genre = content.find_class('genre')
if genre:
genre = genre[0].text_content().strip().split(', ')
else:
genre = []
cast = content.xpath(".//p[contains(@class, '')]")
cast = [t for t in cast if t.attrib.get('class') == '']
if cast:
cast = parse_cast(cast[0].text_content())
2018-05-03 10:11:51 +00:00
if id not in films:
films[id] = {
'title': title,
'year': y
}
2018-06-03 12:00:03 +00:00
if plot:
films[id]['plot'] = plot
if genre:
films[id]['genre'] = genre
if cast:
films[id].update(cast)
2019-08-04 09:22:42 +00:00
if expected and len(films) == expected and sort == 'alpha':
n = False
debug('%s: %s of %s films - next: %s' % (key, len(films), expected, n))
2018-05-03 10:11:51 +00:00
if n:
2018-06-03 12:04:02 +00:00
data = read_url(n, timeout=TIMEOUT)
2018-05-01 10:08:25 +00:00
path = get_path('ids/%s.json' % key)
with open(path, 'w') as fd:
json.dump(films, fd, indent=4, ensure_ascii=False, sort_keys=True)
2019-08-04 09:22:42 +00:00
r = len(films)
if r != expected:
print('%s: got %s, expected %s' % (key, r, expexted))
return r
2018-05-01 10:08:25 +00:00
def save_film_counts(film_counts):
with open(get_path('film_counts.json'), 'w') as fd:
json.dump(film_counts, fd, indent=4, sort_keys=True)
2019-08-04 09:22:42 +00:00
def update_index(from_year=None):
if from_year is None:
from_year = 1874
2018-05-01 10:08:25 +00:00
film_counts_json = get_path('film_counts.json')
if os.path.exists(film_counts_json):
with open(film_counts_json) as fd:
film_counts = json.load(fd)
else:
film_counts = {}
2019-08-04 09:22:42 +00:00
for year in range(from_year, datetime.now().year+1):
2018-05-01 10:08:25 +00:00
film_count = get_film_count(year)
key = '%s' % year
if film_count != film_counts.get(key):
print_info(key, film_count, film_counts)
film_counts[key] = film_count
update_year(year, film_counts)
save_film_counts(film_counts)
def get_unknown_ids(known_ids):
ids = []
for path in glob(get_path('ids/*.json')):
with open(path) as fd:
ids += json.load(fd).keys()
return frozenset(ids) - known_ids
if __name__ == '__main__':
update_index()