oxdata/oxdata/movie/imdbids.py

260 lines
9.2 KiB
Python
Raw Normal View History

2018-05-01 10:08:25 +00:00
from datetime import datetime
from glob import glob
from optparse import OptionParser
import calendar
import codecs
import json
import os
import re
import sys
from django.conf import settings
import lxml.html
import ox
import ox.web.imdb
MAX_PER_RANGE = 200 * 50
DAY = 24 * 60 * 60
TIMEOUT = 90 * DAY
DATA_ROOT = os.path.join(settings.MEDIA_ROOT, 'imdb')
def get_range(from_, to):
base_url = 'http://www.imdb.com'
url = '%s/search/title?adult=include&release_date=%s,%s&sort=release_date,asc&count=50' % (base_url, from_, to)
return url
def get_year(year):
return get_range('%s-01-01' % year, '%s-12-31' % year)
def get_month(year, month):
days = calendar.monthrange(year, month)[1]
month = '%s-%02d' % (year, month)
return get_range('%s-01' % month, '%s-%02d' % (month, days))
def get_day(year, month, day):
day = '%s-%02d-%02d' % (year, month, day)
return get_range(day, day)
def get_film_count(year, month=None, day=None):
if day is not None:
url = get_day(year, month, day)
elif month is not None:
url = get_month(year, month)
else:
url = get_year(year)
data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT)
total = re.compile('<span class="lister-current-last-item">50</span>.*?of (.*?) titles', re.DOTALL).findall(data)
2018-05-03 13:35:02 +00:00
if not total:
total = re.compile(' ([\d+,]+) titles\n', re.DOTALL).findall(data)
2018-05-01 10:08:25 +00:00
if total:
return int(total[0].replace(',', ''))
print('no movies', url)
ox.web.imdb.delete_url(url)
return 0
def get_path(name):
path = os.path.join(DATA_ROOT, name)
ox.makedirs(os.path.dirname(path))
return path
def print_info(key, film_count, film_counts):
added = film_count - film_counts.get(key, 0)
if added != film_count:
extra = '(added %s)' % added
else:
extra = ''
print('update', key, 'now has', film_count, 'films', extra)
def update_year(year, film_counts):
key = '%s' % year
if film_counts[key] > MAX_PER_RANGE:
for month in range(1, 13):
key = '%04d-%02d' % (year, month)
film_count = get_film_count(year, month)
if film_count != film_counts.get(key):
print_info(key, film_count, film_counts)
film_counts[key] = film_count
update_month(year, month, film_counts)
else:
2018-05-03 10:11:51 +00:00
r = update_ids(year)
if r != film_counts[key]:
print('%s: count %s, got ids %s' % (key, film_counts[key], r))
2018-05-01 10:08:25 +00:00
save_film_counts(film_counts)
def update_month(year, month, film_counts):
key = '%04d-%02d' % (year, month)
if film_counts[key] > MAX_PER_RANGE:
month_total = film_counts[key]
days_total = 0
days = calendar.monthrange(year, month)[1]
for day in range(1, days + 1):
key = '%04d-%02d-%02d' % (year, month, day)
film_count = get_film_count(year, month, day)
days_total += film_count
if film_count != film_counts.get(key):
print_info(key, film_count, film_counts)
film_counts[key] = film_count
2018-05-03 10:11:51 +00:00
if film_count > MAX_PER_RANGE and film_count < 2*MAX_PER_RANGE:
r = update_ids(year, month, day, sort='alpha')
if r != film_counts[key]:
print('%s: count %s, got ids %s' % (key, film_counts[key], r))
save_film_counts(film_counts)
elif film_count > MAX_PER_RANGE:
2018-05-01 10:08:25 +00:00
print(key, '!!!to many per day')
else:
2018-05-03 10:11:51 +00:00
r = update_ids(year, month, day)
if r != film_counts[key]:
print('%s: count %s, got ids %s' % (key, film_counts[key], r))
2018-05-01 10:08:25 +00:00
save_film_counts(film_counts)
if days_total != month_total:
print('!! month and days don\'t add up: %s month vs %s days total' % (month_total, days_total))
else:
2018-05-03 10:11:51 +00:00
r = update_ids(year, month)
if r != film_counts[key]:
print('%s: count %s, got ids %s' % (key, film_counts[key], r))
2018-05-01 10:08:25 +00:00
save_film_counts(film_counts)
2018-06-03 12:00:03 +00:00
def parse_cast(string):
results = {}
for part in string.split('|'):
cast = iter([t.strip() for t in part.split(':\n')])
cast = dict(zip(cast, cast))
for key in cast:
rkey = key.lower()
rkey = {
'director': 'directors',
'star': 'stars',
}.get(rkey, rkey)
results[rkey] = cast[key].split(', \n')
return results
2018-05-01 10:08:25 +00:00
2018-05-03 10:11:51 +00:00
def update_ids(year, month=None, day=None, sort=None):
2018-05-01 10:08:25 +00:00
films = {}
if day is not None:
url = get_day(year, month, day)
key = '%04d-%02d-%02d' % (year, month, day)
elif month is not None:
url = get_month(year, month)
2018-05-02 14:53:34 +00:00
key = '%04d-%02d' % (year, month)
2018-05-01 10:08:25 +00:00
else:
url = get_year(year)
key = '%04d' % year
2018-05-03 10:11:51 +00:00
if sort == 'alpha':
urls = [
url.replace('sort=release_date,asc', 'sort=alpha,asc'),
url.replace('sort=release_date,asc', 'sort=alpha,desc'),
]
else:
urls = [url]
for url in urls:
data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT)
n = True
page = 2
while n:
n = re.compile('Next &#187;</a>', re.DOTALL).findall(data)
if n:
n = '%s&page=%s' % (url, page)
page += 1
doc = lxml.html.fromstring(data)
article = doc.find_class('article')
if article:
article = article[0]
else:
print('no article on', '%s&page=%s' % (url, page-2))
break
2018-06-03 12:00:03 +00:00
for content in article.find_class('lister-item-content'):
header = content.find_class('lister-item-header')[0]
2018-05-03 13:35:02 +00:00
a = header.xpath('.//a')
if 'Episode:' in [
e.text_content()
for e in header.xpath(".//small")
] and len(a) > 1:
title = a[0].text_content().strip() + ': '
a = a[1]
else:
title = ''
a = a[0]
2018-05-03 10:11:51 +00:00
id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
2018-05-03 13:35:02 +00:00
title += a.text_content().strip()
2018-05-03 10:11:51 +00:00
try:
y = header.find_class('lister-item-year')[0].text_content()
y = re.sub('\([^\d]+\)', '', y)
y = y.rsplit('(', 1)[-1].split(')')[0].split('–')[0].split(' ')[0].strip()
if not y:
y = year
else:
y = int(y)
except:
print(n)
print(header.find_class('lister-item-year')[0].text_content())
raise
2018-06-03 12:00:03 +00:00
text = content.xpath(".//p[contains(@class, 'text-muted')]")
plot = text[1].text_content().strip()
plot = plot.replace('See full summary »', '').replace('See full summary\xa0»', '').strip()
if plot == 'Add a Plot':
plot = ''
genre = content.find_class('genre')
if genre:
genre = genre[0].text_content().strip().split(', ')
else:
genre = []
cast = content.xpath(".//p[contains(@class, '')]")
cast = [t for t in cast if t.attrib.get('class') == '']
if cast:
cast = parse_cast(cast[0].text_content())
2018-05-03 10:11:51 +00:00
if id not in films:
films[id] = {
'title': title,
'year': y
}
2018-06-03 12:00:03 +00:00
if plot:
films[id]['plot'] = plot
if genre:
films[id]['genre'] = genre
if cast:
films[id].update(cast)
2018-05-03 10:11:51 +00:00
#print(key, len(films), 'films')
if n:
#print(n)
data = ox.web.imdb.read_url(n, unicode=True, timeout=TIMEOUT)
2018-05-01 10:08:25 +00:00
path = get_path('ids/%s.json' % key)
with open(path, 'w') as fd:
json.dump(films, fd, indent=4, ensure_ascii=False, sort_keys=True)
2018-05-03 10:11:51 +00:00
return len(films)
2018-05-01 10:08:25 +00:00
def save_film_counts(film_counts):
with open(get_path('film_counts.json'), 'w') as fd:
json.dump(film_counts, fd, indent=4, sort_keys=True)
def update_index():
film_counts_json = get_path('film_counts.json')
if os.path.exists(film_counts_json):
with open(film_counts_json) as fd:
film_counts = json.load(fd)
else:
film_counts = {}
2018-05-02 10:42:56 +00:00
for year in range(1894, datetime.now().year+1):
2018-05-01 10:08:25 +00:00
film_count = get_film_count(year)
key = '%s' % year
if film_count != film_counts.get(key):
print_info(key, film_count, film_counts)
film_counts[key] = film_count
update_year(year, film_counts)
save_film_counts(film_counts)
def get_unknown_ids(known_ids):
ids = []
for path in glob(get_path('ids/*.json')):
with open(path) as fd:
ids += json.load(fd).keys()
return frozenset(ids) - known_ids
if __name__ == '__main__':
update_index()