oxdata/oxdata/movie/imdbids.py

from datetime import datetime
from glob import glob
from optparse import OptionParser
import calendar
import codecs
import json
import os
import re
import sys

from django.conf import settings
import lxml.html
import ox
import ox.web.imdb


MAX_PER_RANGE = 200 * 50

DAY = 24 * 60 * 60
TIMEOUT = 90 * DAY
DATA_ROOT = os.path.join(settings.MEDIA_ROOT, 'imdb')

def get_range(from_, to):
    base_url = 'http://www.imdb.com'
    url = '%s/search/title?adult=include&release_date=%s,%s&sort=release_date,asc&count=50' % (base_url, from_, to)
    return url

def get_year(year):
    return get_range('%s-01-01' % year, '%s-12-31' % year)

def get_month(year, month):
    days = calendar.monthrange(year, month)[1]
    month = '%s-%02d' % (year, month)
    return get_range('%s-01' % month, '%s-%02d' % (month, days))

def get_day(year, month, day):
    day = '%s-%02d-%02d' % (year, month, day)
    return get_range(day, day)

def get_film_count(year, month=None, day=None):
    if day is not None:
        url = get_day(year, month, day)
    elif month is not None:
        url = get_month(year, month)
    else:
        url = get_year(year)
    data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT)
    total = re.compile('<span class="lister-current-last-item">50</span>.*?of (.*?) titles', re.DOTALL).findall(data)
    if total:
        return int(total[0].replace(',', ''))
    print('no movies', url)
    ox.web.imdb.delete_url(url)
    return 0

def get_path(name):
    path = os.path.join(DATA_ROOT, name)
    ox.makedirs(os.path.dirname(path))
    return path

def print_info(key, film_count, film_counts):
    added = film_count - film_counts.get(key, 0)
    if added != film_count:
        extra = '(added %s)' % added
    else:
        extra = ''
    print('update', key, 'now has', film_count, 'films', extra)

def update_year(year, film_counts):
    key = '%s' % year
    if film_counts[key] > MAX_PER_RANGE:
        for month in range(1, 13):
            key = '%04d-%02d' % (year, month)
            film_count = get_film_count(year, month)
            if film_count != film_counts.get(key):
                print_info(key, film_count, film_counts)
                film_counts[key] = film_count
                update_month(year, month, film_counts)
    else:
        update_ids(year)
        save_film_counts(film_counts)

def update_month(year, month, film_counts):
    key = '%04d-%02d' % (year, month)
    if film_counts[key] > MAX_PER_RANGE:
        month_total = film_counts[key]
        days_total = 0
        days = calendar.monthrange(year, month)[1]
        for day in range(1, days + 1):
            key = '%04d-%02d-%02d' % (year, month, day)
            film_count = get_film_count(year, month, day)
            days_total += film_count
            if film_count != film_counts.get(key):
                print_info(key, film_count, film_counts)
                film_counts[key] = film_count
                if film_count > MAX_PER_RANGE:
                    print(key, '!!!to many per day')
                else:
                    update_ids(year, month, day)
                    save_film_counts(film_counts)
        if days_total != month_total:
            print('!! month and days don\'t add up: %s month vs %s days total' % (month_total, days_total))
    else:
        update_ids(year, month)
        save_film_counts(film_counts)


def update_ids(year, month=None, day=None):
    films = {}
    if day is not None:
        url = get_day(year, month, day)
        key = '%04d-%02d-%02d' % (year, month, day)
    elif month is not None:
        url = get_month(year, month)
        key = '%04d-%02d' % (year, month)
    else:
        url = get_year(year)
        key = '%04d' % year
    data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT)
    n = True
    page = 2
    while n:
        n = re.compile('Next &#187;</a>', re.DOTALL).findall(data)
        if n:
            n = '%s&page=%s' % (url, page)
            page += 1
        doc = lxml.html.fromstring(data)
        article = doc.find_class('article')
        if article:
            article = article[0]
        else:
            print('no article on', '%s&page=%s' % (url, page-2))
            break
        for header in article.find_class('lister-item-header'):
            a = header.xpath('.//a')[0]
            id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
            title = a.text_content().strip()
            try:
                y = header.find_class('lister-item-year')[0].text_content()
                y = re.sub('\([^\d]+\)', '', y)
                y = y.rsplit('(', 1)[-1].split(')')[0].split('–')[0].split(' ')[0].strip()
                if not y:
                    y = year
                else:
                    y = int(y)
            except:
                print(n)
                print(header.find_class('lister-item-year')[0].text_content())
                raise
            if id not in films:
                films[id] = {
                    'title': title,
                    'year': y
                }
        #print(key, len(films), 'films')
        if n:
            #print(n)
            data = ox.web.imdb.read_url(n, unicode=True, timeout=TIMEOUT)
    path = get_path('ids/%s.json' % key)
    with open(path, 'w') as fd:
        json.dump(films, fd, indent=4, ensure_ascii=False, sort_keys=True)

def save_film_counts(film_counts):
    with open(get_path('film_counts.json'), 'w') as fd:
        json.dump(film_counts, fd, indent=4, sort_keys=True)

def update_index():
    film_counts_json = get_path('film_counts.json')
    if os.path.exists(film_counts_json):
        with open(film_counts_json) as fd:
            film_counts = json.load(fd)
    else:
        film_counts = {}

    for year in range(1894, datetime.now().year+1):
        film_count = get_film_count(year)
        key = '%s' % year
        if film_count != film_counts.get(key):
            print_info(key, film_count, film_counts)
            film_counts[key] = film_count
            update_year(year, film_counts)
    save_film_counts(film_counts)

def get_unknown_ids(known_ids):
    ids = []
    for path in glob(get_path('ids/*.json')):
        with open(path) as fd:
            ids += json.load(fd).keys()
    return frozenset(ids) - known_ids

if __name__ == '__main__':
    update_index()