discover imdb ids by year

2018-05-01 12:08:25 +02:00 · 2018-05-01 12:08:25 +02:00 · e8225eec58
commit e8225eec58
parent 1ebc53dd46
2 changed files with 214 additions and 28 deletions
--- a/oxdata/movie/imdbids.py
+++ b/oxdata/movie/imdbids.py
@ -0,0 +1,191 @@
 from datetime import datetime
 from glob import glob
 from optparse import OptionParser
 import calendar
 import codecs
 import json
 import os
 import re
 import sys
 from django.conf import settings
 import lxml.html
 import ox
 import ox.web.imdb
 MAX_PER_RANGE = 200 * 50
 DAY = 24 * 60 * 60
 TIMEOUT = 90 * DAY
 DATA_ROOT = os.path.join(settings.MEDIA_ROOT, 'imdb')
 def get_range(from_, to):
    base_url = 'http://www.imdb.com'
    url = '%s/search/title?adult=include&release_date=%s,%s&sort=release_date,asc&count=50' % (base_url, from_, to)
    return url
 def get_year(year):
    return get_range('%s-01-01' % year, '%s-12-31' % year)
 def get_month(year, month):
    days = calendar.monthrange(year, month)[1]
    month = '%s-%02d' % (year, month)
    return get_range('%s-01' % month, '%s-%02d' % (month, days))
 def get_day(year, month, day):
    day = '%s-%02d-%02d' % (year, month, day)
    return get_range(day, day)
 def get_film_count(year, month=None, day=None):
    if day is not None:
        url = get_day(year, month, day)
    elif month is not None:
        url = get_month(year, month)
    else:
        url = get_year(year)
    data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT)
    total = re.compile('<span class="lister-current-last-item">50</span>.*?of (.*?) titles', re.DOTALL).findall(data)
    if total:
        return int(total[0].replace(',', ''))
    print('no movies', url)
    ox.web.imdb.delete_url(url)
    return 0
 def get_path(name):
    path = os.path.join(DATA_ROOT, name)
    ox.makedirs(os.path.dirname(path))
    return path
 def print_info(key, film_count, film_counts):
    added = film_count - film_counts.get(key, 0)
    if added != film_count:
        extra = '(added %s)' % added
    else:
        extra = ''
    print('update', key, 'now has', film_count, 'films', extra)
 def update_year(year, film_counts):
    key = '%s' % year
    if film_counts[key] > MAX_PER_RANGE:
        for month in range(1, 13):
            key = '%04d-%02d' % (year, month)
            film_count = get_film_count(year, month)
            if film_count != film_counts.get(key):
                print_info(key, film_count, film_counts)
                film_counts[key] = film_count
                update_month(year, month, film_counts)
    else:
        update_ids(year)
        save_film_counts(film_counts)
 def update_month(year, month, film_counts):
    key = '%04d-%02d' % (year, month)
    if film_counts[key] > MAX_PER_RANGE:
        month_total = film_counts[key]
        days_total = 0
        days = calendar.monthrange(year, month)[1]
        for day in range(1, days + 1):
            key = '%04d-%02d-%02d' % (year, month, day)
            film_count = get_film_count(year, month, day)
            days_total += film_count
            if film_count != film_counts.get(key):
                print_info(key, film_count, film_counts)
                film_counts[key] = film_count
                if film_count > MAX_PER_RANGE:
                    print(key, '!!!to many per day')
                else:
                    update_ids(year, month, day)
                    save_film_counts(film_counts)
        if days_total != month_total:
            print('!! month and days don\'t add up: %s month vs %s days total' % (month_total, days_total))
    else:
        update_ids(year, month)
        save_film_counts(film_counts)
 def update_ids(year, month=None, day=None):
    films = {}
    if day is not None:
        url = get_day(year, month, day)
        key = '%04d-%02d-%02d' % (year, month, day)
    elif month is not None:
        url = get_month(year, month)
        key = '%04d-%02d' % (year, mont)
    else:
        url = get_year(year)
        key = '%04d' % year
    data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT)
    n = True
    page = 2
    while n:
        n = re.compile('Next &#187;</a>', re.DOTALL).findall(data)
        if n:
            n = '%s&page=%s' % (url, page)
            page += 1
        doc = lxml.html.fromstring(data)
        article = doc.find_class('article')
        if article:
            article = article[0]
        else:
            print('no article on', '%s&page=%s' % (url, page-2))
            break
        for header in article.find_class('lister-item-header'):
            a = header.xpath('.//a')[0]
            id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
            title = a.text_content().strip()
            try:
                y = header.find_class('lister-item-year')[0].text_content()
                y = re.sub('\([^\d]+\)', '', y)
                y = y.rsplit('(', 1)[-1].split(')')[0].split('–')[0].split(' ')[0].strip()
                if not y:
                    y = year
                else:
                    y = int(y)
            except:
                print(n)
                print(header.find_class('lister-item-year')[0].text_content())
                raise
            if id not in films:
                films[id] = {
                    'title': title,
                    'year': y
                }
        #print(key, len(films), 'films')
        if n:
            #print(n)
            data = ox.web.imdb.read_url(n, unicode=True, timeout=TIMEOUT)
    path = get_path('ids/%s.json' % key)
    with open(path, 'w') as fd:
        json.dump(films, fd, indent=4, ensure_ascii=False, sort_keys=True)
 def save_film_counts(film_counts):
    with open(get_path('film_counts.json'), 'w') as fd:
        json.dump(film_counts, fd, indent=4, sort_keys=True)
 def update_index():
    film_counts_json = get_path('film_counts.json')
    if os.path.exists(film_counts_json):
        with open(film_counts_json) as fd:
            film_counts = json.load(fd)
    else:
        film_counts = {}
    for year in range(1890, datetime.now().year+1):
        film_count = get_film_count(year)
        key = '%s' % year
        if film_count != film_counts.get(key):
            print_info(key, film_count, film_counts)
            film_counts[key] = film_count
            update_year(year, film_counts)
    save_film_counts(film_counts)
 def get_unknown_ids(known_ids):
    ids = []
    for path in glob(get_path('ids/*.json')):
        with open(path) as fd:
            ids += json.load(fd).keys()
    return frozenset(ids) - known_ids
 if __name__ == '__main__':
    update_index()
--- a/oxdata/movie/models.py
+++ b/oxdata/movie/models.py
@ -19,6 +19,8 @@ from oxdjango.fields import DictField
 from lookup.models import get_movie_id
 from poster.models import getPosters
 from .imdbids import get_unknown_ids, update_index
 def normalize_value(value):
    if isinstance(value, bytes):
        value = value.decode('utf-8')
@ -260,35 +262,28 @@ def get_new_ids(timeout=-1):
        with open(new_ids_cache) as fd:
            new_ids = set(json.load(fd))
    else:
-        robot = ox.cache.read_url('http://www.imdb.com/robots.txt', timeout=timeout).decode('utf-8')
+        update_index()
        sitemap_url = re.compile('\nSitemap: (http.+)').findall(robot)[0]
        sitemap = ox.cache.read_url(sitemap_url, timeout=timeout).decode('utf-8')
        urls = re.compile('<loc>(.+?)</loc>').findall(sitemap)
        ids = set()
        for url in sorted(urls, reverse=False):
            s = ox.cache.read_url(url, timeout=timeout).decode('utf-8')
            ids |= set(re.compile('<loc>http://www.imdb.com/title/tt(\d{7})/combined</loc>').findall(s))
            #print url, len(ids)
        known_ids = frozenset([i['imdb'] for i in Imdb.objects.all().values('imdb')])
-        new_ids = frozenset(ids) - known_ids
+        new_ids = get_unknown_ids(known_ids)
-    print('adding %s new items' % len(new_ids))
+    if new_ids:
-    added = 0
+        print('adding %s new items' % len(new_ids))
-    done = set()
+        added = 0
-    for i in sorted(new_ids):
+        done = set()
-        print(i)
+        for i in sorted(new_ids):
-        m, created = Imdb.objects.get_or_create(imdb=i)
+            print(i)
-        try:
+            m, created = Imdb.objects.get_or_create(imdb=i)
-            m.update()
+            try:
-        except:
+                m.update()
-            with open('/tmp/missing.json', 'w') as fd:
+            except:
-                json.dump(list(new_ids-done), fd)
+                with open('/tmp/missing.json', 'w') as fd:
-            raise
+                    json.dump(list(new_ids-done), fd)
-        print(m)
+                raise
-        if created:
+            print(m)
-            added += 1
+            if created:
-        done.add(i)
+                added += 1
-    if added:
+            done.add(i)
-        print(added)
+        if added:
            print(added)
    if os.path.exists(new_ids_cache):
        os.unlink(new_ids_cache)