discover imdb ids by year

2018-05-01 12:08:25 +02:00 · 2018-05-01 12:08:25 +02:00 · e8225eec58
commit e8225eec58
parent 1ebc53dd46
2 changed files with 214 additions and 28 deletions
--- a/oxdata/movie/imdbids.py
+++ b/oxdata/movie/imdbids.py
@ -0,0 +1,191 @@
+from datetime import datetime
+from glob import glob
+from optparse import OptionParser
+import calendar
+import codecs
+import json
+import os
+import re
+import sys
+
+from django.conf import settings
+import lxml.html
+import ox
+import ox.web.imdb
+
+
+MAX_PER_RANGE = 200 * 50
+
+DAY = 24 * 60 * 60
+TIMEOUT = 90 * DAY
+DATA_ROOT = os.path.join(settings.MEDIA_ROOT, 'imdb')
+
+def get_range(from_, to):
+    base_url = 'http://www.imdb.com'
+    url = '%s/search/title?adult=include&release_date=%s,%s&sort=release_date,asc&count=50' % (base_url, from_, to)
+    return url
+
+def get_year(year):
+    return get_range('%s-01-01' % year, '%s-12-31' % year)
+
+def get_month(year, month):
+    days = calendar.monthrange(year, month)[1]
+    month = '%s-%02d' % (year, month)
+    return get_range('%s-01' % month, '%s-%02d' % (month, days))
+
+def get_day(year, month, day):
+    day = '%s-%02d-%02d' % (year, month, day)
+    return get_range(day, day)
+
+def get_film_count(year, month=None, day=None):
+    if day is not None:
+        url = get_day(year, month, day)
+    elif month is not None:
+        url = get_month(year, month)
+    else:
+        url = get_year(year)
+    data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT)
+    total = re.compile('<span class="lister-current-last-item">50</span>.*?of (.*?) titles', re.DOTALL).findall(data)
+    if total:
+        return int(total[0].replace(',', ''))
+    print('no movies', url)
+    ox.web.imdb.delete_url(url)
+    return 0
+
+def get_path(name):
+    path = os.path.join(DATA_ROOT, name)
+    ox.makedirs(os.path.dirname(path))
+    return path
+
+def print_info(key, film_count, film_counts):
+    added = film_count - film_counts.get(key, 0)
+    if added != film_count:
+        extra = '(added %s)' % added
+    else:
+        extra = ''
+    print('update', key, 'now has', film_count, 'films', extra)
+
+def update_year(year, film_counts):
+    key = '%s' % year
+    if film_counts[key] > MAX_PER_RANGE:
+        for month in range(1, 13):
+            key = '%04d-%02d' % (year, month)
+            film_count = get_film_count(year, month)
+            if film_count != film_counts.get(key):
+                print_info(key, film_count, film_counts)
+                film_counts[key] = film_count
+                update_month(year, month, film_counts)
+    else:
+        update_ids(year)
+        save_film_counts(film_counts)
+
+def update_month(year, month, film_counts):
+    key = '%04d-%02d' % (year, month)
+    if film_counts[key] > MAX_PER_RANGE:
+        month_total = film_counts[key]
+        days_total = 0
+        days = calendar.monthrange(year, month)[1]
+        for day in range(1, days + 1):
+            key = '%04d-%02d-%02d' % (year, month, day)
+            film_count = get_film_count(year, month, day)
+            days_total += film_count
+            if film_count != film_counts.get(key):
+                print_info(key, film_count, film_counts)
+                film_counts[key] = film_count
+                if film_count > MAX_PER_RANGE:
+                    print(key, '!!!to many per day')
+                else:
+                    update_ids(year, month, day)
+                    save_film_counts(film_counts)
+        if days_total != month_total:
+            print('!! month and days don\'t add up: %s month vs %s days total' % (month_total, days_total))
+    else:
+        update_ids(year, month)
+        save_film_counts(film_counts)
+
+
+def update_ids(year, month=None, day=None):
+    films = {}
+    if day is not None:
+        url = get_day(year, month, day)
+        key = '%04d-%02d-%02d' % (year, month, day)
+    elif month is not None:
+        url = get_month(year, month)
+        key = '%04d-%02d' % (year, mont)
+    else:
+        url = get_year(year)
+        key = '%04d' % year
+    data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT)
+    n = True
+    page = 2
+    while n:
+        n = re.compile('Next &#187;</a>', re.DOTALL).findall(data)
+        if n:
+            n = '%s&page=%s' % (url, page)
+            page += 1
+        doc = lxml.html.fromstring(data)
+        article = doc.find_class('article')
+        if article:
+            article = article[0]
+        else:
+            print('no article on', '%s&page=%s' % (url, page-2))
+            break
+        for header in article.find_class('lister-item-header'):
+            a = header.xpath('.//a')[0]
+            id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
+            title = a.text_content().strip()
+            try:
+                y = header.find_class('lister-item-year')[0].text_content()
+                y = re.sub('\([^\d]+\)', '', y)
+                y = y.rsplit('(', 1)[-1].split(')')[0].split('–')[0].split(' ')[0].strip()
+                if not y:
+                    y = year
+                else:
+                    y = int(y)
+            except:
+                print(n)
+                print(header.find_class('lister-item-year')[0].text_content())
+                raise
+            if id not in films:
+                films[id] = {
+                    'title': title,
+                    'year': y
+                }
+        #print(key, len(films), 'films')
+        if n:
+            #print(n)
+            data = ox.web.imdb.read_url(n, unicode=True, timeout=TIMEOUT)
+    path = get_path('ids/%s.json' % key)
+    with open(path, 'w') as fd:
+        json.dump(films, fd, indent=4, ensure_ascii=False, sort_keys=True)
+
+def save_film_counts(film_counts):
+    with open(get_path('film_counts.json'), 'w') as fd:
+        json.dump(film_counts, fd, indent=4, sort_keys=True)
+
+def update_index():
+    film_counts_json = get_path('film_counts.json')
+    if os.path.exists(film_counts_json):
+        with open(film_counts_json) as fd:
+            film_counts = json.load(fd)
+    else:
+        film_counts = {}
+
+    for year in range(1890, datetime.now().year+1):
+        film_count = get_film_count(year)
+        key = '%s' % year
+        if film_count != film_counts.get(key):
+            print_info(key, film_count, film_counts)
+            film_counts[key] = film_count
+            update_year(year, film_counts)
+    save_film_counts(film_counts)
+
+def get_unknown_ids(known_ids):
+    ids = []
+    for path in glob(get_path('ids/*.json')):
+        with open(path) as fd:
+            ids += json.load(fd).keys()
+    return frozenset(ids) - known_ids
+
+if __name__ == '__main__':
+    update_index()
--- a/oxdata/movie/models.py
+++ b/oxdata/movie/models.py
@ -19,6 +19,8 @@ from oxdjango.fields import DictField
 from lookup.models import get_movie_id
 from poster.models import getPosters

+from .imdbids import get_unknown_ids, update_index
+
 def normalize_value(value):
    if isinstance(value, bytes):
        value = value.decode('utf-8')
@ -260,17 +262,10 @@ def get_new_ids(timeout=-1):
        with open(new_ids_cache) as fd:
            new_ids = set(json.load(fd))
    else:
-        robot = ox.cache.read_url('http://www.imdb.com/robots.txt', timeout=timeout).decode('utf-8')
-        sitemap_url = re.compile('\nSitemap: (http.+)').findall(robot)[0]
-        sitemap = ox.cache.read_url(sitemap_url, timeout=timeout).decode('utf-8')
-        urls = re.compile('<loc>(.+?)</loc>').findall(sitemap)
-        ids = set()
-        for url in sorted(urls, reverse=False):
-            s = ox.cache.read_url(url, timeout=timeout).decode('utf-8')
-            ids |= set(re.compile('<loc>http://www.imdb.com/title/tt(\d{7})/combined</loc>').findall(s))
-            #print url, len(ids)
+        update_index()
        known_ids = frozenset([i['imdb'] for i in Imdb.objects.all().values('imdb')])
-        new_ids = frozenset(ids) - known_ids
+        new_ids = get_unknown_ids(known_ids)
+    if new_ids:
        print('adding %s new items' % len(new_ids))
        added = 0
        done = set()