diff --git a/README b/README new file mode 100644 index 0000000..9efe036 --- /dev/null +++ b/README @@ -0,0 +1 @@ +collection tools to create *cine.ma sites diff --git a/README.md b/README.md deleted file mode 100644 index 5e4154f..0000000 --- a/README.md +++ /dev/null @@ -1,6 +0,0 @@ -# collection tools to create *cine.ma sites - - COUNTRY=in - NAME=India - python films_by_country.py $COUNTRY films_${COUNTRY}.json - python add_metadata.py $NAME films_${COUNTRY}.json films_${COUNTRY}_metadata.json diff --git a/add_metadata.py b/add_metadata.py index c60c3c0..3857566 100755 --- a/add_metadata.py +++ b/add_metadata.py @@ -1,17 +1,15 @@ -#!/usr/bin/python3 +#!/usr/bin/python from optparse import OptionParser import json import codecs import sys import os -from datetime import datetime import ox def add_metadata(films, country, output): meta = [] api = ox.API('https://indiancine.ma/api/') - current_year = datetime.now().year if os.path.exists(output): with open(output) as fd: @@ -27,12 +25,13 @@ def add_metadata(films, country, output): if info['imdbId'] in known_ids: continue skip = False - for key in ('Mini-Series', 'TV Series', 'TV Movie', 'TV Special', 'Video Game'): + for key in ('Mini-Series', 'TV Series', 'TV Movie', 'TV Special'): if key in info['title']: skip = True if skip: continue - keys = [ + + extra = api.getMetadata(id=info['imdbId'], keys=[ 'language', 'productionCompany', 'director', 'runtime', 'alternativeTitles', 'color', 'sound', @@ -40,25 +39,12 @@ def add_metadata(films, country, output): 'isSeries', 'title', 'originalTitle', 'year' - ] - extra = api.getMetadata(id=info['imdbId'], keys=keys)['data'] - print(info) - print(extra) - if not extra: - save() - print('lets try again') - extra = api.getMetadata(id=info['imdbId'], keys=keys)['data'] - print(extra) - y = extra.get('year') - if y: - y = int(y) - if '(????)' in info.get('title', '') or not y or y >= current_year: + ])['data'] + print info + print extra + if 'isSeries' in extra or ('country' in extra and not country in extra['country']): info['delete'] = True - print('skip unknown or current year', info['imdbId'], info.get('title'), info.get('year')) - continue - if 'isSeries' in extra or ('country' in extra and country not in extra['country']): - info['delete'] = True - print('deleting', info['imdbId'], info.get('title')) + print 'deleting', info['imdbId'], info.get('title') continue if 'originalTitle' in extra: info['alternativeTitles'] = [[info['title'], '']] @@ -68,14 +54,13 @@ def add_metadata(films, country, output): for key in extra: if key not in info: info[key] = extra[key] - print(info['imdbId'], info['title']) + print info['imdbId'], info['title'] meta.append(info) if len(meta) % 100 == 0: save() save() return meta - if __name__ == '__main__': usage = "usage: %prog [options] country films.json films_with_metadata.json" parser = OptionParser(usage=usage) diff --git a/films_by_country.py b/films_by_country.py index f4fdbec..ceec04f 100755 --- a/films_by_country.py +++ b/films_by_country.py @@ -1,15 +1,11 @@ -#!/usr/bin/python3 +#!/usr/bin/python import ox.web.imdb import re import json import sys import codecs - -from datetime import datetime from optparse import OptionParser -import lxml.html - ''' python allofcountry.py in idsofindia.json python allofcountry.py tr idsofturkey.json @@ -18,19 +14,6 @@ python allofcountry.py tr idsofturkey.json def reset_url(url): x = ox.web.imdb.read_url(url, timeout=0) - -def write(films, filename): - data = [] - for id, title in films.items(): - data.append({ - 'imdbId': id, - 'title': title - }) - - with codecs.open(filename, 'w', encoding='utf-8') as fd: - json.dump(data, fd, indent=1, ensure_ascii=False) - - if __name__ == '__main__': usage = "usage: %prog [options] countrycode output.json" parser = OptionParser(usage=usage) @@ -40,56 +23,40 @@ if __name__ == '__main__': parser.print_help() sys.exit(1) - films = {} + films = [] country, filename = args - current_year = datetime.now().strftime('Y') if opts.reset: reset_url(opts.reset) + + base_url = 'http://akas.imdb.com' + url = '%s/search/title?countries=%s&sort=year' % (base_url, country) + data = ox.web.imdb.read_url(url) + n = True + while n: + n = re.compile('Next »').findall(data) + if n: + n = '%s%s' % (base_url, n[0].split('href="')[-1]) - base_url = 'http://www.imdb.com' - #url = '%s/search/title?countries=%s&sort=year' % (base_url, country) - year = 1880 + results = re.compile('(.*?)
', re.DOTALL).findall(data) + if results: + films += re.compile('href="/title/tt(\d{7})/" title="(.*?)"').findall(results[0]) + print n + print len(films), 'films' + if n: + data = ox.web.imdb.read_url(n) + else: + with open('last.html', 'w') as f: + f.write(data) + if len(films) % 1000 == 0: + with codecs.open(filename, 'w', encoding='utf-8') as fd: + json.dump([{ + 'imdbId': f[0], + 'title': ox.decode_html(f[1]) + } for f in films], fd, indent=1, ensure_ascii=False) - added = 0 - - while year < datetime.now().year: - url = '%s/search/title?release_date=%s-01-01,%s-01-01&countries=%s&sort=release_date,asc' % (base_url, year, year + 1, country) - - data = ox.web.imdb.read_url(url, unicode=True) - n = True - page = 1 - while n: - n = re.compile('Next »', re.DOTALL).findall(data) - if n: - n = '%s&page=%s' % (url, page) - page += 1 - doc = lxml.html.fromstring(data) - article = doc.find_class('article') - if article: - article = article[0] - else: - n = None - for a in article.xpath('.//a'): - if '/title/tt' in a.attrib['href']: - img = a.xpath('.//img') - if img: - id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0] - if id not in films: - title = img[0].attrib['alt'] - title = ox.decode_html(title) - films[id] = title - added += 1 - print(len(films), 'films') - if n: - data = ox.web.imdb.read_url(n, unicode=True) - else: - with open('last.html', 'w') as f: - f.write(data) - if added > 1000: - added = 0 - write(films, filename) - year += 1 - print('>> year', year) - - write(films, filename) + with codecs.open(filename, 'w', encoding='utf-8') as fd: + json.dump([{ + 'imdbId': f[0], + 'title': ox.decode_html(f[1]) + } for f in films], fd, indent=1, ensure_ascii=False) diff --git a/import_json.py b/import_json.py index 014e518..3479c1d 100644 --- a/import_json.py +++ b/import_json.py @@ -21,7 +21,7 @@ def load(data_json): reset_table(archive.models.Volume._meta.db_table) reset_table(models.Item._meta.db_table) transaction.commit_unless_managed() - os.system('rm -r /srv/pandora/data/media') + os.system('rm -r /srv/pandora/data/files') os.system('rm -r /srv/pandora/data/items') films = json.load(open(data_json)) diff --git a/stats.py b/stats.py index 3a3d055..12d691b 100644 --- a/stats.py +++ b/stats.py @@ -12,7 +12,6 @@ mini_series = filter(lambda x: 'Mini-Series' in x['title'], data) tv_series = filter(lambda x: 'TV Series' in x['title'], data) tv_movies = filter(lambda x: 'TV Movie' in x['title'], data) tv_special = filter(lambda x: 'TV Special' in x['title'], data) -documentary = filter(lambda x: 'Documentary' in x['title'], data) #cinema = set(data) - set(mini_series) - set(tv_series) - set(tv_movies) @@ -20,9 +19,8 @@ print len(tv_special), 'TV Specials' print len(tv_series), 'TV Series' print len(tv_movies), 'TV Movies' print len(mini_series), 'Mini-Series' -print len(documentary), 'Documentaries' #print len(cinema), 'Cinema' -print len(data) - len(mini_series) - len(tv_movies) - len(tv_series) - len(tv_special) - len(documentary), 'Movies' +print len(data) - len(mini_series) - len(tv_movies) - len(tv_series) - len(tv_special), 'Movies' print len(data), 'total'