From acfd8184c23e401ce94175beef38af09ef25c455 Mon Sep 17 00:00:00 2001 From: j <0x006A@0x2620.org> Date: Thu, 8 Aug 2013 10:08:17 +0200 Subject: [PATCH] script to add metadata to json; general cleanup --- add_metadata.py | 49 +++++++++++++++++++++++++++++++++++++++++++++ films_by_country.py | 22 ++++++++++++++------ import_json.py | 2 +- 3 files changed, 66 insertions(+), 7 deletions(-) create mode 100755 add_metadata.py diff --git a/add_metadata.py b/add_metadata.py new file mode 100755 index 0000000..c379cfc --- /dev/null +++ b/add_metadata.py @@ -0,0 +1,49 @@ +#!/usr/bin/python +from optparse import OptionParser +import json +import codecs +import sys + +import ox + +def add_metadata(films, country): + api = ox.API('https://indiancine.ma/api/') + for info in films: + extra = api.getMetadata(id=info['imdbId'], keys=[ + 'language', 'productionCompany', 'director', + 'runtime', 'alternativeTitles', + 'color', 'sound', + 'summary', 'country', + 'isSeries', + 'title', + 'originalTitle', 'year' + ])['data'] + if 'isSeries' in extra or ('country' in extra and not country in extra['country']): + info['delete'] = True + print 'deleting', info['imdbId'], info.get('title') + continue + if 'originalTitle' in extra: + info['alternativeTitles'] = [[info['title'], '']] + info['title'] = extra.pop('originalTitle') + else: + info['title'] = extra['title'] + for key in extra: + if key not in info: + info[key] = extra[key] + print info['imdbId'], info['title'] + return filter(lambda f: not f.get('delete', False), films) + +if __name__ == '__main__': + usage = "usage: %prog [options] country films.json" + parser = OptionParser(usage=usage) + (opts, args) = parser.parse_args() + if len(args) != 2: + parser.print_help() + sys.exit(1) + country, filename = args + with open(filename) as fd: + films = json.load(fd) + films = add_metadata(films, country) + + with codecs.open(filename, 'w', encoding='utf-8') as fd: + json.dump(films, fd, indent=1, ensure_ascii=False) diff --git a/films_by_country.py b/films_by_country.py index 7affa5b..ceec04f 100755 --- a/films_by_country.py +++ b/films_by_country.py @@ -3,6 +3,7 @@ import ox.web.imdb import re import json import sys +import codecs from optparse import OptionParser ''' @@ -14,7 +15,7 @@ def reset_url(url): x = ox.web.imdb.read_url(url, timeout=0) if __name__ == '__main__': - usage = "usage: %prog [options] country output.json" + usage = "usage: %prog [options] countrycode output.json" parser = OptionParser(usage=usage) parser.add_option('-r', '--reset', dest='reset', default=None, help="reset given url") (opts, args) = parser.parse_args() @@ -23,7 +24,10 @@ if __name__ == '__main__': sys.exit(1) films = [] - country, output = args + country, filename = args + + if opts.reset: + reset_url(opts.reset) base_url = 'http://akas.imdb.com' url = '%s/search/title?countries=%s&sort=year' % (base_url, country) @@ -45,8 +49,14 @@ if __name__ == '__main__': with open('last.html', 'w') as f: f.write(data) if len(films) % 1000 == 0: - with open(filename, 'w') as f: - json.dump(films, f, indent=2) + with codecs.open(filename, 'w', encoding='utf-8') as fd: + json.dump([{ + 'imdbId': f[0], + 'title': ox.decode_html(f[1]) + } for f in films], fd, indent=1, ensure_ascii=False) - with open(filename, 'w') as f: - json.dump(films, f, indent=2) + with codecs.open(filename, 'w', encoding='utf-8') as fd: + json.dump([{ + 'imdbId': f[0], + 'title': ox.decode_html(f[1]) + } for f in films], fd, indent=1, ensure_ascii=False) diff --git a/import_json.py b/import_json.py index 61a1703..3479c1d 100644 --- a/import_json.py +++ b/import_json.py @@ -36,5 +36,5 @@ def load(data_json): print item if __name__ == '__main__': - print 'please import from ./manage.py annd run import_json.load(path_to_json)' + print 'please import from ./manage.py and run import_json.load(path_to_json)'