script to add metadata to json; general cleanup

2013-08-08 10:08:17 +02:00 · 2013-08-08 10:08:17 +02:00 · acfd8184c2
commit acfd8184c2
parent c8f1a42142
3 changed files with 66 additions and 7 deletions
--- a/add_metadata.py
+++ b/add_metadata.py
@ -0,0 +1,49 @@
+#!/usr/bin/python
+from optparse import OptionParser
+import json
+import codecs
+import sys
+
+import ox
+
+def add_metadata(films, country):
+    api = ox.API('https://indiancine.ma/api/')
+    for info in films:
+        extra = api.getMetadata(id=info['imdbId'], keys=[
+            'language', 'productionCompany', 'director',
+            'runtime', 'alternativeTitles',
+            'color', 'sound',
+            'summary', 'country',
+            'isSeries',
+            'title',
+            'originalTitle', 'year'
+        ])['data']
+        if 'isSeries' in extra or ('country' in extra and not country in extra['country']):
+            info['delete'] = True
+            print 'deleting', info['imdbId'], info.get('title')
+            continue
+        if 'originalTitle' in extra:
+            info['alternativeTitles'] = [[info['title'], '']]
+            info['title'] = extra.pop('originalTitle')
+        else:
+            info['title'] = extra['title']
+        for key in extra:
+            if key not in info:
+                info[key] = extra[key]
+        print info['imdbId'], info['title']
+    return filter(lambda f: not f.get('delete', False), films)
+
+if __name__ == '__main__':
+    usage = "usage: %prog [options] country films.json"
+    parser = OptionParser(usage=usage)
+    (opts, args) = parser.parse_args()
+    if len(args) != 2:
+        parser.print_help()
+        sys.exit(1)
+    country, filename = args
+    with open(filename) as fd:
+        films = json.load(fd)
+    films = add_metadata(films, country)
+
+    with codecs.open(filename, 'w', encoding='utf-8') as fd:
+        json.dump(films, fd, indent=1, ensure_ascii=False)
--- a/films_by_country.py
+++ b/films_by_country.py
@ -3,6 +3,7 @@ import ox.web.imdb
 import re
 import json
 import sys
+import codecs
 from optparse import OptionParser

 '''
@ -14,7 +15,7 @@ def reset_url(url):
    x = ox.web.imdb.read_url(url, timeout=0)

 if __name__ == '__main__':
-    usage = "usage: %prog [options] country output.json"
+    usage = "usage: %prog [options] countrycode output.json"
    parser = OptionParser(usage=usage)
    parser.add_option('-r', '--reset', dest='reset', default=None, help="reset given url")
    (opts, args) = parser.parse_args()
@ -23,7 +24,10 @@ if __name__ == '__main__':
        sys.exit(1)

    films = []
-    country, output = args
+    country, filename = args
+
+    if opts.reset:
+        reset_url(opts.reset)
    
    base_url = 'http://akas.imdb.com'
    url = '%s/search/title?countries=%s&sort=year' % (base_url, country)
@ -45,8 +49,14 @@ if __name__ == '__main__':
            with open('last.html', 'w') as f:
                f.write(data)
        if len(films) % 1000 == 0:
-            with open(filename, 'w') as f:
-                json.dump(films, f, indent=2)
+            with codecs.open(filename, 'w', encoding='utf-8') as fd:
+                json.dump([{
+                    'imdbId': f[0],
+                    'title': ox.decode_html(f[1])
+                } for f in films], fd, indent=1, ensure_ascii=False)

-    with open(filename, 'w') as f:
-        json.dump(films, f, indent=2)
+    with codecs.open(filename, 'w', encoding='utf-8') as fd:
+        json.dump([{
+            'imdbId': f[0],
+            'title': ox.decode_html(f[1])
+        } for f in films], fd, indent=1, ensure_ascii=False)
--- a/import_json.py
+++ b/import_json.py
@ -36,5 +36,5 @@ def load(data_json):
        print item

 if __name__ == '__main__':
-    print 'please import from ./manage.py annd run import_json.load(path_to_json)'
+    print 'please import from ./manage.py and run import_json.load(path_to_json)'