script to add metadata to json; general cleanup
This commit is contained in:
parent
c8f1a42142
commit
acfd8184c2
3 changed files with 66 additions and 7 deletions
49
add_metadata.py
Executable file
49
add_metadata.py
Executable file
|
@ -0,0 +1,49 @@
|
||||||
|
#!/usr/bin/python
|
||||||
|
from optparse import OptionParser
|
||||||
|
import json
|
||||||
|
import codecs
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import ox
|
||||||
|
|
||||||
|
def add_metadata(films, country):
|
||||||
|
api = ox.API('https://indiancine.ma/api/')
|
||||||
|
for info in films:
|
||||||
|
extra = api.getMetadata(id=info['imdbId'], keys=[
|
||||||
|
'language', 'productionCompany', 'director',
|
||||||
|
'runtime', 'alternativeTitles',
|
||||||
|
'color', 'sound',
|
||||||
|
'summary', 'country',
|
||||||
|
'isSeries',
|
||||||
|
'title',
|
||||||
|
'originalTitle', 'year'
|
||||||
|
])['data']
|
||||||
|
if 'isSeries' in extra or ('country' in extra and not country in extra['country']):
|
||||||
|
info['delete'] = True
|
||||||
|
print 'deleting', info['imdbId'], info.get('title')
|
||||||
|
continue
|
||||||
|
if 'originalTitle' in extra:
|
||||||
|
info['alternativeTitles'] = [[info['title'], '']]
|
||||||
|
info['title'] = extra.pop('originalTitle')
|
||||||
|
else:
|
||||||
|
info['title'] = extra['title']
|
||||||
|
for key in extra:
|
||||||
|
if key not in info:
|
||||||
|
info[key] = extra[key]
|
||||||
|
print info['imdbId'], info['title']
|
||||||
|
return filter(lambda f: not f.get('delete', False), films)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
usage = "usage: %prog [options] country films.json"
|
||||||
|
parser = OptionParser(usage=usage)
|
||||||
|
(opts, args) = parser.parse_args()
|
||||||
|
if len(args) != 2:
|
||||||
|
parser.print_help()
|
||||||
|
sys.exit(1)
|
||||||
|
country, filename = args
|
||||||
|
with open(filename) as fd:
|
||||||
|
films = json.load(fd)
|
||||||
|
films = add_metadata(films, country)
|
||||||
|
|
||||||
|
with codecs.open(filename, 'w', encoding='utf-8') as fd:
|
||||||
|
json.dump(films, fd, indent=1, ensure_ascii=False)
|
|
@ -3,6 +3,7 @@ import ox.web.imdb
|
||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
import sys
|
import sys
|
||||||
|
import codecs
|
||||||
from optparse import OptionParser
|
from optparse import OptionParser
|
||||||
|
|
||||||
'''
|
'''
|
||||||
|
@ -14,7 +15,7 @@ def reset_url(url):
|
||||||
x = ox.web.imdb.read_url(url, timeout=0)
|
x = ox.web.imdb.read_url(url, timeout=0)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
usage = "usage: %prog [options] country output.json"
|
usage = "usage: %prog [options] countrycode output.json"
|
||||||
parser = OptionParser(usage=usage)
|
parser = OptionParser(usage=usage)
|
||||||
parser.add_option('-r', '--reset', dest='reset', default=None, help="reset given url")
|
parser.add_option('-r', '--reset', dest='reset', default=None, help="reset given url")
|
||||||
(opts, args) = parser.parse_args()
|
(opts, args) = parser.parse_args()
|
||||||
|
@ -23,7 +24,10 @@ if __name__ == '__main__':
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
films = []
|
films = []
|
||||||
country, output = args
|
country, filename = args
|
||||||
|
|
||||||
|
if opts.reset:
|
||||||
|
reset_url(opts.reset)
|
||||||
|
|
||||||
base_url = 'http://akas.imdb.com'
|
base_url = 'http://akas.imdb.com'
|
||||||
url = '%s/search/title?countries=%s&sort=year' % (base_url, country)
|
url = '%s/search/title?countries=%s&sort=year' % (base_url, country)
|
||||||
|
@ -45,8 +49,14 @@ if __name__ == '__main__':
|
||||||
with open('last.html', 'w') as f:
|
with open('last.html', 'w') as f:
|
||||||
f.write(data)
|
f.write(data)
|
||||||
if len(films) % 1000 == 0:
|
if len(films) % 1000 == 0:
|
||||||
with open(filename, 'w') as f:
|
with codecs.open(filename, 'w', encoding='utf-8') as fd:
|
||||||
json.dump(films, f, indent=2)
|
json.dump([{
|
||||||
|
'imdbId': f[0],
|
||||||
|
'title': ox.decode_html(f[1])
|
||||||
|
} for f in films], fd, indent=1, ensure_ascii=False)
|
||||||
|
|
||||||
with open(filename, 'w') as f:
|
with codecs.open(filename, 'w', encoding='utf-8') as fd:
|
||||||
json.dump(films, f, indent=2)
|
json.dump([{
|
||||||
|
'imdbId': f[0],
|
||||||
|
'title': ox.decode_html(f[1])
|
||||||
|
} for f in films], fd, indent=1, ensure_ascii=False)
|
||||||
|
|
|
@ -36,5 +36,5 @@ def load(data_json):
|
||||||
print item
|
print item
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
print 'please import from ./manage.py annd run import_json.load(path_to_json)'
|
print 'please import from ./manage.py and run import_json.load(path_to_json)'
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue