diff --git a/README b/README new file mode 100644 index 0000000..9efe036 --- /dev/null +++ b/README @@ -0,0 +1 @@ +collection tools to create *cine.ma sites diff --git a/README.md b/README.md deleted file mode 100644 index 5e4154f..0000000 --- a/README.md +++ /dev/null @@ -1,6 +0,0 @@ -# collection tools to create *cine.ma sites - - COUNTRY=in - NAME=India - python films_by_country.py $COUNTRY films_${COUNTRY}.json - python add_metadata.py $NAME films_${COUNTRY}.json films_${COUNTRY}_metadata.json diff --git a/add_metadata.py b/add_metadata.py index c60c3c0..3857566 100755 --- a/add_metadata.py +++ b/add_metadata.py @@ -1,17 +1,15 @@ -#!/usr/bin/python3 +#!/usr/bin/python from optparse import OptionParser import json import codecs import sys import os -from datetime import datetime import ox def add_metadata(films, country, output): meta = [] api = ox.API('https://indiancine.ma/api/') - current_year = datetime.now().year if os.path.exists(output): with open(output) as fd: @@ -27,12 +25,13 @@ def add_metadata(films, country, output): if info['imdbId'] in known_ids: continue skip = False - for key in ('Mini-Series', 'TV Series', 'TV Movie', 'TV Special', 'Video Game'): + for key in ('Mini-Series', 'TV Series', 'TV Movie', 'TV Special'): if key in info['title']: skip = True if skip: continue - keys = [ + + extra = api.getMetadata(id=info['imdbId'], keys=[ 'language', 'productionCompany', 'director', 'runtime', 'alternativeTitles', 'color', 'sound', @@ -40,25 +39,12 @@ def add_metadata(films, country, output): 'isSeries', 'title', 'originalTitle', 'year' - ] - extra = api.getMetadata(id=info['imdbId'], keys=keys)['data'] - print(info) - print(extra) - if not extra: - save() - print('lets try again') - extra = api.getMetadata(id=info['imdbId'], keys=keys)['data'] - print(extra) - y = extra.get('year') - if y: - y = int(y) - if '(????)' in info.get('title', '') or not y or y >= current_year: + ])['data'] + print info + print extra + if 'isSeries' in extra or ('country' in extra and not country in extra['country']): info['delete'] = True - print('skip unknown or current year', info['imdbId'], info.get('title'), info.get('year')) - continue - if 'isSeries' in extra or ('country' in extra and country not in extra['country']): - info['delete'] = True - print('deleting', info['imdbId'], info.get('title')) + print 'deleting', info['imdbId'], info.get('title') continue if 'originalTitle' in extra: info['alternativeTitles'] = [[info['title'], '']] @@ -68,14 +54,13 @@ def add_metadata(films, country, output): for key in extra: if key not in info: info[key] = extra[key] - print(info['imdbId'], info['title']) + print info['imdbId'], info['title'] meta.append(info) if len(meta) % 100 == 0: save() save() return meta - if __name__ == '__main__': usage = "usage: %prog [options] country films.json films_with_metadata.json" parser = OptionParser(usage=usage) diff --git a/films_by_country.py b/films_by_country.py index f4fdbec..ceec04f 100755 --- a/films_by_country.py +++ b/films_by_country.py @@ -1,15 +1,11 @@ -#!/usr/bin/python3 +#!/usr/bin/python import ox.web.imdb import re import json import sys import codecs - -from datetime import datetime from optparse import OptionParser -import lxml.html - ''' python allofcountry.py in idsofindia.json python allofcountry.py tr idsofturkey.json @@ -18,19 +14,6 @@ python allofcountry.py tr idsofturkey.json def reset_url(url): x = ox.web.imdb.read_url(url, timeout=0) - -def write(films, filename): - data = [] - for id, title in films.items(): - data.append({ - 'imdbId': id, - 'title': title - }) - - with codecs.open(filename, 'w', encoding='utf-8') as fd: - json.dump(data, fd, indent=1, ensure_ascii=False) - - if __name__ == '__main__': usage = "usage: %prog [options] countrycode output.json" parser = OptionParser(usage=usage) @@ -40,56 +23,40 @@ if __name__ == '__main__': parser.print_help() sys.exit(1) - films = {} + films = [] country, filename = args - current_year = datetime.now().strftime('Y') if opts.reset: reset_url(opts.reset) + + base_url = 'http://akas.imdb.com' + url = '%s/search/title?countries=%s&sort=year' % (base_url, country) + data = ox.web.imdb.read_url(url) + n = True + while n: + n = re.compile('Next »').findall(data) + if n: + n = '%s%s' % (base_url, n[0].split('href="')[-1]) - base_url = 'http://www.imdb.com' - #url = '%s/search/title?countries=%s&sort=year' % (base_url, country) - year = 1880 + results = re.compile('