From ccb54122ccc0bbd235ed917ca42084e287646af0 Mon Sep 17 00:00:00 2001 From: j Date: Fri, 29 Apr 2016 13:02:16 +0200 Subject: [PATCH 1/3] stats --- stats.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/stats.py b/stats.py index 12d691b..3a3d055 100644 --- a/stats.py +++ b/stats.py @@ -12,6 +12,7 @@ mini_series = filter(lambda x: 'Mini-Series' in x['title'], data) tv_series = filter(lambda x: 'TV Series' in x['title'], data) tv_movies = filter(lambda x: 'TV Movie' in x['title'], data) tv_special = filter(lambda x: 'TV Special' in x['title'], data) +documentary = filter(lambda x: 'Documentary' in x['title'], data) #cinema = set(data) - set(mini_series) - set(tv_series) - set(tv_movies) @@ -19,8 +20,9 @@ print len(tv_special), 'TV Specials' print len(tv_series), 'TV Series' print len(tv_movies), 'TV Movies' print len(mini_series), 'Mini-Series' +print len(documentary), 'Documentaries' #print len(cinema), 'Cinema' -print len(data) - len(mini_series) - len(tv_movies) - len(tv_series) - len(tv_special), 'Movies' +print len(data) - len(mini_series) - len(tv_movies) - len(tv_series) - len(tv_special) - len(documentary), 'Movies' print len(data), 'total' From f6ed20d87b753ef97ef0ea934806ef48fe5b6732 Mon Sep 17 00:00:00 2001 From: j Date: Fri, 29 Apr 2016 13:12:05 +0200 Subject: [PATCH 2/3] cleanup --- README | 1 - README.md | 5 +++++ add_metadata.py | 13 +++++++++---- import_json.py | 2 +- 4 files changed, 15 insertions(+), 6 deletions(-) delete mode 100644 README create mode 100644 README.md diff --git a/README b/README deleted file mode 100644 index 9efe036..0000000 --- a/README +++ /dev/null @@ -1 +0,0 @@ -collection tools to create *cine.ma sites diff --git a/README.md b/README.md new file mode 100644 index 0000000..515128b --- /dev/null +++ b/README.md @@ -0,0 +1,5 @@ +# collection tools to create *cine.ma sites + + COUNTRY=in + python films_by_country.py $COUNTRY films_$COUNTRY.json + python add_metadata.py $COUNTRY films_$COUNTRY.json films_$COUNTRY_metadata.json diff --git a/add_metadata.py b/add_metadata.py index 3857566..a4d7fa6 100755 --- a/add_metadata.py +++ b/add_metadata.py @@ -25,13 +25,12 @@ def add_metadata(films, country, output): if info['imdbId'] in known_ids: continue skip = False - for key in ('Mini-Series', 'TV Series', 'TV Movie', 'TV Special'): + for key in ('Mini-Series', 'TV Series', 'TV Movie', 'TV Special', 'Video Game'): if key in info['title']: skip = True if skip: continue - - extra = api.getMetadata(id=info['imdbId'], keys=[ + keys = [ 'language', 'productionCompany', 'director', 'runtime', 'alternativeTitles', 'color', 'sound', @@ -39,9 +38,15 @@ def add_metadata(films, country, output): 'isSeries', 'title', 'originalTitle', 'year' - ])['data'] + ] + extra = api.getMetadata(id=info['imdbId'], keys=keys)['data'] print info print extra + if not extra: + save() + print 'lets try again' + extra = api.getMetadata(id=info['imdbId'], keys=keys)['data'] + print extra if 'isSeries' in extra or ('country' in extra and not country in extra['country']): info['delete'] = True print 'deleting', info['imdbId'], info.get('title') diff --git a/import_json.py b/import_json.py index 3479c1d..014e518 100644 --- a/import_json.py +++ b/import_json.py @@ -21,7 +21,7 @@ def load(data_json): reset_table(archive.models.Volume._meta.db_table) reset_table(models.Item._meta.db_table) transaction.commit_unless_managed() - os.system('rm -r /srv/pandora/data/files') + os.system('rm -r /srv/pandora/data/media') os.system('rm -r /srv/pandora/data/items') films = json.load(open(data_json)) From 322d63f2348e15bd3095d9fbef09d1d70c697065 Mon Sep 17 00:00:00 2001 From: j Date: Mon, 2 Apr 2018 20:27:36 +0530 Subject: [PATCH 3/3] update parser --- README.md | 5 ++- add_metadata.py | 26 ++++++++---- films_by_country.py | 97 ++++++++++++++++++++++++++++++--------------- 3 files changed, 86 insertions(+), 42 deletions(-) diff --git a/README.md b/README.md index 515128b..5e4154f 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,6 @@ # collection tools to create *cine.ma sites COUNTRY=in - python films_by_country.py $COUNTRY films_$COUNTRY.json - python add_metadata.py $COUNTRY films_$COUNTRY.json films_$COUNTRY_metadata.json + NAME=India + python films_by_country.py $COUNTRY films_${COUNTRY}.json + python add_metadata.py $NAME films_${COUNTRY}.json films_${COUNTRY}_metadata.json diff --git a/add_metadata.py b/add_metadata.py index a4d7fa6..c60c3c0 100755 --- a/add_metadata.py +++ b/add_metadata.py @@ -1,15 +1,17 @@ -#!/usr/bin/python +#!/usr/bin/python3 from optparse import OptionParser import json import codecs import sys import os +from datetime import datetime import ox def add_metadata(films, country, output): meta = [] api = ox.API('https://indiancine.ma/api/') + current_year = datetime.now().year if os.path.exists(output): with open(output) as fd: @@ -40,16 +42,23 @@ def add_metadata(films, country, output): 'originalTitle', 'year' ] extra = api.getMetadata(id=info['imdbId'], keys=keys)['data'] - print info - print extra + print(info) + print(extra) if not extra: save() - print 'lets try again' + print('lets try again') extra = api.getMetadata(id=info['imdbId'], keys=keys)['data'] - print extra - if 'isSeries' in extra or ('country' in extra and not country in extra['country']): + print(extra) + y = extra.get('year') + if y: + y = int(y) + if '(????)' in info.get('title', '') or not y or y >= current_year: info['delete'] = True - print 'deleting', info['imdbId'], info.get('title') + print('skip unknown or current year', info['imdbId'], info.get('title'), info.get('year')) + continue + if 'isSeries' in extra or ('country' in extra and country not in extra['country']): + info['delete'] = True + print('deleting', info['imdbId'], info.get('title')) continue if 'originalTitle' in extra: info['alternativeTitles'] = [[info['title'], '']] @@ -59,13 +68,14 @@ def add_metadata(films, country, output): for key in extra: if key not in info: info[key] = extra[key] - print info['imdbId'], info['title'] + print(info['imdbId'], info['title']) meta.append(info) if len(meta) % 100 == 0: save() save() return meta + if __name__ == '__main__': usage = "usage: %prog [options] country films.json films_with_metadata.json" parser = OptionParser(usage=usage) diff --git a/films_by_country.py b/films_by_country.py index ceec04f..f4fdbec 100755 --- a/films_by_country.py +++ b/films_by_country.py @@ -1,11 +1,15 @@ -#!/usr/bin/python +#!/usr/bin/python3 import ox.web.imdb import re import json import sys import codecs + +from datetime import datetime from optparse import OptionParser +import lxml.html + ''' python allofcountry.py in idsofindia.json python allofcountry.py tr idsofturkey.json @@ -14,6 +18,19 @@ python allofcountry.py tr idsofturkey.json def reset_url(url): x = ox.web.imdb.read_url(url, timeout=0) + +def write(films, filename): + data = [] + for id, title in films.items(): + data.append({ + 'imdbId': id, + 'title': title + }) + + with codecs.open(filename, 'w', encoding='utf-8') as fd: + json.dump(data, fd, indent=1, ensure_ascii=False) + + if __name__ == '__main__': usage = "usage: %prog [options] countrycode output.json" parser = OptionParser(usage=usage) @@ -23,40 +40,56 @@ if __name__ == '__main__': parser.print_help() sys.exit(1) - films = [] + films = {} country, filename = args + current_year = datetime.now().strftime('Y') if opts.reset: reset_url(opts.reset) - - base_url = 'http://akas.imdb.com' - url = '%s/search/title?countries=%s&sort=year' % (base_url, country) - data = ox.web.imdb.read_url(url) - n = True - while n: - n = re.compile('Next »').findall(data) - if n: - n = '%s%s' % (base_url, n[0].split('href="')[-1]) - results = re.compile('(.*?)
', re.DOTALL).findall(data) - if results: - films += re.compile('href="/title/tt(\d{7})/" title="(.*?)"').findall(results[0]) - print n - print len(films), 'films' - if n: - data = ox.web.imdb.read_url(n) - else: - with open('last.html', 'w') as f: - f.write(data) - if len(films) % 1000 == 0: - with codecs.open(filename, 'w', encoding='utf-8') as fd: - json.dump([{ - 'imdbId': f[0], - 'title': ox.decode_html(f[1]) - } for f in films], fd, indent=1, ensure_ascii=False) + base_url = 'http://www.imdb.com' + #url = '%s/search/title?countries=%s&sort=year' % (base_url, country) + year = 1880 - with codecs.open(filename, 'w', encoding='utf-8') as fd: - json.dump([{ - 'imdbId': f[0], - 'title': ox.decode_html(f[1]) - } for f in films], fd, indent=1, ensure_ascii=False) + added = 0 + + while year < datetime.now().year: + url = '%s/search/title?release_date=%s-01-01,%s-01-01&countries=%s&sort=release_date,asc' % (base_url, year, year + 1, country) + + data = ox.web.imdb.read_url(url, unicode=True) + n = True + page = 1 + while n: + n = re.compile('Next »', re.DOTALL).findall(data) + if n: + n = '%s&page=%s' % (url, page) + page += 1 + doc = lxml.html.fromstring(data) + article = doc.find_class('article') + if article: + article = article[0] + else: + n = None + for a in article.xpath('.//a'): + if '/title/tt' in a.attrib['href']: + img = a.xpath('.//img') + if img: + id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0] + if id not in films: + title = img[0].attrib['alt'] + title = ox.decode_html(title) + films[id] = title + added += 1 + print(len(films), 'films') + if n: + data = ox.web.imdb.read_url(n, unicode=True) + else: + with open('last.html', 'w') as f: + f.write(data) + if added > 1000: + added = 0 + write(films, filename) + year += 1 + print('>> year', year) + + write(films, filename)