From 322d63f2348e15bd3095d9fbef09d1d70c697065 Mon Sep 17 00:00:00 2001 From: j Date: Mon, 2 Apr 2018 20:27:36 +0530 Subject: [PATCH] update parser --- README.md | 5 ++- add_metadata.py | 26 ++++++++---- films_by_country.py | 97 ++++++++++++++++++++++++++++++--------------- 3 files changed, 86 insertions(+), 42 deletions(-) diff --git a/README.md b/README.md index 515128b..5e4154f 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,6 @@ # collection tools to create *cine.ma sites COUNTRY=in - python films_by_country.py $COUNTRY films_$COUNTRY.json - python add_metadata.py $COUNTRY films_$COUNTRY.json films_$COUNTRY_metadata.json + NAME=India + python films_by_country.py $COUNTRY films_${COUNTRY}.json + python add_metadata.py $NAME films_${COUNTRY}.json films_${COUNTRY}_metadata.json diff --git a/add_metadata.py b/add_metadata.py index a4d7fa6..c60c3c0 100755 --- a/add_metadata.py +++ b/add_metadata.py @@ -1,15 +1,17 @@ -#!/usr/bin/python +#!/usr/bin/python3 from optparse import OptionParser import json import codecs import sys import os +from datetime import datetime import ox def add_metadata(films, country, output): meta = [] api = ox.API('https://indiancine.ma/api/') + current_year = datetime.now().year if os.path.exists(output): with open(output) as fd: @@ -40,16 +42,23 @@ def add_metadata(films, country, output): 'originalTitle', 'year' ] extra = api.getMetadata(id=info['imdbId'], keys=keys)['data'] - print info - print extra + print(info) + print(extra) if not extra: save() - print 'lets try again' + print('lets try again') extra = api.getMetadata(id=info['imdbId'], keys=keys)['data'] - print extra - if 'isSeries' in extra or ('country' in extra and not country in extra['country']): + print(extra) + y = extra.get('year') + if y: + y = int(y) + if '(????)' in info.get('title', '') or not y or y >= current_year: info['delete'] = True - print 'deleting', info['imdbId'], info.get('title') + print('skip unknown or current year', info['imdbId'], info.get('title'), info.get('year')) + continue + if 'isSeries' in extra or ('country' in extra and country not in extra['country']): + info['delete'] = True + print('deleting', info['imdbId'], info.get('title')) continue if 'originalTitle' in extra: info['alternativeTitles'] = [[info['title'], '']] @@ -59,13 +68,14 @@ def add_metadata(films, country, output): for key in extra: if key not in info: info[key] = extra[key] - print info['imdbId'], info['title'] + print(info['imdbId'], info['title']) meta.append(info) if len(meta) % 100 == 0: save() save() return meta + if __name__ == '__main__': usage = "usage: %prog [options] country films.json films_with_metadata.json" parser = OptionParser(usage=usage) diff --git a/films_by_country.py b/films_by_country.py index ceec04f..f4fdbec 100755 --- a/films_by_country.py +++ b/films_by_country.py @@ -1,11 +1,15 @@ -#!/usr/bin/python +#!/usr/bin/python3 import ox.web.imdb import re import json import sys import codecs + +from datetime import datetime from optparse import OptionParser +import lxml.html + ''' python allofcountry.py in idsofindia.json python allofcountry.py tr idsofturkey.json @@ -14,6 +18,19 @@ python allofcountry.py tr idsofturkey.json def reset_url(url): x = ox.web.imdb.read_url(url, timeout=0) + +def write(films, filename): + data = [] + for id, title in films.items(): + data.append({ + 'imdbId': id, + 'title': title + }) + + with codecs.open(filename, 'w', encoding='utf-8') as fd: + json.dump(data, fd, indent=1, ensure_ascii=False) + + if __name__ == '__main__': usage = "usage: %prog [options] countrycode output.json" parser = OptionParser(usage=usage) @@ -23,40 +40,56 @@ if __name__ == '__main__': parser.print_help() sys.exit(1) - films = [] + films = {} country, filename = args + current_year = datetime.now().strftime('Y') if opts.reset: reset_url(opts.reset) - - base_url = 'http://akas.imdb.com' - url = '%s/search/title?countries=%s&sort=year' % (base_url, country) - data = ox.web.imdb.read_url(url) - n = True - while n: - n = re.compile('Next »').findall(data) - if n: - n = '%s%s' % (base_url, n[0].split('href="')[-1]) - results = re.compile('(.*?)
', re.DOTALL).findall(data) - if results: - films += re.compile('href="/title/tt(\d{7})/" title="(.*?)"').findall(results[0]) - print n - print len(films), 'films' - if n: - data = ox.web.imdb.read_url(n) - else: - with open('last.html', 'w') as f: - f.write(data) - if len(films) % 1000 == 0: - with codecs.open(filename, 'w', encoding='utf-8') as fd: - json.dump([{ - 'imdbId': f[0], - 'title': ox.decode_html(f[1]) - } for f in films], fd, indent=1, ensure_ascii=False) + base_url = 'http://www.imdb.com' + #url = '%s/search/title?countries=%s&sort=year' % (base_url, country) + year = 1880 - with codecs.open(filename, 'w', encoding='utf-8') as fd: - json.dump([{ - 'imdbId': f[0], - 'title': ox.decode_html(f[1]) - } for f in films], fd, indent=1, ensure_ascii=False) + added = 0 + + while year < datetime.now().year: + url = '%s/search/title?release_date=%s-01-01,%s-01-01&countries=%s&sort=release_date,asc' % (base_url, year, year + 1, country) + + data = ox.web.imdb.read_url(url, unicode=True) + n = True + page = 1 + while n: + n = re.compile('Next »', re.DOTALL).findall(data) + if n: + n = '%s&page=%s' % (url, page) + page += 1 + doc = lxml.html.fromstring(data) + article = doc.find_class('article') + if article: + article = article[0] + else: + n = None + for a in article.xpath('.//a'): + if '/title/tt' in a.attrib['href']: + img = a.xpath('.//img') + if img: + id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0] + if id not in films: + title = img[0].attrib['alt'] + title = ox.decode_html(title) + films[id] = title + added += 1 + print(len(films), 'films') + if n: + data = ox.web.imdb.read_url(n, unicode=True) + else: + with open('last.html', 'w') as f: + f.write(data) + if added > 1000: + added = 0 + write(films, filename) + year += 1 + print('>> year', year) + + write(films, filename)