diff --git a/add_metadata.py b/add_metadata.py index c60c3c0..25b6f94 100755 --- a/add_metadata.py +++ b/add_metadata.py @@ -17,11 +17,20 @@ def add_metadata(films, country, output): with open(output) as fd: meta = json.load(fd) - known_ids = set([f['imdbId'] for f in meta]) + ignore = output + '.ignored' + if os.path.exists(ignore): + with open(ignore) as fd: + ignored = fd.read().strip().split('\n') + else: + ignored = [] + + known_ids = set([f['imdbId'] for f in meta] + ignored) def save(): with codecs.open(output, 'w', encoding='utf-8') as fd: json.dump(meta, fd, indent=1, ensure_ascii=False) + with open(ignore, 'w') as fd: + fd.write('\n'.join(ignored)) for info in films: if info['imdbId'] in known_ids: @@ -53,12 +62,12 @@ def add_metadata(films, country, output): if y: y = int(y) if '(????)' in info.get('title', '') or not y or y >= current_year: - info['delete'] = True + ignored.append(info['imdbId']) print('skip unknown or current year', info['imdbId'], info.get('title'), info.get('year')) continue if 'isSeries' in extra or ('country' in extra and country not in extra['country']): - info['delete'] = True - print('deleting', info['imdbId'], info.get('title')) + ignored.append(info['imdbId']) + print('ignoring', info['imdbId'], info.get('title')) continue if 'originalTitle' in extra: info['alternativeTitles'] = [[info['title'], '']] diff --git a/films_by_country.py b/films_by_country.py index f4fdbec..58c063c 100755 --- a/films_by_country.py +++ b/films_by_country.py @@ -21,10 +21,11 @@ def reset_url(url): def write(films, filename): data = [] - for id, title in films.items(): + for id, film in films.items(): data.append({ 'imdbId': id, - 'title': title + 'title': film[0], + 'year': film[1], }) with codecs.open(filename, 'w', encoding='utf-8') as fd: @@ -54,6 +55,7 @@ if __name__ == '__main__': added = 0 while year < datetime.now().year: + print('<<', year) url = '%s/search/title?release_date=%s-01-01,%s-01-01&countries=%s&sort=release_date,asc' % (base_url, year, year + 1, country) data = ox.web.imdb.read_url(url, unicode=True) @@ -70,6 +72,25 @@ if __name__ == '__main__': article = article[0] else: n = None + for header in article.find_class('lister-item-header'): + a = header.xpath('.//a')[0] + id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0] + title = a.text_content() + try: + fully = y = header.find_class('lister-item-year')[0].text_content() + y = y.rsplit('(', 1)[-1].split(')')[0].split('–')[0].split(' ')[0].strip() + if not y: + y = year + else: + y = int(y) + except: + print(n) + print(header.find_class('lister-item-year')[0].text_content()) + raise + if id not in films: + films[id] = (title, y) + added += 1 + ''' for a in article.xpath('.//a'): if '/title/tt' in a.attrib['href']: img = a.xpath('.//img') @@ -80,6 +101,7 @@ if __name__ == '__main__': title = ox.decode_html(title) films[id] = title added += 1 + ''' print(len(films), 'films') if n: data = ox.web.imdb.read_url(n, unicode=True) @@ -89,7 +111,7 @@ if __name__ == '__main__': if added > 1000: added = 0 write(films, filename) - year += 1 print('>> year', year) + year += 1 write(films, filename) diff --git a/import_json.py b/import_json.py index 014e518..fd70ee7 100644 --- a/import_json.py +++ b/import_json.py @@ -12,29 +12,30 @@ def load(data_json): import item.models as models import archive.models import os - archive.models.File.objects.all().delete() - archive.models.Instance.objects.all().delete() - archive.models.Volume.objects.all().delete() - models.Item.objects.all().delete() - reset_table(archive.models.File._meta.db_table) - reset_table(archive.models.Instance._meta.db_table) - reset_table(archive.models.Volume._meta.db_table) - reset_table(models.Item._meta.db_table) - transaction.commit_unless_managed() - os.system('rm -r /srv/pandora/data/media') - os.system('rm -r /srv/pandora/data/items') + with transaction.atomic(): + archive.models.File.objects.all().delete() + archive.models.Instance.objects.all().delete() + archive.models.Volume.objects.all().delete() + models.Item.objects.all().delete() + reset_table(archive.models.File._meta.db_table) + reset_table(archive.models.Instance._meta.db_table) + reset_table(archive.models.Volume._meta.db_table) + reset_table(models.Item._meta.db_table) + with transaction.atomic(): + os.system('rm -r /srv/pandora/data/media') + os.system('rm -r /srv/pandora/data/items') + + films = json.load(open(data_json)) + for data in sorted(films, key=lambda f: (f['year'], f['title'], f.get('director', []))): + item = models.Item() + item.data = data + item.save() + item.make_poster() + item.make_icon() + item.level = 2 + item.save() + print(item) - films = json.load(open(data_json)) - for data in films: - item = models.Item() - item.data = data - item.save() - item.make_poster(True) - item.make_icon() - item.level = 2 - item.save() - print item if __name__ == '__main__': - print 'please import from ./manage.py and run import_json.load(path_to_json)' - + print('please import from ./manage.py and run import_json.load(path_to_json)')