parse year from index, fix import

This commit is contained in:
j 2018-04-06 16:00:45 +05:30
parent 322d63f234
commit b876eef0d0
3 changed files with 62 additions and 30 deletions

View file

@ -17,11 +17,20 @@ def add_metadata(films, country, output):
with open(output) as fd: with open(output) as fd:
meta = json.load(fd) meta = json.load(fd)
known_ids = set([f['imdbId'] for f in meta]) ignore = output + '.ignored'
if os.path.exists(ignore):
with open(ignore) as fd:
ignored = fd.read().strip().split('\n')
else:
ignored = []
known_ids = set([f['imdbId'] for f in meta] + ignored)
def save(): def save():
with codecs.open(output, 'w', encoding='utf-8') as fd: with codecs.open(output, 'w', encoding='utf-8') as fd:
json.dump(meta, fd, indent=1, ensure_ascii=False) json.dump(meta, fd, indent=1, ensure_ascii=False)
with open(ignore, 'w') as fd:
fd.write('\n'.join(ignored))
for info in films: for info in films:
if info['imdbId'] in known_ids: if info['imdbId'] in known_ids:
@ -53,12 +62,12 @@ def add_metadata(films, country, output):
if y: if y:
y = int(y) y = int(y)
if '(????)' in info.get('title', '') or not y or y >= current_year: if '(????)' in info.get('title', '') or not y or y >= current_year:
info['delete'] = True ignored.append(info['imdbId'])
print('skip unknown or current year', info['imdbId'], info.get('title'), info.get('year')) print('skip unknown or current year', info['imdbId'], info.get('title'), info.get('year'))
continue continue
if 'isSeries' in extra or ('country' in extra and country not in extra['country']): if 'isSeries' in extra or ('country' in extra and country not in extra['country']):
info['delete'] = True ignored.append(info['imdbId'])
print('deleting', info['imdbId'], info.get('title')) print('ignoring', info['imdbId'], info.get('title'))
continue continue
if 'originalTitle' in extra: if 'originalTitle' in extra:
info['alternativeTitles'] = [[info['title'], '']] info['alternativeTitles'] = [[info['title'], '']]

View file

@ -21,10 +21,11 @@ def reset_url(url):
def write(films, filename): def write(films, filename):
data = [] data = []
for id, title in films.items(): for id, film in films.items():
data.append({ data.append({
'imdbId': id, 'imdbId': id,
'title': title 'title': film[0],
'year': film[1],
}) })
with codecs.open(filename, 'w', encoding='utf-8') as fd: with codecs.open(filename, 'w', encoding='utf-8') as fd:
@ -54,6 +55,7 @@ if __name__ == '__main__':
added = 0 added = 0
while year < datetime.now().year: while year < datetime.now().year:
print('<<', year)
url = '%s/search/title?release_date=%s-01-01,%s-01-01&countries=%s&sort=release_date,asc' % (base_url, year, year + 1, country) url = '%s/search/title?release_date=%s-01-01,%s-01-01&countries=%s&sort=release_date,asc' % (base_url, year, year + 1, country)
data = ox.web.imdb.read_url(url, unicode=True) data = ox.web.imdb.read_url(url, unicode=True)
@ -70,6 +72,25 @@ if __name__ == '__main__':
article = article[0] article = article[0]
else: else:
n = None n = None
for header in article.find_class('lister-item-header'):
a = header.xpath('.//a')[0]
id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
title = a.text_content()
try:
fully = y = header.find_class('lister-item-year')[0].text_content()
y = y.rsplit('(', 1)[-1].split(')')[0].split('')[0].split(' ')[0].strip()
if not y:
y = year
else:
y = int(y)
except:
print(n)
print(header.find_class('lister-item-year')[0].text_content())
raise
if id not in films:
films[id] = (title, y)
added += 1
'''
for a in article.xpath('.//a'): for a in article.xpath('.//a'):
if '/title/tt' in a.attrib['href']: if '/title/tt' in a.attrib['href']:
img = a.xpath('.//img') img = a.xpath('.//img')
@ -80,6 +101,7 @@ if __name__ == '__main__':
title = ox.decode_html(title) title = ox.decode_html(title)
films[id] = title films[id] = title
added += 1 added += 1
'''
print(len(films), 'films') print(len(films), 'films')
if n: if n:
data = ox.web.imdb.read_url(n, unicode=True) data = ox.web.imdb.read_url(n, unicode=True)
@ -89,7 +111,7 @@ if __name__ == '__main__':
if added > 1000: if added > 1000:
added = 0 added = 0
write(films, filename) write(films, filename)
year += 1
print('>> year', year) print('>> year', year)
year += 1
write(films, filename) write(films, filename)

View file

@ -12,29 +12,30 @@ def load(data_json):
import item.models as models import item.models as models
import archive.models import archive.models
import os import os
archive.models.File.objects.all().delete() with transaction.atomic():
archive.models.Instance.objects.all().delete() archive.models.File.objects.all().delete()
archive.models.Volume.objects.all().delete() archive.models.Instance.objects.all().delete()
models.Item.objects.all().delete() archive.models.Volume.objects.all().delete()
reset_table(archive.models.File._meta.db_table) models.Item.objects.all().delete()
reset_table(archive.models.Instance._meta.db_table) reset_table(archive.models.File._meta.db_table)
reset_table(archive.models.Volume._meta.db_table) reset_table(archive.models.Instance._meta.db_table)
reset_table(models.Item._meta.db_table) reset_table(archive.models.Volume._meta.db_table)
transaction.commit_unless_managed() reset_table(models.Item._meta.db_table)
os.system('rm -r /srv/pandora/data/media') with transaction.atomic():
os.system('rm -r /srv/pandora/data/items') os.system('rm -r /srv/pandora/data/media')
os.system('rm -r /srv/pandora/data/items')
films = json.load(open(data_json))
for data in sorted(films, key=lambda f: (f['year'], f['title'], f.get('director', []))):
item = models.Item()
item.data = data
item.save()
item.make_poster()
item.make_icon()
item.level = 2
item.save()
print(item)
films = json.load(open(data_json))
for data in films:
item = models.Item()
item.data = data
item.save()
item.make_poster(True)
item.make_icon()
item.level = 2
item.save()
print item
if __name__ == '__main__': if __name__ == '__main__':
print 'please import from ./manage.py and run import_json.load(path_to_json)' print('please import from ./manage.py and run import_json.load(path_to_json)')