parse year from index, fix import
This commit is contained in:
parent
322d63f234
commit
b876eef0d0
3 changed files with 62 additions and 30 deletions
|
@ -17,11 +17,20 @@ def add_metadata(films, country, output):
|
||||||
with open(output) as fd:
|
with open(output) as fd:
|
||||||
meta = json.load(fd)
|
meta = json.load(fd)
|
||||||
|
|
||||||
known_ids = set([f['imdbId'] for f in meta])
|
ignore = output + '.ignored'
|
||||||
|
if os.path.exists(ignore):
|
||||||
|
with open(ignore) as fd:
|
||||||
|
ignored = fd.read().strip().split('\n')
|
||||||
|
else:
|
||||||
|
ignored = []
|
||||||
|
|
||||||
|
known_ids = set([f['imdbId'] for f in meta] + ignored)
|
||||||
|
|
||||||
def save():
|
def save():
|
||||||
with codecs.open(output, 'w', encoding='utf-8') as fd:
|
with codecs.open(output, 'w', encoding='utf-8') as fd:
|
||||||
json.dump(meta, fd, indent=1, ensure_ascii=False)
|
json.dump(meta, fd, indent=1, ensure_ascii=False)
|
||||||
|
with open(ignore, 'w') as fd:
|
||||||
|
fd.write('\n'.join(ignored))
|
||||||
|
|
||||||
for info in films:
|
for info in films:
|
||||||
if info['imdbId'] in known_ids:
|
if info['imdbId'] in known_ids:
|
||||||
|
@ -53,12 +62,12 @@ def add_metadata(films, country, output):
|
||||||
if y:
|
if y:
|
||||||
y = int(y)
|
y = int(y)
|
||||||
if '(????)' in info.get('title', '') or not y or y >= current_year:
|
if '(????)' in info.get('title', '') or not y or y >= current_year:
|
||||||
info['delete'] = True
|
ignored.append(info['imdbId'])
|
||||||
print('skip unknown or current year', info['imdbId'], info.get('title'), info.get('year'))
|
print('skip unknown or current year', info['imdbId'], info.get('title'), info.get('year'))
|
||||||
continue
|
continue
|
||||||
if 'isSeries' in extra or ('country' in extra and country not in extra['country']):
|
if 'isSeries' in extra or ('country' in extra and country not in extra['country']):
|
||||||
info['delete'] = True
|
ignored.append(info['imdbId'])
|
||||||
print('deleting', info['imdbId'], info.get('title'))
|
print('ignoring', info['imdbId'], info.get('title'))
|
||||||
continue
|
continue
|
||||||
if 'originalTitle' in extra:
|
if 'originalTitle' in extra:
|
||||||
info['alternativeTitles'] = [[info['title'], '']]
|
info['alternativeTitles'] = [[info['title'], '']]
|
||||||
|
|
|
@ -21,10 +21,11 @@ def reset_url(url):
|
||||||
|
|
||||||
def write(films, filename):
|
def write(films, filename):
|
||||||
data = []
|
data = []
|
||||||
for id, title in films.items():
|
for id, film in films.items():
|
||||||
data.append({
|
data.append({
|
||||||
'imdbId': id,
|
'imdbId': id,
|
||||||
'title': title
|
'title': film[0],
|
||||||
|
'year': film[1],
|
||||||
})
|
})
|
||||||
|
|
||||||
with codecs.open(filename, 'w', encoding='utf-8') as fd:
|
with codecs.open(filename, 'w', encoding='utf-8') as fd:
|
||||||
|
@ -54,6 +55,7 @@ if __name__ == '__main__':
|
||||||
added = 0
|
added = 0
|
||||||
|
|
||||||
while year < datetime.now().year:
|
while year < datetime.now().year:
|
||||||
|
print('<<', year)
|
||||||
url = '%s/search/title?release_date=%s-01-01,%s-01-01&countries=%s&sort=release_date,asc' % (base_url, year, year + 1, country)
|
url = '%s/search/title?release_date=%s-01-01,%s-01-01&countries=%s&sort=release_date,asc' % (base_url, year, year + 1, country)
|
||||||
|
|
||||||
data = ox.web.imdb.read_url(url, unicode=True)
|
data = ox.web.imdb.read_url(url, unicode=True)
|
||||||
|
@ -70,6 +72,25 @@ if __name__ == '__main__':
|
||||||
article = article[0]
|
article = article[0]
|
||||||
else:
|
else:
|
||||||
n = None
|
n = None
|
||||||
|
for header in article.find_class('lister-item-header'):
|
||||||
|
a = header.xpath('.//a')[0]
|
||||||
|
id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
|
||||||
|
title = a.text_content()
|
||||||
|
try:
|
||||||
|
fully = y = header.find_class('lister-item-year')[0].text_content()
|
||||||
|
y = y.rsplit('(', 1)[-1].split(')')[0].split('–')[0].split(' ')[0].strip()
|
||||||
|
if not y:
|
||||||
|
y = year
|
||||||
|
else:
|
||||||
|
y = int(y)
|
||||||
|
except:
|
||||||
|
print(n)
|
||||||
|
print(header.find_class('lister-item-year')[0].text_content())
|
||||||
|
raise
|
||||||
|
if id not in films:
|
||||||
|
films[id] = (title, y)
|
||||||
|
added += 1
|
||||||
|
'''
|
||||||
for a in article.xpath('.//a'):
|
for a in article.xpath('.//a'):
|
||||||
if '/title/tt' in a.attrib['href']:
|
if '/title/tt' in a.attrib['href']:
|
||||||
img = a.xpath('.//img')
|
img = a.xpath('.//img')
|
||||||
|
@ -80,6 +101,7 @@ if __name__ == '__main__':
|
||||||
title = ox.decode_html(title)
|
title = ox.decode_html(title)
|
||||||
films[id] = title
|
films[id] = title
|
||||||
added += 1
|
added += 1
|
||||||
|
'''
|
||||||
print(len(films), 'films')
|
print(len(films), 'films')
|
||||||
if n:
|
if n:
|
||||||
data = ox.web.imdb.read_url(n, unicode=True)
|
data = ox.web.imdb.read_url(n, unicode=True)
|
||||||
|
@ -89,7 +111,7 @@ if __name__ == '__main__':
|
||||||
if added > 1000:
|
if added > 1000:
|
||||||
added = 0
|
added = 0
|
||||||
write(films, filename)
|
write(films, filename)
|
||||||
year += 1
|
|
||||||
print('>> year', year)
|
print('>> year', year)
|
||||||
|
year += 1
|
||||||
|
|
||||||
write(films, filename)
|
write(films, filename)
|
||||||
|
|
|
@ -12,29 +12,30 @@ def load(data_json):
|
||||||
import item.models as models
|
import item.models as models
|
||||||
import archive.models
|
import archive.models
|
||||||
import os
|
import os
|
||||||
archive.models.File.objects.all().delete()
|
with transaction.atomic():
|
||||||
archive.models.Instance.objects.all().delete()
|
archive.models.File.objects.all().delete()
|
||||||
archive.models.Volume.objects.all().delete()
|
archive.models.Instance.objects.all().delete()
|
||||||
models.Item.objects.all().delete()
|
archive.models.Volume.objects.all().delete()
|
||||||
reset_table(archive.models.File._meta.db_table)
|
models.Item.objects.all().delete()
|
||||||
reset_table(archive.models.Instance._meta.db_table)
|
reset_table(archive.models.File._meta.db_table)
|
||||||
reset_table(archive.models.Volume._meta.db_table)
|
reset_table(archive.models.Instance._meta.db_table)
|
||||||
reset_table(models.Item._meta.db_table)
|
reset_table(archive.models.Volume._meta.db_table)
|
||||||
transaction.commit_unless_managed()
|
reset_table(models.Item._meta.db_table)
|
||||||
os.system('rm -r /srv/pandora/data/media')
|
with transaction.atomic():
|
||||||
os.system('rm -r /srv/pandora/data/items')
|
os.system('rm -r /srv/pandora/data/media')
|
||||||
|
os.system('rm -r /srv/pandora/data/items')
|
||||||
|
|
||||||
|
films = json.load(open(data_json))
|
||||||
|
for data in sorted(films, key=lambda f: (f['year'], f['title'], f.get('director', []))):
|
||||||
|
item = models.Item()
|
||||||
|
item.data = data
|
||||||
|
item.save()
|
||||||
|
item.make_poster()
|
||||||
|
item.make_icon()
|
||||||
|
item.level = 2
|
||||||
|
item.save()
|
||||||
|
print(item)
|
||||||
|
|
||||||
films = json.load(open(data_json))
|
|
||||||
for data in films:
|
|
||||||
item = models.Item()
|
|
||||||
item.data = data
|
|
||||||
item.save()
|
|
||||||
item.make_poster(True)
|
|
||||||
item.make_icon()
|
|
||||||
item.level = 2
|
|
||||||
item.save()
|
|
||||||
print item
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
print 'please import from ./manage.py and run import_json.load(path_to_json)'
|
print('please import from ./manage.py and run import_json.load(path_to_json)')
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue