pandora_prelinger/get_data.py

44 lines
1.6 KiB
Python

#pip install internetarchive
import json
import internetarchive
data = {}
for e in internetarchive.search.Search('collection:prelinger'):
item = internetarchive.get_item(e['identifier'])
for key in ('h.264', 'MPEG4', '512Kb MPEG4', 'HiRes MPEG4'):
files = [f for f in item.files if f['format'] == key]
if files:
break
if files:
print(item.metadata['title'])
print('https://archive.org/details/%s' % item.identifier)
url = 'https://archive.org/download/%s/%s' % (item.identifier, files[0]['name'])
print(url)
data[item.identifier] = {
'id': item.identifier,
'mp4': url,
'mp4_size': files[0]['size'],
}
for key in (
'title', 'description', 'year',
'publisher', 'addeddate', 'sound',
'creator', 'color', 'credits', 'publisher',
'sponsor', 'uploader', 'licenseurl', 'subject',
'language'
#needed?
'date',
):
if key in item.metadata and item.metadata[key]:
data[item.identifier][key] = item.metadata[key]
if data[item.identifier][key][0] == '[' and data[item.identifier][key][-1] == ']':
data[item.identifier][key] = data[item.identifier][key][1:-1]
else:
formats = sorted({f['format']:1 for f in item.files}.keys())
if formats:
print(item.identifier, item.files)
print(formats)
with open('prelinger.json', 'w') as f:
json.dump(list(data.values()), f, indent=2)