26 lines
787 B
Python
26 lines
787 B
Python
import json
|
|
import re
|
|
import ox
|
|
import ox.web.ubu
|
|
import codecs
|
|
|
|
ids = ox.web.ubu.get_ids()
|
|
|
|
data = {}
|
|
for id in ids:
|
|
info = ox.web.ubu.get_data(id)
|
|
if 'mp4' in info:
|
|
if 'description' in info:
|
|
info['description'] = re.sub(' *\n *', '\n', info['description'])
|
|
info['description'] = re.sub('\n+', '\n\n', info['description'])
|
|
if isinstance(info['description'], str):
|
|
info['description'] = info['description'].decode('latin-1')
|
|
info['description'] = ox.fix_bad_unicode(info['description'])
|
|
info['mp4'] = info['mp4'].replace(' ', '%20')
|
|
data[id] = info
|
|
else:
|
|
print info
|
|
|
|
with codecs.open('ubu.json', 'w', 'utf-8') as f:
|
|
json.dump(data.values(), f, indent=2, ensure_ascii=False)
|