pandora_ubu/get_data.py

26 lines
787 B
Python

import json
import re
import ox
import ox.web.ubu
import codecs
ids = ox.web.ubu.get_ids()
data = {}
for id in ids:
info = ox.web.ubu.get_data(id)
if 'mp4' in info:
if 'description' in info:
info['description'] = re.sub(' *\n *', '\n', info['description'])
info['description'] = re.sub('\n+', '\n\n', info['description'])
if isinstance(info['description'], str):
info['description'] = info['description'].decode('latin-1')
info['description'] = ox.fix_bad_unicode(info['description'])
info['mp4'] = info['mp4'].replace(' ', '%20')
data[id] = info
else:
print info
with codecs.open('ubu.json', 'w', 'utf-8') as f:
json.dump(data.values(), f, indent=2, ensure_ascii=False)