import json import re import ox import ox.web.ubu import codecs ids = ox.web.ubu.get_ids() data = {} for id in ids: info = ox.web.ubu.get_data(id) if 'mp4' in info: if 'description' in info: info['description'] = re.sub(' *\n *', '\n', info['description']) info['description'] = re.sub('\n+', '\n\n', info['description']) if isinstance(info['description'], str): info['description'] = info['description'].decode('latin-1') info['description'] = ox.fix_bad_unicode(info['description']) info['mp4'] = info['mp4'].replace(' ', '%20') data[id] = info else: print info with codecs.open('ubu.json', 'w', 'utf-8') as f: json.dump(data.values(), f, indent=2, ensure_ascii=False)