pandora_cdosea/subtitles.py

156 lines
4.9 KiB
Python
Raw Normal View History

2017-05-16 12:59:32 +00:00
#!/usr/bin/python3
import os
import sys
import json
2017-08-25 09:12:32 +00:00
import re
2017-05-16 12:59:32 +00:00
import subprocess
from collections import defaultdict
import string
from glob import glob
from copy import deepcopy
import ox
import ox.web.auth
base_url = 'http://127.0.0.1:2620'
FRAME_DURATION = 1/60
MAX_DURATION = 40
HIDDEN_TAGS = [
"women with white males",
"gene z hanrahan"
]
# items to not use at all
BLACKLIST = [
'XN'
]
api = None
def get_api():
global api
if not api:
api = ox.API(base_url + '/api/')
api.signin(**ox.web.auth.get('cdosea'))
2017-05-20 15:32:51 +00:00
def update_subtitles():
2017-05-16 12:59:32 +00:00
get_api()
items = api.find({
'query': {
'conditions': [{'key': 'tags', 'value': 'Vocal', 'operator': '=='}]
},
'keys': ['id', 'title'],
'range': [0, 1000]})['data']['items']
for item in items:
'''
info = api.findMedia({
'query': {
'conditions': [
{'key': 'id', 'operator': '==', 'value': item['id']}
]
},
'keys': ['id', 'extension'],
'range': [0, 1]
})['data']['items'][0]
'''
item['subtitles'] = api.get({'id': item['id'], 'keys': ['layers']})['data']['layers']['subtitles']
return items
def get_subtitles(items, id):
for item in items:
if item['title'].startswith(id):
return deepcopy(item['subtitles'])
2017-08-25 09:12:32 +00:00
def render_subtitles(item_json, output_json, output_srt, lang):
2017-05-16 12:59:32 +00:00
with open(item_json) as fd:
item = json.load(fd)
subtitles = []
position = 0
2017-05-21 11:55:48 +00:00
subs = {}
2017-05-16 12:59:32 +00:00
for clip in item['vocals']:
if not clip.get('blank'):
# vocals/A/A4_chaton.wav
id = clip['path'].split('/')[-1][:2]
clip_subtitles = get_subtitles(items, id)
2017-05-21 11:55:48 +00:00
clip_subtitles.sort(key=lambda c: (c['in'], c['out'], c['id']))
2017-05-16 12:59:32 +00:00
for sub in clip_subtitles:
2017-05-21 20:48:54 +00:00
sub_in = float('%0.3f' % (sub['in'] + position))
2017-05-21 11:55:48 +00:00
sub_out = float('%0.3f' % (sub['out'] + position))
sub_id = '%0.3f-%0.3f' % (sub_in, sub_out)
if sub_id not in subs:
subs[sub_id] = {
'in': sub_in,
'out': sub_out,
'value': [],
}
2017-08-25 09:12:32 +00:00
if isinstance(lang, list):
2017-06-17 16:02:14 +00:00
subs[sub_id]['ids'] = []
2017-08-25 09:12:32 +00:00
subs[sub_id]['languages'] = []
slang = re.compile('span lang="(..)"').findall(sub['value'])
value = sub['value'].replace('<br>', '').strip()
if slang:
slang = slang[0]
value = value.replace('<span lang="' + slang + '">', '').replace('</span>', '').strip()
else:
slang = 'en'
2017-05-21 20:48:54 +00:00
# just use strip_tags?
# value = ox.strip_tags(ox.decode_html(sub['value']))
2017-08-25 09:12:32 +00:00
if isinstance(lang, list) and slang in lang:
if lang.index(slang) == 0:
2017-06-17 16:00:24 +00:00
subs[sub_id]['value'].insert(0, value)
else:
subs[sub_id]['value'].append(value)
subs[sub_id]['ids'].append(sub['id'])
2017-08-25 09:12:32 +00:00
subs[sub_id]['languages'].append(slang)
elif slang == lang:
2017-05-21 21:14:15 +00:00
subs[sub_id]['value'].append(value)
2017-05-16 12:59:32 +00:00
position += clip['duration']
2017-05-21 11:55:48 +00:00
subs = sorted(subs.values(), key=lambda c: (c['in'], c['out']))
for sub in subs:
sub['value'] = '\n'.join(sub['value'])
subtitles.append(sub)
2017-05-16 12:59:32 +00:00
2017-08-25 09:12:32 +00:00
if isinstance(lang, list):
fixme = [sub for sub in subs if set(sub['languages']) != set(lang)]
if fixme:
2017-08-25 23:00:48 +00:00
print('split/merge overlaps', output_srt, output_json)
2017-08-25 09:12:32 +00:00
2017-06-17 16:00:24 +00:00
if output_srt:
with open(output_srt, 'wb') as fd:
fd.write(ox.srt.encode(subtitles))
2017-05-16 12:59:32 +00:00
with open(output_json, 'w') as fd:
2017-06-17 16:03:13 +00:00
json.dump(subtitles, fd, indent=4, ensure_ascii=False, sort_keys=True)
2017-05-16 12:59:32 +00:00
2017-05-21 20:48:54 +00:00
2017-05-16 12:59:32 +00:00
if __name__ == '__main__':
if os.path.exists('subtitles.json'):
items = json.load(open('subtitles.json'))
else:
2017-05-20 15:32:51 +00:00
items = update_subtitles()
2017-05-16 12:59:32 +00:00
with open('subtitles.json', 'w') as fd:
2017-06-17 16:03:13 +00:00
json.dump(items, fd, indent=4, ensure_ascii=False, sort_keys=True)
2017-05-16 12:59:32 +00:00
2017-05-17 10:15:28 +00:00
if len(sys.argv) > 1:
2017-05-21 20:48:54 +00:00
files = sys.argv[1:]
2017-05-17 10:15:28 +00:00
else:
files = glob('output/*/*.json')
for item_json in files:
2017-06-17 16:00:24 +00:00
prefix = 'public/' + item_json.split('/')[-1][0].lower() + item_json.split('/')[-2] + '.'
output_json = prefix + '1080p.json'
2017-08-25 09:12:32 +00:00
output_srt = prefix + '1080p.srt'
render_subtitles(item_json, output_json, output_srt, ['ko', 'en'])
output_json = prefix + 'no-en.json'
output_srt = prefix + 'no-en.srt'
render_subtitles(item_json, output_json, output_srt, ['no', 'en'])
for lang in ('en', 'ko', 'no'):
2017-06-17 16:00:24 +00:00
output_json = prefix + lang + '.json'
render_subtitles(item_json, output_json, None, lang)