pandora_cdosea/subtitles.py

250 lines
9.1 KiB
Python
Executable file

#!/usr/bin/python3
from argparse import ArgumentParser
from collections import defaultdict
from copy import deepcopy
from glob import glob
import json
import os
import re
import string
import subprocess
import sys
import ox
import ox.web.auth
base_url = 'http://127.0.0.1:2620'
FRAME_DURATION = 1/60
MAX_DURATION = 40
HIDDEN_TAGS = [
"women with white males",
"gene z hanrahan"
]
# items to not use at all
BLACKLIST = [
'XN'
]
api = None
def get_api():
global api
if not api:
api = ox.API(base_url + '/api/')
api.signin(**ox.web.auth.get('cdosea'))
def update_subtitles():
get_api()
items = api.find({
'query': {
'conditions': [{'key': 'tags', 'value': 'Vocal', 'operator': '=='}]
},
'keys': ['id', 'title'],
'range': [0, 1000]})['data']['items']
for item in items:
'''
info = api.findMedia({
'query': {
'conditions': [
{'key': 'id', 'operator': '==', 'value': item['id']}
]
},
'keys': ['id', 'extension'],
'range': [0, 1]
})['data']['items'][0]
'''
item['subtitles'] = api.get({'id': item['id'], 'keys': ['layers']})['data']['layers']['subtitles']
return items
def get_subtitles(items, id):
for item in items:
if item['title'].startswith(id):
return deepcopy(item['subtitles'])
def render_subtitles(item_json, output_json, output_srt, lang):
with open(item_json) as fd:
item = json.load(fd)
subtitles = []
position = 0
subs = {}
for clip in item['vocals']:
if not clip.get('blank'):
# vocals/A/A4_chaton.wav
id = clip['path'].split('/')[-1][:2]
clip_subtitles = get_subtitles(items, id)
clip_subtitles.sort(key=lambda c: (c['in'], c['out'], c['id']))
for sub in clip_subtitles:
sub_in = float('%0.3f' % (sub['in'] + position))
sub_out = float('%0.3f' % (sub['out'] + position))
sub_id = '%0.3f-%0.3f' % (sub_in, sub_out)
if sub_id not in subs:
subs[sub_id] = {
'in': sub_in,
'out': sub_out,
'value': [],
}
if isinstance(lang, list):
subs[sub_id]['ids'] = []
subs[sub_id]['languages'] = []
slang = re.compile('span lang="(..)"').findall(sub['value'])
value = sub['value'].replace('<br>', '').strip()
if slang:
slang = slang[0]
value = value.replace('<span lang="' + slang + '">', '').replace('</span>', '').strip()
else:
slang = 'en'
# just use strip_tags?
# value = ox.strip_tags(ox.decode_html(sub['value']))
if isinstance(lang, list) and slang in lang:
if lang.index(slang) == 0:
subs[sub_id]['value'].insert(0, value)
else:
subs[sub_id]['value'].append(value)
subs[sub_id]['ids'].append(sub['id'])
subs[sub_id]['languages'].append(slang)
elif slang == lang:
subs[sub_id]['value'].append(value)
position += clip['duration']
if isinstance(lang, list):
#fixme = [sub for sub in subs.values() if [s for s in list(subs.values())]]
fixme = [sub for sub in subs.values() if set(sub['languages']) != set(lang) and sub['value']]
if fixme:
remove = []
for key, sub in list(subs.items()):
intersections = []
for s in list(subs.values()):
intersections += overlaps(sub, s)
if intersections:
points = list(sorted(set([sub['in'], sub['out']] + intersections)))
#print(points, sub['value'])
sub_in = points[0]
for sub_out in points[1:]:
sub_id = '%0.3f-%0.3f' % (sub_in, sub_out)
if sub_id not in subs:
subs[sub_id] = {
'in': sub_in,
'out': sub_out,
'value': [],
'ids': [],
'languages': []
}
if not sub['value']:
continue
if set(subs[sub_id]['languages']) != set(lang):
if not subs[sub_id]['languages']:
subs[sub_id]['value'] += sub['value']
subs[sub_id]['languages'] += sub['languages']
subs[sub_id]['ids'] += sub['ids']
elif subs[sub_id]['languages'] == [lang[0]] \
and sub['languages'][0] not in subs[sub_id]['languages']:
subs[sub_id]['value'].append(sub['value'][0])
subs[sub_id]['languages'].append(sub['languages'][0])
subs[sub_id]['ids'] += sub['ids']
elif subs[sub_id]['languages'] == [lang[1]] \
and sub['languages'][0] not in subs[sub_id]['languages']:
subs[sub_id]['value'].insert(0, sub['value'][0])
subs[sub_id]['languages'].insert(0, sub['languages'][0])
subs[sub_id]['ids'] += sub['ids']
#else:
# print('WTF', sub['languages'], subs[sub_id]['languages'])
sub_in = sub_out
remove.append(key)
#for key, sub in list(subs.items()):
# if len(sub['languages']) == 1:
# del subs[key]
for key in remove:
if len(subs[key]['languages']) == 1:
del subs[key]
for key, sub in list(subs.items()):
if abs(sub['out'] - sub['in']) <= 0.040001:
del subs[key]
subs = sorted(subs.values(), key=lambda c: (c['in'], c['out']))
for sub in subs:
sub['value'] = '\n'.join(sub['value'])
if sub['value'].strip():
subtitles.append(sub)
merged = []
p = None
for sub in subtitles:
if not p:
merged.append(sub)
p = sub
else:
if p['out'] > sub['in']:
if p['value'] == sub['value']:
p['out'] = max(p['out'], sub['out'])
else:
p['out'] = sub['in']
merged.append(sub)
p = sub
else:
merged.append(sub)
p = sub
subtitles = merged
if output_srt:
with open(output_srt, 'wb') as fd:
fd.write(ox.srt.encode(subtitles))
with open(output_json, 'w') as fd:
json.dump(subtitles, fd, indent=4, ensure_ascii=False, sort_keys=True)
def overlaps(src, other):
src_in = float('%0.2f' % src['in'])
src_out = float('%0.2f' % src['out'])
other_in = float('%0.2f' % other['in'])
other_out = float('%0.2f' % other['out'])
points = []
if src_in != other_in or src_out != other_out:
# src inside
if other_in >= src_in and other_in < src_out:
points += [other['in']]
if other_out > src_in and other_out <= src_out:
points += [other['out']]
return points
if __name__ == '__main__':
usage = "usage: %(prog)s [options] json"
parser = ArgumentParser(usage=usage)
parser.add_argument('-p', '--prefix', dest='prefix',
help='version prefix', default='.')
parser.add_argument('files', metavar='path', type=str, nargs='*', help='json files')
opts = parser.parse_args()
if os.path.exists('subtitles.json'):
items = json.load(open('subtitles.json'))
else:
items = update_subtitles()
with open('subtitles.json', 'w') as fd:
json.dump(items, fd, indent=4, ensure_ascii=False, sort_keys=True)
files = opts.files
if not files:
files = glob(os.path.join(opts.prefix, 'output/*/*.json'))
files = [f for f in files if not 'gong' in f]
for item_json in files:
prefix = 'public/' + item_json.split('/')[-1][0].lower() + item_json.split('/')[-2] + '.'
prefix = os.path.join(opts.prefix, prefix)
output_json = prefix + '1080p.json'
output_srt = prefix + '1080p.srt'
render_subtitles(item_json, output_json, output_srt, ['ko', 'en'])
output_json = prefix + 'no-en.json'
output_srt = prefix + 'no-en.srt'
render_subtitles(item_json, output_json, output_srt, ['no', 'en'])
for lang in ('en', 'ko', 'no'):
output_json = prefix + lang + '.json'
output_srt = prefix + lang + '.srt'
render_subtitles(item_json, output_json, output_srt, lang)