#!/usr/bin/python3 from argparse import ArgumentParser from collections import defaultdict from copy import deepcopy from glob import glob import json import os import re import string import subprocess import sys import ox import ox.web.auth base_url = 'http://127.0.0.1:2620' FRAME_DURATION = 1/60 MAX_DURATION = 40 HIDDEN_TAGS = [ "women with white males", "gene z hanrahan" ] # items to not use at all BLACKLIST = [ 'XN' ] api = None def get_api(): global api if not api: api = ox.API(base_url + '/api/') api.signin(**ox.web.auth.get('cdosea')) def update_subtitles(): get_api() items = api.find({ 'query': { 'conditions': [{'key': 'tags', 'value': 'Vocal', 'operator': '=='}] }, 'keys': ['id', 'title'], 'range': [0, 1000]})['data']['items'] for item in items: ''' info = api.findMedia({ 'query': { 'conditions': [ {'key': 'id', 'operator': '==', 'value': item['id']} ] }, 'keys': ['id', 'extension'], 'range': [0, 1] })['data']['items'][0] ''' item['subtitles'] = api.get({'id': item['id'], 'keys': ['layers']})['data']['layers']['subtitles'] return items def get_subtitles(items, id): for item in items: if item['title'].startswith(id): return deepcopy(item['subtitles']) def render_subtitles(item_json, output_json, output_srt, lang): with open(item_json) as fd: item = json.load(fd) subtitles = [] position = 0 subs = {} for clip in item['vocals']: if not clip.get('blank'): # vocals/A/A4_chaton.wav id = clip['path'].split('/')[-1][:2] clip_subtitles = get_subtitles(items, id) clip_subtitles.sort(key=lambda c: (c['in'], c['out'], c['id'])) for sub in clip_subtitles: sub_in = float('%0.3f' % (sub['in'] + position)) sub_out = float('%0.3f' % (sub['out'] + position)) sub_id = '%0.3f-%0.3f' % (sub_in, sub_out) if sub_id not in subs: subs[sub_id] = { 'in': sub_in, 'out': sub_out, 'value': [], } if isinstance(lang, list): subs[sub_id]['ids'] = [] subs[sub_id]['languages'] = [] slang = re.compile('span lang="(..)"').findall(sub['value']) value = sub['value'].replace('
', '').strip() if slang: slang = slang[0] value = value.replace('', '').replace('', '').strip() else: slang = 'en' # just use strip_tags? # value = ox.strip_tags(ox.decode_html(sub['value'])) if isinstance(lang, list) and slang in lang: if lang.index(slang) == 0: subs[sub_id]['value'].insert(0, value) else: subs[sub_id]['value'].append(value) subs[sub_id]['ids'].append(sub['id']) subs[sub_id]['languages'].append(slang) elif slang == lang: subs[sub_id]['value'].append(value) position += clip['duration'] if isinstance(lang, list): #fixme = [sub for sub in subs.values() if [s for s in list(subs.values())]] fixme = [sub for sub in subs.values() if set(sub['languages']) != set(lang) and sub['value']] if fixme: remove = [] for key, sub in list(subs.items()): intersections = [] for s in list(subs.values()): intersections += overlaps(sub, s) if intersections: points = list(sorted(set([sub['in'], sub['out']] + intersections))) #print(points, sub['value']) sub_in = points[0] for sub_out in points[1:]: sub_id = '%0.3f-%0.3f' % (sub_in, sub_out) if sub_id not in subs: subs[sub_id] = { 'in': sub_in, 'out': sub_out, 'value': [], 'ids': [], 'languages': [] } if not sub['value']: continue if set(subs[sub_id]['languages']) != set(lang): if not subs[sub_id]['languages']: subs[sub_id]['value'] += sub['value'] subs[sub_id]['languages'] += sub['languages'] subs[sub_id]['ids'] += sub['ids'] elif subs[sub_id]['languages'] == [lang[0]] \ and sub['languages'][0] not in subs[sub_id]['languages']: subs[sub_id]['value'].append(sub['value'][0]) subs[sub_id]['languages'].append(sub['languages'][0]) subs[sub_id]['ids'] += sub['ids'] elif subs[sub_id]['languages'] == [lang[1]] \ and sub['languages'][0] not in subs[sub_id]['languages']: subs[sub_id]['value'].insert(0, sub['value'][0]) subs[sub_id]['languages'].insert(0, sub['languages'][0]) subs[sub_id]['ids'] += sub['ids'] #else: # print('WTF', sub['languages'], subs[sub_id]['languages']) sub_in = sub_out remove.append(key) #for key, sub in list(subs.items()): # if len(sub['languages']) == 1: # del subs[key] for key in remove: if len(subs[key]['languages']) == 1: del subs[key] for key, sub in list(subs.items()): if abs(sub['out'] - sub['in']) <= 0.040001: del subs[key] subs = sorted(subs.values(), key=lambda c: (c['in'], c['out'])) for sub in subs: sub['value'] = '\n'.join(sub['value']) if sub['value'].strip(): subtitles.append(sub) merged = [] p = None for sub in subtitles: if not p: merged.append(sub) p = sub else: if p['out'] > sub['in']: if p['value'] == sub['value']: p['out'] = max(p['out'], sub['out']) else: p['out'] = sub['in'] merged.append(sub) p = sub else: merged.append(sub) p = sub subtitles = merged if output_srt: with open(output_srt, 'wb') as fd: fd.write(ox.srt.encode(subtitles)) with open(output_json, 'w') as fd: json.dump(subtitles, fd, indent=4, ensure_ascii=False, sort_keys=True) def overlaps(src, other): src_in = float('%0.2f' % src['in']) src_out = float('%0.2f' % src['out']) other_in = float('%0.2f' % other['in']) other_out = float('%0.2f' % other['out']) points = [] if src_in != other_in or src_out != other_out: # src inside if other_in >= src_in and other_in < src_out: points += [other['in']] if other_out > src_in and other_out <= src_out: points += [other['out']] return points if __name__ == '__main__': usage = "usage: %(prog)s [options] json" parser = ArgumentParser(usage=usage) parser.add_argument('-p', '--prefix', dest='prefix', help='version prefix', default='.') parser.add_argument('files', metavar='path', type=str, nargs='*', help='json files') opts = parser.parse_args() if os.path.exists('subtitles.json'): items = json.load(open('subtitles.json')) else: items = update_subtitles() with open('subtitles.json', 'w') as fd: json.dump(items, fd, indent=4, ensure_ascii=False, sort_keys=True) files = opts.files if not files: files = glob(os.path.join(opts.prefix, 'output/*/*.json')) files = [f for f in files if not 'gong' in f] for item_json in files: prefix = 'public/' + item_json.split('/')[-1][0].lower() + item_json.split('/')[-2] + '.' prefix = os.path.join(opts.prefix, prefix) output_json = prefix + '1080p.json' output_srt = prefix + '1080p.srt' render_subtitles(item_json, output_json, output_srt, ['ko', 'en']) output_json = prefix + 'no-en.json' output_srt = prefix + 'no-en.srt' render_subtitles(item_json, output_json, output_srt, ['no', 'en']) output_json = prefix + 'ch-en.json' output_srt = prefix + 'ch-en.srt' render_subtitles(item_json, output_json, output_srt, ['ch', 'en']) for lang in ('en', 'ko', 'no', 'ch'): output_json = prefix + lang + '.json' output_srt = prefix + lang + '.srt' render_subtitles(item_json, output_json, output_srt, lang)