#!/usr/bin/python3 import os import sys import json import re import subprocess from collections import defaultdict import string from glob import glob from copy import deepcopy import ox import ox.web.auth base_url = 'http://127.0.0.1:2620' FRAME_DURATION = 1/60 MAX_DURATION = 40 HIDDEN_TAGS = [ "women with white males", "gene z hanrahan" ] # items to not use at all BLACKLIST = [ 'XN' ] api = None def get_api(): global api if not api: api = ox.API(base_url + '/api/') api.signin(**ox.web.auth.get('cdosea')) def update_subtitles(): get_api() items = api.find({ 'query': { 'conditions': [{'key': 'tags', 'value': 'Vocal', 'operator': '=='}] }, 'keys': ['id', 'title'], 'range': [0, 1000]})['data']['items'] for item in items: ''' info = api.findMedia({ 'query': { 'conditions': [ {'key': 'id', 'operator': '==', 'value': item['id']} ] }, 'keys': ['id', 'extension'], 'range': [0, 1] })['data']['items'][0] ''' item['subtitles'] = api.get({'id': item['id'], 'keys': ['layers']})['data']['layers']['subtitles'] return items def get_subtitles(items, id): for item in items: if item['title'].startswith(id): return deepcopy(item['subtitles']) def render_subtitles(item_json, output_json, output_srt, lang): with open(item_json) as fd: item = json.load(fd) subtitles = [] position = 0 subs = {} for clip in item['vocals']: if not clip.get('blank'): # vocals/A/A4_chaton.wav id = clip['path'].split('/')[-1][:2] clip_subtitles = get_subtitles(items, id) clip_subtitles.sort(key=lambda c: (c['in'], c['out'], c['id'])) for sub in clip_subtitles: sub_in = float('%0.3f' % (sub['in'] + position)) sub_out = float('%0.3f' % (sub['out'] + position)) sub_id = '%0.3f-%0.3f' % (sub_in, sub_out) if sub_id not in subs: subs[sub_id] = { 'in': sub_in, 'out': sub_out, 'value': [], } if isinstance(lang, list): subs[sub_id]['ids'] = [] subs[sub_id]['languages'] = [] slang = re.compile('span lang="(..)"').findall(sub['value']) value = sub['value'].replace('
', '').strip() if slang: slang = slang[0] value = value.replace('', '').replace('', '').strip() else: slang = 'en' # just use strip_tags? # value = ox.strip_tags(ox.decode_html(sub['value'])) if isinstance(lang, list) and slang in lang: if lang.index(slang) == 0: subs[sub_id]['value'].insert(0, value) else: subs[sub_id]['value'].append(value) subs[sub_id]['ids'].append(sub['id']) subs[sub_id]['languages'].append(slang) elif slang == lang: subs[sub_id]['value'].append(value) position += clip['duration'] if isinstance(lang, list): #fixme = [sub for sub in subs.values() if [s for s in list(subs.values())]] fixme = [sub for sub in subs.values() if set(sub['languages']) != set(lang) and sub['value']] if fixme: remove = [] for key, sub in list(subs.items()): intersections = [] for s in list(subs.values()): intersections += overlaps(sub, s) if intersections: points = list(sorted(set([sub['in'], sub['out']] + intersections))) #print(points, sub['value']) sub_in = points[0] for sub_out in points[1:]: sub_id = '%0.3f-%0.3f' % (sub_in, sub_out) if sub_id not in subs: subs[sub_id] = { 'in': sub_in, 'out': sub_out, 'value': [], 'ids': [], 'languages': [] } if not sub['value']: continue if set(subs[sub_id]['languages']) != set(lang): if not subs[sub_id]['languages']: subs[sub_id]['value'] += sub['value'] subs[sub_id]['languages'] += sub['languages'] subs[sub_id]['ids'] += sub['ids'] elif subs[sub_id]['languages'] == [lang[0]] \ and sub['languages'][0] not in subs[sub_id]['languages']: subs[sub_id]['value'].append(sub['value'][0]) subs[sub_id]['languages'].append(sub['languages'][0]) subs[sub_id]['ids'] += sub['ids'] elif subs[sub_id]['languages'] == [lang[1]] \ and sub['languages'][0] not in subs[sub_id]['languages']: subs[sub_id]['value'].insert(0, sub['value'][0]) subs[sub_id]['languages'].insert(0, sub['languages'][0]) subs[sub_id]['ids'] += sub['ids'] #else: # print('WTF', sub['languages'], subs[sub_id]['languages']) sub_in = sub_out remove.append(key) #for key, sub in list(subs.items()): # if len(sub['languages']) == 1: # del subs[key] for key in remove: if len(subs[key]['languages']) == 1: del subs[key] for key, sub in list(subs.items()): if abs(sub['out'] - sub['in']) <= 0.040001: del subs[key] subs = sorted(subs.values(), key=lambda c: (c['in'], c['out'])) for sub in subs: sub['value'] = '\n'.join(sub['value']) if sub['value'].strip(): subtitles.append(sub) merged = [] p = None for sub in subtitles: if not p: merged.append(sub) p = sub else: if p['out'] > sub['in']: if p['value'] == sub['value']: p['out'] = max(p['out'], sub['out']) else: p['out'] = sub['in'] merged.append(sub) p = sub else: merged.append(sub) p = sub subtitles = merged if output_srt: with open(output_srt, 'wb') as fd: fd.write(ox.srt.encode(subtitles)) with open(output_json, 'w') as fd: json.dump(subtitles, fd, indent=4, ensure_ascii=False, sort_keys=True) def overlaps(src, other): src_in = float('%0.2f' % src['in']) src_out = float('%0.2f' % src['out']) other_in = float('%0.2f' % other['in']) other_out = float('%0.2f' % other['out']) points = [] if src_in != other_in or src_out != other_out: # src inside if other_in >= src_in and other_in < src_out: points += [other['in']] if other_out > src_in and other_out <= src_out: points += [other['out']] return points if __name__ == '__main__': if os.path.exists('subtitles.json'): items = json.load(open('subtitles.json')) else: items = update_subtitles() with open('subtitles.json', 'w') as fd: json.dump(items, fd, indent=4, ensure_ascii=False, sort_keys=True) if len(sys.argv) > 1: files = sys.argv[1:] else: files = glob('output/*/*.json') for item_json in files: prefix = 'public/' + item_json.split('/')[-1][0].lower() + item_json.split('/')[-2] + '.' output_json = prefix + '1080p.json' output_srt = prefix + '1080p.srt' render_subtitles(item_json, output_json, output_srt, ['ko', 'en']) output_json = prefix + 'no-en.json' output_srt = prefix + 'no-en.srt' render_subtitles(item_json, output_json, output_srt, ['no', 'en']) for lang in ('en', 'ko', 'no'): output_json = prefix + lang + '.json' render_subtitles(item_json, output_json, None, lang)