diff --git a/align_subtitles.py b/align_subtitles.py new file mode 100644 index 0000000..491ec75 --- /dev/null +++ b/align_subtitles.py @@ -0,0 +1,98 @@ +import requests +import json +import os +import subprocess + +''' +apt-get install -y podman +podman run -P -p 8765:8765 lowerquality/gentle +''' + + +def load_subs(): + subtitles = {} + for url in """ +https://textb.org/r/t_for_time_subtitles_1_melodic/ +https://textb.org/r/t_for_time_subtitles_2_whispered/ +https://textb.org/r/t_for_time_subtitles_3_free/ +https://textb.org/r/t_for_time_subtitles_4_read/ +https://textb.org/r/t_for_time_subtitles_5_ashley/ +""".strip().split('\n'): + data = requests.get(url).text + parts = data.strip().split('##') + print(url) + prefix = '/srv/t_for_time/vo/' + url.split('/')[-2].split('subtitles_')[-1] + for part in parts: + part = part.strip().split('\n') + if part: + title = part[0] + text = "\n".join(part[1:]).strip() + if text: + fname = '%s_%s.txt' % (prefix, title) + with open(fname, 'w') as fd: + fd.write(text) + + +def gentle2subtitles(align): + new_block = '\r\n\r\n' + if new_block not in align['transcript']: + new_block = '\n\n' + + data = [] + end = 0 + + for block in align['transcript'].split(new_block): + if not block.strip(): + continue + start = end + end += len(block) + in_ = -1 + out_ = -1 + for word in align['words']: + if word['startOffset'] < start: + continue + if word.get('case') == 'not-found-in-audio': + continue + if in_ == -1: + in_ = word['start'] + out_ = word['end'] + if word['endOffset'] > end: + break + if 'end' in word: + out_ = word['end'] + if word['endOffset'] == end: + break + + data.append({ + 'in': in_, 'out': out_, 'value': block.replace('\r\n', '\n') + }) + end += len(new_block) + return data + + +def align_text(txt, wav): + cmd = ['curl', '-s', '-F', 'audio=@' + wav, '-F', 'transcript=@%s' % txt, + 'http://localhost:8765/transcriptions?async=false'] + data = subprocess.check_output(cmd).decode() + return json.loads(data) + + +def update_subtitles(): + import item.models + from annotation.tasks import add_annotations + + load_subs() + for i in item.models.Item.objects.filter(data__type=['Voice Over']): + wav = i.files.filter(selected=True)[0].data.path + id = i.get('title').split('_')[0] + batch = i.get('batch')[0][5:].lower().replace('-', '_').replace(' ', '') + txt = '/srv/t_for_time/vo/%s_%s.txt' % (batch, id) + if os.path.exists(txt): + print(i, wav, txt) + subtitles = gentle2subtitles(align_text(txt, wav)) + add_annotations({ + 'item': i.public_id, + 'layer': 'subtitles', + 'user': 'j', + 'annotations': subtitles + })