pandora_t_for_time/align_subtitles.py
2023-11-14 20:04:33 +01:00

98 lines
2.9 KiB
Python

import requests
import json
import os
import subprocess
'''
apt-get install -y podman
podman run -P -p 8765:8765 lowerquality/gentle
'''
def load_subs():
subtitles = {}
for url in """
https://textb.org/r/t_for_time_subtitles_1_melodic/
https://textb.org/r/t_for_time_subtitles_2_whispered/
https://textb.org/r/t_for_time_subtitles_3_free/
https://textb.org/r/t_for_time_subtitles_4_read/
https://textb.org/r/t_for_time_subtitles_5_ashley/
""".strip().split('\n'):
data = requests.get(url).text
parts = data.strip().split('##')
print(url)
prefix = '/srv/t_for_time/vo/' + url.split('/')[-2].split('subtitles_')[-1]
for part in parts:
part = part.strip().split('\n')
if part:
title = part[0]
text = "\n".join(part[1:]).strip()
if text:
fname = '%s_%s.txt' % (prefix, title)
with open(fname, 'w') as fd:
fd.write(text)
def gentle2subtitles(align):
new_block = '\r\n\r\n'
if new_block not in align['transcript']:
new_block = '\n\n'
data = []
end = 0
for block in align['transcript'].split(new_block):
if not block.strip():
continue
start = end
end += len(block)
in_ = -1
out_ = -1
for word in align['words']:
if word['startOffset'] < start:
continue
if word.get('case') == 'not-found-in-audio':
continue
if in_ == -1:
in_ = word['start']
out_ = word['end']
if word['endOffset'] > end:
break
if 'end' in word:
out_ = word['end']
if word['endOffset'] == end:
break
data.append({
'in': in_, 'out': out_, 'value': block.replace('\r\n', '\n')
})
end += len(new_block)
return data
def align_text(txt, wav):
cmd = ['curl', '-s', '-F', 'audio=@' + wav, '-F', 'transcript=@%s' % txt,
'http://localhost:8765/transcriptions?async=false']
data = subprocess.check_output(cmd).decode()
return json.loads(data)
def update_subtitles():
import item.models
from annotation.tasks import add_annotations
load_subs()
for i in item.models.Item.objects.filter(data__type=['Voice Over']):
wav = i.files.filter(selected=True)[0].data.path
id = i.get('title').split('_')[0]
batch = i.get('batch')[0][5:].lower().replace('-', '_').replace(' ', '')
txt = '/srv/t_for_time/vo/%s_%s.txt' % (batch, id)
if os.path.exists(txt):
print(i, wav, txt)
subtitles = gentle2subtitles(align_text(txt, wav))
add_annotations({
'item': i.public_id,
'layer': 'subtitles',
'user': 'j',
'annotations': subtitles
})