98 lines
2.9 KiB
Python
98 lines
2.9 KiB
Python
import requests
|
|
import json
|
|
import os
|
|
import subprocess
|
|
|
|
'''
|
|
apt-get install -y podman
|
|
podman run -P -p 8765:8765 lowerquality/gentle
|
|
'''
|
|
|
|
|
|
def load_subs():
|
|
subtitles = {}
|
|
for url in """
|
|
https://textb.org/r/t_for_time_subtitles_1_melodic/
|
|
https://textb.org/r/t_for_time_subtitles_2_whispered/
|
|
https://textb.org/r/t_for_time_subtitles_3_free/
|
|
https://textb.org/r/t_for_time_subtitles_4_read/
|
|
https://textb.org/r/t_for_time_subtitles_5_ashley/
|
|
""".strip().split('\n'):
|
|
data = requests.get(url).text
|
|
parts = data.strip().split('##')
|
|
print(url)
|
|
prefix = '/srv/t_for_time/vo/' + url.split('/')[-2].split('subtitles_')[-1]
|
|
for part in parts:
|
|
part = part.strip().split('\n')
|
|
if part:
|
|
title = part[0]
|
|
text = "\n".join(part[1:]).strip()
|
|
if text:
|
|
fname = '%s_%s.txt' % (prefix, title)
|
|
with open(fname, 'w') as fd:
|
|
fd.write(text)
|
|
|
|
|
|
def gentle2subtitles(align):
|
|
new_block = '\r\n\r\n'
|
|
if new_block not in align['transcript']:
|
|
new_block = '\n\n'
|
|
|
|
data = []
|
|
end = 0
|
|
|
|
for block in align['transcript'].split(new_block):
|
|
if not block.strip():
|
|
continue
|
|
start = end
|
|
end += len(block)
|
|
in_ = -1
|
|
out_ = -1
|
|
for word in align['words']:
|
|
if word['startOffset'] < start:
|
|
continue
|
|
if word.get('case') == 'not-found-in-audio':
|
|
continue
|
|
if in_ == -1:
|
|
in_ = word['start']
|
|
out_ = word['end']
|
|
if word['endOffset'] > end:
|
|
break
|
|
if 'end' in word:
|
|
out_ = word['end']
|
|
if word['endOffset'] == end:
|
|
break
|
|
|
|
data.append({
|
|
'in': in_, 'out': out_, 'value': block.replace('\r\n', '\n')
|
|
})
|
|
end += len(new_block)
|
|
return data
|
|
|
|
|
|
def align_text(txt, wav):
|
|
cmd = ['curl', '-s', '-F', 'audio=@' + wav, '-F', 'transcript=@%s' % txt,
|
|
'http://localhost:8765/transcriptions?async=false']
|
|
data = subprocess.check_output(cmd).decode()
|
|
return json.loads(data)
|
|
|
|
|
|
def update_subtitles():
|
|
import item.models
|
|
from annotation.tasks import add_annotations
|
|
|
|
load_subs()
|
|
for i in item.models.Item.objects.filter(data__type=['Voice Over']):
|
|
wav = i.files.filter(selected=True)[0].data.path
|
|
id = i.get('title').split('_')[0]
|
|
batch = i.get('batch')[0][5:].lower().replace('-', '_').replace(' ', '')
|
|
txt = '/srv/t_for_time/vo/%s_%s.txt' % (batch, id)
|
|
if os.path.exists(txt):
|
|
print(i, wav, txt)
|
|
subtitles = gentle2subtitles(align_text(txt, wav))
|
|
add_annotations({
|
|
'item': i.public_id,
|
|
'layer': 'subtitles',
|
|
'user': 'j',
|
|
'annotations': subtitles
|
|
})
|