subtitle aligment
This commit is contained in:
parent
4de39b926a
commit
6823002315
1 changed files with 98 additions and 0 deletions
98
align_subtitles.py
Normal file
98
align_subtitles.py
Normal file
|
@ -0,0 +1,98 @@
|
|||
import requests
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
'''
|
||||
apt-get install -y podman
|
||||
podman run -P -p 8765:8765 lowerquality/gentle
|
||||
'''
|
||||
|
||||
|
||||
def load_subs():
|
||||
subtitles = {}
|
||||
for url in """
|
||||
https://textb.org/r/t_for_time_subtitles_1_melodic/
|
||||
https://textb.org/r/t_for_time_subtitles_2_whispered/
|
||||
https://textb.org/r/t_for_time_subtitles_3_free/
|
||||
https://textb.org/r/t_for_time_subtitles_4_read/
|
||||
https://textb.org/r/t_for_time_subtitles_5_ashley/
|
||||
""".strip().split('\n'):
|
||||
data = requests.get(url).text
|
||||
parts = data.strip().split('##')
|
||||
print(url)
|
||||
prefix = '/srv/t_for_time/vo/' + url.split('/')[-2].split('subtitles_')[-1]
|
||||
for part in parts:
|
||||
part = part.strip().split('\n')
|
||||
if part:
|
||||
title = part[0]
|
||||
text = "\n".join(part[1:]).strip()
|
||||
if text:
|
||||
fname = '%s_%s.txt' % (prefix, title)
|
||||
with open(fname, 'w') as fd:
|
||||
fd.write(text)
|
||||
|
||||
|
||||
def gentle2subtitles(align):
|
||||
new_block = '\r\n\r\n'
|
||||
if new_block not in align['transcript']:
|
||||
new_block = '\n\n'
|
||||
|
||||
data = []
|
||||
end = 0
|
||||
|
||||
for block in align['transcript'].split(new_block):
|
||||
if not block.strip():
|
||||
continue
|
||||
start = end
|
||||
end += len(block)
|
||||
in_ = -1
|
||||
out_ = -1
|
||||
for word in align['words']:
|
||||
if word['startOffset'] < start:
|
||||
continue
|
||||
if word.get('case') == 'not-found-in-audio':
|
||||
continue
|
||||
if in_ == -1:
|
||||
in_ = word['start']
|
||||
out_ = word['end']
|
||||
if word['endOffset'] > end:
|
||||
break
|
||||
if 'end' in word:
|
||||
out_ = word['end']
|
||||
if word['endOffset'] == end:
|
||||
break
|
||||
|
||||
data.append({
|
||||
'in': in_, 'out': out_, 'value': block.replace('\r\n', '\n')
|
||||
})
|
||||
end += len(new_block)
|
||||
return data
|
||||
|
||||
|
||||
def align_text(txt, wav):
|
||||
cmd = ['curl', '-s', '-F', 'audio=@' + wav, '-F', 'transcript=@%s' % txt,
|
||||
'http://localhost:8765/transcriptions?async=false']
|
||||
data = subprocess.check_output(cmd).decode()
|
||||
return json.loads(data)
|
||||
|
||||
|
||||
def update_subtitles():
|
||||
import item.models
|
||||
from annotation.tasks import add_annotations
|
||||
|
||||
load_subs()
|
||||
for i in item.models.Item.objects.filter(data__type=['Voice Over']):
|
||||
wav = i.files.filter(selected=True)[0].data.path
|
||||
id = i.get('title').split('_')[0]
|
||||
batch = i.get('batch')[0][5:].lower().replace('-', '_').replace(' ', '')
|
||||
txt = '/srv/t_for_time/vo/%s_%s.txt' % (batch, id)
|
||||
if os.path.exists(txt):
|
||||
print(i, wav, txt)
|
||||
subtitles = gentle2subtitles(align_text(txt, wav))
|
||||
add_annotations({
|
||||
'item': i.public_id,
|
||||
'layer': 'subtitles',
|
||||
'user': 'j',
|
||||
'annotations': subtitles
|
||||
})
|
Loading…
Reference in a new issue