pandora_transcribe/transcribe.py

113 lines
3.2 KiB
Python
Raw Normal View History

2024-07-07 14:32:37 +00:00
import logging
import os
import shutil
import signal
import subprocess
import tempfile
2024-07-07 14:46:41 +00:00
import time
2024-07-07 14:32:37 +00:00
import ox
import ox.iso
from django.conf import settings
from annotation import tasks
from item import utils
2024-07-07 14:46:41 +00:00
from itemlist.models import List
from item.models import Item
from user.models import User
2024-07-07 14:32:37 +00:00
logger = logging.getLogger(__name__)
def extract_subtitles(item, user, layer, translate, gpu=False):
if "language" not in item.data:
logger.error("skip item without language %s", item.public_id)
return False
language = ox.iso.langTo2Code(item.data["language"][0])
if not language:
logger.error("skip item with unknown language %s: %s", item.public_id, item.data["language"])
return False
if not item.streams():
logger.error("skip item without media %s: %s", item.public_id)
return False
src = item.streams()[0].media.path
tmp = tempfile.mkdtemp()
cmd = [
"/opt/whisper-timestamped/bin/whisper_timestamped",
"--language", language,
]
if translate and language in translate:
cmd += [
'--task', 'translate'
]
2024-07-07 14:51:27 +00:00
language = 'en'
2024-07-07 14:32:37 +00:00
if not gpu:
cmd += [
"--fp16", "False",
]
cmd += [
"-f", "srt",
"--accurate",
"--output_dir", tmp,
src,
]
try:
subprocess.check_output(cmd)
except:
logger.error("failed to extract subtitles from item %s\n%s", item.public_id, cmd)
shutil.rmtree(tmp)
return False
annotations = []
for f in os.listdir(tmp):
if f.endswith(".srt") and "words.srt" not in f:
srt = os.path.join(tmp, f)
annotations = ox.srt.load(srt)
if not annotations:
logger.error("no subtitles detected %s", item.public_id)
return True
if language != "en":
for annotation in annotations:
annotation["value"] = '<span lang="%s">%s</span>' % (language, annotation["value"])
tasks.add_annotations.delay({
'item': item.public_id,
'layer': layer,
'user': user.username,
'annotations': annotations
})
shutil.rmtree(tmp)
return True
def main(**kwargs):
2024-07-07 14:46:41 +00:00
user = User.objects.get(username=kwargs['user'])
queue = List.objects.get(user=user, name=kwargs['queue'])
done = List.objects.get(user=user, name=kwargs['done'])
2024-07-07 14:32:37 +00:00
layer = kwargs.get("layer")
translate = kwargs.get("translate")
if translate:
translate = dict([tt.split(':') for tt in translate.split(',')])
if not layer:
layer = utils.get_by_key(settings.CONFIG['layers'], 'isSubtitles', True)
if layer:
layer = layer["id"]
else:
logger.error("no layer defined and config has no subtitle layer")
return
try:
while True:
wait = True
2024-07-07 14:46:41 +00:00
for item in queue.get_items(queue.user).all():
2024-07-07 14:32:37 +00:00
if extract_subtitles(item, user, layer, translate, kwargs.get("gpu")):
queue.items.remove(item)
done.items.remove(item)
wait = False
if wait:
time.sleep(5*60)
except KeyboardInterrupt:
pass