112 lines
3.2 KiB
Python
112 lines
3.2 KiB
Python
import logging
|
|
import os
|
|
import shutil
|
|
import signal
|
|
import subprocess
|
|
import tempfile
|
|
import time
|
|
|
|
import ox
|
|
import ox.iso
|
|
|
|
from django.conf import settings
|
|
|
|
from annotation import tasks
|
|
from item import utils
|
|
from itemlist.models import List
|
|
from item.models import Item
|
|
from user.models import User
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
def extract_subtitles(item, user, layer, translate, gpu=False):
|
|
if "language" not in item.data:
|
|
logger.error("skip item without language %s", item.public_id)
|
|
return False
|
|
language = ox.iso.langTo2Code(item.data["language"][0])
|
|
if not language:
|
|
logger.error("skip item with unknown language %s: %s", item.public_id, item.data["language"])
|
|
return False
|
|
if not item.streams():
|
|
logger.error("skip item without media %s: %s", item.public_id)
|
|
return False
|
|
src = item.streams()[0].media.path
|
|
|
|
tmp = tempfile.mkdtemp()
|
|
cmd = [
|
|
"/opt/whisper-timestamped/bin/whisper_timestamped",
|
|
"--language", language,
|
|
]
|
|
if translate and language in translate:
|
|
cmd += [
|
|
'--task', 'translate'
|
|
]
|
|
language = 'en'
|
|
if not gpu:
|
|
cmd += [
|
|
"--fp16", "False",
|
|
]
|
|
|
|
cmd += [
|
|
"-f", "srt",
|
|
"--accurate",
|
|
"--output_dir", tmp,
|
|
src,
|
|
]
|
|
try:
|
|
subprocess.check_output(cmd)
|
|
except:
|
|
logger.error("failed to extract subtitles from item %s\n%s", item.public_id, cmd)
|
|
shutil.rmtree(tmp)
|
|
return False
|
|
annotations = []
|
|
for f in os.listdir(tmp):
|
|
if f.endswith(".srt") and "words.srt" not in f:
|
|
srt = os.path.join(tmp, f)
|
|
annotations = ox.srt.load(srt)
|
|
if not annotations:
|
|
logger.error("no subtitles detected %s", item.public_id)
|
|
return True
|
|
if language != "en":
|
|
for annotation in annotations:
|
|
annotation["value"] = '<span lang="%s">%s</span>' % (language, annotation["value"])
|
|
|
|
tasks.add_annotations.delay({
|
|
'item': item.public_id,
|
|
'layer': layer,
|
|
'user': user.username,
|
|
'annotations': annotations
|
|
})
|
|
shutil.rmtree(tmp)
|
|
return True
|
|
|
|
|
|
def main(**kwargs):
|
|
user = User.objects.get(username=kwargs['user'])
|
|
queue = List.objects.get(user=user, name=kwargs['queue'])
|
|
done = List.objects.get(user=user, name=kwargs['done'])
|
|
layer = kwargs.get("layer")
|
|
translate = kwargs.get("translate")
|
|
if translate:
|
|
translate = dict([tt.split(':') for tt in translate.split(',')])
|
|
if not layer:
|
|
layer = utils.get_by_key(settings.CONFIG['layers'], 'isSubtitles', True)
|
|
if layer:
|
|
layer = layer["id"]
|
|
else:
|
|
logger.error("no layer defined and config has no subtitle layer")
|
|
return
|
|
try:
|
|
while True:
|
|
wait = True
|
|
for item in queue.get_items(queue.user).all():
|
|
if extract_subtitles(item, user, layer, translate, kwargs.get("gpu")):
|
|
queue.items.remove(item)
|
|
done.items.remove(item)
|
|
wait = False
|
|
if wait:
|
|
time.sleep(5*60)
|
|
except KeyboardInterrupt:
|
|
pass
|
|
|