pandora_transcribe

2024-07-07 15:32:37 +01:00 · 2024-07-07 15:32:37 +01:00 · 1b5de1882a
commit 1b5de1882a
6 changed files with 175 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+*.pyc
--- a/README.md
+++ b/README.md
@ -0,0 +1,31 @@
+# pandora_transcribe
+
+use whisper_timestamped to add automatic transcriptions to pan.do/ra
+
+## installation
+
+    cd /srv/pandora/pandora
+    git clone https://code.0x2620.org/0x2620/pandora_transcribe transcribe
+
+add "transcribe" to LOCAL_APPS in local_setttings.py
+
+
+## configuration
+
+    add a user called subtitles and create 2 lists for that user: Queue and Transcribed
+    alternatively see options for `./manage.py transcribe` to use another user or list name
+
+## run
+
+    in a terminal run
+    ./manage.py transcribe
+
+
+## install a service
+
+copy systemd/service/pandora-transcribe.service to /etc/systemd/system/pandora-transcribe.service and
+
+    systemctl enable --now andora-transcribe.service
+
+
+
--- a/init.py
+++ b/init.py
--- a/apps.py
+++ b/apps.py
@ -0,0 +1,6 @@
+from django.apps import AppConfig
+
+
+class WhisperConfig(AppConfig):
+    default_auto_field = "django.db.models.BigAutoField"
+    name = 'transcribe'
--- a/management/commands/transcribe.py
+++ b/management/commands/transcribe.py
@ -0,0 +1,28 @@
+# -*- coding: utf-8 -*-
+
+from django.core.management.base import BaseCommand
+from django.conf import settings
+from django.db import transaction
+
+import app.monkey_patch
+from ... import transcribe
+
+class Command(BaseCommand):
+    help = 'transcribe items with whisper_timestamped'
+
+    def add_arguments(self, parser):
+        parser.add_argument('--user', type=str, dest='user',
+                            default='subtitles', help='user for subtitles (default: subtitles)')
+        parser.add_argument('--queue', type=str, dest='queue',
+                            default='Queue', help='name of incoming list (default: Queue)')
+        parser.add_argument('--done', type=str, dest='done',
+                            default='Transcribed', help='name of incoming list (default: Transcribed)')
+        parser.add_argument('--layer', type=str, dest='layer',
+                            default=None, help='import into layer (default subtitle layer)')
+        parser.add_argument('--translate', type=str, dest='translate',
+                            default='', help='list of languages to translate: (i.e. hi:en,de:en')
+        parser.add_argument('--gpu', action='store_true', dest='gpu',
+                            default=False, help='user GPU (default: disabled)')
+
+    def handle(self, **kwargs):
+        transcribe.main(**kwargs)
--- a/transcribe.py
+++ b/transcribe.py
@ -0,0 +1,109 @@
+import logging
+import os
+import shutil
+import signal
+import subprocess
+import tempfile
+
+import ox
+import ox.iso
+
+from django.conf import settings
+
+from annotation import tasks
+from item.models import Item
+from itemlist.models import List
+from item import utils
+
+
+logger = logging.getLogger(__name__)
+
+def extract_subtitles(item, user, layer, translate, gpu=False):
+    if "language" not in item.data:
+        logger.error("skip item without language %s", item.public_id)
+        return False
+    language = ox.iso.langTo2Code(item.data["language"][0])
+    if not language:
+        logger.error("skip item with unknown language %s: %s", item.public_id, item.data["language"])
+        return False
+    if not item.streams():
+        logger.error("skip item without media %s: %s", item.public_id)
+        return False
+    src = item.streams()[0].media.path
+
+    tmp = tempfile.mkdtemp()
+    cmd = [
+        "/opt/whisper-timestamped/bin/whisper_timestamped",
+        "--language", language,
+    ]
+    if translate and language in translate:
+        cmd += [
+            '--task', 'translate'
+        ]
+    if not gpu:
+        cmd += [
+            "--fp16", "False",
+        ]
+
+    cmd += [
+        "-f", "srt",
+        "--accurate",
+        "--output_dir", tmp,
+        src,
+    ]
+    try:
+        subprocess.check_output(cmd)
+    except:
+        logger.error("failed to extract subtitles from item %s\n%s", item.public_id, cmd)
+        shutil.rmtree(tmp)
+        return False
+    annotations = []
+    for f in os.listdir(tmp):
+        if f.endswith(".srt") and "words.srt" not in f:
+            srt = os.path.join(tmp, f)
+            annotations = ox.srt.load(srt)
+    if not annotations:
+        logger.error("no subtitles detected %s", item.public_id)
+        return True
+    if language != "en":
+        for annotation in annotations:
+            annotation["value"] = '<span lang="%s">%s</span>' % (language, annotation["value"])
+
+    tasks.add_annotations.delay({
+        'item': item.public_id,
+        'layer': layer,
+        'user': user.username,
+        'annotations': annotations
+    })
+    shutil.rmtree(tmp)
+    return True
+
+
+def main(**kwargs):
+    queue = List.objects.get(kwargs['queue'])
+    done = List.objects.get(kwargs['done'])
+    user = User.objects.get(kwargs['user'])
+    layer = kwargs.get("layer")
+    translate = kwargs.get("translate")
+    if translate:
+        translate = dict([tt.split(':') for tt in translate.split(',')])
+    if not layer:
+        layer = utils.get_by_key(settings.CONFIG['layers'], 'isSubtitles', True)
+        if layer:
+            layer = layer["id"]
+        else:
+            logger.error("no layer defined and config has no subtitle layer")
+            return
+    try:
+        while True:
+            wait = True
+            for item in queue.get_items(queue.user):
+                if extract_subtitles(item, user, layer, translate, kwargs.get("gpu")):
+                    queue.items.remove(item)
+                    done.items.remove(item)
+                    wait = False
+            if wait:
+                time.sleep(5*60)
+    except KeyboardInterrupt:
+        pass
+