pandora_transcribe/transcribe.py

import json
import logging
import os
import shutil
import signal
import subprocess
import tempfile
import time

import ox
import ox.iso

from django.conf import settings

from annotation import tasks
from item import utils
from itemlist.models import List
from item.models import Item
from user.models import User


logger = logging.getLogger(__name__)

TARGET_LENGTH = getattr(settings, 'TRANSCRIBE_TARGET_LENGTH', 200)

def prepare_annotations(result, join_sentences=False):
    if join_sentences:
        return prepare_joint_annotations(result)
    annotations = []
    for segment in result["segments"]:
        annotations.append(
            {
                "in": segment["start"],
                "out": segment["end"] + 0.3,
                "value": segment["text"].strip(),
            }
        )
    return annotations


def prepare_joint_annotations(result, target_length=TARGET_LENGTH):
    abbrevs = ["Mr.", "Mrs.", "Dr."]
    ignore = []
    phrase_sounds = []
    segments = result["segments"]
    all_words = []
    for s in segments:
        all_words.extend(s["words"])
    new_segs = []
    sentence = ""
    for w in all_words:
        if not w == all_words[-1]:
            next_w = all_words[all_words.index(w) + 1]
        else:
            # w is last word
            if sentence == "" and w["text"] in ignore:
                continue
        if sentence == "":
            in_ = w["start"]
            # 0th word of a sentence
            if w["text"] in ignore and next_w["text"][0].isupper():
                continue

        if sentence == "The music " and next_w["text"][0] == "The":
            sentence = ""
            continue
        sentence += w["text"] + " "

        # if this is a short sentence and next word starts less than 1 sec away
        # and not last word of entire text
        if (
            w["text"].endswith(".")
            and w != all_words[-1]
            and (next_w["start"] - w["end"]) < 0.8
            and len(sentence) < target_length
            and next_w["text"] not in ignore
        ):
            # then do not end this sentence yet
            continue
        if (
            w["text"].endswith(".") and w["text"] not in abbrevs and len(w["text"]) > 2
        ) or (
            w["text"] in ignore
            and sentence.strip() == w["text"]
            and (w == all_words[-1] or next_w["text"][0].isupper())
        ):
            # end the sentence, delay end a bit
            out_ = w["end"] + 0.3
            sentence_dict = {"in": in_, "out": out_, "value": sentence.strip()}
            new_segs.append(sentence_dict)
            sentence = ""
    annotations = list(filter(lambda i: i["value"].strip() not in ignore, new_segs))
    return annotations


def run_demucs(src, output):
    cmd = [
        "/opt/whisper-timestamped/bin/demucs"
        "--two-stems", "vocals",
        "-o", output,
        src
    ]
    subprocess.check_call(cmd)
    wav = glob("%s/htdemucs/*/vocals.wav" % output)[0]
    return wav


def run_whisper(src, language=None, translate=False, gpu=False, model="small", demucs=False):
    tmp = tempfile.mkdtemp()

    if demucs:
        try:
            src = run_demucs(src, tmp)
        except:
            logger.error("failed to run demucs for %s", src)
            shutil.rmtree(tmp)
            return None

    output = os.path.join(tmp, "output.json")
    run_py = os.path.join(os.path.dirname(os.path.abspath(__file__)), "run_whisper.py")
    cmd = ["/opt/whisper-timestamped/bin/python", run_py]
    cmd += ["--model", model]
    if language:
        cmd += ["--language", language]
        if translate and language in translate:
            cmd += ["--translate"]
            language = "en"

    cmd += [src, output]

    try:
        subprocess.check_call(cmd)
    except:
        logger.error("failed to run: %s", cmd)
        shutil.rmtree(tmp)
        return None

    with open(output) as fd:
        response = json.load(fd)

    #shutil.rmtree(tmp)
    return response

def extract_subtitles(item, user, layer, translate, gpu=False, join_sentences=False, model="small"):
    language = None
    if "language" not in item.data:
        language = None
    else:
        language = ox.iso.langTo2Code(item.data["language"][0])
    if not item.streams():
        logger.error("skip item without media %s: %s", item.public_id)
        return False
    src = item.streams()[0].media.path
    response = run_whisper(src, language, translate, gpu, model)
    if not response:
        logger.error("extract failed for  %s", item.public_id)
        return False
    annotations = prepare_annotations(response, join_sentences=join_sentences)
    if not annotations:
        return False

    if language and language != "en":
        for annotation in annotations:
            annotation["value"] = '<span lang="%s">%s</span>' % (
                language,
                annotation["value"],
            )

    tasks.add_annotations.delay(
        {
            "item": item.public_id,
            "layer": layer,
            "user": user.username,
            "annotations": annotations,
        }
    )
    return True


def extract_subtitles_cmd(item, user, layer, translate, gpu=False, model="small"):
    if "language" not in item.data:
        language = None
    else:
        language = ox.iso.langTo2Code(item.data["language"][0])
    if not item.streams():
        logger.error("skip item without media %s: %s", item.public_id)
        return False
    src = item.streams()[0].media.path

    tmp = tempfile.mkdtemp()
    cmd = ["/opt/whisper-timestamped/bin/whisper_timestamped", "--model", model]
    if language:
        cmd += ["--language", language]
        if translate and language in translate:
            cmd += ["--task", "translate"]
            language = "en"
    if not gpu:
        cmd += [
            "--fp16",
            "False",
        ]

    cmd += [
        "-f",
        "srt",
        "--accurate",
        "--output_dir",
        tmp,
        src,
    ]
    try:
        subprocess.check_output(cmd)
    except:
        logger.error(
            "failed to extract subtitles from item %s\n%s", item.public_id, cmd
        )
        shutil.rmtree(tmp)
        return False
    annotations = []
    for f in os.listdir(tmp):
        if f.endswith(".srt") and "words.srt" not in f:
            srt = os.path.join(tmp, f)
            annotations = ox.srt.load(srt)
    if not annotations:
        logger.error("no subtitles detected %s", item.public_id)
        return True
    if language and language != "en":
        for annotation in annotations:
            annotation["value"] = '<span lang="%s">%s</span>' % (
                language,
                annotation["value"],
            )

    tasks.add_annotations.delay(
        {
            "item": item.public_id,
            "layer": layer,
            "user": user.username,
            "annotations": annotations,
        }
    )
    shutil.rmtree(tmp)
    return True


def main(**kwargs):
    user = User.objects.get(username=kwargs["user"])
    queue = List.objects.get(user=user, name=kwargs["queue"])
    done = List.objects.get(user=user, name=kwargs["done"])
    layer = kwargs.get("layer")
    translate = kwargs.get("translate")
    if translate:
        translate = dict([tt.split(":") for tt in translate.split(",")])
    if not layer:
        layer = utils.get_by_key(settings.CONFIG["layers"], "isSubtitles", True)
        if layer:
            layer = layer["id"]
        else:
            logger.error("no layer defined and config has no subtitle layer")
            return
    try:
        while True:
            wait = True
            for item in queue.get_items(queue.user).all():
                if extract_subtitles(
                    item,
                    user,
                    layer,
                    translate,
                    kwargs.get("gpu"),
                    join_sentences=kwargs.get("join_sentences"),
                    model=kwargs.get("model", "small")
                ):
                    done.items.add(item)
                    queue.items.remove(item)
                    wait = False
            if wait:
                time.sleep(5 * 60)
    except KeyboardInterrupt:
        pass
re-enable language/translate 2025-01-24 11:51:00 +05:30			`import json`
pandora_transcribe 2024-07-07 15:32:37 +01:00			`import logging`
			`import os`
			`import shutil`
			`import signal`
			`import subprocess`
			`import tempfile`
make it run 2024-07-07 15:46:41 +01:00			`import time`
pandora_transcribe 2024-07-07 15:32:37 +01:00
			`import ox`
			`import ox.iso`

			`from django.conf import settings`

			`from annotation import tasks`
			`from item import utils`
make it run 2024-07-07 15:46:41 +01:00			`from itemlist.models import List`
			`from item.models import Item`
			`from user.models import User`
pandora_transcribe 2024-07-07 15:32:37 +01:00

			`logger = logging.getLogger(__name__)`

add TRANSCRIBE_TARGET_LENGTH setting 2025-01-25 11:43:27 +05:30			`TARGET_LENGTH = getattr(settings, 'TRANSCRIBE_TARGET_LENGTH', 200)`
use python api, add option to join translations, use turbo model 2025-01-24 08:56:51 +05:30
			`def prepare_annotations(result, join_sentences=False):`
			`if join_sentences:`
			`return prepare_joint_annotations(result)`
			`annotations = []`
cleanups, add demucs option 2025-01-24 18:00:58 +05:30			`for segment in result["segments"]:`
use python api, add option to join translations, use turbo model 2025-01-24 08:56:51 +05:30			`annotations.append(`
			`{`
			`"in": segment["start"],`
cleanups, add demucs option 2025-01-24 18:00:58 +05:30			`"out": segment["end"] + 0.3,`
			`"value": segment["text"].strip(),`
use python api, add option to join translations, use turbo model 2025-01-24 08:56:51 +05:30			`}`
			`)`
			`return annotations`


add TRANSCRIBE_TARGET_LENGTH setting 2025-01-25 11:43:27 +05:30			`def prepare_joint_annotations(result, target_length=TARGET_LENGTH):`
use python api, add option to join translations, use turbo model 2025-01-24 08:56:51 +05:30			`abbrevs = ["Mr.", "Mrs.", "Dr."]`
			`ignore = []`
			`phrase_sounds = []`
			`segments = result["segments"]`
			`all_words = []`
			`for s in segments:`
			`all_words.extend(s["words"])`
			`new_segs = []`
			`sentence = ""`
			`for w in all_words:`
			`if not w == all_words[-1]:`
			`next_w = all_words[all_words.index(w) + 1]`
			`else:`
			`# w is last word`
			`if sentence == "" and w["text"] in ignore:`
			`continue`
			`if sentence == "":`
			`in_ = w["start"]`
			`# 0th word of a sentence`
			`if w["text"] in ignore and next_w["text"][0].isupper():`
			`continue`

			`if sentence == "The music " and next_w["text"][0] == "The":`
			`sentence = ""`
			`continue`
			`sentence += w["text"] + " "`

			`# if this is a short sentence and next word starts less than 1 sec away`
			`# and not last word of entire text`
			`if (`
			`w["text"].endswith(".")`
			`and w != all_words[-1]`
			`and (next_w["start"] - w["end"]) < 0.8`
			`and len(sentence) < target_length`
			`and next_w["text"] not in ignore`
			`):`
			`# then do not end this sentence yet`
			`continue`
			`if (`
			`w["text"].endswith(".") and w["text"] not in abbrevs and len(w["text"]) > 2`
			`) or (`
			`w["text"] in ignore`
			`and sentence.strip() == w["text"]`
			`and (w == all_words[-1] or next_w["text"][0].isupper())`
			`):`
			`# end the sentence, delay end a bit`
			`out_ = w["end"] + 0.3`
			`sentence_dict = {"in": in_, "out": out_, "value": sentence.strip()}`
			`new_segs.append(sentence_dict)`
			`sentence = ""`
			`annotations = list(filter(lambda i: i["value"].strip() not in ignore, new_segs))`
			`return annotations`


cleanups, add demucs option 2025-01-24 18:00:58 +05:30			`def run_demucs(src, output):`
			`cmd = [`
			`"/opt/whisper-timestamped/bin/demucs"`
			`"--two-stems", "vocals",`
			`"-o", output,`
			`src`
			`]`
			`subprocess.check_call(cmd)`
			`wav = glob("%s/htdemucs/*/vocals.wav" % output)[0]`
			`return wav`
use python api, add option to join translations, use turbo model 2025-01-24 08:56:51 +05:30
cleanups, add demucs option 2025-01-24 18:00:58 +05:30
			`def run_whisper(src, language=None, translate=False, gpu=False, model="small", demucs=False):`
other parts write to stdout, write output to tmp file 2025-01-24 09:54:39 +05:30			`tmp = tempfile.mkdtemp()`

cleanups, add demucs option 2025-01-24 18:00:58 +05:30			`if demucs:`
			`try:`
			`src = run_demucs(src, tmp)`
			`except:`
			`logger.error("failed to run demucs for %s", src)`
			`shutil.rmtree(tmp)`
			`return None`

			`output = os.path.join(tmp, "output.json")`
use python api, add option to join translations, use turbo model 2025-01-24 08:56:51 +05:30			`run_py = os.path.join(os.path.dirname(os.path.abspath(__file__)), "run_whisper.py")`
			`cmd = ["/opt/whisper-timestamped/bin/python", run_py]`
pass model 2025-01-24 11:52:29 +05:30			`cmd += ["--model", model]`
use python api, add option to join translations, use turbo model 2025-01-24 08:56:51 +05:30			`if language:`
			`cmd += ["--language", language]`
			`if translate and language in translate:`
			`cmd += ["--translate"]`
			`language = "en"`

other parts write to stdout, write output to tmp file 2025-01-24 09:54:39 +05:30			`cmd += [src, output]`
use python api, add option to join translations, use turbo model 2025-01-24 08:56:51 +05:30
			`try:`
use large-v3-turbo 2025-01-24 10:18:25 +05:30			`subprocess.check_call(cmd)`
use python api, add option to join translations, use turbo model 2025-01-24 08:56:51 +05:30			`except:`
cleanups, add demucs option 2025-01-24 18:00:58 +05:30			`logger.error("failed to run: %s", cmd)`
			`shutil.rmtree(tmp)`
			`return None`

other parts write to stdout, write output to tmp file 2025-01-24 09:54:39 +05:30			`with open(output) as fd:`
			`response = json.load(fd)`
cleanups, add demucs option 2025-01-24 18:00:58 +05:30
			`#shutil.rmtree(tmp)`
			`return response`

			`def extract_subtitles(item, user, layer, translate, gpu=False, join_sentences=False, model="small"):`
			`language = None`
			`if "language" not in item.data:`
			`language = None`
			`else:`
			`language = ox.iso.langTo2Code(item.data["language"][0])`
			`if not item.streams():`
			`logger.error("skip item without media %s: %s", item.public_id)`
			`return False`
			`src = item.streams()[0].media.path`
			`response = run_whisper(src, language, translate, gpu, model)`
			`if not response:`
			`logger.error("extract failed for %s", item.public_id)`
			`return False`
use python api, add option to join translations, use turbo model 2025-01-24 08:56:51 +05:30			`annotations = prepare_annotations(response, join_sentences=join_sentences)`
			`if not annotations:`
			`return False`

			`if language and language != "en":`
			`for annotation in annotations:`
			`annotation["value"] = '<span lang="%s">%s</span>' % (`
			`language,`
			`annotation["value"],`
			`)`

			`tasks.add_annotations.delay(`
			`{`
			`"item": item.public_id,`
			`"layer": layer,`
			`"user": user.username,`
			`"annotations": annotations,`
			`}`
			`)`
			`return True`


pass model 2025-01-24 11:52:29 +05:30			`def extract_subtitles_cmd(item, user, layer, translate, gpu=False, model="small"):`
pandora_transcribe 2024-07-07 15:32:37 +01:00			`if "language" not in item.data:`
if no language is set, try without 2024-07-08 15:33:03 +01:00			`language = None`
			`else:`
			`language = ox.iso.langTo2Code(item.data["language"][0])`
pandora_transcribe 2024-07-07 15:32:37 +01:00			`if not item.streams():`
			`logger.error("skip item without media %s: %s", item.public_id)`
			`return False`
			`src = item.streams()[0].media.path`

			`tmp = tempfile.mkdtemp()`
pass model 2025-01-24 11:52:29 +05:30			`cmd = ["/opt/whisper-timestamped/bin/whisper_timestamped", "--model", model]`
if no language is set, try without 2024-07-08 15:33:03 +01:00			`if language:`
use python api, add option to join translations, use turbo model 2025-01-24 08:56:51 +05:30			`cmd += ["--language", language]`
if no language is set, try without 2024-07-08 15:33:03 +01:00			`if translate and language in translate:`
use python api, add option to join translations, use turbo model 2025-01-24 08:56:51 +05:30			`cmd += ["--task", "translate"]`
			`language = "en"`
pandora_transcribe 2024-07-07 15:32:37 +01:00			`if not gpu:`
			`cmd += [`
use python api, add option to join translations, use turbo model 2025-01-24 08:56:51 +05:30			`"--fp16",`
			`"False",`
pandora_transcribe 2024-07-07 15:32:37 +01:00			`]`

			`cmd += [`
use python api, add option to join translations, use turbo model 2025-01-24 08:56:51 +05:30			`"-f",`
			`"srt",`
pandora_transcribe 2024-07-07 15:32:37 +01:00			`"--accurate",`
use python api, add option to join translations, use turbo model 2025-01-24 08:56:51 +05:30			`"--output_dir",`
			`tmp,`
pandora_transcribe 2024-07-07 15:32:37 +01:00			`src,`
			`]`
			`try:`
			`subprocess.check_output(cmd)`
			`except:`
use python api, add option to join translations, use turbo model 2025-01-24 08:56:51 +05:30			`logger.error(`
			`"failed to extract subtitles from item %s\n%s", item.public_id, cmd`
			`)`
pandora_transcribe 2024-07-07 15:32:37 +01:00			`shutil.rmtree(tmp)`
			`return False`
			`annotations = []`
			`for f in os.listdir(tmp):`
			`if f.endswith(".srt") and "words.srt" not in f:`
			`srt = os.path.join(tmp, f)`
			`annotations = ox.srt.load(srt)`
			`if not annotations:`
			`logger.error("no subtitles detected %s", item.public_id)`
			`return True`
dont wrap in lang="None" 2024-07-08 15:35:02 +01:00			`if language and language != "en":`
pandora_transcribe 2024-07-07 15:32:37 +01:00			`for annotation in annotations:`
use python api, add option to join translations, use turbo model 2025-01-24 08:56:51 +05:30			`annotation["value"] = '<span lang="%s">%s</span>' % (`
			`language,`
			`annotation["value"],`
			`)`

			`tasks.add_annotations.delay(`
			`{`
			`"item": item.public_id,`
			`"layer": layer,`
			`"user": user.username,`
			`"annotations": annotations,`
			`}`
			`)`
pandora_transcribe 2024-07-07 15:32:37 +01:00			`shutil.rmtree(tmp)`
			`return True`


			`def main(**kwargs):`
use python api, add option to join translations, use turbo model 2025-01-24 08:56:51 +05:30			`user = User.objects.get(username=kwargs["user"])`
			`queue = List.objects.get(user=user, name=kwargs["queue"])`
			`done = List.objects.get(user=user, name=kwargs["done"])`
pandora_transcribe 2024-07-07 15:32:37 +01:00			`layer = kwargs.get("layer")`
			`translate = kwargs.get("translate")`
			`if translate:`
use python api, add option to join translations, use turbo model 2025-01-24 08:56:51 +05:30			`translate = dict([tt.split(":") for tt in translate.split(",")])`
pandora_transcribe 2024-07-07 15:32:37 +01:00			`if not layer:`
use python api, add option to join translations, use turbo model 2025-01-24 08:56:51 +05:30			`layer = utils.get_by_key(settings.CONFIG["layers"], "isSubtitles", True)`
pandora_transcribe 2024-07-07 15:32:37 +01:00			`if layer:`
			`layer = layer["id"]`
			`else:`
			`logger.error("no layer defined and config has no subtitle layer")`
			`return`
			`try:`
			`while True:`
			`wait = True`
make it run 2024-07-07 15:46:41 +01:00			`for item in queue.get_items(queue.user).all():`
use python api, add option to join translations, use turbo model 2025-01-24 08:56:51 +05:30			`if extract_subtitles(`
			`item,`
			`user,`
			`layer,`
			`translate,`
			`kwargs.get("gpu"),`
			`join_sentences=kwargs.get("join_sentences"),`
expose model to manage command 2025-01-24 12:10:35 +05:30			`model=kwargs.get("model", "small")`
use python api, add option to join translations, use turbo model 2025-01-24 08:56:51 +05:30			`):`
add to done 2024-07-08 15:43:23 +01:00			`done.items.add(item)`
pandora_transcribe 2024-07-07 15:32:37 +01:00			`queue.items.remove(item)`
			`wait = False`
			`if wait:`
use python api, add option to join translations, use turbo model 2025-01-24 08:56:51 +05:30			`time.sleep(5 * 60)`
pandora_transcribe 2024-07-07 15:32:37 +01:00			`except KeyboardInterrupt:`
			`pass`