From f28d37e33e3662072eac44e3dcb09750bad87968 Mon Sep 17 00:00:00 2001 From: j Date: Sat, 25 Jan 2025 11:43:27 +0530 Subject: [PATCH 1/2] add TRANSCRIBE_TARGET_LENGTH setting --- transcribe.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/transcribe.py b/transcribe.py index 748a80c..622feb3 100644 --- a/transcribe.py +++ b/transcribe.py @@ -21,6 +21,7 @@ from user.models import User logger = logging.getLogger(__name__) +TARGET_LENGTH = getattr(settings, 'TRANSCRIBE_TARGET_LENGTH', 200) def prepare_annotations(result, join_sentences=False): if join_sentences: @@ -37,7 +38,7 @@ def prepare_annotations(result, join_sentences=False): return annotations -def prepare_joint_annotations(result, target_length=200): +def prepare_joint_annotations(result, target_length=TARGET_LENGTH): abbrevs = ["Mr.", "Mrs.", "Dr."] ignore = [] phrase_sounds = [] From eb2d12a905464ffe3b67361279c4ef89c77c5a98 Mon Sep 17 00:00:00 2001 From: j Date: Sat, 25 Jan 2025 14:36:15 +0530 Subject: [PATCH 2/2] support multipart items --- transcribe.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/transcribe.py b/transcribe.py index 622feb3..58520bd 100644 --- a/transcribe.py +++ b/transcribe.py @@ -150,15 +150,23 @@ def extract_subtitles(item, user, layer, translate, gpu=False, join_sentences=Fa if not item.streams(): logger.error("skip item without media %s: %s", item.public_id) return False - src = item.streams()[0].media.path - response = run_whisper(src, language, translate, gpu, model) - if not response: - logger.error("extract failed for %s", item.public_id) - return False - annotations = prepare_annotations(response, join_sentences=join_sentences) + + offset = 0 + annotations = [] + for stream in item.streams(): + src = stream.media.path + response = run_whisper(src, language, translate, gpu, model) + if not response: + logger.error("extract failed for %s", item.public_id) + return False + for annotation in prepare_annotations(response, join_sentences=join_sentences): + if offset: + annotation['in'] += offset + annotation['out'] += offset + annotations.append(annotation) + offset += stream.duration if not annotations: return False - if language and language != "en": for annotation in annotations: annotation["value"] = '%s' % (