diff --git a/transcribe.py b/transcribe.py index 748a80c..58520bd 100644 --- a/transcribe.py +++ b/transcribe.py @@ -21,6 +21,7 @@ from user.models import User logger = logging.getLogger(__name__) +TARGET_LENGTH = getattr(settings, 'TRANSCRIBE_TARGET_LENGTH', 200) def prepare_annotations(result, join_sentences=False): if join_sentences: @@ -37,7 +38,7 @@ def prepare_annotations(result, join_sentences=False): return annotations -def prepare_joint_annotations(result, target_length=200): +def prepare_joint_annotations(result, target_length=TARGET_LENGTH): abbrevs = ["Mr.", "Mrs.", "Dr."] ignore = [] phrase_sounds = [] @@ -149,15 +150,23 @@ def extract_subtitles(item, user, layer, translate, gpu=False, join_sentences=Fa if not item.streams(): logger.error("skip item without media %s: %s", item.public_id) return False - src = item.streams()[0].media.path - response = run_whisper(src, language, translate, gpu, model) - if not response: - logger.error("extract failed for %s", item.public_id) - return False - annotations = prepare_annotations(response, join_sentences=join_sentences) + + offset = 0 + annotations = [] + for stream in item.streams(): + src = stream.media.path + response = run_whisper(src, language, translate, gpu, model) + if not response: + logger.error("extract failed for %s", item.public_id) + return False + for annotation in prepare_annotations(response, join_sentences=join_sentences): + if offset: + annotation['in'] += offset + annotation['out'] += offset + annotations.append(annotation) + offset += stream.duration if not annotations: return False - if language and language != "en": for annotation in annotations: annotation["value"] = '%s' % (