From f8cbbd55c78df2353d600e68cb6464ecd41964c9 Mon Sep 17 00:00:00 2001
From: j <j@mailb.org>
Date: Mon, 26 Jan 2026 18:35:29 +0100
Subject: [PATCH] various duration issues, prepare for double vo special case

---
 render.py | 113 +++++++++++++++++++++++++++++++++++++++++-------------
 utils.py  |   7 +++-
 2 files changed, 92 insertions(+), 28 deletions(-)

diff --git a/render.py b/render.py
index 57925ef..2229bff 100644
--- a/render.py
+++ b/render.py
@@ -61,6 +61,7 @@ def compose(clips, fragment, target=150, base=1024, voice_over=None, options=Non
         },
         'audio-center': {
             'A1': [],
+            'A2': [],
         },
         'audio-front': {
             'A1': [],
@@ -132,7 +133,9 @@ def compose(clips, fragment, target=150, base=1024, voice_over=None, options=Non
         next_length = length + clip['duration'] 
         if target - next_length < -target*0.1:
             break
-        clip_duration = int(clip['duration'] * fps) / fps
+        clip_duration = format_duration(clip['duration'], fps)
+        if clip['duration'] != clip_duration:
+            print("WTF", clip, clip['duration'], clip_duration)
         length += clip_duration
 
         # 50/50 source or ai
@@ -145,13 +148,13 @@ def compose(clips, fragment, target=150, base=1024, voice_over=None, options=Non
         print('%07.3f-%07.3f %07.3f %s (%s)' % (
             length-clip_duration,
             length,
-            clip['duration'],
+            clip_duration,
             os.path.basename(clip['source']),
             src.split('/')[-2]
         ))
 
         scene['front']['V2'].append({
-            'duration': clip['duration'],
+            'duration': clip_duration,
             'src': src,
             "filter": {
             }
@@ -181,10 +184,21 @@ def compose(clips, fragment, target=150, base=1024, voice_over=None, options=Non
             'fadein': '00:00:00.125'
         }
         scene['audio-front']['A2'].append({
-            'duration': clip['duration'],
+            'duration': clip_duration,
             'src': audio,
             'filter': audio_filter.copy()
         })
+
+        length = format_duration(length, fps)
+        ad = get_scene_duration(scene, track='audio-front:A2')
+        vd = get_scene_duration(scene, track='front:V2')
+        if ad == vd and abs(ad-length) > 1/48:
+            print('v: ', vd, 'ad', ad, 'length:', length, 'fixup')
+            length = ad
+        if abs(length -vd) > 1/48 or abs(length - ad) > 1/48 or ad != vd:
+            print('vd: ', vd, 'ad', ad, 'length:', length)
+            print(clip)
+            sys.exit(-1)
         used.append(clip)
         if not clips and target - length > 0:
             print("not enough clips, need to reset")
@@ -200,7 +214,7 @@ def compose(clips, fragment, target=150, base=1024, voice_over=None, options=Non
                 if "ai" in clip:
                     clip["use_ai"] = True
 
-    scene_duration = int(get_scene_duration(scene) * fps)
+    scene_duration = int(round(get_scene_duration(scene) * fps))
     voice_overs = []
     sub_offset = 0
     subs = []
@@ -214,22 +228,28 @@ def compose(clips, fragment, target=150, base=1024, voice_over=None, options=Non
             else:
                 gap = (2 * fps + random_int(seq, 5 * fps)) / fps
             gap = format_duration(gap, fps)
-            if int((sub_offset + gap)* fps) > scene_duration:
+            if int((sub_offset + gap) * fps) > scene_duration:
                 gap = format_duration((scene_duration - int(sub_offset * fps)) / fps, fps)
-            scene['audio-center']['A1'].append({
-                'blank': True,
-                'duration': gap
-            })
-            scene['audio-rear']['A1'].append({
-                'blank': True,
-                'duration': gap
-            })
+            for tl, track in (
+                ('audio-center', 'A1'),
+                ('audio-center', 'A2'),
+                ('audio-rear', 'A1'),
+                ('audio-rear', 'A2'),
+            ):
+                scene[tl][track].append({
+                    'blank': True,
+                    'duration': gap
+                })
             print('%07.3f-%07.3f %07.3f' % (sub_offset, sub_offset+gap, gap), 'silence')
             sub_offset += gap
 
             vo_key = random_choice(seq, vo_keys, pop=True)
             variant = random_int(seq, len(voice_over[vo_key]))
             vo = voice_over[vo_key][variant]
+            if isinstance(vo, list):
+                vo, vo_b = vo
+            else:
+                vo_b = None
             while int((vo['duration'] + sub_offset) * fps) > scene_duration:
                 if not vo_keys:
                     vo = None
@@ -237,6 +257,10 @@ def compose(clips, fragment, target=150, base=1024, voice_over=None, options=Non
                 vo_key = random_choice(seq, vo_keys, pop=True)
                 variant = random_int(seq, len(voice_over[vo_key]))
                 vo = voice_over[vo_key][variant]
+                if isinstance(vo, list):
+                    vo, vo_b = vo
+                else:
+                    vo_b = None
             if vo is None:
                 break
             print('%07.3f-%07.3f %07.3f' % (sub_offset, sub_offset+vo["duration"], vo["duration"]), vo["src"].split('/')[-1])
@@ -255,13 +279,29 @@ def compose(clips, fragment, target=150, base=1024, voice_over=None, options=Non
                 sub["in"] += sub_offset
                 sub["out"] += sub_offset
                 subs.append(sub)
+            if vo_b:
+                vo_b = vo_b.copy()
+                vo_b['filter'] = {'volume': a}
+                scene['audio-center']['A2'].append(vo_b)
+                vo_b = vo_b.copy()
+                vo_b['filter'] = {'volume': b}
+                scene['audio-rear']['A1'].append(vo_b)
+            else:
+                for tl, track in (
+                    ('audio-center', 'A2'),
+                    ('audio-rear', 'A2'),
+                ):
+                    scene[tl][track].append({
+                        'blank': True,
+                        'duration': voc["duration"]
+                    })
             sub_offset += voc["duration"]
         if subs:
             scene["subtitles"] = subs
+        sub_offset = format_duration(sub_offset, fps)
 
-    sub_offset = int(sub_offset * fps)
-    if sub_offset < scene_duration:
-        gap = format_duration((scene_duration - sub_offset) / fps, fps)
+    if sub_offset < scene_duration/fps:
+        gap = scene_duration/fps - sub_offset
         print('%07.3f-%07.3f %07.3f' % (sub_offset, sub_offset+gap, gap), 'silence')
         scene['audio-center']['A1'].append({
             'blank': True,
@@ -272,7 +312,18 @@ def compose(clips, fragment, target=150, base=1024, voice_over=None, options=Non
             'duration': gap
         })
         sub_offset += gap
-    print("scene duration %0.3f (target: %0.3f)" % (length, target))
+    '''
+    print("scene duration: %0.3f vo: %0.3f (length: %0.3f, target: %0.3f)" % (
+        get_scene_duration(scene),
+        sub_offset,
+        length,
+        target
+    ))
+    '''
+    print("scene duration: %0.3f (target: %0.3f)" % (
+        get_scene_duration(scene),
+        target
+    ))
     return scene, used
 
 def write_subtitles(data, folder, options):
@@ -560,10 +611,14 @@ def render_all(options):
 
             for a, b in (
                 ("front-mixed.mp4", "front.mp4"),
+                ("audio-center.wav", "front.mp4"),
+                ("audio-rear.wav", "front.mp4"),
+                ("audio-front.wav", "front.mp4"),
+                ("audio-5.1.mp4", "front.mp4"),
             ):
                 duration_a = ox.avinfo(str(fragment_prefix / a))['duration']
                 duration_b = ox.avinfo(str(fragment_prefix / b))['duration']
-                if duration_a != duration_b:
+                if abs(duration_a - duration_b) > 1/48:
                     print('!!', duration_a, fragment_prefix / a)
                     print('!!', duration_b, fragment_prefix / b)
                     sys.exit(-1)
@@ -854,6 +909,7 @@ def generate_clips(options):
     import item.models
     import itemlist.models
 
+    fps = 24
     options = load_defaults(options)
     prefix = options['prefix']
     lang, tlang = parse_lang(options["lang"])
@@ -873,7 +929,8 @@ def generate_clips(options):
                     continue
                 if not e.files.filter(selected=True).exists():
                     continue
-                source = e.files.filter(selected=True)[0].data.path
+                selected = e.files.filter(selected=True)[0]
+                source = selected.data.path
                 ext = os.path.splitext(source)[1]
                 type_ = e.data['type'][0].lower()
                 if type_.startswith('ai:'):
@@ -885,7 +942,7 @@ def generate_clips(options):
                         ai_type = '%s-%s' % (type_[3:], n)
                         n += 1
                     clip['ai'][ai_type] = target
-                    type_ = ai_type
+                    type_ = 'ai:' + ai_type
                 target = os.path.join(prefix, 'video', type_, i.data['title'] + ext)
                 if type_ == "source":
                     source_target = target
@@ -898,19 +955,22 @@ def generate_clips(options):
                 if os.path.islink(target):
                     os.unlink(target)
                 os.symlink(source, target)
-                durations.append(e.files.filter(selected=True)[0].duration)
+                durations.append(selected.duration)
             if not durations:
                 print(i.public_id, 'no duration!', clip)
                 continue
-            clip["duration"] = min(durations)
+            if len(set(durations)) > 1:
+                print(clip, durations)
+
+            clip["duration"] = min(durations) - 1/24
             # trim to a multiple of the output fps
-            d1 = int(clip["duration"] * 24) / 24
+            d1 = format_duration(clip["duration"], fps)
             if d1 != clip["duration"]:
                 clip["duration"] = d1
             if not clip["duration"]:
                 print('!!', durations, clip)
                 continue
-            cd = format_duration(clip["duration"], 24)
+            cd = format_duration(clip["duration"], fps)
             clip["duration"] = cd
             clip['tags'] = i.data.get('tags', [])
             adjust_volume = i.data.get('adjustvolume', '')
@@ -971,7 +1031,8 @@ def generate_clips(options):
         voice_over[fragment][type].append({
             "variant": variant,
             "src": target,
-            "duration": format_duration(source.duration, 24),
+            #"duration": format_duration(source.duration, fps, True),
+            "duration": source.duration,
             "subs": subs
         })
     with open(os.path.join(prefix, 'voice_over.json'), 'w') as fd:
diff --git a/utils.py b/utils.py
index 8824d2b..1cdd826 100644
--- a/utils.py
+++ b/utils.py
@@ -57,6 +57,9 @@ def write_if_new(path, data, mode=''):
         with open(path, write_mode) as fd:
             fd.write(data)
 
-def format_duration(duration, fps):
-    return float('%0.5f' % (round(duration * fps) / fps))
+def format_duration(duration, fps, audio=False):
+    if audio:
+        return float('%0.5f' % (int(duration * fps) / fps))
+    else:
+        return float('%0.5f' % (round(duration * fps) / fps))