various duration issues, prepare for double vo special case

source material might not be in same fps, import with profile
fix scene duration
2026-01-26 18:35:29 +01:00 · 2026-01-26 18:34:55 +01:00 · 2026-01-26 18:34:27 +01:00
4 changed files with 108 additions and 35 deletions
--- a/render.py
+++ b/render.py
@ -61,6 +61,7 @@ def compose(clips, fragment, target=150, base=1024, voice_over=None, options=Non
        },
        'audio-center': {
            'A1': [],
            'A2': [],
        },
        'audio-front': {
            'A1': [],
@ -132,7 +133,9 @@ def compose(clips, fragment, target=150, base=1024, voice_over=None, options=Non
        next_length = length + clip['duration'] 
        if target - next_length < -target*0.1:
            break
-        clip_duration = int(clip['duration'] * fps) / fps
+        clip_duration = format_duration(clip['duration'], fps)
        if clip['duration'] != clip_duration:
            print("WTF", clip, clip['duration'], clip_duration)
        length += clip_duration
        # 50/50 source or ai
@ -145,13 +148,13 @@ def compose(clips, fragment, target=150, base=1024, voice_over=None, options=Non
        print('%07.3f-%07.3f %07.3f %s (%s)' % (
            length-clip_duration,
            length,
-            clip['duration'],
+            clip_duration,
            os.path.basename(clip['source']),
            src.split('/')[-2]
        ))
        scene['front']['V2'].append({
-            'duration': clip['duration'],
+            'duration': clip_duration,
            'src': src,
            "filter": {
            }
@ -181,10 +184,21 @@ def compose(clips, fragment, target=150, base=1024, voice_over=None, options=Non
            'fadein': '00:00:00.125'
        }
        scene['audio-front']['A2'].append({
-            'duration': clip['duration'],
+            'duration': clip_duration,
            'src': audio,
            'filter': audio_filter.copy()
        })
        length = format_duration(length, fps)
        ad = get_scene_duration(scene, track='audio-front:A2')
        vd = get_scene_duration(scene, track='front:V2')
        if ad == vd and abs(ad-length) > 1/48:
            print('v: ', vd, 'ad', ad, 'length:', length, 'fixup')
            length = ad
        if abs(length -vd) > 1/48 or abs(length - ad) > 1/48 or ad != vd:
            print('vd: ', vd, 'ad', ad, 'length:', length)
            print(clip)
            sys.exit(-1)
        used.append(clip)
        if not clips and target - length > 0:
            print("not enough clips, need to reset")
@ -200,7 +214,7 @@ def compose(clips, fragment, target=150, base=1024, voice_over=None, options=Non
                if "ai" in clip:
                    clip["use_ai"] = True
-    scene_duration = int(get_scene_duration(scene) * fps)
+    scene_duration = int(round(get_scene_duration(scene) * fps))
    voice_overs = []
    sub_offset = 0
    subs = []
@ -214,22 +228,28 @@ def compose(clips, fragment, target=150, base=1024, voice_over=None, options=Non
            else:
                gap = (2 * fps + random_int(seq, 5 * fps)) / fps
            gap = format_duration(gap, fps)
-            if int((sub_offset + gap)* fps) > scene_duration:
+            if int((sub_offset + gap) * fps) > scene_duration:
                gap = format_duration((scene_duration - int(sub_offset * fps)) / fps, fps)
-            scene['audio-center']['A1'].append({
+            for tl, track in (
-                'blank': True,
+                ('audio-center', 'A1'),
-                'duration': gap
+                ('audio-center', 'A2'),
-            })
+                ('audio-rear', 'A1'),
-            scene['audio-rear']['A1'].append({
+                ('audio-rear', 'A2'),
-                'blank': True,
+            ):
-                'duration': gap
+                scene[tl][track].append({
-            })
+                    'blank': True,
                    'duration': gap
                })
            print('%07.3f-%07.3f %07.3f' % (sub_offset, sub_offset+gap, gap), 'silence')
            sub_offset += gap
            vo_key = random_choice(seq, vo_keys, pop=True)
            variant = random_int(seq, len(voice_over[vo_key]))
            vo = voice_over[vo_key][variant]
            if isinstance(vo, list):
                vo, vo_b = vo
            else:
                vo_b = None
            while int((vo['duration'] + sub_offset) * fps) > scene_duration:
                if not vo_keys:
                    vo = None
@ -237,6 +257,10 @@ def compose(clips, fragment, target=150, base=1024, voice_over=None, options=Non
                vo_key = random_choice(seq, vo_keys, pop=True)
                variant = random_int(seq, len(voice_over[vo_key]))
                vo = voice_over[vo_key][variant]
                if isinstance(vo, list):
                    vo, vo_b = vo
                else:
                    vo_b = None
            if vo is None:
                break
            print('%07.3f-%07.3f %07.3f' % (sub_offset, sub_offset+vo["duration"], vo["duration"]), vo["src"].split('/')[-1])
@ -255,13 +279,29 @@ def compose(clips, fragment, target=150, base=1024, voice_over=None, options=Non
                sub["in"] += sub_offset
                sub["out"] += sub_offset
                subs.append(sub)
            if vo_b:
                vo_b = vo_b.copy()
                vo_b['filter'] = {'volume': a}
                scene['audio-center']['A2'].append(vo_b)
                vo_b = vo_b.copy()
                vo_b['filter'] = {'volume': b}
                scene['audio-rear']['A1'].append(vo_b)
            else:
                for tl, track in (
                    ('audio-center', 'A2'),
                    ('audio-rear', 'A2'),
                ):
                    scene[tl][track].append({
                        'blank': True,
                        'duration': voc["duration"]
                    })
            sub_offset += voc["duration"]
        if subs:
            scene["subtitles"] = subs
        sub_offset = format_duration(sub_offset, fps)
-    sub_offset = int(sub_offset * fps)
+    if sub_offset < scene_duration/fps:
-    if sub_offset < scene_duration:
+        gap = scene_duration/fps - sub_offset
        gap = format_duration((scene_duration - sub_offset) / fps, fps)
        print('%07.3f-%07.3f %07.3f' % (sub_offset, sub_offset+gap, gap), 'silence')
        scene['audio-center']['A1'].append({
            'blank': True,
@ -272,7 +312,18 @@ def compose(clips, fragment, target=150, base=1024, voice_over=None, options=Non
            'duration': gap
        })
        sub_offset += gap
-    print("scene duration %0.3f (target: %0.3f)" % (length, target))
+    '''
    print("scene duration: %0.3f vo: %0.3f (length: %0.3f, target: %0.3f)" % (
        get_scene_duration(scene),
        sub_offset,
        length,
        target
    ))
    '''
    print("scene duration: %0.3f (target: %0.3f)" % (
        get_scene_duration(scene),
        target
    ))
    return scene, used
 def write_subtitles(data, folder, options):
@ -560,10 +611,14 @@ def render_all(options):
            for a, b in (
                ("front-mixed.mp4", "front.mp4"),
                ("audio-center.wav", "front.mp4"),
                ("audio-rear.wav", "front.mp4"),
                ("audio-front.wav", "front.mp4"),
                ("audio-5.1.mp4", "front.mp4"),
            ):
                duration_a = ox.avinfo(str(fragment_prefix / a))['duration']
                duration_b = ox.avinfo(str(fragment_prefix / b))['duration']
-                if duration_a != duration_b:
+                if abs(duration_a - duration_b) > 1/48:
                    print('!!', duration_a, fragment_prefix / a)
                    print('!!', duration_b, fragment_prefix / b)
                    sys.exit(-1)
@ -854,6 +909,7 @@ def generate_clips(options):
    import item.models
    import itemlist.models
    fps = 24
    options = load_defaults(options)
    prefix = options['prefix']
    lang, tlang = parse_lang(options["lang"])
@ -873,7 +929,8 @@ def generate_clips(options):
                    continue
                if not e.files.filter(selected=True).exists():
                    continue
-                source = e.files.filter(selected=True)[0].data.path
+                selected = e.files.filter(selected=True)[0]
                source = selected.data.path
                ext = os.path.splitext(source)[1]
                type_ = e.data['type'][0].lower()
                if type_.startswith('ai:'):
@ -885,7 +942,7 @@ def generate_clips(options):
                        ai_type = '%s-%s' % (type_[3:], n)
                        n += 1
                    clip['ai'][ai_type] = target
-                    type_ = ai_type
+                    type_ = 'ai:' + ai_type
                target = os.path.join(prefix, 'video', type_, i.data['title'] + ext)
                if type_ == "source":
                    source_target = target
@ -898,19 +955,22 @@ def generate_clips(options):
                if os.path.islink(target):
                    os.unlink(target)
                os.symlink(source, target)
-                durations.append(e.files.filter(selected=True)[0].duration)
+                durations.append(selected.duration)
            if not durations:
                print(i.public_id, 'no duration!', clip)
                continue
-            clip["duration"] = min(durations)
+            if len(set(durations)) > 1:
                print(clip, durations)
            clip["duration"] = min(durations) - 1/24
            # trim to a multiple of the output fps
-            d1 = int(clip["duration"] * 24) / 24
+            d1 = format_duration(clip["duration"], fps)
            if d1 != clip["duration"]:
                clip["duration"] = d1
            if not clip["duration"]:
                print('!!', durations, clip)
                continue
-            cd = format_duration(clip["duration"], 24)
+            cd = format_duration(clip["duration"], fps)
            clip["duration"] = cd
            clip['tags'] = i.data.get('tags', [])
            adjust_volume = i.data.get('adjustvolume', '')
@ -971,7 +1031,8 @@ def generate_clips(options):
        voice_over[fragment][type].append({
            "variant": variant,
            "src": target,
-            "duration": format_duration(source.duration, 24),
+            #"duration": format_duration(source.duration, fps, True),
            "duration": source.duration,
            "subs": subs
        })
    with open(os.path.join(prefix, 'voice_over.json'), 'w') as fd:
--- a/render_kdenlive.py
+++ b/render_kdenlive.py
@ -21,7 +21,7 @@ def get_melt():
        cmd = ['xvfb-run', '-a'] + cmd
    return cmd
-def melt_xml(file):
+def melt_xml(file, profile='atsc_1080p_24'):
    out = None
    real_path = os.path.realpath(file)
    if file in _CACHE and isinstance(_CACHE[file], list):
@ -29,7 +29,7 @@ def melt_xml(file):
        if os.stat(real_path).st_mtime != ts:
            out = None
    if not out:
-        cmd = get_melt() + [file, '-consumer', 'xml']
+        cmd = get_melt() + [file, '-profile', profile, '-consumer', 'xml']
        out = subprocess.check_output(cmd).decode()
        _CACHE[file] = [os.stat(real_path).st_mtime, out]
    return out
@ -73,6 +73,7 @@ class KDEnliveProject:
        self._width = int(width)
        self._height = int(height)
        self._fps = int(frame_rate_num) / int(frame_rate_den)
        self.profile = 'atsc_1080p_24'
        self._tree = self.get_element("mlt", attrib={
            "LC_NUMERIC": "C",
@ -444,7 +445,7 @@ class KDEnliveProject:
        return prefix + self.get_counter(prefix)
    def get_chain(self, file, kdenlive_id=None):
-        out = melt_xml(file)
+        out = melt_xml(file, self.profile)
        chain = lxml.etree.fromstring(out).xpath('producer')[0]
        chain.tag = 'chain'
        chain.attrib['id'] = self.get_id('chain')
@ -597,7 +598,11 @@ class KDEnliveProject:
        else:
            print('!!', track_id)
-        frames = int(self._fps * clip['duration'])
+        frames = int(round(self._fps * clip['duration']))
        if track_id[0] == "V":
            if abs(self._fps * clip['duration'] - frames) > 1/48:
                delta = abs(self._fps * clip['duration'] - frames) * 24
                print("Track alignment issues", self._fps * clip['duration'], frames, clip.get('src', clip), delta)
        self._duration[track_id] += frames
        if clip.get("blank"):
--- a/render_utils.py
+++ b/render_utils.py
@ -5,6 +5,7 @@ import lxml.etree
 import ox
 from .render_kdenlive import melt_xml
 from .utils import format_duration
 def parse_lang(lang):
@ -57,17 +58,20 @@ def get_clip_by_seqid(clips, seqid):
    return None
-def get_scene_duration(scene):
+def get_scene_duration(scene, fps=24, track=None):
    if isinstance(scene, str):
        with open(scene) as fd:
            scene = json.load(fd)
    duration = 0
    for key, value in scene.items():
        for name, clips in value.items():
            if track and '%s:%s' % (key, name) != track:
                continue
            if clips:
                for clip in clips:
-                    duration += int(clip["duration"] * 24)
+                    duration += round(clip["duration"] * fps)
-                return duration / 24
+                #print("scene duration based on %s:%s is %s %s" % (key, name, duration / fps, format_duration(duration / fps, fps)))
                return duration / fps
 def get_offset_duration(prefix):
--- a/utils.py
+++ b/utils.py
@ -57,6 +57,9 @@ def write_if_new(path, data, mode=''):
        with open(path, write_mode) as fd:
            fd.write(data)
-def format_duration(duration, fps):
+def format_duration(duration, fps, audio=False):
-    return float('%0.5f' % (round(duration * fps) / fps))
+    if audio:
        return float('%0.5f' % (int(duration * fps) / fps))
    else:
        return float('%0.5f' % (round(duration * fps) / fps))
Author	SHA1	Message	Date
j	f8cbbd55c7	various duration issues, prepare for double vo special case	2026-01-26 18:35:29 +01:00
j	c47e6a5e15	source material might not be in same fps, import with profile	2026-01-26 18:34:55 +01:00
j	0c4f55006d	fix scene duration	2026-01-26 18:34:27 +01:00