From 26988acd500c5ec6a686ae86e7e08ed6a212241d Mon Sep 17 00:00:00 2001
From: j <j@mailb.org>
Date: Sun, 1 Feb 2026 13:08:21 +0100
Subject: [PATCH] vo/video aligment

---
 generate.py        | 98 +++++++++++++++++++++++++++++++++++++---------
 render.py          | 47 ++++++++++++++++++----
 render_kdenlive.py | 36 ++++++++---------
 3 files changed, 138 insertions(+), 43 deletions(-)

diff --git a/generate.py b/generate.py
index 3d940c7..2ed5c83 100644
--- a/generate.py
+++ b/generate.py
@@ -122,6 +122,41 @@ def make_single_character_image(character):
     file.oshash = ox.oshash(file.file.path)
     file.save()
     file.update_sort()
+    file.update_find()
+    return file
+
+def age_character_image(character, age):
+    character = get_character_document(character)
+    character_url = public_document_url(character)
+    data = {
+        "model": "seedream-4-5-251128",
+        "size": "2K",
+        "watermark": False,
+        'image': character_url,
+        "prompt": "use character from image 1, but make older, change the body, face and appearance to match that of a %d year old person, replace clothing, shoes to match the age, adjust hair style to match the age, keep the full body including feet visible. photo realistic picture of a real person in high detail, studio light" % age
+    }
+    url = bytedance_image_generation(data)
+    extension = url.split(".")[-1].split("?")[0]
+    if extension == "jpeg":
+        extension = "jpg"
+    file = Document(user=character.user)
+    file.rightslevel = 2
+    file.data["title"] = character.data['title'] + ' (Age %s)' % age
+    file.extension = extension
+    file.width = -1
+    file.pages = -1
+    file.uploading = True
+    file.save()
+    file.uploading = True
+    name = "data.%s" % file.extension
+    file.file.name = file.path(name)
+    ox.net.save_url(url, file.file.path, overwrite=True)
+    file.get_info()
+    file.get_ratio()
+    file.oshash = ox.oshash(file.file.path)
+    file.save()
+    file.update_sort()
+    file.update_find()
     return file
 
 def make_single_character_image_flux(character):
@@ -149,6 +184,7 @@ def make_single_character_image_flux(character):
     file.oshash = ox.oshash(file.file.path)
     file.save()
     file.update_sort()
+    file.update_find()
     return file
 
 
@@ -456,16 +492,19 @@ def process_frame(item, prompt, character=None, position=0, seed=None):
     img.update_find()
     return img
 
-def get_character_document(character, type="Single Character"):
+def get_character_document(character, type="Single Character", age=None):
     if character in ("P1", "P2", "P3", "P4", "P5"):
-        return Document.objects.get(data__title=type + " " + character)
+        title = type + " " + character
+        if age:
+            title += ' (Age %d)' % age
+        return Document.objects.get(data__title=title)
     return character
 
 """
 REPLACE_CHARACTER_PROMPT = "Replace the foreground character in image 1 with the character in image 2, keep the posture, clothing, background, light, atmosphere from image 1, but take the facial features and personality from image 2. Make sure the size of the character is adjusted since the new character is a child and make sure the size of the head matches the body. The quality of the image should be the same between foreground and background, adjust the quality of the character to match the background. Use the style of image 1 for the character: if image 1 is a photo make the character a real person, if image 1 is a drawing make the character a drawn character, if image 1 is a comic use a comic character and so on"
 """
 
-REPLACE_CHARACTER_PROMPT = "Replace the foreground character in image 1 with the character in image 2, keep the posture, clothing, background, light, atmosphere from image 1, but take the facial features and personality from image 2. Make sure the size of the character is adjusted since the new character is a child and make sure the size of the head matches the body. The quality of the image should be the same between foreground and background, adjust the quality of the character to match the background. Use the style of image 1 for the character: if image 1 is a photo make the character a real person, if image 1 is a drawing make the character a drawn character, if image 1 is a comic use a comic character"
+REPLACE_CHARACTER_PROMPT = "Replace the foreground character in image 1 with the character in image 2, keep the posture, clothing, background, light, atmosphere from image 1, but take the facial features and personality from image 2. Make sure the size of the character is adjusted since the new character is a child and make sure the size of the head matches the body. The quality of the image should be the same between foreground and background, adjust the quality of the character to match the background. Use the style of image 1 for the character: if image 1 is a photo make the character a real person, if image 1 is a drawing make the character a drawn character, if image 1 is a comic use a comic character, restore any blured out regions of the image"
 
 
 def fal_replace_character(item, character, position=0):
@@ -501,13 +540,15 @@ def fal_replace_character(item, character, position=0):
     return img
 
 
-def replace_character(item, character, position=0, seed=None, extra=None):
+def replace_character(item, character, position=0, seed=None, extra=None, age=None):
     prompt = REPLACE_CHARACTER_PROMPT
-    if character == "P5":
+    if age:
+        prompt = prompt.replace("child", "person")
+    elif character == "P5":
         prompt = prompt.replace("child", "teenager")
     if extra:
         prompt += " " + extra
-    character = get_character_document(character)
+    character = get_character_document(character, age=age)
     if isinstance(character, Document):
         character_url = public_document_url(character)
     else:
@@ -518,6 +559,8 @@ def replace_character(item, character, position=0, seed=None, extra=None):
     else:
         frame.data["character"] = character
     frame.data["position"] = position
+    if age:
+        frame.data["title"] += " (Age %d)" % age
     frame.save()
     return frame
 
@@ -741,7 +784,7 @@ def wan_animate_replace(item, character, keep=False):
         shutil.rmtree(os.path.dirname(output))
     return ai
 
-def ltx_a2v(item, character, prompt=None, first_frame=None, keep=False):
+def ltx_a2v(item, character, prompt=None, first_frame=None, keep=False, expand_prompt=False):
     video_url = public_video_url(item)
     audio_path = item.streams()[0].file.data.path
     if first_frame is None:
@@ -757,10 +800,14 @@ def ltx_a2v(item, character, prompt=None, first_frame=None, keep=False):
     image_url = public_document_url(first_frame)
     prefix = "/srv/pandora/static/power/cache/%s_a2v" % (item.public_id)
     os.makedirs(prefix, exist_ok=True)
-    output = prefix + '/audio.m4a'
-    if not os.path.exists(output):
-        cmd = ['ffmpeg', '-hide_banner', '-nostats', '-i', audio_path, '-vn', '-c:a', 'copy', output]
-        subprocess.call(cmd)
+    if audio_path.endswith('.mp3'):
+        output = prefix + '/audio.mp3'
+        shutil.copy(audio_path, output)
+    else:
+        output = prefix + '/audio.m4a'
+        if not os.path.exists(output):
+            cmd = ['ffmpeg', '-hide_banner', '-nostats', '-i', audio_path, '-vn', '-c:a', 'copy', output]
+            subprocess.call(cmd)
     if not os.path.exists(output):
         raise Exception
     audio_url = public_url(output)
@@ -776,6 +823,7 @@ def ltx_a2v(item, character, prompt=None, first_frame=None, keep=False):
         "fps": 24,
         "prompt": prompt,
         "enable_safety_checker": False,
+        "enable_prompt_expansion": expand_prompt,
     }
     print(data)
     handler = fal_client.submit(model, arguments=data)
@@ -797,6 +845,15 @@ def ltx_a2v(item, character, prompt=None, first_frame=None, keep=False):
         shutil.rmtree(prefix)
     return ai
 
+
+def vo2video(vo, item, character, position=0, prompt=None, expand_prompt=False):
+    first_frame = replace_character(item, charcater, position)
+    if prompt is None:
+        # the painting becomes animated and the girl looks into the camera and speaks
+        prompt = "the scene and character become animated, the character looks into the camera and speaks"
+    return ltx_a2v(audio, character=character, prompt=prompt, first_frame=first_frame, expand_prompt=expand_prompt)
+
+
 def ltx_v2v(item, character, prompt=None, keep=False):
     video_url = public_video_url(item)
     character = get_character_document(character)
@@ -1430,8 +1487,8 @@ def add_ai_variant(item, video_path, type):
     file.oshash = ox.oshash(video_path)
     file.item = ai
     file.path = "%s.mp4" % type
-    file.extension = "mp4"
     file.info = ox.avinfo(video_path)
+    file.info['extension'] = "mp4"
     del file.info["path"]
     file.parse_info()
     file.data.name = file.get_path("data." + video_path.split(".")[-1])
@@ -1469,6 +1526,7 @@ def add_ai_image(item, position, url, extension=None):
     file.oshash = ox.oshash(file.file.path)
     file.save()
     file.update_sort()
+    file.update_find()
     file.add(item)
     return file
 
@@ -1490,9 +1548,10 @@ def extract_firstframe(character='P1'):
                 item.refresh_from_db()
                 add_tag(item, 'ai-failed')
 
-def process_reshoot_firstframe(character='P1'):
+def process_reshoot_firstframe(character='P1', age=None, l=None):
     position = 0
-    l = itemlist.models.List.objects.get(name='Reshoot-Firstframe')
+    if l is None:
+        l = itemlist.models.List.objects.get(name='Reshoot-Firstframe')
     for item in l.items.all():
         if 'ai-failed' in item.data.get('tags', []):
             print('>> skip', item)
@@ -1501,13 +1560,16 @@ def process_reshoot_firstframe(character='P1'):
             pass
             #reshoot_item_segments(item, character)
         else:
-            cid = get_character_document(character).get_id()
-            first_frame = item.documents.filter(
+            cid = get_character_document(character, age=age).get_id()
+            qs = item.documents.filter(
                 data__character=cid, data__position=position
-            ).order_by('-created').first()
+            )
+            if age:
+                qs = qs.filter(data__title__contains='(Age %d)' % age)
+            first_frame = qs.order_by('-created').first()
             if not first_frame:
                 try:
-                    first_frame = replace_character(item, character, position)
+                    first_frame = replace_character(item, character, position, age=age)
                 except:
                     item.refresh_from_db()
                     add_tag(item, 'ai-failed')
diff --git a/render.py b/render.py
index 15ab713..7c9d2c9 100644
--- a/render.py
+++ b/render.py
@@ -153,7 +153,7 @@ def compose(clips, fragment, target=150, base=1024, voice_over=None, options=Non
             src.split('/')[-2]
         ))
 
-        scene['front']['V2'].append({
+        scene['front']['V1'].append({
             'duration': clip_duration,
             'id': clip['id'],
             'src': src,
@@ -193,11 +193,11 @@ def compose(clips, fragment, target=150, base=1024, voice_over=None, options=Non
 
         length = format_duration(length, fps)
         ad = get_scene_duration(scene, track='audio-front:A2')
-        vd = get_scene_duration(scene, track='front:V2')
+        vd = get_scene_duration(scene, track='front:V1')
         if ad == vd and abs(ad-length) > 1/48:
             print('v: ', vd, 'ad', ad, 'length:', length, 'fixup')
             length = ad
-        if abs(length -vd) > 1/48 or abs(length - ad) > 1/48 or ad != vd:
+        if abs(length - vd) > 1/48 or abs(length - ad) > 1/48 or ad != vd:
             print('vd: ', vd, 'ad', ad, 'length:', length)
             print(clip)
             sys.exit(-1)
@@ -237,6 +237,7 @@ def compose(clips, fragment, target=150, base=1024, voice_over=None, options=Non
                 ('audio-center', 'A2'),
                 ('audio-rear', 'A1'),
                 ('audio-rear', 'A2'),
+                ('front', 'V2'),
             ):
                 scene[tl][track].append({
                     'blank': True,
@@ -299,6 +300,19 @@ def compose(clips, fragment, target=150, base=1024, voice_over=None, options=Non
                         'blank': True,
                         'duration': voc["duration"]
                     })
+            if 'ai' in vo:
+                scene['front']['V2'].append({
+                    'duration': vo['duration'],
+                    'id': vo['id'],
+                    'src': vo['ai'],
+                    "filter": {
+                    }
+                })
+            else:
+                scene['front']['V2'].append({
+                    'blank': True,
+                    'duration': vo["duration"]
+                })
             sub_offset += voc["duration"]
         if subs:
             scene["subtitles"] = subs
@@ -315,6 +329,10 @@ def compose(clips, fragment, target=150, base=1024, voice_over=None, options=Non
             'blank': True,
             'duration': gap
         })
+        scene['front']['V2'].append({
+            'blank': True,
+            'duration': gap
+        })
         sub_offset += gap
     '''
     print("scene duration: %0.3f vo: %0.3f (length: %0.3f, target: %0.3f)" % (
@@ -1063,6 +1081,19 @@ def generate_clips(options):
             "duration": source.duration,
             "subs": subs
         }
+        ai = item.models.Item.objects.filter(
+            data__title=vo.data['title'],
+            data__type__contains='ai:audio-to-video'
+        ).first()
+        if ai:
+            ai_source = ai.files.filter(selected=True)[0]
+            ai_src = ai_source.data.path
+            ai_target = os.path.join(prefix, 'voice_video', fragment, '%s-%s.%s' % (type, variant, 'mp4'))
+            os.makedirs(os.path.dirname(ai_target), exist_ok=True)
+            if os.path.islink(ai_target):
+                os.unlink(ai_target)
+            os.symlink(ai_src, ai_target)
+            vo_variant['ai'] = ai_target
         done = False
         if type == 'quote':
             if '-a-t' in variant:
@@ -1149,17 +1180,19 @@ def unused_tags():
 
     used_tags = set(tags)
     used_anti_tags = set(anti_tags)
-    all_tags = {t.value.strip().lower() for t in item.models.Facet.objects.filter(key='tags').distinct()}
+    skip_tags = {'ai-failed', 'ai-fail', 'skip'}
+    all_tags = {t.value.strip().lower() for t in item.models.Facet.objects.filter(key='tags').distinct() if t.value.strip().lower() not in skip_tags}
     unused_tags = all_tags - used_tags - used_anti_tags
     unused_items = itemlist.models.List.objects.get(name='Unused Material').items.all()
 
     unused = []
     for tag in sorted(unused_tags):
+        total = item.models.Item.objects.filter(data__type__contains='source').filter(data__tags__icontains=tag).count()
         count = unused_items.filter(data__tags__icontains=tag).count()
-        unused.append([count, tag])
+        unused.append([count, tag, total])
     with open("/srv/pandora/static/power/unused-tags.txt", "w") as fd:
-        for count, tag in reversed(sorted(unused)):
-            fd.write("%s (%d unused video clips)\n" % (tag, count))
+        for count, tag, total in reversed(sorted(unused)):
+            fd.write("%s (%d unused video clips of %s)\n" % (tag, count, total))
 
 def fragment_statistics():
     import itemlist.models
diff --git a/render_kdenlive.py b/render_kdenlive.py
index efcd480..26f9016 100644
--- a/render_kdenlive.py
+++ b/render_kdenlive.py
@@ -375,7 +375,7 @@ class KDEnliveProject:
                     ["sum", "1"],
                 ]),
                 self.get_element("transition", [
-                    ["a_track", "0"],
+                    ["a_track", "5"],
                     ["b_track", "6"],
                     ["compositing", "0"],
                     ["distort", "0"],
@@ -601,10 +601,10 @@ class KDEnliveProject:
             print('!!', track_id)
 
         frames = int(round(self._fps * clip['duration']))
-        if track_id[0] == "V":
+        if track_id[0] == "V" and not clip.get("blank"):
             if abs(self._fps * clip['duration'] - frames) > 1/48:
-                delta = abs(self._fps * clip['duration'] - frames) * 24
-                print("Track alignment issues", self._fps * clip['duration'], frames, clip.get('src', clip), delta)
+                delta = abs((self._fps * clip['duration']) - frames)
+                print("Track alignment issues", track_id, self._fps * clip['duration'], frames, clip.get('src', clip), delta)
         self._duration[track_id] += frames
 
         if clip.get("blank"):
@@ -648,20 +648,20 @@ class KDEnliveProject:
                         height = target_height
                         print("scale to fill %s %sx%s" % (path, width, height))
                 rect = "00:00:00.000=%s %s %s %s 1.000000" % (x, y, width, height)
-                filters_.append(
-                    self.get_element("filter", [
-                        ["mlt_service", "qtblend"],
-                        ["kdenlive_id", "qtblend"],
-                        ["rotate_center", "1"],
-                        ["rect", rect],
-                        ["rotation", "00:00:00.000=0"],
-                        ["compositing", "0"],
-                        ["distort", "0"],
-                        ["kdenlive:collapsed", "0"],
-                        ["disable", "0"],
-                    ])
-                )
-
+                if width != self._width or height != self._height or x or y:
+                    filters_.append(
+                        self.get_element("filter", [
+                            ["mlt_service", "qtblend"],
+                            ["kdenlive_id", "qtblend"],
+                            ["rotate_center", "1"],
+                            ["rect", rect],
+                            ["rotation", "00:00:00.000=0"],
+                            ["compositing", "0"],
+                            ["distort", "0"],
+                            ["kdenlive:collapsed", "0"],
+                            ["disable", "0"],
+                        ])
+                    )
         for ft in filters.items():
             filters_ += self.get_filter(*ft)
         if track_id[0] == 'A' and not has_audio: