Merge remote-tracking branch 'VideoStory/main' into backup-main

Original
2025-04-20 11:49:00 +02:00 · 2025-04-20 11:41:31 +02:00 · 2025-04-20 11:31:29 +02:00 · 2025-04-20 11:29:26 +02:00
8 changed files with 303 additions and 0 deletions
--- a/clean.py
+++ b/clean.py
@ -0,0 +1,5 @@
+import re
+def clean(input_text):
+    output_text = re.sub(r'keywords:', ', ', input_text)
+    output_text = re.sub(r'\n', ', ', input_text)
+    return output_text
--- a/gen.py
+++ b/gen.py
@ -0,0 +1,80 @@
+import re
+from llama_cpp import Llama
+import outetts
+from diffusers import StableDiffusionPipeline
+from diffusers.utils import load_image, export_to_video
+import torch
+import os
+
+def loadllama():
+    llm = Llama(
+        model_path="models/llama3.gguf",
+        n_ctx=4096,
+        n_threads=6,
+        n_gpu_layers=-1,
+        verbose=False
+    )
+
+    return llm
+
+def loadtts():
+    model_config = outetts.GGUFModelConfig_v1(
+        model_path="models/tts.gguf",
+        language="en",
+        n_gpu_layers=-1,
+        verbose=False,
+    )
+
+    interface = outetts.InterfaceGGUF(model_version="0.2", cfg=model_config)
+    speaker = interface.load_default_speaker(name="male_1")
+    return interface, speaker
+
+def loadsdxl():
+    pipe = pipe = StableDiffusionPipeline.from_single_file("models/revAnimated_v2Rebirth.safetensors", torch_dtype=torch.float32, safety_checker=None)
+    pipe.to("cuda")
+    pipe.enable_xformers_memory_efficient_attention()
+    pipe.enable_model_cpu_offload()
+    return pipe
+
+def loadsvd():
+    pipe = StableVideoDiffusionPipeline.from_pretrained(
+        "stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16"
+    )
+    pipe.to("cuda")
+    return pipe
+
+
+def generate_story(prompt, sprompt, model):
+    sys_prompt = ("<|im_start|>system\n" + sprompt + "<|im_end|>\n")
+    user_prompt = (
+        "<|im_start|>user\n"
+        + prompt + "<|im_end|>\n"
+        + "<|im_start|>assistant"
+    )
+    res = model(sys_prompt + user_prompt, max_tokens=2**16, stop=["</s>"], echo=False)
+    return str(res["choices"][0]["text"])
+
+
+def text_to_speech(prompt, interface, speaker, num):
+
+    output = interface.generate(
+        text=prompt,
+        temperature=0.1,
+        repetition_penalty=1.1,
+        max_length=4096,
+        speaker=speaker,
+    )
+
+    output.save(f"./audios/{num}.wav")
+
+def stableDiffusion(pprompt, nprompt, pipe, num):
+    os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
+    images = pipe(prompt=pprompt, negative_prompt=nprompt, num_inference_steps=60, height=512, width=768).images[0]
+    images.save(f"./images/{num}.png")
+
+def stableVideoDiffusion(pipe, image, num):
+    os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
+    image = load_image(image)
+    frames = pipe(image, decode_chunk_size=2, num_frames=25).frames[0]
+    export_to_video(frames, f"./videos/{num}.mp4", fps=7)
+
--- a/genXL.py
+++ b/genXL.py
@ -0,0 +1,82 @@
+import re
+from llama_cpp import Llama
+import outetts
+from diffusers import StableDiffusionXLPipeline
+from diffusers.utils import load_image, export_to_video
+import torch
+import os
+
+def loadllama():
+    llm = Llama(
+        model_path="models/llama3.gguf",
+        n_ctx=4096,
+        n_threads=6,
+        n_gpu_layers=-1,
+        verbose=False
+    )
+
+    return llm
+
+def loadtts():
+    model_config = outetts.GGUFModelConfig_v1(
+        model_path="models/tts.gguf",
+        language="en",
+        n_gpu_layers=-1,
+        verbose=False,
+    )
+
+    interface = outetts.InterfaceGGUF(model_version="0.2", cfg=model_config)
+    speaker = interface.load_default_speaker(name="male_1")
+    return interface, speaker
+
+def loadsdxl():
+    pipe = StableDiffusionXLPipeline.from_single_file("models/sdxlReal.safetensors", torch_dtype=torch.float16, variant="fp16", requires_safety_checker=True)
+    pipe.to("cuda")
+    pipe.enable_xformers_memory_efficient_attention()
+    pipe.enable_model_cpu_offload()
+    return pipe
+
+def loadsvd():
+    pipe = StableVideoDiffusionPipeline.from_pretrained(
+        "stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16"
+    )
+    pipe.to("cuda")
+    pipe.enable_xformers_memory_efficient_attention()
+    pipe.enable_model_cpu_offload()
+    return pipe
+
+
+def generate_story(prompt, sprompt, model):
+    sys_prompt = ("<|im_start|>system\n" + sprompt + "<|im_end|>\n")
+    user_prompt = (
+        "<|im_start|>user\n"
+        + prompt + "<|im_end|>\n"
+        + "<|im_start|>assistant"
+    )
+    res = model(sys_prompt + user_prompt, max_tokens=2**16, stop=["</s>"], echo=False)
+    return str(res["choices"][0]["text"])
+
+
+def text_to_speech(prompt, interface, speaker, num):
+
+    output = interface.generate(
+        text=prompt,
+        temperature=0.1,
+        repetition_penalty=1.1,
+        max_length=4096,
+        speaker=speaker,
+    )
+
+    output.save(f"./audios/{num}.wav")
+
+def stableDiffusion(pprompt, nprompt, pipe, num):
+    os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
+    images = pipe(prompt=pprompt, negative_prompt=nprompt, num_inference_steps=30, height=576, width=1024).images[0]
+    images.save(f"./images/{num}.png")
+
+def stableVideoDiffusion(pipe, image, num):
+    os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
+    image = load_image(image)
+    frames = pipe(image, decode_chunk_size=2, num_frames=25).frames[0]
+    export_to_video(frames, f"./videos/{num}.mp4", fps=7)
+
--- a/main.py
+++ b/main.py
@ -0,0 +1,66 @@
+import gen
+import torch
+import video
+import os
+
+
+SYSTEMPROMPTT = ""
+SYSTEMPROMPTI = ""
+SDBAD = ""
+
+with open("promptUtoT.txt", "r") as file:
+    SYSTEMPROMPTT = file.readlines()[0]
+
+with open("promptTtoI.txt", "r") as file:
+    SYSTEMPROMPTI = file.readlines()[0]
+
+def main(prompt):
+
+    llm = gen.loadllama()
+
+    raw_text = gen.generate_story(prompt, SYSTEMPROMPTT, model=llm)
+    image_prompts = []
+    raw_text = raw_text.split("\n")
+    raw_text = [item for item in raw_text if item != ""]
+
+    for i in range(len(raw_text)):
+        promptimg = "Context:\n"
+        for l in range(0, i - 1):
+            promptimg += raw_text[l] + " "
+        promptimg += "Current Scene:\n" + raw_text[i]
+        image_prompts.append(gen.generate_story(promptimg, SYSTEMPROMPTI, model=llm).strip("\n"))
+
+    llm, speaker = gen.loadtts()
+
+    for i in range(len(raw_text)):
+        try:
+            gen.text_to_speech(raw_text[i], llm, speaker, i)
+        except:
+            pass
+
+    del llm
+    del speaker
+    torch.cuda.empty_cache()
+
+    llm = gen.loadsdxl()
+
+    for elem in range(len(image_prompts)):
+        gen.stableDiffusion(image_prompts[elem], SDBAD, llm, elem)
+
+    del llm
+    torch.cuda.empty_cache()
+    """
+    llm = gen.loadsvd()
+
+    for elem in range(len([name for name in os.listdir('./images')])):
+        gen.stableVideoDiffusion(llm, f"./images/{elem}.png", elem)
+
+    del llm
+    torch.cuda.empty_cache()
+    """
+
+    video.create_video(".", ".", "out.mp4")
+
+    return None
+
+main("")
--- a/out.mp4
+++ b/out.mp4
--- a/promptTtoI.txt
+++ b/promptTtoI.txt
@ -0,0 +1 @@
+You are an AI prompt engineer for Stable Diffusion 1.5.
--- a/promptUtoT.txt
+++ b/promptUtoT.txt
@ -0,0 +1 @@
+You are a highly skilled and versatile writer, renowned for your ability to craft captivating stories across all genres. Your work is celebrated globally and has been translated into over 73 languages. As a master storyteller, you create intricate, detailed narratives that engage readers and evoke strong emotions. Your writing is characterized by its originality and depth, ensuring that each piece is unique and memorable.
--- a/video.py
+++ b/video.py
@ -0,0 +1,68 @@
+import os
+from moviepy import *
+
+def create_video(image_folder, audio_folder, output_video):
+    # Get sorted lists of images and audio files
+    images = []
+    audio_files = []
+    for i in range(len([name for name in os.listdir('./images')])):
+        images.append(f"./images/{i}.png")
+        audio_files.append(f"./audios/{i}.wav")
+
+    clips = []
+
+    for img, audio in zip(images, audio_files):
+        # Create an ImageClip
+        img_path = os.path.join(image_folder, img)
+        image_clip = ImageClip(img_path)
+
+        # Create an AudioFileClip
+        audio_path = os.path.join(audio_folder, audio)
+        audio_clip = AudioFileClip(audio_path)
+        audio_clip = CompositeAudioClip([audio_clip])
+
+        # Set the duration of the image clip to the duration of the audio clip
+        image_clip = image_clip.with_duration(audio_clip.duration)
+
+        # Set the audio of the image clip
+        image_clip.audio = audio_clip
+
+        # Append the image clip to the list of clips
+        clips.append(image_clip)
+
+    # Concatenate all the clips into a single video
+    final_video = concatenate_videoclips(clips, method="compose")
+
+    # Write the result to a file
+    final_video.write_videofile(output_video, codec='libx264', audio_codec='aac', fps=24)
+
+"""
+import os
+from moviepy import *
+
+def create_video_from_images_and_audio(image_folder, audio_folder, output_video):
+    video_files = []
+    audio_files = []
+    for i in range(len([name for name in os.listdir('./images')])):
+        video_files.append(f"./videos/{i}.mp4")
+        audio_files.append(f"./audios/{i}.wav")
+
+
+    audio_clip = AudioFileClip(audio_file)
+    audio_duration = audio_clip.duration
+
+    clips = []
+
+    for video, audio in zip(video_files, audio_files):
+        video_clip = VideoFileClip(video_path)
+        audio_clip = AudioFileClip(audio_path)
+        if video_clip.duration < audio_clip.duration:
+            loops_needed = int(audio_clip.duration // video_clip.duration) + 1
+            video_clip = video_clip.loop(loops=loops_needed)
+        video_clip = video_clip.subclip(0, audio_clip.duration)
+        video_clip = video_clip.set_audio(audio_clip)
+        clips.append(video_clip)
+
+    final_video = concatenate_videoclips(clips, method="compose")
+    final_video.write_videofile(output_video, codec='libx264', audio_codec='aac', fps=24)
+"""
Author	SHA1	Message	Date
Joachim Rey	23b261710b	Merge remote-tracking branch 'VideoStory/main' into backup-main	2025-04-20 11:49:00 +02:00
Joachim Rey	100348f0a4	Original	2025-04-20 11:41:31 +02:00
Joachim Rey	62815fe58c	Original	2025-04-20 11:31:29 +02:00
Joachim Rey	7e9e55d131	Original	2025-04-20 11:29:26 +02:00
				`@ -0,0 +1 @@`
				`You are an AI prompt engineer for Stable Diffusion 1.5.`
				`@ -0,0 +1 @@`
				`You are a highly skilled and versatile writer, renowned for your ability to craft captivating stories across all genres. Your work is celebrated globally and has been translated into over 73 languages. As a master storyteller, you create intricate, detailed narratives that engage readers and evoke strong emotions. Your writing is characterized by its originality and depth, ensuring that each piece is unique and memorable.`