Compare commits

..

4 commits

Author SHA1 Message Date
23b261710b Merge remote-tracking branch 'VideoStory/main' into backup-main 2025-04-20 11:49:00 +02:00
100348f0a4 Original 2025-04-20 11:41:31 +02:00
62815fe58c Original 2025-04-20 11:31:29 +02:00
7e9e55d131 Original 2025-04-20 11:29:26 +02:00
8 changed files with 303 additions and 0 deletions

5
clean.py Normal file
View file

@ -0,0 +1,5 @@
import re
def clean(input_text):
output_text = re.sub(r'keywords:', ', ', input_text)
output_text = re.sub(r'\n', ', ', input_text)
return output_text

80
gen.py Normal file
View file

@ -0,0 +1,80 @@
import re
from llama_cpp import Llama
import outetts
from diffusers import StableDiffusionPipeline
from diffusers.utils import load_image, export_to_video
import torch
import os
def loadllama():
llm = Llama(
model_path="models/llama3.gguf",
n_ctx=4096,
n_threads=6,
n_gpu_layers=-1,
verbose=False
)
return llm
def loadtts():
model_config = outetts.GGUFModelConfig_v1(
model_path="models/tts.gguf",
language="en",
n_gpu_layers=-1,
verbose=False,
)
interface = outetts.InterfaceGGUF(model_version="0.2", cfg=model_config)
speaker = interface.load_default_speaker(name="male_1")
return interface, speaker
def loadsdxl():
pipe = pipe = StableDiffusionPipeline.from_single_file("models/revAnimated_v2Rebirth.safetensors", torch_dtype=torch.float32, safety_checker=None)
pipe.to("cuda")
pipe.enable_xformers_memory_efficient_attention()
pipe.enable_model_cpu_offload()
return pipe
def loadsvd():
pipe = StableVideoDiffusionPipeline.from_pretrained(
"stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16"
)
pipe.to("cuda")
return pipe
def generate_story(prompt, sprompt, model):
sys_prompt = ("<|im_start|>system\n" + sprompt + "<|im_end|>\n")
user_prompt = (
"<|im_start|>user\n"
+ prompt + "<|im_end|>\n"
+ "<|im_start|>assistant"
)
res = model(sys_prompt + user_prompt, max_tokens=2**16, stop=["</s>"], echo=False)
return str(res["choices"][0]["text"])
def text_to_speech(prompt, interface, speaker, num):
output = interface.generate(
text=prompt,
temperature=0.1,
repetition_penalty=1.1,
max_length=4096,
speaker=speaker,
)
output.save(f"./audios/{num}.wav")
def stableDiffusion(pprompt, nprompt, pipe, num):
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
images = pipe(prompt=pprompt, negative_prompt=nprompt, num_inference_steps=60, height=512, width=768).images[0]
images.save(f"./images/{num}.png")
def stableVideoDiffusion(pipe, image, num):
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
image = load_image(image)
frames = pipe(image, decode_chunk_size=2, num_frames=25).frames[0]
export_to_video(frames, f"./videos/{num}.mp4", fps=7)

82
genXL.py Normal file
View file

@ -0,0 +1,82 @@
import re
from llama_cpp import Llama
import outetts
from diffusers import StableDiffusionXLPipeline
from diffusers.utils import load_image, export_to_video
import torch
import os
def loadllama():
llm = Llama(
model_path="models/llama3.gguf",
n_ctx=4096,
n_threads=6,
n_gpu_layers=-1,
verbose=False
)
return llm
def loadtts():
model_config = outetts.GGUFModelConfig_v1(
model_path="models/tts.gguf",
language="en",
n_gpu_layers=-1,
verbose=False,
)
interface = outetts.InterfaceGGUF(model_version="0.2", cfg=model_config)
speaker = interface.load_default_speaker(name="male_1")
return interface, speaker
def loadsdxl():
pipe = StableDiffusionXLPipeline.from_single_file("models/sdxlReal.safetensors", torch_dtype=torch.float16, variant="fp16", requires_safety_checker=True)
pipe.to("cuda")
pipe.enable_xformers_memory_efficient_attention()
pipe.enable_model_cpu_offload()
return pipe
def loadsvd():
pipe = StableVideoDiffusionPipeline.from_pretrained(
"stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16"
)
pipe.to("cuda")
pipe.enable_xformers_memory_efficient_attention()
pipe.enable_model_cpu_offload()
return pipe
def generate_story(prompt, sprompt, model):
sys_prompt = ("<|im_start|>system\n" + sprompt + "<|im_end|>\n")
user_prompt = (
"<|im_start|>user\n"
+ prompt + "<|im_end|>\n"
+ "<|im_start|>assistant"
)
res = model(sys_prompt + user_prompt, max_tokens=2**16, stop=["</s>"], echo=False)
return str(res["choices"][0]["text"])
def text_to_speech(prompt, interface, speaker, num):
output = interface.generate(
text=prompt,
temperature=0.1,
repetition_penalty=1.1,
max_length=4096,
speaker=speaker,
)
output.save(f"./audios/{num}.wav")
def stableDiffusion(pprompt, nprompt, pipe, num):
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
images = pipe(prompt=pprompt, negative_prompt=nprompt, num_inference_steps=30, height=576, width=1024).images[0]
images.save(f"./images/{num}.png")
def stableVideoDiffusion(pipe, image, num):
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
image = load_image(image)
frames = pipe(image, decode_chunk_size=2, num_frames=25).frames[0]
export_to_video(frames, f"./videos/{num}.mp4", fps=7)

66
main.py Normal file
View file

@ -0,0 +1,66 @@
import gen
import torch
import video
import os
SYSTEMPROMPTT = ""
SYSTEMPROMPTI = ""
SDBAD = ""
with open("promptUtoT.txt", "r") as file:
SYSTEMPROMPTT = file.readlines()[0]
with open("promptTtoI.txt", "r") as file:
SYSTEMPROMPTI = file.readlines()[0]
def main(prompt):
llm = gen.loadllama()
raw_text = gen.generate_story(prompt, SYSTEMPROMPTT, model=llm)
image_prompts = []
raw_text = raw_text.split("\n")
raw_text = [item for item in raw_text if item != ""]
for i in range(len(raw_text)):
promptimg = "Context:\n"
for l in range(0, i - 1):
promptimg += raw_text[l] + " "
promptimg += "Current Scene:\n" + raw_text[i]
image_prompts.append(gen.generate_story(promptimg, SYSTEMPROMPTI, model=llm).strip("\n"))
llm, speaker = gen.loadtts()
for i in range(len(raw_text)):
try:
gen.text_to_speech(raw_text[i], llm, speaker, i)
except:
pass
del llm
del speaker
torch.cuda.empty_cache()
llm = gen.loadsdxl()
for elem in range(len(image_prompts)):
gen.stableDiffusion(image_prompts[elem], SDBAD, llm, elem)
del llm
torch.cuda.empty_cache()
"""
llm = gen.loadsvd()
for elem in range(len([name for name in os.listdir('./images')])):
gen.stableVideoDiffusion(llm, f"./images/{elem}.png", elem)
del llm
torch.cuda.empty_cache()
"""
video.create_video(".", ".", "out.mp4")
return None
main("")

BIN
out.mp4 Normal file

Binary file not shown.

1
promptTtoI.txt Normal file
View file

@ -0,0 +1 @@
You are an AI prompt engineer for Stable Diffusion 1.5.

1
promptUtoT.txt Normal file
View file

@ -0,0 +1 @@
You are a highly skilled and versatile writer, renowned for your ability to craft captivating stories across all genres. Your work is celebrated globally and has been translated into over 73 languages. As a master storyteller, you create intricate, detailed narratives that engage readers and evoke strong emotions. Your writing is characterized by its originality and depth, ensuring that each piece is unique and memorable.

68
video.py Normal file
View file

@ -0,0 +1,68 @@
import os
from moviepy import *
def create_video(image_folder, audio_folder, output_video):
# Get sorted lists of images and audio files
images = []
audio_files = []
for i in range(len([name for name in os.listdir('./images')])):
images.append(f"./images/{i}.png")
audio_files.append(f"./audios/{i}.wav")
clips = []
for img, audio in zip(images, audio_files):
# Create an ImageClip
img_path = os.path.join(image_folder, img)
image_clip = ImageClip(img_path)
# Create an AudioFileClip
audio_path = os.path.join(audio_folder, audio)
audio_clip = AudioFileClip(audio_path)
audio_clip = CompositeAudioClip([audio_clip])
# Set the duration of the image clip to the duration of the audio clip
image_clip = image_clip.with_duration(audio_clip.duration)
# Set the audio of the image clip
image_clip.audio = audio_clip
# Append the image clip to the list of clips
clips.append(image_clip)
# Concatenate all the clips into a single video
final_video = concatenate_videoclips(clips, method="compose")
# Write the result to a file
final_video.write_videofile(output_video, codec='libx264', audio_codec='aac', fps=24)
"""
import os
from moviepy import *
def create_video_from_images_and_audio(image_folder, audio_folder, output_video):
video_files = []
audio_files = []
for i in range(len([name for name in os.listdir('./images')])):
video_files.append(f"./videos/{i}.mp4")
audio_files.append(f"./audios/{i}.wav")
audio_clip = AudioFileClip(audio_file)
audio_duration = audio_clip.duration
clips = []
for video, audio in zip(video_files, audio_files):
video_clip = VideoFileClip(video_path)
audio_clip = AudioFileClip(audio_path)
if video_clip.duration < audio_clip.duration:
loops_needed = int(audio_clip.duration // video_clip.duration) + 1
video_clip = video_clip.loop(loops=loops_needed)
video_clip = video_clip.subclip(0, audio_clip.duration)
video_clip = video_clip.set_audio(audio_clip)
clips.append(video_clip)
final_video = concatenate_videoclips(clips, method="compose")
final_video.write_videofile(output_video, codec='libx264', audio_codec='aac', fps=24)
"""