From 31f2fec625b57374143fbafc5c19cef2f6b61080 Mon Sep 17 00:00:00 2001 From: Joachim Date: Sun, 20 Apr 2025 12:37:25 +0200 Subject: [PATCH 01/10] Actualiser README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b7356a8..65e2a76 100644 --- a/README.md +++ b/README.md @@ -61,4 +61,4 @@ img3 --> vd; ## Exemple de sortie -https://uwo.nya.pub/forge/Joachim/VideoStory/src/branch/main/out.mp4 \ No newline at end of file + \ No newline at end of file From d74819d075ddd4bfb517359aa19dc12f8952f3bb Mon Sep 17 00:00:00 2001 From: Joachim Date: Sun, 20 Apr 2025 12:38:38 +0200 Subject: [PATCH 02/10] Actualiser README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 65e2a76..b7356a8 100644 --- a/README.md +++ b/README.md @@ -61,4 +61,4 @@ img3 --> vd; ## Exemple de sortie - \ No newline at end of file +https://uwo.nya.pub/forge/Joachim/VideoStory/src/branch/main/out.mp4 \ No newline at end of file From 9d78d7d11376239b1cc0e058799712dd47297e2c Mon Sep 17 00:00:00 2001 From: Joachim Date: Sun, 20 Apr 2025 12:40:14 +0200 Subject: [PATCH 03/10] Actualiser gen.py --- gen.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gen.py b/gen.py index e810283..b627dd7 100644 --- a/gen.py +++ b/gen.py @@ -8,7 +8,7 @@ import os def loadllama(): llm = Llama( - model_path="models/llama3.gguf", + model_path="models/.gguf", n_ctx=4096, n_threads=6, n_gpu_layers=-1, @@ -19,7 +19,7 @@ def loadllama(): def loadtts(): model_config = outetts.GGUFModelConfig_v1( - model_path="models/tts.gguf", + model_path="models/.gguf", language="en", n_gpu_layers=-1, verbose=False, @@ -30,7 +30,7 @@ def loadtts(): return interface, speaker def loadsdxl(): - pipe = pipe = StableDiffusionPipeline.from_single_file("models/revAnimated_v2Rebirth.safetensors", torch_dtype=torch.float32, safety_checker=None) + pipe = pipe = StableDiffusionPipeline.from_single_file("models/.safetensors", torch_dtype=torch.float32, safety_checker=None) pipe.to("cuda") pipe.enable_xformers_memory_efficient_attention() pipe.enable_model_cpu_offload() From 6c66ecacd50d7ed6f18c4210afe0fbe42fcec140 Mon Sep 17 00:00:00 2001 From: Joachim Date: Sun, 20 Apr 2025 12:40:43 +0200 Subject: [PATCH 04/10] Actualiser genXL.py --- genXL.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/genXL.py b/genXL.py index bd62258..3e0ace8 100644 --- a/genXL.py +++ b/genXL.py @@ -8,7 +8,7 @@ import os def loadllama(): llm = Llama( - model_path="models/llama3.gguf", + model_path="models/.gguf", n_ctx=4096, n_threads=6, n_gpu_layers=-1, @@ -19,7 +19,7 @@ def loadllama(): def loadtts(): model_config = outetts.GGUFModelConfig_v1( - model_path="models/tts.gguf", + model_path="models/.gguf", language="en", n_gpu_layers=-1, verbose=False, @@ -30,7 +30,7 @@ def loadtts(): return interface, speaker def loadsdxl(): - pipe = StableDiffusionXLPipeline.from_single_file("models/sdxlReal.safetensors", torch_dtype=torch.float16, variant="fp16", requires_safety_checker=True) + pipe = StableDiffusionXLPipeline.from_single_file("models/.safetensors", torch_dtype=torch.float16, variant="fp16", requires_safety_checker=True) pipe.to("cuda") pipe.enable_xformers_memory_efficient_attention() pipe.enable_model_cpu_offload() From 5caf7c95db243a5035125744513fa5dc7fd404bb Mon Sep 17 00:00:00 2001 From: Joachim Date: Sun, 20 Apr 2025 12:42:45 +0200 Subject: [PATCH 05/10] Actualiser README.md --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index b7356a8..a89bce1 100644 --- a/README.md +++ b/README.md @@ -21,9 +21,11 @@ Dans le fichier `main.py`, ajoutez dans l'appel de main() le prompt. `SDBAD` est le prompt négatif de Stable Diffusion. `SYSTEMPROMPTI` est le prompt système de Stable Diffusion. +`promptTtoI.txt` et `promptUtoT.txt` sont respectivement le prompt système de Stable Diffusion et celui de Llama. + Dans le fichier `gen.py`, dans les fonctions `loadllama()`, `loadtts()` et `loadsdxl()` vous devez ajouter vos models (fichiers locaux). -Le programme se lance avec `main.py` +Le programme se lance avec `main.py`. ## Fonctionnement ```mermaid From 972ab13880b91aa7742e558f1da3f55d59fef89b Mon Sep 17 00:00:00 2001 From: Joachim Date: Sun, 20 Apr 2025 12:44:24 +0200 Subject: [PATCH 06/10] Supprimer clean.py --- clean.py | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 clean.py diff --git a/clean.py b/clean.py deleted file mode 100644 index fae096d..0000000 --- a/clean.py +++ /dev/null @@ -1,5 +0,0 @@ -import re -def clean(input_text): - output_text = re.sub(r'keywords:', ', ', input_text) - output_text = re.sub(r'\n', ', ', input_text) - return output_text \ No newline at end of file From 561326249068c797d4ca2c2cb4f44820a29b9b75 Mon Sep 17 00:00:00 2001 From: Joachim Date: Wed, 23 Apr 2025 09:23:57 +0200 Subject: [PATCH 07/10] Actualiser README.md --- README.md | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index a89bce1..c50c2ee 100644 --- a/README.md +++ b/README.md @@ -35,12 +35,15 @@ img1["Image 1"] img2["Image 2"] img3["Image 3"] p1["Paragraphe 1"] -p2["Paragraphe 2"] -p3["Paragraphe 3"] +p2["Paragraphe 2 + (1)"] +p3["Paragraphe 3 + (1 + 2)"] fa["Fichier Audio"] vd{"Vidéo"} prt{"Prompt"} llm{{"Llama"}} +llm1{{"Llama"}} +llm2{{"Llama"}} +llm3{{"Llama"}} tts{{"TTS"}} prt --> llm; llm --> Texte; @@ -49,9 +52,12 @@ Texte --> p2; Texte --> p3; Texte --> tts; tts --> fa; -p1 --> sd; -p2 --> sd; -p3 --> sd; +p1 --> llm1; +p2 --> llm2; +p3 --> llm3; +llm1 --> sd +llm2 --> sd +llm3 --> sd sd --> img1; sd --> img2; sd --> img3; From 1855c861c3903ffa2432962bf9e9355a15a60480 Mon Sep 17 00:00:00 2001 From: Joachim Date: Wed, 23 Apr 2025 11:14:47 +0200 Subject: [PATCH 08/10] Actualiser README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c50c2ee..fa552a5 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ moviepy Dans le fichier `main.py`, ajoutez dans l'appel de main() le prompt. `SYSTEMPROMPTT` est le prompt système de Llama. `SDBAD` est le prompt négatif de Stable Diffusion. -`SYSTEMPROMPTI` est le prompt système de Stable Diffusion. +`SYSTEMPROMPTI` est le prompt système de Llama pour Stable Diffusion. `promptTtoI.txt` et `promptUtoT.txt` sont respectivement le prompt système de Stable Diffusion et celui de Llama. From ad917ff7909463e23cf9f50a556a34fe9c6c9ebc Mon Sep 17 00:00:00 2001 From: Joachim Date: Fri, 4 Jul 2025 12:19:50 +0200 Subject: [PATCH 09/10] Actualiser README.md --- README.md | 155 ++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 140 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index fa552a5..e5572c4 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,9 @@ # VideoStory -C'est une expérience avec Stable Diffusion 1.5, Llama 3 et du TTS qui permet la génération de "vidéo" (suite d'images) avec une histoire narrée. Il supporte potentiellement SD Video. -Ce programme n'a pas vraiment pour vocation d'être utilisé. +This is an experiment with Stable Diffusion 1.5, Llama 3, and TTS that allows for the generation of "video" (a sequence of images) with a narrated story. It potentially supports SD Video. This program is not really intended for practical use. -## Librairie -Voici les dépendances: +## Library +Here are the dependencies: ``` re llama_cpp @@ -15,19 +14,44 @@ os moviepy ``` -## Utilisation -Dans le fichier `main.py`, ajoutez dans l'appel de main() le prompt. -`SYSTEMPROMPTT` est le prompt système de Llama. -`SDBAD` est le prompt négatif de Stable Diffusion. -`SYSTEMPROMPTI` est le prompt système de Llama pour Stable Diffusion. +## Usage +In the `main.py` file, add the prompt in the call to main(). +`SYSTEMPROMPTT` is the system prompt for Llama. +`SDBAD` is the negative prompt for Stable Diffusion. +`SYSTEMPROMPTI` is the system prompt for Llama for Stable Diffusion. -`promptTtoI.txt` et `promptUtoT.txt` sont respectivement le prompt système de Stable Diffusion et celui de Llama. +`promptTtoI.txt` and `promptUtoT.txt` are respectively the system prompt for Stable Diffusion and that for Llama. -Dans le fichier `gen.py`, dans les fonctions `loadllama()`, `loadtts()` et `loadsdxl()` vous devez ajouter vos models (fichiers locaux). +In the `gen.py` file, in the functions `loadllama()`, `loadtts()`, and `loadsdxl()`, you need to add your models (local files). -Le programme se lance avec `main.py`. +The program is launched with `main.py`. -## Fonctionnement +## Why does this project suck + +### Context Limitation + +AIs are limited by the size of their context. Too much, and the AI goes crazy; not enough, and the output is worse than the usual trash. It is impossible to create a "real" long story fully with AI because: +- If you generate it in one go, as in V1, the AI will create a pretty short story, and the further it goes, the more it becomes incoherent and repetitive. +- If you generate it in multiple parts, as in V2, the story might be more coherent and better in the long term, but the overall quality will be lesser because when you rebuild the whole story into one big text, there are a lot of "artifacts." + +### Self-Biasing Limitation + +AIs self-bias themselves all the time because of their context. If there weren't any context, there wouldn't be any bias, but also no output. AI self-biasing is the same thing as human biasing but on a much larger scale. Everything biases AI toward its final output. The proof is that if you prompt the AI to generate a story about a cat, it will generate a story about a cat. However, this is also an issue because every word in its context is taken into account to generate the final output, along with all the "artifacts" it created along the way. For one artifact, ten more are generated, and the output rapidly becomes garbage. This is due to the fact that AIs are probabilistic machines, i.e., useless for tasks that require more than just probabilities. + +This self-bias is really visible in V2 because, at each pass, the AI's context is cut and modified. This means that instead of having one AI with one context and one bias, we have multiple versions of the AI with different biases. This creates a LOT of artifacts, as they all have different "state of mind" and "goal." You could visualize the AI's bias as a vector made of all the tokens/n-grams in its context. While V1 only uses one context, with one vector in one direction, V2 uses multiple contexts with multiple vectors all pointing in "kind of the same direction" but still diverging. + +### Conclusion + +To correct the issue, you would need to write the text yourself multiple times with various small wording variations and then train the AI with them. Then you would have a well-written and longer story, and V2's bias would probably be better (i.e., pointing more in the same direction). +Soo yeah, shocker: writing your own story is better than using an AI to generate them, even with the most sophisticated methods. The same goes for image and audio generation. + +## Output exemple + +https://uwo.nya.pub/forge/Joachim/VideoStory/src/branch/main/out.mp4 + +## Flow charts + +### V1 ```mermaid flowchart TD; sd{{"Stable Diffusion"}} @@ -67,6 +91,107 @@ img2 --> vd; img3 --> vd; ``` -## Exemple de sortie +### V2 (Unpublished) -https://uwo.nya.pub/forge/Joachim/VideoStory/src/branch/main/out.mp4 \ No newline at end of file +```mermaid +stateDiagram-v2 +state "Part 1" as p1 +state "Part 2" as p2 +state "Part N" as pN +state "Gen Story p1" as Gp1 +state "Gen Story p2" as Gp2 +state "Gen Story pN" as GpN +state "Summary 1" as S1 +state "Summary 2" as S2 +state "Summary N" as SN +state "Prompt 1" as pt1 +state "Prompt 2" as pt2 +state "Prompt N" as ptN +state "Gen illustration 1" as it1 +state "Gen illustration 2" as it2 +state "Gen illustration N" as itN +state "Gen TTS 1" as tt1 +state "Gen TTS 2" as tt2 +state "Gen TTS N" as ttN +state "Subtitle 1" as sub1 +state "Subtitle 2" as sub2 +state "Subtitle N" as subN +state "Video part 1" as v1 +state "Video part 2" as v2 +state "Video part N" as vN +state "Video Final" as vf +World --> Base +Description --> Base +Name --> Base +Base --> Master +Master --> Player : Until number x of max interations is reached +Player --> Master + +Logs --> p1 +Logs --> p2 +Logs --> pN + +p1 --> Gp1 +p2 --> Gp2 +pN --> GpN + +Master --> Logs +Player --> Logs + +Gp1 --> S1 +Gp2 --> S2 +GpN --> SN + +S1 --> pt1 +S2 --> pt2 +SN --> ptN + +pt1 --> it1 +pt2 --> it2 +ptN --> itN + +Gp1 --> tt1 +Gp2 --> tt2 +GpN --> ttN + +Gp1 --> sub1 +Gp2 --> sub2 +GpN --> subN + +it1 --> v1 +tt1 --> v1 +sub1 --> v1 +it2 --> v2 +tt2 --> v2 +sub2 --> v2 +itN --> vN +ttN --> vN +subN --> vN + +v1 --> vf +v2 --> vf +vN --> vf + +World: World name +Description: World description/rules +Name: Main actor's name +Logs: Roleplay's logs +Master: AI leading the game +Player: AI choosing next state, with only current state context +p1: Part 1 of logs +p2: Part 2 of logs +pN: Part N of logs +Gp1: Story generated with Part 1 +Gp2: Story generated with Part 2 +GpN: Story generated with Part N +Base: Base prompt for leading AI +S1: Story summary +S2: Story summary +SN: Story summary +sub1: Video's subtitles +sub2: Video's subtitles +subN: Video's subtitles +pt1: Gen SD prompt with simplified story +pt2: Gen SD prompt with simplified story +ptN: Gen SD prompt with simplified story +``` \ No newline at end of file From d4ec54632d35fdc8270b9730ec39735145837c99 Mon Sep 17 00:00:00 2001 From: Joachim Date: Fri, 4 Jul 2025 12:22:43 +0200 Subject: [PATCH 10/10] Actualiser README.md --- README.md | 52 ++++++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index e5572c4..5863fd5 100644 --- a/README.md +++ b/README.md @@ -1,30 +1,6 @@ # VideoStory -This is an experiment with Stable Diffusion 1.5, Llama 3, and TTS that allows for the generation of "video" (a sequence of images) with a narrated story. It potentially supports SD Video. This program is not really intended for practical use. - -## Library -Here are the dependencies: -``` -re -llama_cpp -outetts -diffusers -torch -os -moviepy -``` - -## Usage -In the `main.py` file, add the prompt in the call to main(). -`SYSTEMPROMPTT` is the system prompt for Llama. -`SDBAD` is the negative prompt for Stable Diffusion. -`SYSTEMPROMPTI` is the system prompt for Llama for Stable Diffusion. - -`promptTtoI.txt` and `promptUtoT.txt` are respectively the system prompt for Stable Diffusion and that for Llama. - -In the `gen.py` file, in the functions `loadllama()`, `loadtts()`, and `loadsdxl()`, you need to add your models (local files). - -The program is launched with `main.py`. +This is an experiment with Stable Diffusion (1.5/SDXL), Llama (2/3), Phi, and others that allows for the generation of "video" (a sequence of images) with a narrated story. This program is not really intended for practical use. ## Why does this project suck @@ -194,4 +170,28 @@ subN: Video's subtitles pt1: Gen SD prompt with simplified story pt2: Gen SD prompt with simplified story ptN: Gen SD prompt with simplified story -``` \ No newline at end of file +``` + +## Library +Here are the dependencies: +``` +re +llama_cpp +outetts +diffusers +torch +os +moviepy +``` + +## Usage +In the `main.py` file, add the prompt in the call to main(). +`SYSTEMPROMPTT` is the system prompt for Llama. +`SDBAD` is the negative prompt for Stable Diffusion. +`SYSTEMPROMPTI` is the system prompt for Llama for Stable Diffusion. + +`promptTtoI.txt` and `promptUtoT.txt` are respectively the system prompt for Stable Diffusion and that for Llama. + +In the `gen.py` file, in the functions `loadllama()`, `loadtts()`, and `loadsdxl()`, you need to add your models (local files). + +The program is launched with `main.py`. \ No newline at end of file