svjack commited on
Commit
ef46f0f
·
verified ·
1 Parent(s): c31c4a4

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +8 -0
  2. .ipynb_checkpoints/README-checkpoint.md +203 -0
  3. .python-version +1 -0
  4. README.md +203 -0
  5. cache_latents.py +339 -0
  6. cache_text_encoder_outputs.py +214 -0
  7. convert_lora.py +137 -0
  8. dataset/__init__.py +0 -0
  9. dataset/config_utils.py +381 -0
  10. dataset/dataset_config.md +461 -0
  11. dataset/image_video_dataset.py +1726 -0
  12. docs/advanced_config.md +316 -0
  13. docs/framepack.md +331 -0
  14. docs/sampling_during_training.md +116 -0
  15. docs/wan.md +531 -0
  16. fpack_cache_latents.py +381 -0
  17. fpack_cache_text_encoder_outputs.py +110 -0
  18. fpack_generate_video.py +1149 -0
  19. fpack_train_network.py +410 -0
  20. frame_pack/__init__.py +0 -0
  21. frame_pack/bucket_tools.py +30 -0
  22. frame_pack/clip_vision.py +14 -0
  23. frame_pack/framepack_utils.py +273 -0
  24. frame_pack/hunyuan.py +116 -0
  25. frame_pack/hunyuan_video_packed.py +2015 -0
  26. frame_pack/k_diffusion_hunyuan.py +128 -0
  27. frame_pack/uni_pc_fm.py +142 -0
  28. frame_pack/utils.py +617 -0
  29. frame_pack/wrapper.py +51 -0
  30. framepack_yichen_output/framepack-yichen-lora-000001.safetensors +3 -0
  31. framepack_yichen_output/framepack-yichen-lora-000002.safetensors +3 -0
  32. framepack_yichen_output/framepack-yichen-lora-000003.safetensors +3 -0
  33. framepack_yichen_output/framepack-yichen-lora-000004.safetensors +3 -0
  34. framepack_yichen_output/framepack-yichen-lora-000005.safetensors +3 -0
  35. framepack_yichen_output/framepack-yichen-lora-000006.safetensors +3 -0
  36. hunyuan_model/__init__.py +0 -0
  37. hunyuan_model/activation_layers.py +23 -0
  38. hunyuan_model/attention.py +295 -0
  39. hunyuan_model/autoencoder_kl_causal_3d.py +609 -0
  40. hunyuan_model/embed_layers.py +132 -0
  41. hunyuan_model/fp8_optimization.py +39 -0
  42. hunyuan_model/helpers.py +40 -0
  43. hunyuan_model/mlp_layers.py +118 -0
  44. hunyuan_model/models.py +1044 -0
  45. hunyuan_model/modulate_layers.py +76 -0
  46. hunyuan_model/norm_layers.py +79 -0
  47. hunyuan_model/pipeline_hunyuan_video.py +1100 -0
  48. hunyuan_model/posemb_layers.py +310 -0
  49. hunyuan_model/text_encoder.py +710 -0
  50. hunyuan_model/token_refiner.py +245 -0
.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ .venv
3
+ venv/
4
+ logs/
5
+ uv.lock
6
+ main.exp
7
+ main.lib
8
+ main.obj
.ipynb_checkpoints/README-checkpoint.md ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FramePack Dancing Image-to-Video Generation
2
+
3
+ This repository contains the necessary steps and scripts to generate videos using the Dancing image-to-video model. The model leverages LoRA (Low-Rank Adaptation) weights and pre-trained components to create high-quality anime-style videos based on textual prompts.
4
+
5
+ ## Prerequisites
6
+
7
+ Before proceeding, ensure that you have the following installed on your system:
8
+
9
+ • **Ubuntu** (or a compatible Linux distribution)
10
+ • **Python 3.x**
11
+ • **pip** (Python package manager)
12
+ • **Git**
13
+ • **Git LFS** (Git Large File Storage)
14
+ • **FFmpeg**
15
+
16
+ ## Installation
17
+
18
+ 1. **Update and Install Dependencies**
19
+
20
+ ```bash
21
+ sudo apt-get update && sudo apt-get install cbm git-lfs ffmpeg
22
+ ```
23
+
24
+ 2. **Clone the Repository**
25
+
26
+ ```bash
27
+ git clone https://huggingface.co/svjack/YiChen_FramePack_lora_early
28
+ cd YiChen_FramePack_lora_early
29
+ ```
30
+
31
+ 3. **Install Python Dependencies**
32
+
33
+ ```bash
34
+ pip install torch torchvision
35
+ pip install -r requirements.txt
36
+ pip install ascii-magic matplotlib tensorboard huggingface_hub datasets
37
+ pip install moviepy==1.0.3
38
+ pip install sageattention==1.0.6
39
+ ```
40
+
41
+ 4. **Download Model Weights**
42
+
43
+ ```bash
44
+ git clone https://huggingface.co/lllyasviel/FramePackI2V_HY
45
+ git clone https://huggingface.co/hunyuanvideo-community/HunyuanVideo
46
+ git clone https://huggingface.co/Comfy-Org/HunyuanVideo_repackaged
47
+ git clone https://huggingface.co/Comfy-Org/sigclip_vision_384
48
+ ```
49
+
50
+ ## Usage
51
+
52
+ To generate a video, use the `fpack_generate_video.py` script with the appropriate parameters. Below are examples of how to generate videos using the Dancing model.
53
+
54
+
55
+
56
+ ### 1. Furina
57
+ - Source Image
58
+
59
+
60
+ ```bash
61
+ python fpack_generate_video.py \
62
+ --dit FramePackI2V_HY/diffusion_pytorch_model-00001-of-00003.safetensors \
63
+ --vae HunyuanVideo/vae/diffusion_pytorch_model.safetensors \
64
+ --text_encoder1 HunyuanVideo_repackaged/split_files/text_encoders/llava_llama3_fp16.safetensors \
65
+ --text_encoder2 HunyuanVideo_repackaged/split_files/text_encoders/clip_l.safetensors \
66
+ --image_encoder sigclip_vision_384/sigclip_vision_patch14_384.safetensors \
67
+ --image_path fln.png \
68
+ --prompt "In the style of Yi Chen Dancing White Background , The character's movements shift dynamically throughout the video, transitioning from poised stillness to lively dance steps. Her expressions evolve seamlessly—starting with focused determination, then flashing surprise as she executes a quick spin, before breaking into a joyful smile mid-leap. Her hands flow through choreographed positions, sometimes extending gracefully like unfolding wings, other times clapping rhythmically against her wrists. During a dramatic hip sway, her fingers fan open near her cheek, then sweep downward as her whole body dips into a playful crouch, the sequins on her costume catching the light with every motion." \
69
+ --video_size 960 544 --video_seconds 3 --fps 30 --infer_steps 25 \
70
+ --attn_mode sdpa --fp8_scaled \
71
+ --vae_chunk_size 32 --vae_spatial_tile_sample_min_size 128 \
72
+ --save_path save --output_type both \
73
+ --seed 1234 --lora_multiplier 1.0 --lora_weight framepack_yichen_output/framepack-yichen-lora-000006.safetensors
74
+
75
+
76
+ ```
77
+
78
+ - Without Lora
79
+
80
+ - With Lora
81
+
82
+
83
+ ### 2. Roper
84
+ - Source Image
85
+
86
+
87
+
88
+ ```bash
89
+ python fpack_generate_video.py \
90
+ --dit FramePackI2V_HY/diffusion_pytorch_model-00001-of-00003.safetensors \
91
+ --vae HunyuanVideo/vae/diffusion_pytorch_model.safetensors \
92
+ --text_encoder1 HunyuanVideo_repackaged/split_files/text_encoders/llava_llama3_fp16.safetensors \
93
+ --text_encoder2 HunyuanVideo_repackaged/split_files/text_encoders/clip_l.safetensors \
94
+ --image_encoder sigclip_vision_384/sigclip_vision_patch14_384.safetensors \
95
+ --image_path shengjiang.png \
96
+ --prompt "In the style of Yi Chen Dancing White Background , The character's movements shift dynamically throughout the video, transitioning from poised stillness to lively dance steps. Her expressions evolve seamlessly—starting with focused determination, then flashing surprise as she executes a quick spin, before breaking into a joyful smile mid-leap. Her hands flow through choreographed positions, sometimes extending gracefully like unfolding wings, other times clapping rhythmically against her wrists. During a dramatic hip sway, her fingers fan open near her cheek, then sweep downward as her whole body dips into a playful crouch, the sequins on her costume catching the light with every motion." \
97
+ --video_size 960 544 --video_seconds 3 --fps 30 --infer_steps 25 \
98
+ --attn_mode sdpa --fp8_scaled \
99
+ --vae_chunk_size 32 --vae_spatial_tile_sample_min_size 128 \
100
+ --save_path save --output_type both \
101
+ --seed 1234 --lora_multiplier 1.0 --lora_weight framepack_yichen_output/framepack-yichen-lora-000006.safetensors
102
+
103
+ ```
104
+
105
+ - With Lora
106
+
107
+
108
+
109
+ ### 3. Varesa
110
+ - Source Image
111
+
112
+
113
+ ```bash
114
+ python fpack_generate_video.py \
115
+ --dit FramePackI2V_HY/diffusion_pytorch_model-00001-of-00003.safetensors \
116
+ --vae HunyuanVideo/vae/diffusion_pytorch_model.safetensors \
117
+ --text_encoder1 HunyuanVideo_repackaged/split_files/text_encoders/llava_llama3_fp16.safetensors \
118
+ --text_encoder2 HunyuanVideo_repackaged/split_files/text_encoders/clip_l.safetensors \
119
+ --image_encoder sigclip_vision_384/sigclip_vision_patch14_384.safetensors \
120
+ --image_path waliesha.jpg \
121
+ --prompt "In the style of Yi Chen Dancing White Background , The dancer’s energy pulses in waves—one moment a statue, poised and precise, the next a whirl of motion as her feet flicker across the floor. Her face tells its own story: brows knit in concentration, then eyes widening mid-turn as if startled by her own speed, before dissolving into laughter as she springs upward, weightless. Her arms carve the air—now arcing like ribbons unfurling, now snapping sharp as a whip’s crack, palms meeting wrists in staccato beats. A roll of her hips sends her fingers fluttering near her temple, then cascading down as she folds into a teasing dip, the beads on her dress scattering light like sparks." \
122
+ --video_size 960 544 --video_seconds 3 --fps 30 --infer_steps 25 \
123
+ --attn_mode sdpa --fp8_scaled \
124
+ --vae_chunk_size 32 --vae_spatial_tile_sample_min_size 128 \
125
+ --save_path save --output_type both \
126
+ --seed 1234 --lora_multiplier 1.0 --lora_weight framepack_yichen_output/framepack-yichen-lora-000006.safetensors
127
+
128
+ ```
129
+ - With Lora
130
+
131
+
132
+
133
+ ### 4. Scaramouche
134
+ - Source Image
135
+
136
+
137
+ ```bash
138
+ python fpack_generate_video.py \
139
+ --dit FramePackI2V_HY/diffusion_pytorch_model-00001-of-00003.safetensors \
140
+ --vae HunyuanVideo/vae/diffusion_pytorch_model.safetensors \
141
+ --text_encoder1 HunyuanVideo_repackaged/split_files/text_encoders/llava_llama3_fp16.safetensors \
142
+ --text_encoder2 HunyuanVideo_repackaged/split_files/text_encoders/clip_l.safetensors \
143
+ --image_encoder sigclip_vision_384/sigclip_vision_patch14_384.safetensors \
144
+ --image_path shanbing.jpg \
145
+ --prompt "In the style of Yi Chen Dancing White Background , The dancer’s energy pulses in waves—one moment a statue, poised and precise, the next a whirl of motion as her feet flicker across the floor. Her face tells its own story: brows knit in concentration, then eyes widening mid-turn as if startled by her own speed, before dissolving into laughter as she springs upward, weightless. Her arms carve the air—now arcing like ribbons unfurling, now snapping sharp as a whip’s crack, palms meeting wrists in staccato beats. A roll of her hips sends her fingers fluttering near her temple, then cascading down as she folds into a teasing dip, the beads on her dress scattering light like sparks." \
146
+ --video_size 960 544 --video_seconds 3 --fps 30 --infer_steps 25 \
147
+ --attn_mode sdpa --fp8_scaled \
148
+ --vae_chunk_size 32 --vae_spatial_tile_sample_min_size 128 \
149
+ --save_path save --output_type both \
150
+ --seed 1234 --lora_multiplier 1.0 --lora_weight framepack_yichen_output/framepack-yichen-lora-000006.safetensors
151
+
152
+ ```
153
+
154
+ - With Lora
155
+
156
+
157
+
158
+
159
+ ## Parameters
160
+
161
+ * `--fp8`: Enable FP8 precision (optional).
162
+ * `--task`: Specify the task (e.g., `t2v-1.3B`).
163
+ * `--video_size`: Set the resolution of the generated video (e.g., `1024 1024`).
164
+ * `--video_length`: Define the length of the video in frames.
165
+ * `--infer_steps`: Number of inference steps.
166
+ * `--save_path`: Directory to save the generated video.
167
+ * `--output_type`: Output type (e.g., `both` for video and frames).
168
+ * `--dit`: Path to the diffusion model weights.
169
+ * `--vae`: Path to the VAE model weights.
170
+ * `--t5`: Path to the T5 model weights.
171
+ * `--attn_mode`: Attention mode (e.g., `torch`).
172
+ * `--lora_weight`: Path to the LoRA weights.
173
+ * `--lora_multiplier`: Multiplier for LoRA weights.
174
+ * `--prompt`: Textual prompt for video generation.
175
+
176
+
177
+
178
+ ## Output
179
+
180
+ The generated video and frames will be saved in the specified `save_path` directory.
181
+
182
+ ## Troubleshooting
183
+
184
+ • Ensure all dependencies are correctly installed.
185
+ • Verify that the model weights are downloaded and placed in the correct locations.
186
+ • Check for any missing Python packages and install them using `pip`.
187
+
188
+ ## License
189
+
190
+ This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
191
+
192
+ ## Acknowledgments
193
+
194
+ • **Hugging Face** for hosting the model weights.
195
+ • **Wan-AI** for providing the pre-trained models.
196
+ • **DeepBeepMeep** for contributing to the model weights.
197
+
198
+ ## Contact
199
+
200
+ For any questions or issues, please open an issue on the repository or contact the maintainer.
201
+
202
+ ---
203
+
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.10
README.md ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FramePack Dancing Image-to-Video Generation
2
+
3
+ This repository contains the necessary steps and scripts to generate videos using the Dancing image-to-video model. The model leverages LoRA (Low-Rank Adaptation) weights and pre-trained components to create high-quality anime-style videos based on textual prompts.
4
+
5
+ ## Prerequisites
6
+
7
+ Before proceeding, ensure that you have the following installed on your system:
8
+
9
+ • **Ubuntu** (or a compatible Linux distribution)
10
+ • **Python 3.x**
11
+ • **pip** (Python package manager)
12
+ • **Git**
13
+ • **Git LFS** (Git Large File Storage)
14
+ • **FFmpeg**
15
+
16
+ ## Installation
17
+
18
+ 1. **Update and Install Dependencies**
19
+
20
+ ```bash
21
+ sudo apt-get update && sudo apt-get install cbm git-lfs ffmpeg
22
+ ```
23
+
24
+ 2. **Clone the Repository**
25
+
26
+ ```bash
27
+ git clone https://huggingface.co/svjack/YiChen_FramePack_lora_early
28
+ cd YiChen_FramePack_lora_early
29
+ ```
30
+
31
+ 3. **Install Python Dependencies**
32
+
33
+ ```bash
34
+ pip install torch torchvision
35
+ pip install -r requirements.txt
36
+ pip install ascii-magic matplotlib tensorboard huggingface_hub datasets
37
+ pip install moviepy==1.0.3
38
+ pip install sageattention==1.0.6
39
+ ```
40
+
41
+ 4. **Download Model Weights**
42
+
43
+ ```bash
44
+ git clone https://huggingface.co/lllyasviel/FramePackI2V_HY
45
+ git clone https://huggingface.co/hunyuanvideo-community/HunyuanVideo
46
+ git clone https://huggingface.co/Comfy-Org/HunyuanVideo_repackaged
47
+ git clone https://huggingface.co/Comfy-Org/sigclip_vision_384
48
+ ```
49
+
50
+ ## Usage
51
+
52
+ To generate a video, use the `fpack_generate_video.py` script with the appropriate parameters. Below are examples of how to generate videos using the Dancing model.
53
+
54
+
55
+
56
+ ### 1. Furina
57
+ - Source Image
58
+
59
+
60
+ ```bash
61
+ python fpack_generate_video.py \
62
+ --dit FramePackI2V_HY/diffusion_pytorch_model-00001-of-00003.safetensors \
63
+ --vae HunyuanVideo/vae/diffusion_pytorch_model.safetensors \
64
+ --text_encoder1 HunyuanVideo_repackaged/split_files/text_encoders/llava_llama3_fp16.safetensors \
65
+ --text_encoder2 HunyuanVideo_repackaged/split_files/text_encoders/clip_l.safetensors \
66
+ --image_encoder sigclip_vision_384/sigclip_vision_patch14_384.safetensors \
67
+ --image_path fln.png \
68
+ --prompt "In the style of Yi Chen Dancing White Background , The character's movements shift dynamically throughout the video, transitioning from poised stillness to lively dance steps. Her expressions evolve seamlessly—starting with focused determination, then flashing surprise as she executes a quick spin, before breaking into a joyful smile mid-leap. Her hands flow through choreographed positions, sometimes extending gracefully like unfolding wings, other times clapping rhythmically against her wrists. During a dramatic hip sway, her fingers fan open near her cheek, then sweep downward as her whole body dips into a playful crouch, the sequins on her costume catching the light with every motion." \
69
+ --video_size 960 544 --video_seconds 3 --fps 30 --infer_steps 25 \
70
+ --attn_mode sdpa --fp8_scaled \
71
+ --vae_chunk_size 32 --vae_spatial_tile_sample_min_size 128 \
72
+ --save_path save --output_type both \
73
+ --seed 1234 --lora_multiplier 1.0 --lora_weight framepack_yichen_output/framepack-yichen-lora-000006.safetensors
74
+
75
+
76
+ ```
77
+
78
+ - Without Lora
79
+
80
+ - With Lora
81
+
82
+
83
+ ### 2. Roper
84
+ - Source Image
85
+
86
+
87
+
88
+ ```bash
89
+ python fpack_generate_video.py \
90
+ --dit FramePackI2V_HY/diffusion_pytorch_model-00001-of-00003.safetensors \
91
+ --vae HunyuanVideo/vae/diffusion_pytorch_model.safetensors \
92
+ --text_encoder1 HunyuanVideo_repackaged/split_files/text_encoders/llava_llama3_fp16.safetensors \
93
+ --text_encoder2 HunyuanVideo_repackaged/split_files/text_encoders/clip_l.safetensors \
94
+ --image_encoder sigclip_vision_384/sigclip_vision_patch14_384.safetensors \
95
+ --image_path shengjiang.png \
96
+ --prompt "In the style of Yi Chen Dancing White Background , The character's movements shift dynamically throughout the video, transitioning from poised stillness to lively dance steps. Her expressions evolve seamlessly—starting with focused determination, then flashing surprise as she executes a quick spin, before breaking into a joyful smile mid-leap. Her hands flow through choreographed positions, sometimes extending gracefully like unfolding wings, other times clapping rhythmically against her wrists. During a dramatic hip sway, her fingers fan open near her cheek, then sweep downward as her whole body dips into a playful crouch, the sequins on her costume catching the light with every motion." \
97
+ --video_size 960 544 --video_seconds 3 --fps 30 --infer_steps 25 \
98
+ --attn_mode sdpa --fp8_scaled \
99
+ --vae_chunk_size 32 --vae_spatial_tile_sample_min_size 128 \
100
+ --save_path save --output_type both \
101
+ --seed 1234 --lora_multiplier 1.0 --lora_weight framepack_yichen_output/framepack-yichen-lora-000006.safetensors
102
+
103
+ ```
104
+
105
+ - With Lora
106
+
107
+
108
+
109
+ ### 3. Varesa
110
+ - Source Image
111
+
112
+
113
+ ```bash
114
+ python fpack_generate_video.py \
115
+ --dit FramePackI2V_HY/diffusion_pytorch_model-00001-of-00003.safetensors \
116
+ --vae HunyuanVideo/vae/diffusion_pytorch_model.safetensors \
117
+ --text_encoder1 HunyuanVideo_repackaged/split_files/text_encoders/llava_llama3_fp16.safetensors \
118
+ --text_encoder2 HunyuanVideo_repackaged/split_files/text_encoders/clip_l.safetensors \
119
+ --image_encoder sigclip_vision_384/sigclip_vision_patch14_384.safetensors \
120
+ --image_path waliesha.jpg \
121
+ --prompt "In the style of Yi Chen Dancing White Background , The dancer’s energy pulses in waves—one moment a statue, poised and precise, the next a whirl of motion as her feet flicker across the floor. Her face tells its own story: brows knit in concentration, then eyes widening mid-turn as if startled by her own speed, before dissolving into laughter as she springs upward, weightless. Her arms carve the air—now arcing like ribbons unfurling, now snapping sharp as a whip’s crack, palms meeting wrists in staccato beats. A roll of her hips sends her fingers fluttering near her temple, then cascading down as she folds into a teasing dip, the beads on her dress scattering light like sparks." \
122
+ --video_size 960 544 --video_seconds 3 --fps 30 --infer_steps 25 \
123
+ --attn_mode sdpa --fp8_scaled \
124
+ --vae_chunk_size 32 --vae_spatial_tile_sample_min_size 128 \
125
+ --save_path save --output_type both \
126
+ --seed 1234 --lora_multiplier 1.0 --lora_weight framepack_yichen_output/framepack-yichen-lora-000006.safetensors
127
+
128
+ ```
129
+ - With Lora
130
+
131
+
132
+
133
+ ### 4. Scaramouche
134
+ - Source Image
135
+
136
+
137
+ ```bash
138
+ python fpack_generate_video.py \
139
+ --dit FramePackI2V_HY/diffusion_pytorch_model-00001-of-00003.safetensors \
140
+ --vae HunyuanVideo/vae/diffusion_pytorch_model.safetensors \
141
+ --text_encoder1 HunyuanVideo_repackaged/split_files/text_encoders/llava_llama3_fp16.safetensors \
142
+ --text_encoder2 HunyuanVideo_repackaged/split_files/text_encoders/clip_l.safetensors \
143
+ --image_encoder sigclip_vision_384/sigclip_vision_patch14_384.safetensors \
144
+ --image_path shanbing.jpg \
145
+ --prompt "In the style of Yi Chen Dancing White Background , The dancer’s energy pulses in waves—one moment a statue, poised and precise, the next a whirl of motion as her feet flicker across the floor. Her face tells its own story: brows knit in concentration, then eyes widening mid-turn as if startled by her own speed, before dissolving into laughter as she springs upward, weightless. Her arms carve the air—now arcing like ribbons unfurling, now snapping sharp as a whip’s crack, palms meeting wrists in staccato beats. A roll of her hips sends her fingers fluttering near her temple, then cascading down as she folds into a teasing dip, the beads on her dress scattering light like sparks." \
146
+ --video_size 960 544 --video_seconds 3 --fps 30 --infer_steps 25 \
147
+ --attn_mode sdpa --fp8_scaled \
148
+ --vae_chunk_size 32 --vae_spatial_tile_sample_min_size 128 \
149
+ --save_path save --output_type both \
150
+ --seed 1234 --lora_multiplier 1.0 --lora_weight framepack_yichen_output/framepack-yichen-lora-000006.safetensors
151
+
152
+ ```
153
+
154
+ - With Lora
155
+
156
+
157
+
158
+
159
+ ## Parameters
160
+
161
+ * `--fp8`: Enable FP8 precision (optional).
162
+ * `--task`: Specify the task (e.g., `t2v-1.3B`).
163
+ * `--video_size`: Set the resolution of the generated video (e.g., `1024 1024`).
164
+ * `--video_length`: Define the length of the video in frames.
165
+ * `--infer_steps`: Number of inference steps.
166
+ * `--save_path`: Directory to save the generated video.
167
+ * `--output_type`: Output type (e.g., `both` for video and frames).
168
+ * `--dit`: Path to the diffusion model weights.
169
+ * `--vae`: Path to the VAE model weights.
170
+ * `--t5`: Path to the T5 model weights.
171
+ * `--attn_mode`: Attention mode (e.g., `torch`).
172
+ * `--lora_weight`: Path to the LoRA weights.
173
+ * `--lora_multiplier`: Multiplier for LoRA weights.
174
+ * `--prompt`: Textual prompt for video generation.
175
+
176
+
177
+
178
+ ## Output
179
+
180
+ The generated video and frames will be saved in the specified `save_path` directory.
181
+
182
+ ## Troubleshooting
183
+
184
+ • Ensure all dependencies are correctly installed.
185
+ • Verify that the model weights are downloaded and placed in the correct locations.
186
+ • Check for any missing Python packages and install them using `pip`.
187
+
188
+ ## License
189
+
190
+ This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
191
+
192
+ ## Acknowledgments
193
+
194
+ • **Hugging Face** for hosting the model weights.
195
+ • **Wan-AI** for providing the pre-trained models.
196
+ • **DeepBeepMeep** for contributing to the model weights.
197
+
198
+ ## Contact
199
+
200
+ For any questions or issues, please open an issue on the repository or contact the maintainer.
201
+
202
+ ---
203
+
cache_latents.py ADDED
@@ -0,0 +1,339 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import glob
4
+ from typing import Optional, Union
5
+
6
+ import numpy as np
7
+ import torch
8
+ from tqdm import tqdm
9
+
10
+ from dataset import config_utils
11
+ from dataset.config_utils import BlueprintGenerator, ConfigSanitizer
12
+ from PIL import Image
13
+
14
+ import logging
15
+
16
+ from dataset.image_video_dataset import BaseDataset, ItemInfo, save_latent_cache, ARCHITECTURE_HUNYUAN_VIDEO
17
+ from hunyuan_model.vae import load_vae
18
+ from hunyuan_model.autoencoder_kl_causal_3d import AutoencoderKLCausal3D
19
+ from utils.model_utils import str_to_dtype
20
+
21
+ logger = logging.getLogger(__name__)
22
+ logging.basicConfig(level=logging.INFO)
23
+
24
+
25
+ def show_image(image: Union[list[Union[Image.Image, np.ndarray], Union[Image.Image, np.ndarray]]]) -> int:
26
+ import cv2
27
+
28
+ imgs = (
29
+ [image]
30
+ if (isinstance(image, np.ndarray) and len(image.shape) == 3) or isinstance(image, Image.Image)
31
+ else [image[0], image[-1]]
32
+ )
33
+ if len(imgs) > 1:
34
+ print(f"Number of images: {len(image)}")
35
+ for i, img in enumerate(imgs):
36
+ if len(imgs) > 1:
37
+ print(f"{'First' if i == 0 else 'Last'} image: {img.shape}")
38
+ else:
39
+ print(f"Image: {img.shape}")
40
+ cv2_img = np.array(img) if isinstance(img, Image.Image) else img
41
+ cv2_img = cv2.cvtColor(cv2_img, cv2.COLOR_RGB2BGR)
42
+ cv2.imshow("image", cv2_img)
43
+ k = cv2.waitKey(0)
44
+ cv2.destroyAllWindows()
45
+ if k == ord("q") or k == ord("d"):
46
+ return k
47
+ return k
48
+
49
+
50
+ def show_console(
51
+ image: Union[list[Union[Image.Image, np.ndarray], Union[Image.Image, np.ndarray]]],
52
+ width: int,
53
+ back: str,
54
+ interactive: bool = False,
55
+ ) -> int:
56
+ from ascii_magic import from_pillow_image, Back
57
+
58
+ back = None
59
+ if back is not None:
60
+ back = getattr(Back, back.upper())
61
+
62
+ k = None
63
+ imgs = (
64
+ [image]
65
+ if (isinstance(image, np.ndarray) and len(image.shape) == 3) or isinstance(image, Image.Image)
66
+ else [image[0], image[-1]]
67
+ )
68
+ if len(imgs) > 1:
69
+ print(f"Number of images: {len(image)}")
70
+ for i, img in enumerate(imgs):
71
+ if len(imgs) > 1:
72
+ print(f"{'First' if i == 0 else 'Last'} image: {img.shape}")
73
+ else:
74
+ print(f"Image: {img.shape}")
75
+ pil_img = img if isinstance(img, Image.Image) else Image.fromarray(img)
76
+ ascii_img = from_pillow_image(pil_img)
77
+ ascii_img.to_terminal(columns=width, back=back)
78
+
79
+ if interactive:
80
+ k = input("Press q to quit, d to next dataset, other key to next: ")
81
+ if k == "q" or k == "d":
82
+ return ord(k)
83
+
84
+ if not interactive:
85
+ return ord(" ")
86
+ return ord(k) if k else ord(" ")
87
+
88
+
89
+ def save_video(image: Union[list[Union[Image.Image, np.ndarray], Union[Image.Image, np.ndarray]]], cache_path: str, fps: int = 24):
90
+ import av
91
+
92
+ directory = os.path.dirname(cache_path)
93
+ if not os.path.exists(directory):
94
+ os.makedirs(directory)
95
+
96
+ if (isinstance(image, np.ndarray) and len(image.shape) == 3) or isinstance(image, Image.Image):
97
+ # save image
98
+ image_path = cache_path.replace(".safetensors", ".jpg")
99
+ img = image if isinstance(image, Image.Image) else Image.fromarray(image)
100
+ img.save(image_path)
101
+ print(f"Saved image: {image_path}")
102
+ else:
103
+ imgs = image
104
+ print(f"Number of images: {len(imgs)}")
105
+ # save video
106
+ video_path = cache_path.replace(".safetensors", ".mp4")
107
+ height, width = imgs[0].shape[0:2]
108
+
109
+ # create output container
110
+ container = av.open(video_path, mode="w")
111
+
112
+ # create video stream
113
+ codec = "libx264"
114
+ pixel_format = "yuv420p"
115
+ stream = container.add_stream(codec, rate=fps)
116
+ stream.width = width
117
+ stream.height = height
118
+ stream.pix_fmt = pixel_format
119
+ stream.bit_rate = 1000000 # 1Mbit/s for preview quality
120
+
121
+ for frame_img in imgs:
122
+ if isinstance(frame_img, Image.Image):
123
+ frame = av.VideoFrame.from_image(frame_img)
124
+ else:
125
+ frame = av.VideoFrame.from_ndarray(frame_img, format="rgb24")
126
+ packets = stream.encode(frame)
127
+ for packet in packets:
128
+ container.mux(packet)
129
+
130
+ for packet in stream.encode():
131
+ container.mux(packet)
132
+
133
+ container.close()
134
+
135
+ print(f"Saved video: {video_path}")
136
+
137
+
138
+ def show_datasets(
139
+ datasets: list[BaseDataset],
140
+ debug_mode: str,
141
+ console_width: int,
142
+ console_back: str,
143
+ console_num_images: Optional[int],
144
+ fps: int = 24,
145
+ ):
146
+ if debug_mode != "video":
147
+ print(f"d: next dataset, q: quit")
148
+
149
+ num_workers = max(1, os.cpu_count() - 1)
150
+ for i, dataset in enumerate(datasets):
151
+ print(f"Dataset [{i}]")
152
+ batch_index = 0
153
+ num_images_to_show = console_num_images
154
+ k = None
155
+ for key, batch in dataset.retrieve_latent_cache_batches(num_workers):
156
+ print(f"bucket resolution: {key}, count: {len(batch)}")
157
+ for j, item_info in enumerate(batch):
158
+ item_info: ItemInfo
159
+ print(f"{batch_index}-{j}: {item_info}")
160
+ if debug_mode == "image":
161
+ k = show_image(item_info.content)
162
+ elif debug_mode == "console":
163
+ k = show_console(item_info.content, console_width, console_back, console_num_images is None)
164
+ if num_images_to_show is not None:
165
+ num_images_to_show -= 1
166
+ if num_images_to_show == 0:
167
+ k = ord("d") # next dataset
168
+ elif debug_mode == "video":
169
+ save_video(item_info.content, item_info.latent_cache_path, fps)
170
+ k = None # save next video
171
+
172
+ if k == ord("q"):
173
+ return
174
+ elif k == ord("d"):
175
+ break
176
+ if k == ord("d"):
177
+ break
178
+ batch_index += 1
179
+
180
+
181
+ def encode_and_save_batch(vae: AutoencoderKLCausal3D, batch: list[ItemInfo]):
182
+ contents = torch.stack([torch.from_numpy(item.content) for item in batch])
183
+ if len(contents.shape) == 4:
184
+ contents = contents.unsqueeze(1) # B, H, W, C -> B, F, H, W, C
185
+
186
+ contents = contents.permute(0, 4, 1, 2, 3).contiguous() # B, C, F, H, W
187
+ contents = contents.to(vae.device, dtype=vae.dtype)
188
+ contents = contents / 127.5 - 1.0 # normalize to [-1, 1]
189
+
190
+ h, w = contents.shape[3], contents.shape[4]
191
+ if h < 8 or w < 8:
192
+ item = batch[0] # other items should have the same size
193
+ raise ValueError(f"Image or video size too small: {item.item_key} and {len(batch) - 1} more, size: {item.original_size}")
194
+
195
+ # print(f"encode batch: {contents.shape}")
196
+ with torch.no_grad():
197
+ latent = vae.encode(contents).latent_dist.sample()
198
+ # latent = latent * vae.config.scaling_factor
199
+
200
+ # # debug: decode and save
201
+ # with torch.no_grad():
202
+ # latent_to_decode = latent / vae.config.scaling_factor
203
+ # images = vae.decode(latent_to_decode, return_dict=False)[0]
204
+ # images = (images / 2 + 0.5).clamp(0, 1)
205
+ # images = images.cpu().float().numpy()
206
+ # images = (images * 255).astype(np.uint8)
207
+ # images = images.transpose(0, 2, 3, 4, 1) # B, C, F, H, W -> B, F, H, W, C
208
+ # for b in range(images.shape[0]):
209
+ # for f in range(images.shape[1]):
210
+ # fln = os.path.splitext(os.path.basename(batch[b].item_key))[0]
211
+ # img = Image.fromarray(images[b, f])
212
+ # img.save(f"./logs/decode_{fln}_{b}_{f:03d}.jpg")
213
+
214
+ for item, l in zip(batch, latent):
215
+ # print(f"save latent cache: {item.latent_cache_path}, latent shape: {l.shape}")
216
+ save_latent_cache(item, l)
217
+
218
+
219
+ def encode_datasets(datasets: list[BaseDataset], encode: callable, args: argparse.Namespace):
220
+ num_workers = args.num_workers if args.num_workers is not None else max(1, os.cpu_count() - 1)
221
+ for i, dataset in enumerate(datasets):
222
+ logger.info(f"Encoding dataset [{i}]")
223
+ all_latent_cache_paths = []
224
+ for _, batch in tqdm(dataset.retrieve_latent_cache_batches(num_workers)):
225
+ all_latent_cache_paths.extend([item.latent_cache_path for item in batch])
226
+
227
+ if args.skip_existing:
228
+ filtered_batch = [item for item in batch if not os.path.exists(item.latent_cache_path)]
229
+ if len(filtered_batch) == 0:
230
+ continue
231
+ batch = filtered_batch
232
+
233
+ bs = args.batch_size if args.batch_size is not None else len(batch)
234
+ for i in range(0, len(batch), bs):
235
+ encode(batch[i : i + bs])
236
+
237
+ # normalize paths
238
+ all_latent_cache_paths = [os.path.normpath(p) for p in all_latent_cache_paths]
239
+ all_latent_cache_paths = set(all_latent_cache_paths)
240
+
241
+ # remove old cache files not in the dataset
242
+ all_cache_files = dataset.get_all_latent_cache_files()
243
+ for cache_file in all_cache_files:
244
+ if os.path.normpath(cache_file) not in all_latent_cache_paths:
245
+ if args.keep_cache:
246
+ logger.info(f"Keep cache file not in the dataset: {cache_file}")
247
+ else:
248
+ os.remove(cache_file)
249
+ logger.info(f"Removed old cache file: {cache_file}")
250
+
251
+
252
+ def main(args):
253
+ device = args.device if args.device is not None else "cuda" if torch.cuda.is_available() else "cpu"
254
+ device = torch.device(device)
255
+
256
+ # Load dataset config
257
+ blueprint_generator = BlueprintGenerator(ConfigSanitizer())
258
+ logger.info(f"Load dataset config from {args.dataset_config}")
259
+ user_config = config_utils.load_user_config(args.dataset_config)
260
+ blueprint = blueprint_generator.generate(user_config, args, architecture=ARCHITECTURE_HUNYUAN_VIDEO)
261
+ train_dataset_group = config_utils.generate_dataset_group_by_blueprint(blueprint.dataset_group)
262
+
263
+ datasets = train_dataset_group.datasets
264
+
265
+ if args.debug_mode is not None:
266
+ show_datasets(datasets, args.debug_mode, args.console_width, args.console_back, args.console_num_images)
267
+ return
268
+
269
+ assert args.vae is not None, "vae checkpoint is required"
270
+
271
+ # Load VAE model: HunyuanVideo VAE model is float16
272
+ vae_dtype = torch.float16 if args.vae_dtype is None else str_to_dtype(args.vae_dtype)
273
+ vae, _, s_ratio, t_ratio = load_vae(vae_dtype=vae_dtype, device=device, vae_path=args.vae)
274
+ vae.eval()
275
+ logger.info(f"Loaded VAE: {vae.config}, dtype: {vae.dtype}")
276
+
277
+ if args.vae_chunk_size is not None:
278
+ vae.set_chunk_size_for_causal_conv_3d(args.vae_chunk_size)
279
+ logger.info(f"Set chunk_size to {args.vae_chunk_size} for CausalConv3d in VAE")
280
+ if args.vae_spatial_tile_sample_min_size is not None:
281
+ vae.enable_spatial_tiling(True)
282
+ vae.tile_sample_min_size = args.vae_spatial_tile_sample_min_size
283
+ vae.tile_latent_min_size = args.vae_spatial_tile_sample_min_size // 8
284
+ elif args.vae_tiling:
285
+ vae.enable_spatial_tiling(True)
286
+
287
+ # Encode images
288
+ def encode(one_batch: list[ItemInfo]):
289
+ encode_and_save_batch(vae, one_batch)
290
+
291
+ encode_datasets(datasets, encode, args)
292
+
293
+
294
+ def setup_parser_common() -> argparse.ArgumentParser:
295
+ parser = argparse.ArgumentParser()
296
+
297
+ parser.add_argument("--dataset_config", type=str, required=True, help="path to dataset config .toml file")
298
+ parser.add_argument("--vae", type=str, required=False, default=None, help="path to vae checkpoint")
299
+ parser.add_argument("--vae_dtype", type=str, default=None, help="data type for VAE, default is float16")
300
+ parser.add_argument("--device", type=str, default=None, help="device to use, default is cuda if available")
301
+ parser.add_argument(
302
+ "--batch_size", type=int, default=None, help="batch size, override dataset config if dataset batch size > this"
303
+ )
304
+ parser.add_argument("--num_workers", type=int, default=None, help="number of workers for dataset. default is cpu count-1")
305
+ parser.add_argument("--skip_existing", action="store_true", help="skip existing cache files")
306
+ parser.add_argument("--keep_cache", action="store_true", help="keep cache files not in dataset")
307
+ parser.add_argument("--debug_mode", type=str, default=None, choices=["image", "console", "video"], help="debug mode")
308
+ parser.add_argument("--console_width", type=int, default=80, help="debug mode: console width")
309
+ parser.add_argument(
310
+ "--console_back", type=str, default=None, help="debug mode: console background color, one of ascii_magic.Back"
311
+ )
312
+ parser.add_argument(
313
+ "--console_num_images",
314
+ type=int,
315
+ default=None,
316
+ help="debug mode: not interactive, number of images to show for each dataset",
317
+ )
318
+ return parser
319
+
320
+
321
+ def hv_setup_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
322
+ parser.add_argument(
323
+ "--vae_tiling",
324
+ action="store_true",
325
+ help="enable spatial tiling for VAE, default is False. If vae_spatial_tile_sample_min_size is set, this is automatically enabled",
326
+ )
327
+ parser.add_argument("--vae_chunk_size", type=int, default=None, help="chunk size for CausalConv3d in VAE")
328
+ parser.add_argument(
329
+ "--vae_spatial_tile_sample_min_size", type=int, default=None, help="spatial tile sample min size for VAE, default 256"
330
+ )
331
+ return parser
332
+
333
+
334
+ if __name__ == "__main__":
335
+ parser = setup_parser_common()
336
+ parser = hv_setup_parser(parser)
337
+
338
+ args = parser.parse_args()
339
+ main(args)
cache_text_encoder_outputs.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ from typing import Optional, Union
4
+
5
+ import numpy as np
6
+ import torch
7
+ from tqdm import tqdm
8
+
9
+ from dataset import config_utils
10
+ from dataset.config_utils import BlueprintGenerator, ConfigSanitizer
11
+ import accelerate
12
+
13
+ from dataset.image_video_dataset import ARCHITECTURE_HUNYUAN_VIDEO, BaseDataset, ItemInfo, save_text_encoder_output_cache
14
+ from hunyuan_model import text_encoder as text_encoder_module
15
+ from hunyuan_model.text_encoder import TextEncoder
16
+
17
+ import logging
18
+
19
+ from utils.model_utils import str_to_dtype
20
+
21
+ logger = logging.getLogger(__name__)
22
+ logging.basicConfig(level=logging.INFO)
23
+
24
+
25
+ def encode_prompt(text_encoder: TextEncoder, prompt: Union[str, list[str]]):
26
+ data_type = "video" # video only, image is not supported
27
+ text_inputs = text_encoder.text2tokens(prompt, data_type=data_type)
28
+
29
+ with torch.no_grad():
30
+ prompt_outputs = text_encoder.encode(text_inputs, data_type=data_type)
31
+
32
+ return prompt_outputs.hidden_state, prompt_outputs.attention_mask
33
+
34
+
35
+ def encode_and_save_batch(
36
+ text_encoder: TextEncoder, batch: list[ItemInfo], is_llm: bool, accelerator: Optional[accelerate.Accelerator]
37
+ ):
38
+ prompts = [item.caption for item in batch]
39
+ # print(prompts)
40
+
41
+ # encode prompt
42
+ if accelerator is not None:
43
+ with accelerator.autocast():
44
+ prompt_embeds, prompt_mask = encode_prompt(text_encoder, prompts)
45
+ else:
46
+ prompt_embeds, prompt_mask = encode_prompt(text_encoder, prompts)
47
+
48
+ # # convert to fp16 if needed
49
+ # if prompt_embeds.dtype == torch.float32 and text_encoder.dtype != torch.float32:
50
+ # prompt_embeds = prompt_embeds.to(text_encoder.dtype)
51
+
52
+ # save prompt cache
53
+ for item, embed, mask in zip(batch, prompt_embeds, prompt_mask):
54
+ save_text_encoder_output_cache(item, embed, mask, is_llm)
55
+
56
+
57
+ def prepare_cache_files_and_paths(datasets: list[BaseDataset]):
58
+ all_cache_files_for_dataset = [] # exisiting cache files
59
+ all_cache_paths_for_dataset = [] # all cache paths in the dataset
60
+ for dataset in datasets:
61
+ all_cache_files = [os.path.normpath(file) for file in dataset.get_all_text_encoder_output_cache_files()]
62
+ all_cache_files = set(all_cache_files)
63
+ all_cache_files_for_dataset.append(all_cache_files)
64
+
65
+ all_cache_paths_for_dataset.append(set())
66
+ return all_cache_files_for_dataset, all_cache_paths_for_dataset
67
+
68
+
69
+ def process_text_encoder_batches(
70
+ num_workers: Optional[int],
71
+ skip_existing: bool,
72
+ batch_size: int,
73
+ datasets: list[BaseDataset],
74
+ all_cache_files_for_dataset: list[set],
75
+ all_cache_paths_for_dataset: list[set],
76
+ encode: callable,
77
+ ):
78
+ num_workers = num_workers if num_workers is not None else max(1, os.cpu_count() - 1)
79
+ for i, dataset in enumerate(datasets):
80
+ logger.info(f"Encoding dataset [{i}]")
81
+ all_cache_files = all_cache_files_for_dataset[i]
82
+ all_cache_paths = all_cache_paths_for_dataset[i]
83
+ for batch in tqdm(dataset.retrieve_text_encoder_output_cache_batches(num_workers)):
84
+ # update cache files (it's ok if we update it multiple times)
85
+ all_cache_paths.update([os.path.normpath(item.text_encoder_output_cache_path) for item in batch])
86
+
87
+ # skip existing cache files
88
+ if skip_existing:
89
+ filtered_batch = [
90
+ item for item in batch if not os.path.normpath(item.text_encoder_output_cache_path) in all_cache_files
91
+ ]
92
+ # print(f"Filtered {len(batch) - len(filtered_batch)} existing cache files")
93
+ if len(filtered_batch) == 0:
94
+ continue
95
+ batch = filtered_batch
96
+
97
+ bs = batch_size if batch_size is not None else len(batch)
98
+ for i in range(0, len(batch), bs):
99
+ encode(batch[i : i + bs])
100
+
101
+
102
+ def post_process_cache_files(
103
+ datasets: list[BaseDataset], all_cache_files_for_dataset: list[set], all_cache_paths_for_dataset: list[set]
104
+ ):
105
+ for i, dataset in enumerate(datasets):
106
+ all_cache_files = all_cache_files_for_dataset[i]
107
+ all_cache_paths = all_cache_paths_for_dataset[i]
108
+ for cache_file in all_cache_files:
109
+ if cache_file not in all_cache_paths:
110
+ if args.keep_cache:
111
+ logger.info(f"Keep cache file not in the dataset: {cache_file}")
112
+ else:
113
+ os.remove(cache_file)
114
+ logger.info(f"Removed old cache file: {cache_file}")
115
+
116
+
117
+ def main(args):
118
+ device = args.device if args.device is not None else "cuda" if torch.cuda.is_available() else "cpu"
119
+ device = torch.device(device)
120
+
121
+ # Load dataset config
122
+ blueprint_generator = BlueprintGenerator(ConfigSanitizer())
123
+ logger.info(f"Load dataset config from {args.dataset_config}")
124
+ user_config = config_utils.load_user_config(args.dataset_config)
125
+ blueprint = blueprint_generator.generate(user_config, args, architecture=ARCHITECTURE_HUNYUAN_VIDEO)
126
+ train_dataset_group = config_utils.generate_dataset_group_by_blueprint(blueprint.dataset_group)
127
+
128
+ datasets = train_dataset_group.datasets
129
+
130
+ # define accelerator for fp8 inference
131
+ accelerator = None
132
+ if args.fp8_llm:
133
+ accelerator = accelerate.Accelerator(mixed_precision="fp16")
134
+
135
+ # prepare cache files and paths: all_cache_files_for_dataset = exisiting cache files, all_cache_paths_for_dataset = all cache paths in the dataset
136
+ all_cache_files_for_dataset, all_cache_paths_for_dataset = prepare_cache_files_and_paths(datasets)
137
+
138
+ # Load Text Encoder 1
139
+ text_encoder_dtype = torch.float16 if args.text_encoder_dtype is None else str_to_dtype(args.text_encoder_dtype)
140
+ logger.info(f"loading text encoder 1: {args.text_encoder1}")
141
+ text_encoder_1 = text_encoder_module.load_text_encoder_1(args.text_encoder1, device, args.fp8_llm, text_encoder_dtype)
142
+ text_encoder_1.to(device=device)
143
+
144
+ # Encode with Text Encoder 1 (LLM)
145
+ logger.info("Encoding with Text Encoder 1")
146
+
147
+ def encode_for_text_encoder_1(batch: list[ItemInfo]):
148
+ encode_and_save_batch(text_encoder_1, batch, is_llm=True, accelerator=accelerator)
149
+
150
+ process_text_encoder_batches(
151
+ args.num_workers,
152
+ args.skip_existing,
153
+ args.batch_size,
154
+ datasets,
155
+ all_cache_files_for_dataset,
156
+ all_cache_paths_for_dataset,
157
+ encode_for_text_encoder_1,
158
+ )
159
+ del text_encoder_1
160
+
161
+ # Load Text Encoder 2
162
+ logger.info(f"loading text encoder 2: {args.text_encoder2}")
163
+ text_encoder_2 = text_encoder_module.load_text_encoder_2(args.text_encoder2, device, text_encoder_dtype)
164
+ text_encoder_2.to(device=device)
165
+
166
+ # Encode with Text Encoder 2
167
+ logger.info("Encoding with Text Encoder 2")
168
+
169
+ def encode_for_text_encoder_2(batch: list[ItemInfo]):
170
+ encode_and_save_batch(text_encoder_2, batch, is_llm=False, accelerator=None)
171
+
172
+ process_text_encoder_batches(
173
+ args.num_workers,
174
+ args.skip_existing,
175
+ args.batch_size,
176
+ datasets,
177
+ all_cache_files_for_dataset,
178
+ all_cache_paths_for_dataset,
179
+ encode_for_text_encoder_2,
180
+ )
181
+ del text_encoder_2
182
+
183
+ # remove cache files not in dataset
184
+ post_process_cache_files(datasets, all_cache_files_for_dataset, all_cache_paths_for_dataset)
185
+
186
+
187
+ def setup_parser_common():
188
+ parser = argparse.ArgumentParser()
189
+
190
+ parser.add_argument("--dataset_config", type=str, required=True, help="path to dataset config .toml file")
191
+ parser.add_argument("--device", type=str, default=None, help="device to use, default is cuda if available")
192
+ parser.add_argument(
193
+ "--batch_size", type=int, default=None, help="batch size, override dataset config if dataset batch size > this"
194
+ )
195
+ parser.add_argument("--num_workers", type=int, default=None, help="number of workers for dataset. default is cpu count-1")
196
+ parser.add_argument("--skip_existing", action="store_true", help="skip existing cache files")
197
+ parser.add_argument("--keep_cache", action="store_true", help="keep cache files not in dataset")
198
+ return parser
199
+
200
+
201
+ def hv_setup_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
202
+ parser.add_argument("--text_encoder1", type=str, required=True, help="Text Encoder 1 directory")
203
+ parser.add_argument("--text_encoder2", type=str, required=True, help="Text Encoder 2 directory")
204
+ parser.add_argument("--text_encoder_dtype", type=str, default=None, help="data type for Text Encoder, default is float16")
205
+ parser.add_argument("--fp8_llm", action="store_true", help="use fp8 for Text Encoder 1 (LLM)")
206
+ return parser
207
+
208
+
209
+ if __name__ == "__main__":
210
+ parser = setup_parser_common()
211
+ parser = hv_setup_parser(parser)
212
+
213
+ args = parser.parse_args()
214
+ main(args)
convert_lora.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+
3
+ import torch
4
+ from safetensors.torch import load_file, save_file
5
+ from safetensors import safe_open
6
+ from utils import model_utils
7
+
8
+ import logging
9
+
10
+
11
+ logger = logging.getLogger(__name__)
12
+ logging.basicConfig(level=logging.INFO)
13
+
14
+
15
+ def convert_from_diffusers(prefix, weights_sd):
16
+ # convert from diffusers(?) to default LoRA
17
+ # Diffusers format: {"diffusion_model.module.name.lora_A.weight": weight, "diffusion_model.module.name.lora_B.weight": weight, ...}
18
+ # default LoRA format: {"prefix_module_name.lora_down.weight": weight, "prefix_module_name.lora_up.weight": weight, ...}
19
+
20
+ # note: Diffusers has no alpha, so alpha is set to rank
21
+ new_weights_sd = {}
22
+ lora_dims = {}
23
+ for key, weight in weights_sd.items():
24
+ diffusers_prefix, key_body = key.split(".", 1)
25
+ if diffusers_prefix != "diffusion_model" and diffusers_prefix != "transformer":
26
+ logger.warning(f"unexpected key: {key} in diffusers format")
27
+ continue
28
+
29
+ new_key = f"{prefix}{key_body}".replace(".", "_").replace("_lora_A_", ".lora_down.").replace("_lora_B_", ".lora_up.")
30
+ new_weights_sd[new_key] = weight
31
+
32
+ lora_name = new_key.split(".")[0] # before first dot
33
+ if lora_name not in lora_dims and "lora_down" in new_key:
34
+ lora_dims[lora_name] = weight.shape[0]
35
+
36
+ # add alpha with rank
37
+ for lora_name, dim in lora_dims.items():
38
+ new_weights_sd[f"{lora_name}.alpha"] = torch.tensor(dim)
39
+
40
+ return new_weights_sd
41
+
42
+
43
+ def convert_to_diffusers(prefix, weights_sd):
44
+ # convert from default LoRA to diffusers
45
+
46
+ # get alphas
47
+ lora_alphas = {}
48
+ for key, weight in weights_sd.items():
49
+ if key.startswith(prefix):
50
+ lora_name = key.split(".", 1)[0] # before first dot
51
+ if lora_name not in lora_alphas and "alpha" in key:
52
+ lora_alphas[lora_name] = weight
53
+
54
+ new_weights_sd = {}
55
+ for key, weight in weights_sd.items():
56
+ if key.startswith(prefix):
57
+ if "alpha" in key:
58
+ continue
59
+
60
+ lora_name = key.split(".", 1)[0] # before first dot
61
+
62
+ module_name = lora_name[len(prefix) :] # remove "lora_unet_"
63
+ module_name = module_name.replace("_", ".") # replace "_" with "."
64
+ if ".cross.attn." in module_name or ".self.attn." in module_name:
65
+ # Wan2.1 lora name to module name: ugly but works
66
+ module_name = module_name.replace("cross.attn", "cross_attn") # fix cross attn
67
+ module_name = module_name.replace("self.attn", "self_attn") # fix self attn
68
+ module_name = module_name.replace("k.img", "k_img") # fix k img
69
+ module_name = module_name.replace("v.img", "v_img") # fix v img
70
+ else:
71
+ # HunyuanVideo lora name to module name: ugly but works
72
+ module_name = module_name.replace("double.blocks.", "double_blocks.") # fix double blocks
73
+ module_name = module_name.replace("single.blocks.", "single_blocks.") # fix single blocks
74
+ module_name = module_name.replace("img.", "img_") # fix img
75
+ module_name = module_name.replace("txt.", "txt_") # fix txt
76
+ module_name = module_name.replace("attn.", "attn_") # fix attn
77
+
78
+ diffusers_prefix = "diffusion_model"
79
+ if "lora_down" in key:
80
+ new_key = f"{diffusers_prefix}.{module_name}.lora_A.weight"
81
+ dim = weight.shape[0]
82
+ elif "lora_up" in key:
83
+ new_key = f"{diffusers_prefix}.{module_name}.lora_B.weight"
84
+ dim = weight.shape[1]
85
+ else:
86
+ logger.warning(f"unexpected key: {key} in default LoRA format")
87
+ continue
88
+
89
+ # scale weight by alpha
90
+ if lora_name in lora_alphas:
91
+ # we scale both down and up, so scale is sqrt
92
+ scale = lora_alphas[lora_name] / dim
93
+ scale = scale.sqrt()
94
+ weight = weight * scale
95
+ else:
96
+ logger.warning(f"missing alpha for {lora_name}")
97
+
98
+ new_weights_sd[new_key] = weight
99
+
100
+ return new_weights_sd
101
+
102
+
103
+ def convert(input_file, output_file, target_format):
104
+ logger.info(f"loading {input_file}")
105
+ weights_sd = load_file(input_file)
106
+ with safe_open(input_file, framework="pt") as f:
107
+ metadata = f.metadata()
108
+
109
+ logger.info(f"converting to {target_format}")
110
+ prefix = "lora_unet_"
111
+ if target_format == "default":
112
+ new_weights_sd = convert_from_diffusers(prefix, weights_sd)
113
+ metadata = metadata or {}
114
+ model_utils.precalculate_safetensors_hashes(new_weights_sd, metadata)
115
+ elif target_format == "other":
116
+ new_weights_sd = convert_to_diffusers(prefix, weights_sd)
117
+ else:
118
+ raise ValueError(f"unknown target format: {target_format}")
119
+
120
+ logger.info(f"saving to {output_file}")
121
+ save_file(new_weights_sd, output_file, metadata=metadata)
122
+
123
+ logger.info("done")
124
+
125
+
126
+ def parse_args():
127
+ parser = argparse.ArgumentParser(description="Convert LoRA weights between default and other formats")
128
+ parser.add_argument("--input", type=str, required=True, help="input model file")
129
+ parser.add_argument("--output", type=str, required=True, help="output model file")
130
+ parser.add_argument("--target", type=str, required=True, choices=["other", "default"], help="target format")
131
+ args = parser.parse_args()
132
+ return args
133
+
134
+
135
+ if __name__ == "__main__":
136
+ args = parse_args()
137
+ convert(args.input, args.output, args.target)
dataset/__init__.py ADDED
File without changes
dataset/config_utils.py ADDED
@@ -0,0 +1,381 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from dataclasses import (
3
+ asdict,
4
+ dataclass,
5
+ )
6
+ import functools
7
+ import random
8
+ from textwrap import dedent, indent
9
+ import json
10
+ from pathlib import Path
11
+
12
+ # from toolz import curry
13
+ from typing import Dict, List, Optional, Sequence, Tuple, Union
14
+
15
+ import toml
16
+ import voluptuous
17
+ from voluptuous import Any, ExactSequence, MultipleInvalid, Object, Schema
18
+
19
+ from .image_video_dataset import DatasetGroup, ImageDataset, VideoDataset
20
+
21
+ import logging
22
+
23
+ logger = logging.getLogger(__name__)
24
+ logging.basicConfig(level=logging.INFO)
25
+
26
+
27
+ @dataclass
28
+ class BaseDatasetParams:
29
+ resolution: Tuple[int, int] = (960, 544)
30
+ enable_bucket: bool = False
31
+ bucket_no_upscale: bool = False
32
+ caption_extension: Optional[str] = None
33
+ batch_size: int = 1
34
+ num_repeats: int = 1
35
+ cache_directory: Optional[str] = None
36
+ debug_dataset: bool = False
37
+ architecture: str = "no_default" # short style like "hv" or "wan"
38
+
39
+
40
+ @dataclass
41
+ class ImageDatasetParams(BaseDatasetParams):
42
+ image_directory: Optional[str] = None
43
+ image_jsonl_file: Optional[str] = None
44
+
45
+
46
+ @dataclass
47
+ class VideoDatasetParams(BaseDatasetParams):
48
+ video_directory: Optional[str] = None
49
+ video_jsonl_file: Optional[str] = None
50
+ control_directory: Optional[str] = None
51
+ target_frames: Sequence[int] = (1,)
52
+ frame_extraction: Optional[str] = "head"
53
+ frame_stride: Optional[int] = 1
54
+ frame_sample: Optional[int] = 1
55
+ max_frames: Optional[int] = 129
56
+ source_fps: Optional[float] = None
57
+
58
+
59
+ @dataclass
60
+ class DatasetBlueprint:
61
+ is_image_dataset: bool
62
+ params: Union[ImageDatasetParams, VideoDatasetParams]
63
+
64
+
65
+ @dataclass
66
+ class DatasetGroupBlueprint:
67
+ datasets: Sequence[DatasetBlueprint]
68
+
69
+
70
+ @dataclass
71
+ class Blueprint:
72
+ dataset_group: DatasetGroupBlueprint
73
+
74
+
75
+ class ConfigSanitizer:
76
+ # @curry
77
+ @staticmethod
78
+ def __validate_and_convert_twodim(klass, value: Sequence) -> Tuple:
79
+ Schema(ExactSequence([klass, klass]))(value)
80
+ return tuple(value)
81
+
82
+ # @curry
83
+ @staticmethod
84
+ def __validate_and_convert_scalar_or_twodim(klass, value: Union[float, Sequence]) -> Tuple:
85
+ Schema(Any(klass, ExactSequence([klass, klass])))(value)
86
+ try:
87
+ Schema(klass)(value)
88
+ return (value, value)
89
+ except:
90
+ return ConfigSanitizer.__validate_and_convert_twodim(klass, value)
91
+
92
+ # datasets schema
93
+ DATASET_ASCENDABLE_SCHEMA = {
94
+ "caption_extension": str,
95
+ "batch_size": int,
96
+ "num_repeats": int,
97
+ "resolution": functools.partial(__validate_and_convert_scalar_or_twodim.__func__, int),
98
+ "enable_bucket": bool,
99
+ "bucket_no_upscale": bool,
100
+ }
101
+ IMAGE_DATASET_DISTINCT_SCHEMA = {
102
+ "image_directory": str,
103
+ "image_jsonl_file": str,
104
+ "cache_directory": str,
105
+ }
106
+ VIDEO_DATASET_DISTINCT_SCHEMA = {
107
+ "video_directory": str,
108
+ "video_jsonl_file": str,
109
+ "control_directory": str,
110
+ "target_frames": [int],
111
+ "frame_extraction": str,
112
+ "frame_stride": int,
113
+ "frame_sample": int,
114
+ "max_frames": int,
115
+ "cache_directory": str,
116
+ "source_fps": float,
117
+ }
118
+
119
+ # options handled by argparse but not handled by user config
120
+ ARGPARSE_SPECIFIC_SCHEMA = {
121
+ "debug_dataset": bool,
122
+ }
123
+
124
+ def __init__(self) -> None:
125
+ self.image_dataset_schema = self.__merge_dict(
126
+ self.DATASET_ASCENDABLE_SCHEMA,
127
+ self.IMAGE_DATASET_DISTINCT_SCHEMA,
128
+ )
129
+ self.video_dataset_schema = self.__merge_dict(
130
+ self.DATASET_ASCENDABLE_SCHEMA,
131
+ self.VIDEO_DATASET_DISTINCT_SCHEMA,
132
+ )
133
+
134
+ def validate_flex_dataset(dataset_config: dict):
135
+ if "video_directory" in dataset_config or "video_jsonl_file" in dataset_config:
136
+ return Schema(self.video_dataset_schema)(dataset_config)
137
+ else:
138
+ return Schema(self.image_dataset_schema)(dataset_config)
139
+
140
+ self.dataset_schema = validate_flex_dataset
141
+
142
+ self.general_schema = self.__merge_dict(
143
+ self.DATASET_ASCENDABLE_SCHEMA,
144
+ )
145
+ self.user_config_validator = Schema(
146
+ {
147
+ "general": self.general_schema,
148
+ "datasets": [self.dataset_schema],
149
+ }
150
+ )
151
+ self.argparse_schema = self.__merge_dict(
152
+ self.ARGPARSE_SPECIFIC_SCHEMA,
153
+ )
154
+ self.argparse_config_validator = Schema(Object(self.argparse_schema), extra=voluptuous.ALLOW_EXTRA)
155
+
156
+ def sanitize_user_config(self, user_config: dict) -> dict:
157
+ try:
158
+ return self.user_config_validator(user_config)
159
+ except MultipleInvalid:
160
+ # TODO: clarify the error message
161
+ logger.error("Invalid user config / ユーザ設定の形式が正しくないようです")
162
+ raise
163
+
164
+ # NOTE: In nature, argument parser result is not needed to be sanitize
165
+ # However this will help us to detect program bug
166
+ def sanitize_argparse_namespace(self, argparse_namespace: argparse.Namespace) -> argparse.Namespace:
167
+ try:
168
+ return self.argparse_config_validator(argparse_namespace)
169
+ except MultipleInvalid:
170
+ # XXX: this should be a bug
171
+ logger.error(
172
+ "Invalid cmdline parsed arguments. This should be a bug. / コマンドラインのパース結果が正しくないようです。プログラムのバグの可能性が高いです。"
173
+ )
174
+ raise
175
+
176
+ # NOTE: value would be overwritten by latter dict if there is already the same key
177
+ @staticmethod
178
+ def __merge_dict(*dict_list: dict) -> dict:
179
+ merged = {}
180
+ for schema in dict_list:
181
+ # merged |= schema
182
+ for k, v in schema.items():
183
+ merged[k] = v
184
+ return merged
185
+
186
+
187
+ class BlueprintGenerator:
188
+ BLUEPRINT_PARAM_NAME_TO_CONFIG_OPTNAME = {}
189
+
190
+ def __init__(self, sanitizer: ConfigSanitizer):
191
+ self.sanitizer = sanitizer
192
+
193
+ # runtime_params is for parameters which is only configurable on runtime, such as tokenizer
194
+ def generate(self, user_config: dict, argparse_namespace: argparse.Namespace, **runtime_params) -> Blueprint:
195
+ sanitized_user_config = self.sanitizer.sanitize_user_config(user_config)
196
+ sanitized_argparse_namespace = self.sanitizer.sanitize_argparse_namespace(argparse_namespace)
197
+
198
+ argparse_config = {k: v for k, v in vars(sanitized_argparse_namespace).items() if v is not None}
199
+ general_config = sanitized_user_config.get("general", {})
200
+
201
+ dataset_blueprints = []
202
+ for dataset_config in sanitized_user_config.get("datasets", []):
203
+ is_image_dataset = "image_directory" in dataset_config or "image_jsonl_file" in dataset_config
204
+ if is_image_dataset:
205
+ dataset_params_klass = ImageDatasetParams
206
+ else:
207
+ dataset_params_klass = VideoDatasetParams
208
+
209
+ params = self.generate_params_by_fallbacks(
210
+ dataset_params_klass, [dataset_config, general_config, argparse_config, runtime_params]
211
+ )
212
+ dataset_blueprints.append(DatasetBlueprint(is_image_dataset, params))
213
+
214
+ dataset_group_blueprint = DatasetGroupBlueprint(dataset_blueprints)
215
+
216
+ return Blueprint(dataset_group_blueprint)
217
+
218
+ @staticmethod
219
+ def generate_params_by_fallbacks(param_klass, fallbacks: Sequence[dict]):
220
+ name_map = BlueprintGenerator.BLUEPRINT_PARAM_NAME_TO_CONFIG_OPTNAME
221
+ search_value = BlueprintGenerator.search_value
222
+ default_params = asdict(param_klass())
223
+ param_names = default_params.keys()
224
+
225
+ params = {name: search_value(name_map.get(name, name), fallbacks, default_params.get(name)) for name in param_names}
226
+
227
+ return param_klass(**params)
228
+
229
+ @staticmethod
230
+ def search_value(key: str, fallbacks: Sequence[dict], default_value=None):
231
+ for cand in fallbacks:
232
+ value = cand.get(key)
233
+ if value is not None:
234
+ return value
235
+
236
+ return default_value
237
+
238
+
239
+ # if training is True, it will return a dataset group for training, otherwise for caching
240
+ def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlueprint, training: bool = False) -> DatasetGroup:
241
+ datasets: List[Union[ImageDataset, VideoDataset]] = []
242
+
243
+ for dataset_blueprint in dataset_group_blueprint.datasets:
244
+ if dataset_blueprint.is_image_dataset:
245
+ dataset_klass = ImageDataset
246
+ else:
247
+ dataset_klass = VideoDataset
248
+
249
+ dataset = dataset_klass(**asdict(dataset_blueprint.params))
250
+ datasets.append(dataset)
251
+
252
+ # assertion
253
+ cache_directories = [dataset.cache_directory for dataset in datasets]
254
+ num_of_unique_cache_directories = len(set(cache_directories))
255
+ if num_of_unique_cache_directories != len(cache_directories):
256
+ raise ValueError(
257
+ "cache directory should be unique for each dataset (note that cache directory is image/video directory if not specified)"
258
+ + " / cache directory は各データセットごとに異なる必要があります(指定されていない場合はimage/video directoryが使われるので注意)"
259
+ )
260
+
261
+ # print info
262
+ info = ""
263
+ for i, dataset in enumerate(datasets):
264
+ is_image_dataset = isinstance(dataset, ImageDataset)
265
+ info += dedent(
266
+ f"""\
267
+ [Dataset {i}]
268
+ is_image_dataset: {is_image_dataset}
269
+ resolution: {dataset.resolution}
270
+ batch_size: {dataset.batch_size}
271
+ num_repeats: {dataset.num_repeats}
272
+ caption_extension: "{dataset.caption_extension}"
273
+ enable_bucket: {dataset.enable_bucket}
274
+ bucket_no_upscale: {dataset.bucket_no_upscale}
275
+ cache_directory: "{dataset.cache_directory}"
276
+ debug_dataset: {dataset.debug_dataset}
277
+ """
278
+ )
279
+
280
+ if is_image_dataset:
281
+ info += indent(
282
+ dedent(
283
+ f"""\
284
+ image_directory: "{dataset.image_directory}"
285
+ image_jsonl_file: "{dataset.image_jsonl_file}"
286
+ \n"""
287
+ ),
288
+ " ",
289
+ )
290
+ else:
291
+ info += indent(
292
+ dedent(
293
+ f"""\
294
+ video_directory: "{dataset.video_directory}"
295
+ video_jsonl_file: "{dataset.video_jsonl_file}"
296
+ control_directory: "{dataset.control_directory}"
297
+ target_frames: {dataset.target_frames}
298
+ frame_extraction: {dataset.frame_extraction}
299
+ frame_stride: {dataset.frame_stride}
300
+ frame_sample: {dataset.frame_sample}
301
+ max_frames: {dataset.max_frames}
302
+ source_fps: {dataset.source_fps}
303
+ \n"""
304
+ ),
305
+ " ",
306
+ )
307
+ logger.info(f"{info}")
308
+
309
+ # make buckets first because it determines the length of dataset
310
+ # and set the same seed for all datasets
311
+ seed = random.randint(0, 2**31) # actual seed is seed + epoch_no
312
+ for i, dataset in enumerate(datasets):
313
+ # logger.info(f"[Dataset {i}]")
314
+ dataset.set_seed(seed)
315
+ if training:
316
+ dataset.prepare_for_training()
317
+
318
+ return DatasetGroup(datasets)
319
+
320
+
321
+ def load_user_config(file: str) -> dict:
322
+ file: Path = Path(file)
323
+ if not file.is_file():
324
+ raise ValueError(f"file not found / ファイルが見つかりません: {file}")
325
+
326
+ if file.name.lower().endswith(".json"):
327
+ try:
328
+ with open(file, "r", encoding="utf-8") as f:
329
+ config = json.load(f)
330
+ except Exception:
331
+ logger.error(
332
+ f"Error on parsing JSON config file. Please check the format. / JSON 形式の設定ファイルの読み込みに失敗しました。文法が正しいか確認してください。: {file}"
333
+ )
334
+ raise
335
+ elif file.name.lower().endswith(".toml"):
336
+ try:
337
+ config = toml.load(file)
338
+ except Exception:
339
+ logger.error(
340
+ f"Error on parsing TOML config file. Please check the format. / TOML 形式の設定ファイルの読み込みに失敗しました。文法が正しいか確認してください。: {file}"
341
+ )
342
+ raise
343
+ else:
344
+ raise ValueError(f"not supported config file format / 対応していない設定ファイルの形式です: {file}")
345
+
346
+ return config
347
+
348
+
349
+ # for config test
350
+ if __name__ == "__main__":
351
+ parser = argparse.ArgumentParser()
352
+ parser.add_argument("dataset_config")
353
+ config_args, remain = parser.parse_known_args()
354
+
355
+ parser = argparse.ArgumentParser()
356
+ parser.add_argument("--debug_dataset", action="store_true")
357
+ argparse_namespace = parser.parse_args(remain)
358
+
359
+ logger.info("[argparse_namespace]")
360
+ logger.info(f"{vars(argparse_namespace)}")
361
+
362
+ user_config = load_user_config(config_args.dataset_config)
363
+
364
+ logger.info("")
365
+ logger.info("[user_config]")
366
+ logger.info(f"{user_config}")
367
+
368
+ sanitizer = ConfigSanitizer()
369
+ sanitized_user_config = sanitizer.sanitize_user_config(user_config)
370
+
371
+ logger.info("")
372
+ logger.info("[sanitized_user_config]")
373
+ logger.info(f"{sanitized_user_config}")
374
+
375
+ blueprint = BlueprintGenerator(sanitizer).generate(user_config, argparse_namespace)
376
+
377
+ logger.info("")
378
+ logger.info("[blueprint]")
379
+ logger.info(f"{blueprint}")
380
+
381
+ dataset_group = generate_dataset_group_by_blueprint(blueprint.dataset_group)
dataset/dataset_config.md ADDED
@@ -0,0 +1,461 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ > 📝 Click on the language section to expand / 言語をクリックして展開
2
+
3
+ ## Dataset Configuration
4
+
5
+ Please create a TOML file for dataset configuration.
6
+
7
+ Image and video datasets are supported. The configuration file can include multiple datasets, either image or video datasets, with caption text files or metadata JSONL files.
8
+
9
+ The cache directory must be different for each dataset.
10
+
11
+ Each video is extracted frame by frame without additional processing and used for training. It is recommended to use videos with a frame rate of 24fps for HunyuanVideo, 16fps for Wan2.1 and 30fps for FramePack. You can check the videos that will be trained using `--debug_mode video` when caching latent (see [here](/README.md#latent-caching)).
12
+ <details>
13
+ <summary>日本語</summary>
14
+
15
+ データセットの設定を行うためのTOMLファイルを作成してください。
16
+
17
+ 画像データセットと動画データセットがサポートされています。設定ファイルには、画像または動画データセットを複数含めることができます。キャプションテキストファイルまたはメタデータJSONLファイルを使用できます。
18
+
19
+ キャッシュディレクトリは、各データセットごとに異なるディレクトリである必要があります。
20
+
21
+ 動画は追加のプロセスなしでフレームごとに抽出され、学習に用いられます。そのため、HunyuanVideoは24fps、Wan2.1は16fps、FramePackは30fpsのフレームレートの動画を使用することをお勧めします。latentキャッシュ時の`--debug_mode video`を使用すると、学習される動画を確認できます([こちら](/README.ja.md#latentの事前キャッシュ)を参照)。
22
+ </details>
23
+
24
+ ### Sample for Image Dataset with Caption Text Files
25
+
26
+ ```toml
27
+ # resolution, caption_extension, batch_size, num_repeats, enable_bucket, bucket_no_upscale should be set in either general or datasets
28
+ # otherwise, the default values will be used for each item
29
+
30
+ # general configurations
31
+ [general]
32
+ resolution = [960, 544]
33
+ caption_extension = ".txt"
34
+ batch_size = 1
35
+ enable_bucket = true
36
+ bucket_no_upscale = false
37
+
38
+ [[datasets]]
39
+ image_directory = "/path/to/image_dir"
40
+ cache_directory = "/path/to/cache_directory"
41
+ num_repeats = 1 # optional, default is 1. Number of times to repeat the dataset. Useful to balance the multiple datasets with different sizes.
42
+
43
+ # other datasets can be added here. each dataset can have different configurations
44
+ ```
45
+
46
+ `cache_directory` is optional, default is None to use the same directory as the image directory. However, we recommend to set the cache directory to avoid accidental sharing of the cache files between different datasets.
47
+
48
+ `num_repeats` is also available. It is optional, default is 1 (no repeat). It repeats the images (or videos) that many times to expand the dataset. For example, if `num_repeats = 2` and there are 20 images in the dataset, each image will be duplicated twice (with the same caption) to have a total of 40 images. It is useful to balance the multiple datasets with different sizes.
49
+
50
+ <details>
51
+ <summary>日本語</summary>
52
+
53
+ `cache_directory` はオプションです。デフォルトは画像ディレクトリと同じディレクトリに設定されます。ただし、異なるデータセット間でキャッシュファイルが共有されるのを防ぐために、明示的に別のキャッシュディレクトリを設定することをお勧めします。
54
+
55
+ `num_repeats` はオプションで、デフォルトは 1 です(繰り返しなし)。画像(や動画)を、その回数だけ単純に繰り返してデータセットを拡張します。たとえば`num_repeats = 2`としたとき、画像20枚のデータセットなら、各画像が2枚ずつ(同一のキャプションで)計40枚存在した場合と同じになります。異なるデータ数のデータセット間でバランスを取るために使用可能です。
56
+
57
+ resolution, caption_extension, batch_size, num_repeats, enable_bucket, bucket_no_upscale は general または datasets のどちらかに設定してください。省略時は各項目のデフォルト値が使用されます。
58
+
59
+ `[[datasets]]`以下を追加することで、他のデータセットを追加できます。各データセットには異なる設定を持てます。
60
+ </details>
61
+
62
+ ### Sample for Image Dataset with Metadata JSONL File
63
+
64
+ ```toml
65
+ # resolution, batch_size, num_repeats, enable_bucket, bucket_no_upscale should be set in either general or datasets
66
+ # caption_extension is not required for metadata jsonl file
67
+ # cache_directory is required for each dataset with metadata jsonl file
68
+
69
+ # general configurations
70
+ [general]
71
+ resolution = [960, 544]
72
+ batch_size = 1
73
+ enable_bucket = true
74
+ bucket_no_upscale = false
75
+
76
+ [[datasets]]
77
+ image_jsonl_file = "/path/to/metadata.jsonl"
78
+ cache_directory = "/path/to/cache_directory" # required for metadata jsonl file
79
+ num_repeats = 1 # optional, default is 1. Same as above.
80
+
81
+ # other datasets can be added here. each dataset can have different configurations
82
+ ```
83
+
84
+ JSONL file format for metadata:
85
+
86
+ ```json
87
+ {"image_path": "/path/to/image1.jpg", "caption": "A caption for image1"}
88
+ {"image_path": "/path/to/image2.jpg", "caption": "A caption for image2"}
89
+ ```
90
+
91
+ <details>
92
+ <summary>日本語</summary>
93
+
94
+ resolution, batch_size, num_repeats, enable_bucket, bucket_no_upscale は general または datasets のどちらかに設定してください。省略時は各項目のデフォルト値が使用されます。
95
+
96
+ metadata jsonl ファイルを使用する場合、caption_extension は必要ありません。また、cache_directory は必須です。
97
+
98
+ キャプションによるデータセットと同様に、複数のデータセットを追加できます。各データセットには異なる設定を持てます。
99
+ </details>
100
+
101
+
102
+ ### Sample for Video Dataset with Caption Text Files
103
+
104
+ ```toml
105
+ # Common parameters (resolution, caption_extension, batch_size, num_repeats, enable_bucket, bucket_no_upscale)
106
+ # can be set in either general or datasets sections
107
+ # Video-specific parameters (target_frames, frame_extraction, frame_stride, frame_sample, max_frames, source_fps)
108
+ # must be set in each datasets section
109
+
110
+ # general configurations
111
+ [general]
112
+ resolution = [960, 544]
113
+ caption_extension = ".txt"
114
+ batch_size = 1
115
+ enable_bucket = true
116
+ bucket_no_upscale = false
117
+
118
+ [[datasets]]
119
+ video_directory = "/path/to/video_dir"
120
+ cache_directory = "/path/to/cache_directory" # recommended to set cache directory
121
+ target_frames = [1, 25, 45]
122
+ frame_extraction = "head"
123
+ source_fps = 30.0 # optional, source fps for videos in the directory, decimal number
124
+
125
+ [[datasets]]
126
+ video_directory = "/path/to/video_dir2"
127
+ cache_directory = "/path/to/cache_directory2" # recommended to set cache directory
128
+ frame_extraction = "full"
129
+ max_frames = 45
130
+
131
+ # other datasets can be added here. each dataset can have different configurations
132
+ ```
133
+
134
+ __In HunyuanVideo and Wan2.1, the number of `target_frames` must be "N\*4+1" (N=0,1,2,...).__ Otherwise, it will be truncated to the nearest "N*4+1".
135
+
136
+ In FramePack, it is recommended to set `frame_extraction` to `full` and `max_frames` to a sufficiently large value, as it can handle longer videos. However, if the video is too long, an Out of Memory error may occur during VAE encoding. The videos in FramePack are trimmed to "N * latent_window_size * 4 + 1" frames (for example, 37, 73, 109... if `latent_window_size` is 9).
137
+
138
+ If the `source_fps` is specified, the videos in the directory are considered to be at this frame rate, and some frames will be skipped to match the model's frame rate (24 for HunyuanVideo and 16 for Wan2.1). __The value must be a decimal number, for example, `30.0` instead of `30`.__ The skipping is done automatically and does not consider the content of the images. Please check if the converted data is correct using `--debug_mode video`.
139
+
140
+ If `source_fps` is not specified (default), all frames of the video will be used regardless of the video's frame rate.
141
+
142
+ <details>
143
+ <summary>日本語</summary>
144
+
145
+ 共通パラメータ(resolution, caption_extension, batch_size, num_repeats, enable_bucket, bucket_no_upscale)は、generalまたはdatasetsのいずれかに設定できます。
146
+ 動画固有のパラメータ(target_frames, frame_extraction, frame_stride, frame_sample, max_frames, source_fps)は、各datasetsセクションに設定する必要があります。
147
+
148
+ __HunyuanVideoおよびWan2.1では、target_framesの数値は「N\*4+1」である必要があります。__ これ以外の値の場合は、最も近いN\*4+1の値に切り捨てられます。
149
+
150
+ FramePackでも同様ですが、FramePackでは動画が長くても学習可能なため、 `frame_extraction`に`full` を指定し、`max_frames`を十分に大きな値に設定することをお勧めします。ただし、あまりにも長すぎるとVAEのencodeでOut of Memoryエラーが発生する可能性があります。FramePackの動画は、「N * latent_window_size * 4 + 1」フレームにトリミングされます(latent_window_sizeが9の場合、37、73、109……)。
151
+
152
+ `source_fps`を指定した場合、ディレクトリ内の動画をこのフレームレートとみなして、モデルのフレームレートにあうようにいくつかのフレームをスキップします(HunyuanVideoは24、Wan2.1は16)。__小数点を含む数値で指定してください。__ 例:`30`ではなく`30.0`。スキップは機械的に行われ、画像の内容は考慮しません。変換後のデータが正しいか、`--debug_mode video`で確認してください。
153
+
154
+ `source_fps`を指定しない場合、動画のフレームは(動画自体のフレームレートに関係なく)すべて使用されます。
155
+
156
+ 他の注意事項は画像データセットと同様です。
157
+ </details>
158
+
159
+ ### Sample for Video Dataset with Metadata JSONL File
160
+
161
+ ```toml
162
+ # Common parameters (resolution, caption_extension, batch_size, num_repeats, enable_bucket, bucket_no_upscale)
163
+ # can be set in either general or datasets sections
164
+ # Video-specific parameters (target_frames, frame_extraction, frame_stride, frame_sample, max_frames, source_fps)
165
+ # must be set in each datasets section
166
+
167
+ # caption_extension is not required for metadata jsonl file
168
+ # cache_directory is required for each dataset with metadata jsonl file
169
+
170
+ # general configurations
171
+ [general]
172
+ resolution = [960, 544]
173
+ batch_size = 1
174
+ enable_bucket = true
175
+ bucket_no_upscale = false
176
+
177
+ [[datasets]]
178
+ video_jsonl_file = "/path/to/metadata.jsonl"
179
+ target_frames = [1, 25, 45]
180
+ frame_extraction = "head"
181
+ cache_directory = "/path/to/cache_directory_head"
182
+ source_fps = 30.0 # optional, source fps for videos in the jsonl file
183
+ # same metadata jsonl file can be used for multiple datasets
184
+ [[datasets]]
185
+ video_jsonl_file = "/path/to/metadata.jsonl"
186
+ target_frames = [1]
187
+ frame_stride = 10
188
+ cache_directory = "/path/to/cache_directory_stride"
189
+
190
+ # other datasets can be added here. each dataset can have different configurations
191
+ ```
192
+
193
+ JSONL file format for metadata:
194
+
195
+ ```json
196
+ {"video_path": "/path/to/video1.mp4", "caption": "A caption for video1"}
197
+ {"video_path": "/path/to/video2.mp4", "caption": "A caption for video2"}
198
+ ```
199
+
200
+ `video_path` can be a directory containing multiple images.
201
+
202
+ <details>
203
+ <summary>日本語</summary>
204
+ metadata jsonl ファイルを使用する場合、caption_extension は必要ありません。また、cache_directory は必須です。
205
+
206
+ `video_path`は、複数の画像を含むディレクトリのパスでも構いません。
207
+
208
+ 他の注意事項は今までのデータセットと同様です。
209
+ </details>
210
+
211
+ ### frame_extraction Options
212
+
213
+ - `head`: Extract the first N frames from the video.
214
+ - `chunk`: Extract frames by splitting the video into chunks of N frames.
215
+ - `slide`: Extract frames from the video with a stride of `frame_stride`.
216
+ - `uniform`: Extract `frame_sample` samples uniformly from the video.
217
+ - `full`: Extract all frames from the video.
218
+
219
+ In the case of `full`, the entire video is used, but it is trimmed to "N*4+1" frames. It is also trimmed to the `max_frames` if it exceeds that value. To avoid Out of Memory errors, please set `max_frames`.
220
+
221
+ The frame extraction methods other than `full` are recommended when the video contains repeated actions. `full` is recommended when each video represents a single complete motion.
222
+
223
+ For example, consider a video with 40 frames. The following diagrams illustrate each extraction:
224
+
225
+ <details>
226
+ <summary>日本語</summary>
227
+
228
+ - `head`: 動画から最初のNフレームを抽出します。
229
+ - `chunk`: 動画をNフレームずつに分割してフレームを抽出します。
230
+ - `slide`: `frame_stride`に指定したフレームごとに動画からNフレームを抽出します。
231
+ - `uniform`: 動画から一定間隔で、`frame_sample`個のNフレームを抽出します。
232
+ - `full`: 動画から全てのフレームを抽出します。
233
+
234
+ `full`の場合、各動画の全体を用いますが、「N*4+1」のフレーム数にトリミングされます。また`max_frames`を超える場合もその値にトリミングされます。Out of Memoryエラーを避けるために、`max_frames`を設定してください。
235
+
236
+ `full`以外の抽出方法は、動画が特定の動作を繰り返している場合にお勧めします。`full`はそれぞれの動画がひとつの完結したモーションの場合にお勧めします。
237
+
238
+ 例えば、40フレームの動画を例とした抽出について、以下の図で説明します。
239
+ </details>
240
+
241
+ ```
242
+ Original Video, 40 frames: x = frame, o = no frame
243
+ oooooooooooooooooooooooooooooooooooooooo
244
+
245
+ head, target_frames = [1, 13, 25] -> extract head frames:
246
+ xooooooooooooooooooooooooooooooooooooooo
247
+ xxxxxxxxxxxxxooooooooooooooooooooooooooo
248
+ xxxxxxxxxxxxxxxxxxxxxxxxxooooooooooooooo
249
+
250
+ chunk, target_frames = [13, 25] -> extract frames by splitting into chunks, into 13 and 25 frames:
251
+ xxxxxxxxxxxxxooooooooooooooooooooooooooo
252
+ oooooooooooooxxxxxxxxxxxxxoooooooooooooo
253
+ ooooooooooooooooooooooooooxxxxxxxxxxxxxo
254
+ xxxxxxxxxxxxxxxxxxxxxxxxxooooooooooooooo
255
+
256
+ NOTE: Please do not include 1 in target_frames if you are using the frame_extraction "chunk". It will make the all frames to be extracted.
257
+ 注: frame_extraction "chunk" を使用する場合、target_frames に 1 を含めないでください。全てのフレームが抽出されてしまいます。
258
+
259
+ slide, target_frames = [1, 13, 25], frame_stride = 10 -> extract N frames with a stride of 10:
260
+ xooooooooooooooooooooooooooooooooooooooo
261
+ ooooooooooxooooooooooooooooooooooooooooo
262
+ ooooooooooooooooooooxooooooooooooooooooo
263
+ ooooooooooooooooooooooooooooooxooooooooo
264
+ xxxxxxxxxxxxxooooooooooooooooooooooooooo
265
+ ooooooooooxxxxxxxxxxxxxooooooooooooooooo
266
+ ooooooooooooooooooooxxxxxxxxxxxxxooooooo
267
+ xxxxxxxxxxxxxxxxxxxxxxxxxooooooooooooooo
268
+ ooooooooooxxxxxxxxxxxxxxxxxxxxxxxxxooooo
269
+
270
+ uniform, target_frames =[1, 13, 25], frame_sample = 4 -> extract `frame_sample` samples uniformly, N frames each:
271
+ xooooooooooooooooooooooooooooooooooooooo
272
+ oooooooooooooxoooooooooooooooooooooooooo
273
+ oooooooooooooooooooooooooxoooooooooooooo
274
+ ooooooooooooooooooooooooooooooooooooooox
275
+ xxxxxxxxxxxxxooooooooooooooooooooooooooo
276
+ oooooooooxxxxxxxxxxxxxoooooooooooooooooo
277
+ ooooooooooooooooooxxxxxxxxxxxxxooooooooo
278
+ oooooooooooooooooooooooooooxxxxxxxxxxxxx
279
+ xxxxxxxxxxxxxxxxxxxxxxxxxooooooooooooooo
280
+ oooooxxxxxxxxxxxxxxxxxxxxxxxxxoooooooooo
281
+ ooooooooooxxxxxxxxxxxxxxxxxxxxxxxxxooooo
282
+ oooooooooooooooxxxxxxxxxxxxxxxxxxxxxxxxx
283
+
284
+ Three Original Videos, 20, 25, 35 frames: x = frame, o = no frame
285
+
286
+ full, max_frames = 31 -> extract all frames (trimmed to the maximum length):
287
+ video1: xxxxxxxxxxxxxxxxx (trimmed to 17 frames)
288
+ video2: xxxxxxxxxxxxxxxxxxxxxxxxx (25 frames)
289
+ video3: xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx (trimmed to 31 frames)
290
+ ```
291
+
292
+ ### Sample for Video Dataset with Control Images
293
+
294
+ The dataset with control videos is used for training ControlNet models.
295
+
296
+ The dataset configuration with caption text files is similar to the video dataset, but with an additional `control_directory` parameter.
297
+
298
+ The control video for a video is used from the `control_directory` with the same filename (or different extension) as the video, for example, `video_dir/video1.mp4` and `control_dir/video1.mp4` or `control_dir/video1.mov`. The control video can also be a directory without an extension, for example, `video_dir/video1.mp4` and `control_dir/video1`.
299
+
300
+ ```toml
301
+ [[datasets]]
302
+ video_directory = "/path/to/video_dir"
303
+ control_directory = "/path/to/control_dir" # required for dataset with control videos
304
+ cache_directory = "/path/to/cache_directory" # recommended to set cache directory
305
+ target_frames = [1, 25, 45]
306
+ frame_extraction = "head"
307
+ ```
308
+
309
+ The dataset configuration with metadata JSONL file is same as the video dataset, but metadata JSONL file must include the control video paths. The control video path can be a directory containing multiple images.
310
+
311
+ ```json
312
+ {"video_path": "/path/to/video1.mp4", "control_path": "/path/to/control1.mp4", "caption": "A caption for video1"}
313
+ {"video_path": "/path/to/video2.mp4", "control_path": "/path/to/control2.mp4", "caption": "A caption for video2"}
314
+ ```
315
+
316
+ <details>
317
+ <summary>日本語</summary>
318
+ 制御動画を持つデータセットです。ControlNetモデルの学習に使用します。
319
+
320
+ キャプションを用いる場合のデータセット設定は動画データセットと似ていますが、`control_directory`パラメータが追加されています。上にある例を参照してください。ある動画に対する制御用動画として、動画と同じファイル名(または拡張子のみが異なるファイル名)の、`control_directory`にある動画が使用されます(例:`video_dir/video1.mp4`と`control_dir/video1.mp4`または`control_dir/video1.mov`)。また、拡張子なしのディレクトリ内の、複数枚の画像を制御用動画として使用することもできます(例:`video_dir/video1.mp4`と`control_dir/video1`)。
321
+
322
+ データセット設定でメタデータJSONLファイルを使用する場合は、動画と制御用動画のパスを含める必要があります。制御用動画のパスは、複数枚の画像を含むディレクトリのパスでも構いません。
323
+ </details>
324
+
325
+ ## Specifications
326
+
327
+ ```toml
328
+ # general configurations
329
+ [general]
330
+ resolution = [960, 544] # optional, [W, H], default is [960, 544]. This is the default resolution for all datasets
331
+ caption_extension = ".txt" # optional, default is None. This is the default caption extension for all datasets
332
+ batch_size = 1 # optional, default is 1. This is the default batch size for all datasets
333
+ num_repeats = 1 # optional, default is 1. Number of times to repeat the dataset. Useful to balance the multiple datasets with different sizes.
334
+ enable_bucket = true # optional, default is false. Enable bucketing for datasets
335
+ bucket_no_upscale = false # optional, default is false. Disable upscaling for bucketing. Ignored if enable_bucket is false
336
+
337
+ ### Image Dataset
338
+
339
+ # sample image dataset with caption text files
340
+ [[datasets]]
341
+ image_directory = "/path/to/image_dir"
342
+ caption_extension = ".txt" # required for caption text files, if general caption extension is not set
343
+ resolution = [960, 544] # required if general resolution is not set
344
+ batch_size = 4 # optional, overwrite the default batch size
345
+ num_repeats = 1 # optional, overwrite the default num_repeats
346
+ enable_bucket = false # optional, overwrite the default bucketing setting
347
+ bucket_no_upscale = true # optional, overwrite the default bucketing setting
348
+ cache_directory = "/path/to/cache_directory" # optional, default is None to use the same directory as the image directory. NOTE: caching is always enabled
349
+
350
+ # sample image dataset with metadata **jsonl** file
351
+ [[datasets]]
352
+ image_jsonl_file = "/path/to/metadata.jsonl" # includes pairs of image files and captions
353
+ resolution = [960, 544] # required if general resolution is not set
354
+ cache_directory = "/path/to/cache_directory" # required for metadata jsonl file
355
+ # caption_extension is not required for metadata jsonl file
356
+ # batch_size, num_repeats, enable_bucket, bucket_no_upscale are also available for metadata jsonl file
357
+
358
+ ### Video Dataset
359
+
360
+ # sample video dataset with caption text files
361
+ [[datasets]]
362
+ video_directory = "/path/to/video_dir"
363
+ caption_extension = ".txt" # required for caption text files, if general caption extension is not set
364
+ resolution = [960, 544] # required if general resolution is not set
365
+
366
+ control_directory = "/path/to/control_dir" # optional, required for dataset with control images
367
+
368
+ # following configurations must be set in each [[datasets]] section for video datasets
369
+
370
+ target_frames = [1, 25, 79] # required for video dataset. list of video lengths to extract frames. each element must be N*4+1 (N=0,1,2,...)
371
+
372
+ # NOTE: Please do not include 1 in target_frames if you are using the frame_extraction "chunk". It will make the all frames to be extracted.
373
+
374
+ frame_extraction = "head" # optional, "head" or "chunk", "slide", "uniform". Default is "head"
375
+ frame_stride = 1 # optional, default is 1, available for "slide" frame extraction
376
+ frame_sample = 4 # optional, default is 1 (same as "head"), available for "uniform" frame extraction
377
+ max_frames = 129 # optional, default is 129. Maximum number of frames to extract, available for "full" frame extraction
378
+ # batch_size, num_repeats, enable_bucket, bucket_no_upscale, cache_directory are also available for video dataset
379
+
380
+ # sample video dataset with metadata jsonl file
381
+ [[datasets]]
382
+ video_jsonl_file = "/path/to/metadata.jsonl" # includes pairs of video files and captions
383
+
384
+ target_frames = [1, 79]
385
+
386
+ cache_directory = "/path/to/cache_directory" # required for metadata jsonl file
387
+ # frame_extraction, frame_stride, frame_sample, max_frames are also available for metadata jsonl file
388
+ ```
389
+
390
+ <!--
391
+ # sample image dataset with lance
392
+ [[datasets]]
393
+ image_lance_dataset = "/path/to/lance_dataset"
394
+ resolution = [960, 544] # required if general resolution is not set
395
+ # batch_size, enable_bucket, bucket_no_upscale, cache_directory are also available for lance dataset
396
+ -->
397
+
398
+ The metadata with .json file will be supported in the near future.
399
+
400
+
401
+
402
+ <!--
403
+
404
+ ```toml
405
+ # general configurations
406
+ [general]
407
+ resolution = [960, 544] # optional, [W, H], default is None. This is the default resolution for all datasets
408
+ caption_extension = ".txt" # optional, default is None. This is the default caption extension for all datasets
409
+ batch_size = 1 # optional, default is 1. This is the default batch size for all datasets
410
+ enable_bucket = true # optional, default is false. Enable bucketing for datasets
411
+ bucket_no_upscale = false # optional, default is false. Disable upscaling for bucketing. Ignored if enable_bucket is false
412
+
413
+ # sample image dataset with caption text files
414
+ [[datasets]]
415
+ image_directory = "/path/to/image_dir"
416
+ caption_extension = ".txt" # required for caption text files, if general caption extension is not set
417
+ resolution = [960, 544] # required if general resolution is not set
418
+ batch_size = 4 # optional, overwrite the default batch size
419
+ enable_bucket = false # optional, overwrite the default bucketing setting
420
+ bucket_no_upscale = true # optional, overwrite the default bucketing setting
421
+ cache_directory = "/path/to/cache_directory" # optional, default is None to use the same directory as the image directory. NOTE: caching is always enabled
422
+
423
+ # sample image dataset with metadata **jsonl** file
424
+ [[datasets]]
425
+ image_jsonl_file = "/path/to/metadata.jsonl" # includes pairs of image files and captions
426
+ resolution = [960, 544] # required if general resolution is not set
427
+ cache_directory = "/path/to/cache_directory" # required for metadata jsonl file
428
+ # caption_extension is not required for metadata jsonl file
429
+ # batch_size, enable_bucket, bucket_no_upscale are also available for metadata jsonl file
430
+
431
+ # sample video dataset with caption text files
432
+ [[datasets]]
433
+ video_directory = "/path/to/video_dir"
434
+ caption_extension = ".txt" # required for caption text files, if general caption extension is not set
435
+ resolution = [960, 544] # required if general resolution is not set
436
+ target_frames = [1, 25, 79] # required for video dataset. list of video lengths to extract frames. each element must be N*4+1 (N=0,1,2,...)
437
+ frame_extraction = "head" # optional, "head" or "chunk", "slide", "uniform". Default is "head"
438
+ frame_stride = 1 # optional, default is 1, available for "slide" frame extraction
439
+ frame_sample = 4 # optional, default is 1 (same as "head"), available for "uniform" frame extraction
440
+ # batch_size, enable_bucket, bucket_no_upscale, cache_directory are also available for video dataset
441
+
442
+ # sample video dataset with metadata jsonl file
443
+ [[datasets]]
444
+ video_jsonl_file = "/path/to/metadata.jsonl" # includes pairs of video files and captions
445
+ target_frames = [1, 79]
446
+ cache_directory = "/path/to/cache_directory" # required for metadata jsonl file
447
+ # frame_extraction, frame_stride, frame_sample are also available for metadata jsonl file
448
+ ```
449
+
450
+ # sample image dataset with lance
451
+ [[datasets]]
452
+ image_lance_dataset = "/path/to/lance_dataset"
453
+ resolution = [960, 544] # required if general resolution is not set
454
+ # batch_size, enable_bucket, bucket_no_upscale, cache_directory are also available for lance dataset
455
+
456
+ The metadata with .json file will be supported in the near future.
457
+
458
+
459
+
460
+
461
+ -->
dataset/image_video_dataset.py ADDED
@@ -0,0 +1,1726 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from concurrent.futures import ThreadPoolExecutor
2
+ import glob
3
+ import json
4
+ import math
5
+ import os
6
+ import random
7
+ import time
8
+ from typing import Optional, Sequence, Tuple, Union
9
+
10
+ import numpy as np
11
+ import torch
12
+ from safetensors.torch import save_file, load_file
13
+ from safetensors import safe_open
14
+ from PIL import Image
15
+ import cv2
16
+ import av
17
+
18
+ from utils import safetensors_utils
19
+ from utils.model_utils import dtype_to_str
20
+
21
+ import logging
22
+
23
+ logger = logging.getLogger(__name__)
24
+ logging.basicConfig(level=logging.INFO)
25
+
26
+
27
+ IMAGE_EXTENSIONS = [".png", ".jpg", ".jpeg", ".webp", ".bmp", ".PNG", ".JPG", ".JPEG", ".WEBP", ".BMP"]
28
+
29
+ try:
30
+ import pillow_avif
31
+
32
+ IMAGE_EXTENSIONS.extend([".avif", ".AVIF"])
33
+ except:
34
+ pass
35
+
36
+ # JPEG-XL on Linux
37
+ try:
38
+ from jxlpy import JXLImagePlugin
39
+
40
+ IMAGE_EXTENSIONS.extend([".jxl", ".JXL"])
41
+ except:
42
+ pass
43
+
44
+ # JPEG-XL on Windows
45
+ try:
46
+ import pillow_jxl
47
+
48
+ IMAGE_EXTENSIONS.extend([".jxl", ".JXL"])
49
+ except:
50
+ pass
51
+
52
+ VIDEO_EXTENSIONS = [
53
+ ".mp4",
54
+ ".webm",
55
+ ".avi",
56
+ ".mkv",
57
+ ".mov",
58
+ ".flv",
59
+ ".wmv",
60
+ ".m4v",
61
+ ".mpg",
62
+ ".mpeg",
63
+ ".MP4",
64
+ ".WEBM",
65
+ ".AVI",
66
+ ".MKV",
67
+ ".MOV",
68
+ ".FLV",
69
+ ".WMV",
70
+ ".M4V",
71
+ ".MPG",
72
+ ".MPEG",
73
+ ] # some of them are not tested
74
+
75
+ ARCHITECTURE_HUNYUAN_VIDEO = "hv"
76
+ ARCHITECTURE_HUNYUAN_VIDEO_FULL = "hunyuan_video"
77
+ ARCHITECTURE_WAN = "wan"
78
+ ARCHITECTURE_WAN_FULL = "wan"
79
+ ARCHITECTURE_FRAMEPACK = "fp"
80
+ ARCHITECTURE_FRAMEPACK_FULL = "framepack"
81
+
82
+
83
+ def glob_images(directory, base="*"):
84
+ img_paths = []
85
+ for ext in IMAGE_EXTENSIONS:
86
+ if base == "*":
87
+ img_paths.extend(glob.glob(os.path.join(glob.escape(directory), base + ext)))
88
+ else:
89
+ img_paths.extend(glob.glob(glob.escape(os.path.join(directory, base + ext))))
90
+ img_paths = list(set(img_paths)) # remove duplicates
91
+ img_paths.sort()
92
+ return img_paths
93
+
94
+
95
+ def glob_videos(directory, base="*"):
96
+ video_paths = []
97
+ for ext in VIDEO_EXTENSIONS:
98
+ if base == "*":
99
+ video_paths.extend(glob.glob(os.path.join(glob.escape(directory), base + ext)))
100
+ else:
101
+ video_paths.extend(glob.glob(glob.escape(os.path.join(directory, base + ext))))
102
+ video_paths = list(set(video_paths)) # remove duplicates
103
+ video_paths.sort()
104
+ return video_paths
105
+
106
+
107
+ def divisible_by(num: int, divisor: int) -> int:
108
+ return num - num % divisor
109
+
110
+
111
+ def resize_image_to_bucket(image: Union[Image.Image, np.ndarray], bucket_reso: tuple[int, int]) -> np.ndarray:
112
+ """
113
+ Resize the image to the bucket resolution.
114
+
115
+ bucket_reso: **(width, height)**
116
+ """
117
+ is_pil_image = isinstance(image, Image.Image)
118
+ if is_pil_image:
119
+ image_width, image_height = image.size
120
+ else:
121
+ image_height, image_width = image.shape[:2]
122
+
123
+ if bucket_reso == (image_width, image_height):
124
+ return np.array(image) if is_pil_image else image
125
+
126
+ bucket_width, bucket_height = bucket_reso
127
+ if bucket_width == image_width or bucket_height == image_height:
128
+ image = np.array(image) if is_pil_image else image
129
+ else:
130
+ # resize the image to the bucket resolution to match the short side
131
+ scale_width = bucket_width / image_width
132
+ scale_height = bucket_height / image_height
133
+ scale = max(scale_width, scale_height)
134
+ image_width = int(image_width * scale + 0.5)
135
+ image_height = int(image_height * scale + 0.5)
136
+
137
+ if scale > 1:
138
+ image = Image.fromarray(image) if not is_pil_image else image
139
+ image = image.resize((image_width, image_height), Image.LANCZOS)
140
+ image = np.array(image)
141
+ else:
142
+ image = np.array(image) if is_pil_image else image
143
+ image = cv2.resize(image, (image_width, image_height), interpolation=cv2.INTER_AREA)
144
+
145
+ # crop the image to the bucket resolution
146
+ crop_left = (image_width - bucket_width) // 2
147
+ crop_top = (image_height - bucket_height) // 2
148
+ image = image[crop_top : crop_top + bucket_height, crop_left : crop_left + bucket_width]
149
+ return image
150
+
151
+
152
+ class ItemInfo:
153
+ def __init__(
154
+ self,
155
+ item_key: str,
156
+ caption: str,
157
+ original_size: tuple[int, int],
158
+ bucket_size: Optional[Union[tuple[int, int], tuple[int, int, int]]] = None,
159
+ frame_count: Optional[int] = None,
160
+ content: Optional[np.ndarray] = None,
161
+ latent_cache_path: Optional[str] = None,
162
+ ) -> None:
163
+ self.item_key = item_key
164
+ self.caption = caption
165
+ self.original_size = original_size
166
+ self.bucket_size = bucket_size
167
+ self.frame_count = frame_count
168
+ self.content = content
169
+ self.latent_cache_path = latent_cache_path
170
+ self.text_encoder_output_cache_path: Optional[str] = None
171
+ self.control_content: Optional[np.ndarray] = None
172
+
173
+ def __str__(self) -> str:
174
+ return (
175
+ f"ItemInfo(item_key={self.item_key}, caption={self.caption}, "
176
+ + f"original_size={self.original_size}, bucket_size={self.bucket_size}, "
177
+ + f"frame_count={self.frame_count}, latent_cache_path={self.latent_cache_path}, content={self.content.shape if self.content is not None else None})"
178
+ )
179
+
180
+
181
+ # We use simple if-else approach to support multiple architectures.
182
+ # Maybe we can use a plugin system in the future.
183
+
184
+ # the keys of the dict are `<content_type>_FxHxW_<dtype>` for latents
185
+ # and `<content_type>_<dtype|mask>` for other tensors
186
+
187
+
188
+ def save_latent_cache(item_info: ItemInfo, latent: torch.Tensor):
189
+ """HunyuanVideo architecture only. HunyuanVideo doesn't support I2V and control latents"""
190
+ assert latent.dim() == 4, "latent should be 4D tensor (frame, channel, height, width)"
191
+
192
+ _, F, H, W = latent.shape
193
+ dtype_str = dtype_to_str(latent.dtype)
194
+ sd = {f"latents_{F}x{H}x{W}_{dtype_str}": latent.detach().cpu()}
195
+
196
+ save_latent_cache_common(item_info, sd, ARCHITECTURE_HUNYUAN_VIDEO_FULL)
197
+
198
+
199
+ def save_latent_cache_wan(
200
+ item_info: ItemInfo,
201
+ latent: torch.Tensor,
202
+ clip_embed: Optional[torch.Tensor],
203
+ image_latent: Optional[torch.Tensor],
204
+ control_latent: Optional[torch.Tensor],
205
+ ):
206
+ """Wan architecture only"""
207
+ assert latent.dim() == 4, "latent should be 4D tensor (frame, channel, height, width)"
208
+
209
+ _, F, H, W = latent.shape
210
+ dtype_str = dtype_to_str(latent.dtype)
211
+ sd = {f"latents_{F}x{H}x{W}_{dtype_str}": latent.detach().cpu()}
212
+
213
+ if clip_embed is not None:
214
+ sd[f"clip_{dtype_str}"] = clip_embed.detach().cpu()
215
+
216
+ if image_latent is not None:
217
+ sd[f"latents_image_{F}x{H}x{W}_{dtype_str}"] = image_latent.detach().cpu()
218
+
219
+ if control_latent is not None:
220
+ sd[f"latents_control_{F}x{H}x{W}_{dtype_str}"] = control_latent.detach().cpu()
221
+
222
+ save_latent_cache_common(item_info, sd, ARCHITECTURE_WAN_FULL)
223
+
224
+
225
+ def save_latent_cache_framepack(
226
+ item_info: ItemInfo,
227
+ latent: torch.Tensor,
228
+ latent_indices: torch.Tensor,
229
+ clean_latents: torch.Tensor,
230
+ clean_latent_indices: torch.Tensor,
231
+ clean_latents_2x: torch.Tensor,
232
+ clean_latent_2x_indices: torch.Tensor,
233
+ clean_latents_4x: torch.Tensor,
234
+ clean_latent_4x_indices: torch.Tensor,
235
+ image_embeddings: torch.Tensor,
236
+ ):
237
+ """FramePack architecture only"""
238
+ assert latent.dim() == 4, "latent should be 4D tensor (frame, channel, height, width)"
239
+
240
+ _, F, H, W = latent.shape
241
+ dtype_str = dtype_to_str(latent.dtype)
242
+ sd = {f"latents_{F}x{H}x{W}_{dtype_str}": latent.detach().cpu().contiguous()}
243
+
244
+ # `latents_xxx` must have {F, H, W} suffix
245
+ indices_dtype_str = dtype_to_str(latent_indices.dtype)
246
+ sd[f"image_embeddings_{dtype_str}"] = image_embeddings.detach().cpu() # image embeddings dtype is same as latents dtype
247
+ sd[f"latent_indices_{indices_dtype_str}"] = latent_indices.detach().cpu()
248
+ sd[f"clean_latent_indices_{indices_dtype_str}"] = clean_latent_indices.detach().cpu()
249
+ sd[f"clean_latent_2x_indices_{indices_dtype_str}"] = clean_latent_2x_indices.detach().cpu()
250
+ sd[f"clean_latent_4x_indices_{indices_dtype_str}"] = clean_latent_4x_indices.detach().cpu()
251
+ sd[f"latents_clean_{F}x{H}x{W}_{dtype_str}"] = clean_latents.detach().cpu().contiguous()
252
+ sd[f"latents_clean_2x_{F}x{H}x{W}_{dtype_str}"] = clean_latents_2x.detach().cpu().contiguous()
253
+ sd[f"latents_clean_4x_{F}x{H}x{W}_{dtype_str}"] = clean_latents_4x.detach().cpu().contiguous()
254
+
255
+ # for key, value in sd.items():
256
+ # print(f"{key}: {value.shape}")
257
+ save_latent_cache_common(item_info, sd, ARCHITECTURE_FRAMEPACK_FULL)
258
+
259
+
260
+ def save_latent_cache_common(item_info: ItemInfo, sd: dict[str, torch.Tensor], arch_fullname: str):
261
+ metadata = {
262
+ "architecture": arch_fullname,
263
+ "width": f"{item_info.original_size[0]}",
264
+ "height": f"{item_info.original_size[1]}",
265
+ "format_version": "1.0.1",
266
+ }
267
+ if item_info.frame_count is not None:
268
+ metadata["frame_count"] = f"{item_info.frame_count}"
269
+
270
+ for key, value in sd.items():
271
+ # NaN check and show warning, replace NaN with 0
272
+ if torch.isnan(value).any():
273
+ logger.warning(f"{key} tensor has NaN: {item_info.item_key}, replace NaN with 0")
274
+ value[torch.isnan(value)] = 0
275
+
276
+ latent_dir = os.path.dirname(item_info.latent_cache_path)
277
+ os.makedirs(latent_dir, exist_ok=True)
278
+
279
+ save_file(sd, item_info.latent_cache_path, metadata=metadata)
280
+
281
+
282
+ def save_text_encoder_output_cache(item_info: ItemInfo, embed: torch.Tensor, mask: Optional[torch.Tensor], is_llm: bool):
283
+ """HunyuanVideo architecture only"""
284
+ assert (
285
+ embed.dim() == 1 or embed.dim() == 2
286
+ ), f"embed should be 2D tensor (feature, hidden_size) or (hidden_size,), got {embed.shape}"
287
+ assert mask is None or mask.dim() == 1, f"mask should be 1D tensor (feature), got {mask.shape}"
288
+
289
+ sd = {}
290
+ dtype_str = dtype_to_str(embed.dtype)
291
+ text_encoder_type = "llm" if is_llm else "clipL"
292
+ sd[f"{text_encoder_type}_{dtype_str}"] = embed.detach().cpu()
293
+ if mask is not None:
294
+ sd[f"{text_encoder_type}_mask"] = mask.detach().cpu()
295
+
296
+ save_text_encoder_output_cache_common(item_info, sd, ARCHITECTURE_HUNYUAN_VIDEO_FULL)
297
+
298
+
299
+ def save_text_encoder_output_cache_wan(item_info: ItemInfo, embed: torch.Tensor):
300
+ """Wan architecture only. Wan2.1 only has a single text encoder"""
301
+
302
+ sd = {}
303
+ dtype_str = dtype_to_str(embed.dtype)
304
+ text_encoder_type = "t5"
305
+ sd[f"varlen_{text_encoder_type}_{dtype_str}"] = embed.detach().cpu()
306
+
307
+ save_text_encoder_output_cache_common(item_info, sd, ARCHITECTURE_WAN_FULL)
308
+
309
+
310
+ def save_text_encoder_output_cache_framepack(
311
+ item_info: ItemInfo, llama_vec: torch.Tensor, llama_attention_mask: torch.Tensor, clip_l_pooler: torch.Tensor
312
+ ):
313
+ """FramePack architecture only."""
314
+ sd = {}
315
+ dtype_str = dtype_to_str(llama_vec.dtype)
316
+ sd[f"llama_vec_{dtype_str}"] = llama_vec.detach().cpu()
317
+ sd[f"llama_attention_mask"] = llama_attention_mask.detach().cpu()
318
+ dtype_str = dtype_to_str(clip_l_pooler.dtype)
319
+ sd[f"clip_l_pooler_{dtype_str}"] = clip_l_pooler.detach().cpu()
320
+
321
+ save_text_encoder_output_cache_common(item_info, sd, ARCHITECTURE_FRAMEPACK_FULL)
322
+
323
+
324
+ def save_text_encoder_output_cache_common(item_info: ItemInfo, sd: dict[str, torch.Tensor], arch_fullname: str):
325
+ for key, value in sd.items():
326
+ # NaN check and show warning, replace NaN with 0
327
+ if torch.isnan(value).any():
328
+ logger.warning(f"{key} tensor has NaN: {item_info.item_key}, replace NaN with 0")
329
+ value[torch.isnan(value)] = 0
330
+
331
+ metadata = {
332
+ "architecture": arch_fullname,
333
+ "caption1": item_info.caption,
334
+ "format_version": "1.0.1",
335
+ }
336
+
337
+ if os.path.exists(item_info.text_encoder_output_cache_path):
338
+ # load existing cache and update metadata
339
+ with safetensors_utils.MemoryEfficientSafeOpen(item_info.text_encoder_output_cache_path) as f:
340
+ existing_metadata = f.metadata()
341
+ for key in f.keys():
342
+ if key not in sd: # avoid overwriting by existing cache, we keep the new one
343
+ sd[key] = f.get_tensor(key)
344
+
345
+ assert existing_metadata["architecture"] == metadata["architecture"], "architecture mismatch"
346
+ if existing_metadata["caption1"] != metadata["caption1"]:
347
+ logger.warning(f"caption mismatch: existing={existing_metadata['caption1']}, new={metadata['caption1']}, overwrite")
348
+ # TODO verify format_version
349
+
350
+ existing_metadata.pop("caption1", None)
351
+ existing_metadata.pop("format_version", None)
352
+ metadata.update(existing_metadata) # copy existing metadata except caption and format_version
353
+ else:
354
+ text_encoder_output_dir = os.path.dirname(item_info.text_encoder_output_cache_path)
355
+ os.makedirs(text_encoder_output_dir, exist_ok=True)
356
+
357
+ safetensors_utils.mem_eff_save_file(sd, item_info.text_encoder_output_cache_path, metadata=metadata)
358
+
359
+
360
+ class BucketSelector:
361
+ RESOLUTION_STEPS_HUNYUAN = 16
362
+ RESOLUTION_STEPS_WAN = 16
363
+ RESOLUTION_STEPS_FRAMEPACK = 16
364
+
365
+ def __init__(
366
+ self, resolution: Tuple[int, int], enable_bucket: bool = True, no_upscale: bool = False, architecture: str = "no_default"
367
+ ):
368
+ self.resolution = resolution
369
+ self.bucket_area = resolution[0] * resolution[1]
370
+ self.architecture = architecture
371
+
372
+ if self.architecture == ARCHITECTURE_HUNYUAN_VIDEO:
373
+ self.reso_steps = BucketSelector.RESOLUTION_STEPS_HUNYUAN
374
+ elif self.architecture == ARCHITECTURE_WAN:
375
+ self.reso_steps = BucketSelector.RESOLUTION_STEPS_WAN
376
+ elif self.architecture == ARCHITECTURE_FRAMEPACK:
377
+ self.reso_steps = BucketSelector.RESOLUTION_STEPS_FRAMEPACK
378
+ else:
379
+ raise ValueError(f"Invalid architecture: {self.architecture}")
380
+
381
+ if not enable_bucket:
382
+ # only define one bucket
383
+ self.bucket_resolutions = [resolution]
384
+ self.no_upscale = False
385
+ else:
386
+ # prepare bucket resolution
387
+ self.no_upscale = no_upscale
388
+ sqrt_size = int(math.sqrt(self.bucket_area))
389
+ min_size = divisible_by(sqrt_size // 2, self.reso_steps)
390
+ self.bucket_resolutions = []
391
+ for w in range(min_size, sqrt_size + self.reso_steps, self.reso_steps):
392
+ h = divisible_by(self.bucket_area // w, self.reso_steps)
393
+ self.bucket_resolutions.append((w, h))
394
+ self.bucket_resolutions.append((h, w))
395
+
396
+ self.bucket_resolutions = list(set(self.bucket_resolutions))
397
+ self.bucket_resolutions.sort()
398
+
399
+ # calculate aspect ratio to find the nearest resolution
400
+ self.aspect_ratios = np.array([w / h for w, h in self.bucket_resolutions])
401
+
402
+ def get_bucket_resolution(self, image_size: tuple[int, int]) -> tuple[int, int]:
403
+ """
404
+ return the bucket resolution for the given image size, (width, height)
405
+ """
406
+ area = image_size[0] * image_size[1]
407
+ if self.no_upscale and area <= self.bucket_area:
408
+ w, h = image_size
409
+ w = divisible_by(w, self.reso_steps)
410
+ h = divisible_by(h, self.reso_steps)
411
+ return w, h
412
+
413
+ aspect_ratio = image_size[0] / image_size[1]
414
+ ar_errors = self.aspect_ratios - aspect_ratio
415
+ bucket_id = np.abs(ar_errors).argmin()
416
+ return self.bucket_resolutions[bucket_id]
417
+
418
+
419
+ def load_video(
420
+ video_path: str,
421
+ start_frame: Optional[int] = None,
422
+ end_frame: Optional[int] = None,
423
+ bucket_selector: Optional[BucketSelector] = None,
424
+ bucket_reso: Optional[tuple[int, int]] = None,
425
+ source_fps: Optional[float] = None,
426
+ target_fps: Optional[float] = None,
427
+ ) -> list[np.ndarray]:
428
+ """
429
+ bucket_reso: if given, resize the video to the bucket resolution, (width, height)
430
+ """
431
+ if source_fps is None or target_fps is None:
432
+ if os.path.isfile(video_path):
433
+ container = av.open(video_path)
434
+ video = []
435
+ for i, frame in enumerate(container.decode(video=0)):
436
+ if start_frame is not None and i < start_frame:
437
+ continue
438
+ if end_frame is not None and i >= end_frame:
439
+ break
440
+ frame = frame.to_image()
441
+
442
+ if bucket_selector is not None and bucket_reso is None:
443
+ bucket_reso = bucket_selector.get_bucket_resolution(frame.size) # calc resolution from first frame
444
+
445
+ if bucket_reso is not None:
446
+ frame = resize_image_to_bucket(frame, bucket_reso)
447
+ else:
448
+ frame = np.array(frame)
449
+
450
+ video.append(frame)
451
+ container.close()
452
+ else:
453
+ # load images in the directory
454
+ image_files = glob_images(video_path)
455
+ image_files.sort()
456
+ video = []
457
+ for i in range(len(image_files)):
458
+ if start_frame is not None and i < start_frame:
459
+ continue
460
+ if end_frame is not None and i >= end_frame:
461
+ break
462
+
463
+ image_file = image_files[i]
464
+ image = Image.open(image_file).convert("RGB")
465
+
466
+ if bucket_selector is not None and bucket_reso is None:
467
+ bucket_reso = bucket_selector.get_bucket_resolution(image.size) # calc resolution from first frame
468
+ image = np.array(image)
469
+ if bucket_reso is not None:
470
+ image = resize_image_to_bucket(image, bucket_reso)
471
+
472
+ video.append(image)
473
+ else:
474
+ # drop frames to match the target fps TODO commonize this code with the above if this works
475
+ frame_index_delta = target_fps / source_fps # example: 16 / 30 = 0.5333
476
+ if os.path.isfile(video_path):
477
+ container = av.open(video_path)
478
+ video = []
479
+ frame_index_with_fraction = 0.0
480
+ previous_frame_index = -1
481
+ for i, frame in enumerate(container.decode(video=0)):
482
+ target_frame_index = int(frame_index_with_fraction)
483
+ frame_index_with_fraction += frame_index_delta
484
+
485
+ if target_frame_index == previous_frame_index: # drop this frame
486
+ continue
487
+
488
+ # accept this frame
489
+ previous_frame_index = target_frame_index
490
+
491
+ if start_frame is not None and target_frame_index < start_frame:
492
+ continue
493
+ if end_frame is not None and target_frame_index >= end_frame:
494
+ break
495
+ frame = frame.to_image()
496
+
497
+ if bucket_selector is not None and bucket_reso is None:
498
+ bucket_reso = bucket_selector.get_bucket_resolution(frame.size) # calc resolution from first frame
499
+
500
+ if bucket_reso is not None:
501
+ frame = resize_image_to_bucket(frame, bucket_reso)
502
+ else:
503
+ frame = np.array(frame)
504
+
505
+ video.append(frame)
506
+ container.close()
507
+ else:
508
+ # load images in the directory
509
+ image_files = glob_images(video_path)
510
+ image_files.sort()
511
+ video = []
512
+ frame_index_with_fraction = 0.0
513
+ previous_frame_index = -1
514
+ for i in range(len(image_files)):
515
+ target_frame_index = int(frame_index_with_fraction)
516
+ frame_index_with_fraction += frame_index_delta
517
+
518
+ if target_frame_index == previous_frame_index: # drop this frame
519
+ continue
520
+
521
+ # accept this frame
522
+ previous_frame_index = target_frame_index
523
+
524
+ if start_frame is not None and target_frame_index < start_frame:
525
+ continue
526
+ if end_frame is not None and target_frame_index >= end_frame:
527
+ break
528
+
529
+ image_file = image_files[i]
530
+ image = Image.open(image_file).convert("RGB")
531
+
532
+ if bucket_selector is not None and bucket_reso is None:
533
+ bucket_reso = bucket_selector.get_bucket_resolution(image.size) # calc resolution from first frame
534
+ image = np.array(image)
535
+ if bucket_reso is not None:
536
+ image = resize_image_to_bucket(image, bucket_reso)
537
+
538
+ video.append(image)
539
+
540
+ return video
541
+
542
+
543
+ class BucketBatchManager:
544
+
545
+ def __init__(self, bucketed_item_info: dict[Union[tuple[int, int], tuple[int, int, int]], list[ItemInfo]], batch_size: int):
546
+ self.batch_size = batch_size
547
+ self.buckets = bucketed_item_info
548
+ self.bucket_resos = list(self.buckets.keys())
549
+ self.bucket_resos.sort()
550
+
551
+ # indices for enumerating batches. each batch is reso + batch_idx. reso is (width, height) or (width, height, frames)
552
+ self.bucket_batch_indices: list[tuple[Union[tuple[int, int], tuple[int, int, int], int]]] = []
553
+ for bucket_reso in self.bucket_resos:
554
+ bucket = self.buckets[bucket_reso]
555
+ num_batches = math.ceil(len(bucket) / self.batch_size)
556
+ for i in range(num_batches):
557
+ self.bucket_batch_indices.append((bucket_reso, i))
558
+
559
+ self.shuffle()
560
+
561
+ def show_bucket_info(self):
562
+ for bucket_reso in self.bucket_resos:
563
+ bucket = self.buckets[bucket_reso]
564
+ logger.info(f"bucket: {bucket_reso}, count: {len(bucket)}")
565
+
566
+ logger.info(f"total batches: {len(self)}")
567
+
568
+ def shuffle(self):
569
+ # shuffle each bucket
570
+ for bucket in self.buckets.values():
571
+ random.shuffle(bucket)
572
+
573
+ # shuffle the order of batches
574
+ random.shuffle(self.bucket_batch_indices)
575
+
576
+ def __len__(self):
577
+ return len(self.bucket_batch_indices)
578
+
579
+ def __getitem__(self, idx):
580
+ bucket_reso, batch_idx = self.bucket_batch_indices[idx]
581
+ bucket = self.buckets[bucket_reso]
582
+ start = batch_idx * self.batch_size
583
+ end = min(start + self.batch_size, len(bucket))
584
+
585
+ batch_tensor_data = {}
586
+ varlen_keys = set()
587
+ for item_info in bucket[start:end]:
588
+ sd_latent = load_file(item_info.latent_cache_path)
589
+ sd_te = load_file(item_info.text_encoder_output_cache_path)
590
+ sd = {**sd_latent, **sd_te}
591
+
592
+ # TODO refactor this
593
+ for key in sd.keys():
594
+ is_varlen_key = key.startswith("varlen_") # varlen keys are not stacked
595
+ content_key = key
596
+
597
+ if is_varlen_key:
598
+ content_key = content_key.replace("varlen_", "")
599
+
600
+ if content_key.endswith("_mask"):
601
+ pass
602
+ else:
603
+ content_key = content_key.rsplit("_", 1)[0] # remove dtype
604
+ if content_key.startswith("latents_"):
605
+ content_key = content_key.rsplit("_", 1)[0] # remove FxHxW
606
+
607
+ if content_key not in batch_tensor_data:
608
+ batch_tensor_data[content_key] = []
609
+ batch_tensor_data[content_key].append(sd[key])
610
+
611
+ if is_varlen_key:
612
+ varlen_keys.add(content_key)
613
+
614
+ for key in batch_tensor_data.keys():
615
+ if key not in varlen_keys:
616
+ batch_tensor_data[key] = torch.stack(batch_tensor_data[key])
617
+
618
+ return batch_tensor_data
619
+
620
+
621
+ class ContentDatasource:
622
+ def __init__(self):
623
+ self.caption_only = False # set to True to only fetch caption for Text Encoder caching
624
+ self.has_control = False
625
+
626
+ def set_caption_only(self, caption_only: bool):
627
+ self.caption_only = caption_only
628
+
629
+ def is_indexable(self):
630
+ return False
631
+
632
+ def get_caption(self, idx: int) -> tuple[str, str]:
633
+ """
634
+ Returns caption. May not be called if is_indexable() returns False.
635
+ """
636
+ raise NotImplementedError
637
+
638
+ def __len__(self):
639
+ raise NotImplementedError
640
+
641
+ def __iter__(self):
642
+ raise NotImplementedError
643
+
644
+ def __next__(self):
645
+ raise NotImplementedError
646
+
647
+
648
+ class ImageDatasource(ContentDatasource):
649
+ def __init__(self):
650
+ super().__init__()
651
+
652
+ def get_image_data(self, idx: int) -> tuple[str, Image.Image, str]:
653
+ """
654
+ Returns image data as a tuple of image path, image, and caption for the given index.
655
+ Key must be unique and valid as a file name.
656
+ May not be called if is_indexable() returns False.
657
+ """
658
+ raise NotImplementedError
659
+
660
+
661
+ class ImageDirectoryDatasource(ImageDatasource):
662
+ def __init__(self, image_directory: str, caption_extension: Optional[str] = None):
663
+ super().__init__()
664
+ self.image_directory = image_directory
665
+ self.caption_extension = caption_extension
666
+ self.current_idx = 0
667
+
668
+ # glob images
669
+ logger.info(f"glob images in {self.image_directory}")
670
+ self.image_paths = glob_images(self.image_directory)
671
+ logger.info(f"found {len(self.image_paths)} images")
672
+
673
+ def is_indexable(self):
674
+ return True
675
+
676
+ def __len__(self):
677
+ return len(self.image_paths)
678
+
679
+ def get_image_data(self, idx: int) -> tuple[str, Image.Image, str]:
680
+ image_path = self.image_paths[idx]
681
+ image = Image.open(image_path).convert("RGB")
682
+
683
+ _, caption = self.get_caption(idx)
684
+
685
+ return image_path, image, caption
686
+
687
+ def get_caption(self, idx: int) -> tuple[str, str]:
688
+ image_path = self.image_paths[idx]
689
+ caption_path = os.path.splitext(image_path)[0] + self.caption_extension if self.caption_extension else ""
690
+ with open(caption_path, "r", encoding="utf-8") as f:
691
+ caption = f.read().strip()
692
+ return image_path, caption
693
+
694
+ def __iter__(self):
695
+ self.current_idx = 0
696
+ return self
697
+
698
+ def __next__(self) -> callable:
699
+ """
700
+ Returns a fetcher function that returns image data.
701
+ """
702
+ if self.current_idx >= len(self.image_paths):
703
+ raise StopIteration
704
+
705
+ if self.caption_only:
706
+
707
+ def create_caption_fetcher(index):
708
+ return lambda: self.get_caption(index)
709
+
710
+ fetcher = create_caption_fetcher(self.current_idx)
711
+ else:
712
+
713
+ def create_image_fetcher(index):
714
+ return lambda: self.get_image_data(index)
715
+
716
+ fetcher = create_image_fetcher(self.current_idx)
717
+
718
+ self.current_idx += 1
719
+ return fetcher
720
+
721
+
722
+ class ImageJsonlDatasource(ImageDatasource):
723
+ def __init__(self, image_jsonl_file: str):
724
+ super().__init__()
725
+ self.image_jsonl_file = image_jsonl_file
726
+ self.current_idx = 0
727
+
728
+ # load jsonl
729
+ logger.info(f"load image jsonl from {self.image_jsonl_file}")
730
+ self.data = []
731
+ with open(self.image_jsonl_file, "r", encoding="utf-8") as f:
732
+ for line in f:
733
+ try:
734
+ data = json.loads(line)
735
+ except json.JSONDecodeError:
736
+ logger.error(f"failed to load json: {line} @ {self.image_jsonl_file}")
737
+ raise
738
+ self.data.append(data)
739
+ logger.info(f"loaded {len(self.data)} images")
740
+
741
+ def is_indexable(self):
742
+ return True
743
+
744
+ def __len__(self):
745
+ return len(self.data)
746
+
747
+ def get_image_data(self, idx: int) -> tuple[str, Image.Image, str]:
748
+ data = self.data[idx]
749
+ image_path = data["image_path"]
750
+ image = Image.open(image_path).convert("RGB")
751
+
752
+ caption = data["caption"]
753
+
754
+ return image_path, image, caption
755
+
756
+ def get_caption(self, idx: int) -> tuple[str, str]:
757
+ data = self.data[idx]
758
+ image_path = data["image_path"]
759
+ caption = data["caption"]
760
+ return image_path, caption
761
+
762
+ def __iter__(self):
763
+ self.current_idx = 0
764
+ return self
765
+
766
+ def __next__(self) -> callable:
767
+ if self.current_idx >= len(self.data):
768
+ raise StopIteration
769
+
770
+ if self.caption_only:
771
+
772
+ def create_caption_fetcher(index):
773
+ return lambda: self.get_caption(index)
774
+
775
+ fetcher = create_caption_fetcher(self.current_idx)
776
+
777
+ else:
778
+
779
+ def create_fetcher(index):
780
+ return lambda: self.get_image_data(index)
781
+
782
+ fetcher = create_fetcher(self.current_idx)
783
+
784
+ self.current_idx += 1
785
+ return fetcher
786
+
787
+
788
+ class VideoDatasource(ContentDatasource):
789
+ def __init__(self):
790
+ super().__init__()
791
+
792
+ # None means all frames
793
+ self.start_frame = None
794
+ self.end_frame = None
795
+
796
+ self.bucket_selector = None
797
+
798
+ self.source_fps = None
799
+ self.target_fps = None
800
+
801
+ def __len__(self):
802
+ raise NotImplementedError
803
+
804
+ def get_video_data_from_path(
805
+ self,
806
+ video_path: str,
807
+ start_frame: Optional[int] = None,
808
+ end_frame: Optional[int] = None,
809
+ bucket_selector: Optional[BucketSelector] = None,
810
+ ) -> tuple[str, list[Image.Image], str]:
811
+ # this method can resize the video if bucket_selector is given to reduce the memory usage
812
+
813
+ start_frame = start_frame if start_frame is not None else self.start_frame
814
+ end_frame = end_frame if end_frame is not None else self.end_frame
815
+ bucket_selector = bucket_selector if bucket_selector is not None else self.bucket_selector
816
+
817
+ video = load_video(
818
+ video_path, start_frame, end_frame, bucket_selector, source_fps=self.source_fps, target_fps=self.target_fps
819
+ )
820
+ return video
821
+
822
+ def get_control_data_from_path(
823
+ self,
824
+ control_path: str,
825
+ start_frame: Optional[int] = None,
826
+ end_frame: Optional[int] = None,
827
+ bucket_selector: Optional[BucketSelector] = None,
828
+ ) -> list[Image.Image]:
829
+ start_frame = start_frame if start_frame is not None else self.start_frame
830
+ end_frame = end_frame if end_frame is not None else self.end_frame
831
+ bucket_selector = bucket_selector if bucket_selector is not None else self.bucket_selector
832
+
833
+ control = load_video(
834
+ control_path, start_frame, end_frame, bucket_selector, source_fps=self.source_fps, target_fps=self.target_fps
835
+ )
836
+ return control
837
+
838
+ def set_start_and_end_frame(self, start_frame: Optional[int], end_frame: Optional[int]):
839
+ self.start_frame = start_frame
840
+ self.end_frame = end_frame
841
+
842
+ def set_bucket_selector(self, bucket_selector: BucketSelector):
843
+ self.bucket_selector = bucket_selector
844
+
845
+ def set_source_and_target_fps(self, source_fps: Optional[float], target_fps: Optional[float]):
846
+ self.source_fps = source_fps
847
+ self.target_fps = target_fps
848
+
849
+ def __iter__(self):
850
+ raise NotImplementedError
851
+
852
+ def __next__(self):
853
+ raise NotImplementedError
854
+
855
+
856
+ class VideoDirectoryDatasource(VideoDatasource):
857
+ def __init__(self, video_directory: str, caption_extension: Optional[str] = None, control_directory: Optional[str] = None):
858
+ super().__init__()
859
+ self.video_directory = video_directory
860
+ self.caption_extension = caption_extension
861
+ self.control_directory = control_directory # 新しく追加: コントロール画像ディレクトリ
862
+ self.current_idx = 0
863
+
864
+ # glob videos
865
+ logger.info(f"glob videos in {self.video_directory}")
866
+ self.video_paths = glob_videos(self.video_directory)
867
+ logger.info(f"found {len(self.video_paths)} videos")
868
+
869
+ # glob control images if specified
870
+ if self.control_directory is not None:
871
+ logger.info(f"glob control videos in {self.control_directory}")
872
+ self.has_control = True
873
+ self.control_paths = {}
874
+ for video_path in self.video_paths:
875
+ video_basename = os.path.basename(video_path)
876
+ # construct control path from video path
877
+ # for example: video_path = "vid/video.mp4" -> control_path = "control/video.mp4"
878
+ control_path = os.path.join(self.control_directory, video_basename)
879
+ if os.path.exists(control_path):
880
+ self.control_paths[video_path] = control_path
881
+ else:
882
+ # use the same base name for control path
883
+ base_name = os.path.splitext(video_basename)[0]
884
+
885
+ # directory with images. for example: video_path = "vid/video.mp4" -> control_path = "control/video"
886
+ potential_path = os.path.join(self.control_directory, base_name) # no extension
887
+ if os.path.isdir(potential_path):
888
+ self.control_paths[video_path] = potential_path
889
+ else:
890
+ # another extension for control path
891
+ # for example: video_path = "vid/video.mp4" -> control_path = "control/video.mov"
892
+ for ext in VIDEO_EXTENSIONS:
893
+ potential_path = os.path.join(self.control_directory, base_name + ext)
894
+ if os.path.exists(potential_path):
895
+ self.control_paths[video_path] = potential_path
896
+ break
897
+
898
+ logger.info(f"found {len(self.control_paths)} matching control videos/images")
899
+ # check if all videos have matching control paths, if not, raise an error
900
+ missing_controls = len(self.video_paths) - len(self.control_paths)
901
+ if missing_controls > 0:
902
+ # logger.warning(f"Could not find matching control videos/images for {missing_controls} videos")
903
+ missing_controls_videos = [video_path for video_path in self.video_paths if video_path not in self.control_paths]
904
+ logger.error(
905
+ f"Could not find matching control videos/images for {missing_controls} videos: {missing_controls_videos}"
906
+ )
907
+ raise ValueError(f"Could not find matching control videos/images for {missing_controls} videos")
908
+
909
+ def is_indexable(self):
910
+ return True
911
+
912
+ def __len__(self):
913
+ return len(self.video_paths)
914
+
915
+ def get_video_data(
916
+ self,
917
+ idx: int,
918
+ start_frame: Optional[int] = None,
919
+ end_frame: Optional[int] = None,
920
+ bucket_selector: Optional[BucketSelector] = None,
921
+ ) -> tuple[str, list[Image.Image], str, Optional[list[Image.Image]]]:
922
+ video_path = self.video_paths[idx]
923
+ video = self.get_video_data_from_path(video_path, start_frame, end_frame, bucket_selector)
924
+
925
+ _, caption = self.get_caption(idx)
926
+
927
+ control = None
928
+ if self.control_directory is not None and video_path in self.control_paths:
929
+ control_path = self.control_paths[video_path]
930
+ control = self.get_control_data_from_path(control_path, start_frame, end_frame, bucket_selector)
931
+
932
+ return video_path, video, caption, control
933
+
934
+ def get_caption(self, idx: int) -> tuple[str, str]:
935
+ video_path = self.video_paths[idx]
936
+ caption_path = os.path.splitext(video_path)[0] + self.caption_extension if self.caption_extension else ""
937
+ with open(caption_path, "r", encoding="utf-8") as f:
938
+ caption = f.read().strip()
939
+ return video_path, caption
940
+
941
+ def __iter__(self):
942
+ self.current_idx = 0
943
+ return self
944
+
945
+ def __next__(self):
946
+ if self.current_idx >= len(self.video_paths):
947
+ raise StopIteration
948
+
949
+ if self.caption_only:
950
+
951
+ def create_caption_fetcher(index):
952
+ return lambda: self.get_caption(index)
953
+
954
+ fetcher = create_caption_fetcher(self.current_idx)
955
+
956
+ else:
957
+
958
+ def create_fetcher(index):
959
+ return lambda: self.get_video_data(index)
960
+
961
+ fetcher = create_fetcher(self.current_idx)
962
+
963
+ self.current_idx += 1
964
+ return fetcher
965
+
966
+
967
+ class VideoJsonlDatasource(VideoDatasource):
968
+ def __init__(self, video_jsonl_file: str):
969
+ super().__init__()
970
+ self.video_jsonl_file = video_jsonl_file
971
+ self.current_idx = 0
972
+
973
+ # load jsonl
974
+ logger.info(f"load video jsonl from {self.video_jsonl_file}")
975
+ self.data = []
976
+ with open(self.video_jsonl_file, "r", encoding="utf-8") as f:
977
+ for line in f:
978
+ data = json.loads(line)
979
+ self.data.append(data)
980
+ logger.info(f"loaded {len(self.data)} videos")
981
+
982
+ # Check if there are control paths in the JSONL
983
+ self.has_control = any("control_path" in item for item in self.data)
984
+ if self.has_control:
985
+ control_count = sum(1 for item in self.data if "control_path" in item)
986
+ if control_count < len(self.data):
987
+ missing_control_videos = [item["video_path"] for item in self.data if "control_path" not in item]
988
+ logger.error(f"Some videos do not have control paths in JSONL data: {missing_control_videos}")
989
+ raise ValueError(f"Some videos do not have control paths in JSONL data: {missing_control_videos}")
990
+ logger.info(f"found {control_count} control videos/images in JSONL data")
991
+
992
+ def is_indexable(self):
993
+ return True
994
+
995
+ def __len__(self):
996
+ return len(self.data)
997
+
998
+ def get_video_data(
999
+ self,
1000
+ idx: int,
1001
+ start_frame: Optional[int] = None,
1002
+ end_frame: Optional[int] = None,
1003
+ bucket_selector: Optional[BucketSelector] = None,
1004
+ ) -> tuple[str, list[Image.Image], str, Optional[list[Image.Image]]]:
1005
+ data = self.data[idx]
1006
+ video_path = data["video_path"]
1007
+ video = self.get_video_data_from_path(video_path, start_frame, end_frame, bucket_selector)
1008
+
1009
+ caption = data["caption"]
1010
+
1011
+ control = None
1012
+ if "control_path" in data and data["control_path"]:
1013
+ control_path = data["control_path"]
1014
+ control = self.get_control_data_from_path(control_path, start_frame, end_frame, bucket_selector)
1015
+
1016
+ return video_path, video, caption, control
1017
+
1018
+ def get_caption(self, idx: int) -> tuple[str, str]:
1019
+ data = self.data[idx]
1020
+ video_path = data["video_path"]
1021
+ caption = data["caption"]
1022
+ return video_path, caption
1023
+
1024
+ def __iter__(self):
1025
+ self.current_idx = 0
1026
+ return self
1027
+
1028
+ def __next__(self):
1029
+ if self.current_idx >= len(self.data):
1030
+ raise StopIteration
1031
+
1032
+ if self.caption_only:
1033
+
1034
+ def create_caption_fetcher(index):
1035
+ return lambda: self.get_caption(index)
1036
+
1037
+ fetcher = create_caption_fetcher(self.current_idx)
1038
+
1039
+ else:
1040
+
1041
+ def create_fetcher(index):
1042
+ return lambda: self.get_video_data(index)
1043
+
1044
+ fetcher = create_fetcher(self.current_idx)
1045
+
1046
+ self.current_idx += 1
1047
+ return fetcher
1048
+
1049
+
1050
+ class BaseDataset(torch.utils.data.Dataset):
1051
+ def __init__(
1052
+ self,
1053
+ resolution: Tuple[int, int] = (960, 544),
1054
+ caption_extension: Optional[str] = None,
1055
+ batch_size: int = 1,
1056
+ num_repeats: int = 1,
1057
+ enable_bucket: bool = False,
1058
+ bucket_no_upscale: bool = False,
1059
+ cache_directory: Optional[str] = None,
1060
+ debug_dataset: bool = False,
1061
+ architecture: str = "no_default",
1062
+ ):
1063
+ self.resolution = resolution
1064
+ self.caption_extension = caption_extension
1065
+ self.batch_size = batch_size
1066
+ self.num_repeats = num_repeats
1067
+ self.enable_bucket = enable_bucket
1068
+ self.bucket_no_upscale = bucket_no_upscale
1069
+ self.cache_directory = cache_directory
1070
+ self.debug_dataset = debug_dataset
1071
+ self.architecture = architecture
1072
+ self.seed = None
1073
+ self.current_epoch = 0
1074
+
1075
+ if not self.enable_bucket:
1076
+ self.bucket_no_upscale = False
1077
+
1078
+ def get_metadata(self) -> dict:
1079
+ metadata = {
1080
+ "resolution": self.resolution,
1081
+ "caption_extension": self.caption_extension,
1082
+ "batch_size_per_device": self.batch_size,
1083
+ "num_repeats": self.num_repeats,
1084
+ "enable_bucket": bool(self.enable_bucket),
1085
+ "bucket_no_upscale": bool(self.bucket_no_upscale),
1086
+ }
1087
+ return metadata
1088
+
1089
+ def get_all_latent_cache_files(self):
1090
+ return glob.glob(os.path.join(self.cache_directory, f"*_{self.architecture}.safetensors"))
1091
+
1092
+ def get_all_text_encoder_output_cache_files(self):
1093
+ return glob.glob(os.path.join(self.cache_directory, f"*_{self.architecture}_te.safetensors"))
1094
+
1095
+ def get_latent_cache_path(self, item_info: ItemInfo) -> str:
1096
+ """
1097
+ Returns the cache path for the latent tensor.
1098
+
1099
+ item_info: ItemInfo object
1100
+
1101
+ Returns:
1102
+ str: cache path
1103
+
1104
+ cache_path is based on the item_key and the resolution.
1105
+ """
1106
+ w, h = item_info.original_size
1107
+ basename = os.path.splitext(os.path.basename(item_info.item_key))[0]
1108
+ assert self.cache_directory is not None, "cache_directory is required / cache_directoryは必須です"
1109
+ return os.path.join(self.cache_directory, f"{basename}_{w:04d}x{h:04d}_{self.architecture}.safetensors")
1110
+
1111
+ def get_text_encoder_output_cache_path(self, item_info: ItemInfo) -> str:
1112
+ basename = os.path.splitext(os.path.basename(item_info.item_key))[0]
1113
+ assert self.cache_directory is not None, "cache_directory is required / cache_directoryは必須です"
1114
+ return os.path.join(self.cache_directory, f"{basename}_{self.architecture}_te.safetensors")
1115
+
1116
+ def retrieve_latent_cache_batches(self, num_workers: int):
1117
+ raise NotImplementedError
1118
+
1119
+ def retrieve_text_encoder_output_cache_batches(self, num_workers: int):
1120
+ raise NotImplementedError
1121
+
1122
+ def prepare_for_training(self):
1123
+ pass
1124
+
1125
+ def set_seed(self, seed: int):
1126
+ self.seed = seed
1127
+
1128
+ def set_current_epoch(self, epoch):
1129
+ if not self.current_epoch == epoch: # shuffle buckets when epoch is incremented
1130
+ if epoch > self.current_epoch:
1131
+ logger.info("epoch is incremented. current_epoch: {}, epoch: {}".format(self.current_epoch, epoch))
1132
+ num_epochs = epoch - self.current_epoch
1133
+ for _ in range(num_epochs):
1134
+ self.current_epoch += 1
1135
+ self.shuffle_buckets()
1136
+ # self.current_epoch seem to be set to 0 again in the next epoch. it may be caused by skipped_dataloader?
1137
+ else:
1138
+ logger.warning("epoch is not incremented. current_epoch: {}, epoch: {}".format(self.current_epoch, epoch))
1139
+ self.current_epoch = epoch
1140
+
1141
+ def set_current_step(self, step):
1142
+ self.current_step = step
1143
+
1144
+ def set_max_train_steps(self, max_train_steps):
1145
+ self.max_train_steps = max_train_steps
1146
+
1147
+ def shuffle_buckets(self):
1148
+ raise NotImplementedError
1149
+
1150
+ def __len__(self):
1151
+ return NotImplementedError
1152
+
1153
+ def __getitem__(self, idx):
1154
+ raise NotImplementedError
1155
+
1156
+ def _default_retrieve_text_encoder_output_cache_batches(self, datasource: ContentDatasource, batch_size: int, num_workers: int):
1157
+ datasource.set_caption_only(True)
1158
+ executor = ThreadPoolExecutor(max_workers=num_workers)
1159
+
1160
+ data: list[ItemInfo] = []
1161
+ futures = []
1162
+
1163
+ def aggregate_future(consume_all: bool = False):
1164
+ while len(futures) >= num_workers or (consume_all and len(futures) > 0):
1165
+ completed_futures = [future for future in futures if future.done()]
1166
+ if len(completed_futures) == 0:
1167
+ if len(futures) >= num_workers or consume_all: # to avoid adding too many futures
1168
+ time.sleep(0.1)
1169
+ continue
1170
+ else:
1171
+ break # submit batch if possible
1172
+
1173
+ for future in completed_futures:
1174
+ item_key, caption = future.result()
1175
+ item_info = ItemInfo(item_key, caption, (0, 0), (0, 0))
1176
+ item_info.text_encoder_output_cache_path = self.get_text_encoder_output_cache_path(item_info)
1177
+ data.append(item_info)
1178
+
1179
+ futures.remove(future)
1180
+
1181
+ def submit_batch(flush: bool = False):
1182
+ nonlocal data
1183
+ if len(data) >= batch_size or (len(data) > 0 and flush):
1184
+ batch = data[0:batch_size]
1185
+ if len(data) > batch_size:
1186
+ data = data[batch_size:]
1187
+ else:
1188
+ data = []
1189
+ return batch
1190
+ return None
1191
+
1192
+ for fetch_op in datasource:
1193
+ future = executor.submit(fetch_op)
1194
+ futures.append(future)
1195
+ aggregate_future()
1196
+ while True:
1197
+ batch = submit_batch()
1198
+ if batch is None:
1199
+ break
1200
+ yield batch
1201
+
1202
+ aggregate_future(consume_all=True)
1203
+ while True:
1204
+ batch = submit_batch(flush=True)
1205
+ if batch is None:
1206
+ break
1207
+ yield batch
1208
+
1209
+ executor.shutdown()
1210
+
1211
+
1212
+ class ImageDataset(BaseDataset):
1213
+ def __init__(
1214
+ self,
1215
+ resolution: Tuple[int, int],
1216
+ caption_extension: Optional[str],
1217
+ batch_size: int,
1218
+ num_repeats: int,
1219
+ enable_bucket: bool,
1220
+ bucket_no_upscale: bool,
1221
+ image_directory: Optional[str] = None,
1222
+ image_jsonl_file: Optional[str] = None,
1223
+ cache_directory: Optional[str] = None,
1224
+ debug_dataset: bool = False,
1225
+ architecture: str = "no_default",
1226
+ ):
1227
+ super(ImageDataset, self).__init__(
1228
+ resolution,
1229
+ caption_extension,
1230
+ batch_size,
1231
+ num_repeats,
1232
+ enable_bucket,
1233
+ bucket_no_upscale,
1234
+ cache_directory,
1235
+ debug_dataset,
1236
+ architecture,
1237
+ )
1238
+ self.image_directory = image_directory
1239
+ self.image_jsonl_file = image_jsonl_file
1240
+ if image_directory is not None:
1241
+ self.datasource = ImageDirectoryDatasource(image_directory, caption_extension)
1242
+ elif image_jsonl_file is not None:
1243
+ self.datasource = ImageJsonlDatasource(image_jsonl_file)
1244
+ else:
1245
+ raise ValueError("image_directory or image_jsonl_file must be specified")
1246
+
1247
+ if self.cache_directory is None:
1248
+ self.cache_directory = self.image_directory
1249
+
1250
+ self.batch_manager = None
1251
+ self.num_train_items = 0
1252
+
1253
+ def get_metadata(self):
1254
+ metadata = super().get_metadata()
1255
+ if self.image_directory is not None:
1256
+ metadata["image_directory"] = os.path.basename(self.image_directory)
1257
+ if self.image_jsonl_file is not None:
1258
+ metadata["image_jsonl_file"] = os.path.basename(self.image_jsonl_file)
1259
+ return metadata
1260
+
1261
+ def get_total_image_count(self):
1262
+ return len(self.datasource) if self.datasource.is_indexable() else None
1263
+
1264
+ def retrieve_latent_cache_batches(self, num_workers: int):
1265
+ buckset_selector = BucketSelector(self.resolution, self.enable_bucket, self.bucket_no_upscale, self.architecture)
1266
+ executor = ThreadPoolExecutor(max_workers=num_workers)
1267
+
1268
+ batches: dict[tuple[int, int], list[ItemInfo]] = {} # (width, height) -> [ItemInfo]
1269
+ futures = []
1270
+
1271
+ # aggregate futures and sort by bucket resolution
1272
+ def aggregate_future(consume_all: bool = False):
1273
+ while len(futures) >= num_workers or (consume_all and len(futures) > 0):
1274
+ completed_futures = [future for future in futures if future.done()]
1275
+ if len(completed_futures) == 0:
1276
+ if len(futures) >= num_workers or consume_all: # to avoid adding too many futures
1277
+ time.sleep(0.1)
1278
+ continue
1279
+ else:
1280
+ break # submit batch if possible
1281
+
1282
+ for future in completed_futures:
1283
+ original_size, item_key, image, caption = future.result()
1284
+ bucket_height, bucket_width = image.shape[:2]
1285
+ bucket_reso = (bucket_width, bucket_height)
1286
+
1287
+ item_info = ItemInfo(item_key, caption, original_size, bucket_reso, content=image)
1288
+ item_info.latent_cache_path = self.get_latent_cache_path(item_info)
1289
+
1290
+ if bucket_reso not in batches:
1291
+ batches[bucket_reso] = []
1292
+ batches[bucket_reso].append(item_info)
1293
+
1294
+ futures.remove(future)
1295
+
1296
+ # submit batch if some bucket has enough items
1297
+ def submit_batch(flush: bool = False):
1298
+ for key in batches:
1299
+ if len(batches[key]) >= self.batch_size or flush:
1300
+ batch = batches[key][0 : self.batch_size]
1301
+ if len(batches[key]) > self.batch_size:
1302
+ batches[key] = batches[key][self.batch_size :]
1303
+ else:
1304
+ del batches[key]
1305
+ return key, batch
1306
+ return None, None
1307
+
1308
+ for fetch_op in self.datasource:
1309
+
1310
+ # fetch and resize image in a separate thread
1311
+ def fetch_and_resize(op: callable) -> tuple[tuple[int, int], str, Image.Image, str]:
1312
+ image_key, image, caption = op()
1313
+ image: Image.Image
1314
+ image_size = image.size
1315
+
1316
+ bucket_reso = buckset_selector.get_bucket_resolution(image_size)
1317
+ image = resize_image_to_bucket(image, bucket_reso)
1318
+ return image_size, image_key, image, caption
1319
+
1320
+ future = executor.submit(fetch_and_resize, fetch_op)
1321
+ futures.append(future)
1322
+ aggregate_future()
1323
+ while True:
1324
+ key, batch = submit_batch()
1325
+ if key is None:
1326
+ break
1327
+ yield key, batch
1328
+
1329
+ aggregate_future(consume_all=True)
1330
+ while True:
1331
+ key, batch = submit_batch(flush=True)
1332
+ if key is None:
1333
+ break
1334
+ yield key, batch
1335
+
1336
+ executor.shutdown()
1337
+
1338
+ def retrieve_text_encoder_output_cache_batches(self, num_workers: int):
1339
+ return self._default_retrieve_text_encoder_output_cache_batches(self.datasource, self.batch_size, num_workers)
1340
+
1341
+ def prepare_for_training(self):
1342
+ bucket_selector = BucketSelector(self.resolution, self.enable_bucket, self.bucket_no_upscale, self.architecture)
1343
+
1344
+ # glob cache files
1345
+ latent_cache_files = glob.glob(os.path.join(self.cache_directory, f"*_{self.architecture}.safetensors"))
1346
+
1347
+ # assign cache files to item info
1348
+ bucketed_item_info: dict[tuple[int, int], list[ItemInfo]] = {} # (width, height) -> [ItemInfo]
1349
+ for cache_file in latent_cache_files:
1350
+ tokens = os.path.basename(cache_file).split("_")
1351
+
1352
+ image_size = tokens[-2] # 0000x0000
1353
+ image_width, image_height = map(int, image_size.split("x"))
1354
+ image_size = (image_width, image_height)
1355
+
1356
+ item_key = "_".join(tokens[:-2])
1357
+ text_encoder_output_cache_file = os.path.join(self.cache_directory, f"{item_key}_{self.architecture}_te.safetensors")
1358
+ if not os.path.exists(text_encoder_output_cache_file):
1359
+ logger.warning(f"Text encoder output cache file not found: {text_encoder_output_cache_file}")
1360
+ continue
1361
+
1362
+ bucket_reso = bucket_selector.get_bucket_resolution(image_size)
1363
+ item_info = ItemInfo(item_key, "", image_size, bucket_reso, latent_cache_path=cache_file)
1364
+ item_info.text_encoder_output_cache_path = text_encoder_output_cache_file
1365
+
1366
+ bucket = bucketed_item_info.get(bucket_reso, [])
1367
+ for _ in range(self.num_repeats):
1368
+ bucket.append(item_info)
1369
+ bucketed_item_info[bucket_reso] = bucket
1370
+
1371
+ # prepare batch manager
1372
+ self.batch_manager = BucketBatchManager(bucketed_item_info, self.batch_size)
1373
+ self.batch_manager.show_bucket_info()
1374
+
1375
+ self.num_train_items = sum([len(bucket) for bucket in bucketed_item_info.values()])
1376
+
1377
+ def shuffle_buckets(self):
1378
+ # set random seed for this epoch
1379
+ random.seed(self.seed + self.current_epoch)
1380
+ self.batch_manager.shuffle()
1381
+
1382
+ def __len__(self):
1383
+ if self.batch_manager is None:
1384
+ return 100 # dummy value
1385
+ return len(self.batch_manager)
1386
+
1387
+ def __getitem__(self, idx):
1388
+ return self.batch_manager[idx]
1389
+
1390
+
1391
+ class VideoDataset(BaseDataset):
1392
+ TARGET_FPS_HUNYUAN = 24.0
1393
+ TARGET_FPS_WAN = 16.0
1394
+ TARGET_FPS_FRAMEPACK = 30.0
1395
+
1396
+ def __init__(
1397
+ self,
1398
+ resolution: Tuple[int, int],
1399
+ caption_extension: Optional[str],
1400
+ batch_size: int,
1401
+ num_repeats: int,
1402
+ enable_bucket: bool,
1403
+ bucket_no_upscale: bool,
1404
+ frame_extraction: Optional[str] = "head",
1405
+ frame_stride: Optional[int] = 1,
1406
+ frame_sample: Optional[int] = 1,
1407
+ target_frames: Optional[list[int]] = None,
1408
+ max_frames: Optional[int] = None,
1409
+ source_fps: Optional[float] = None,
1410
+ video_directory: Optional[str] = None,
1411
+ video_jsonl_file: Optional[str] = None,
1412
+ control_directory: Optional[str] = None,
1413
+ cache_directory: Optional[str] = None,
1414
+ debug_dataset: bool = False,
1415
+ architecture: str = "no_default",
1416
+ ):
1417
+ super(VideoDataset, self).__init__(
1418
+ resolution,
1419
+ caption_extension,
1420
+ batch_size,
1421
+ num_repeats,
1422
+ enable_bucket,
1423
+ bucket_no_upscale,
1424
+ cache_directory,
1425
+ debug_dataset,
1426
+ architecture,
1427
+ )
1428
+ self.video_directory = video_directory
1429
+ self.video_jsonl_file = video_jsonl_file
1430
+ self.control_directory = control_directory
1431
+ self.frame_extraction = frame_extraction
1432
+ self.frame_stride = frame_stride
1433
+ self.frame_sample = frame_sample
1434
+ self.max_frames = max_frames
1435
+ self.source_fps = source_fps
1436
+
1437
+ if self.architecture == ARCHITECTURE_HUNYUAN_VIDEO:
1438
+ self.target_fps = VideoDataset.TARGET_FPS_HUNYUAN
1439
+ elif self.architecture == ARCHITECTURE_WAN:
1440
+ self.target_fps = VideoDataset.TARGET_FPS_WAN
1441
+ elif self.architecture == ARCHITECTURE_FRAMEPACK:
1442
+ self.target_fps = VideoDataset.TARGET_FPS_FRAMEPACK
1443
+ else:
1444
+ raise ValueError(f"Unsupported architecture: {self.architecture}")
1445
+
1446
+ if target_frames is not None:
1447
+ target_frames = list(set(target_frames))
1448
+ target_frames.sort()
1449
+
1450
+ # round each value to N*4+1
1451
+ rounded_target_frames = [(f - 1) // 4 * 4 + 1 for f in target_frames]
1452
+ rouneded_target_frames = list(set(rounded_target_frames))
1453
+ rouneded_target_frames.sort()
1454
+
1455
+ # if value is changed, warn
1456
+ if target_frames != rounded_target_frames:
1457
+ logger.warning(f"target_frames are rounded to {rounded_target_frames}")
1458
+
1459
+ target_frames = tuple(rounded_target_frames)
1460
+
1461
+ self.target_frames = target_frames
1462
+
1463
+ if video_directory is not None:
1464
+ self.datasource = VideoDirectoryDatasource(video_directory, caption_extension, control_directory)
1465
+ elif video_jsonl_file is not None:
1466
+ self.datasource = VideoJsonlDatasource(video_jsonl_file)
1467
+
1468
+ if self.frame_extraction == "uniform" and self.frame_sample == 1:
1469
+ self.frame_extraction = "head"
1470
+ logger.warning("frame_sample is set to 1 for frame_extraction=uniform. frame_extraction is changed to head.")
1471
+ if self.frame_extraction == "head":
1472
+ # head extraction. we can limit the number of frames to be extracted
1473
+ self.datasource.set_start_and_end_frame(0, max(self.target_frames))
1474
+
1475
+ if self.cache_directory is None:
1476
+ self.cache_directory = self.video_directory
1477
+
1478
+ self.batch_manager = None
1479
+ self.num_train_items = 0
1480
+ self.has_control = self.datasource.has_control
1481
+
1482
+ def get_metadata(self):
1483
+ metadata = super().get_metadata()
1484
+ if self.video_directory is not None:
1485
+ metadata["video_directory"] = os.path.basename(self.video_directory)
1486
+ if self.video_jsonl_file is not None:
1487
+ metadata["video_jsonl_file"] = os.path.basename(self.video_jsonl_file)
1488
+ if self.control_directory is not None:
1489
+ metadata["control_directory"] = os.path.basename(self.control_directory)
1490
+ metadata["frame_extraction"] = self.frame_extraction
1491
+ metadata["frame_stride"] = self.frame_stride
1492
+ metadata["frame_sample"] = self.frame_sample
1493
+ metadata["target_frames"] = self.target_frames
1494
+ metadata["max_frames"] = self.max_frames
1495
+ metadata["source_fps"] = self.source_fps
1496
+ metadata["has_control"] = self.has_control
1497
+ return metadata
1498
+
1499
+ def retrieve_latent_cache_batches(self, num_workers: int):
1500
+ buckset_selector = BucketSelector(self.resolution, architecture=self.architecture)
1501
+ self.datasource.set_bucket_selector(buckset_selector)
1502
+ if self.source_fps is not None:
1503
+ self.datasource.set_source_and_target_fps(self.source_fps, self.target_fps)
1504
+ else:
1505
+ self.datasource.set_source_and_target_fps(None, None) # no conversion
1506
+
1507
+ executor = ThreadPoolExecutor(max_workers=num_workers)
1508
+
1509
+ # key: (width, height, frame_count), value: [ItemInfo]
1510
+ batches: dict[tuple[int, int, int], list[ItemInfo]] = {}
1511
+ futures = []
1512
+
1513
+ def aggregate_future(consume_all: bool = False):
1514
+ while len(futures) >= num_workers or (consume_all and len(futures) > 0):
1515
+ completed_futures = [future for future in futures if future.done()]
1516
+ if len(completed_futures) == 0:
1517
+ if len(futures) >= num_workers or consume_all: # to avoid adding too many futures
1518
+ time.sleep(0.1)
1519
+ continue
1520
+ else:
1521
+ break # submit batch if possible
1522
+
1523
+ for future in completed_futures:
1524
+ original_frame_size, video_key, video, caption, control = future.result()
1525
+
1526
+ frame_count = len(video)
1527
+ video = np.stack(video, axis=0)
1528
+ height, width = video.shape[1:3]
1529
+ bucket_reso = (width, height) # already resized
1530
+
1531
+ # process control images if available
1532
+ control_video = None
1533
+ if control is not None:
1534
+ # set frame count to the same as video
1535
+ if len(control) > frame_count:
1536
+ control = control[:frame_count]
1537
+ elif len(control) < frame_count:
1538
+ # if control is shorter than video, repeat the last frame
1539
+ last_frame = control[-1]
1540
+ control.extend([last_frame] * (frame_count - len(control)))
1541
+ control_video = np.stack(control, axis=0)
1542
+
1543
+ crop_pos_and_frames = []
1544
+ if self.frame_extraction == "head":
1545
+ for target_frame in self.target_frames:
1546
+ if frame_count >= target_frame:
1547
+ crop_pos_and_frames.append((0, target_frame))
1548
+ elif self.frame_extraction == "chunk":
1549
+ # split by target_frames
1550
+ for target_frame in self.target_frames:
1551
+ for i in range(0, frame_count, target_frame):
1552
+ if i + target_frame <= frame_count:
1553
+ crop_pos_and_frames.append((i, target_frame))
1554
+ elif self.frame_extraction == "slide":
1555
+ # slide window
1556
+ for target_frame in self.target_frames:
1557
+ if frame_count >= target_frame:
1558
+ for i in range(0, frame_count - target_frame + 1, self.frame_stride):
1559
+ crop_pos_and_frames.append((i, target_frame))
1560
+ elif self.frame_extraction == "uniform":
1561
+ # select N frames uniformly
1562
+ for target_frame in self.target_frames:
1563
+ if frame_count >= target_frame:
1564
+ frame_indices = np.linspace(0, frame_count - target_frame, self.frame_sample, dtype=int)
1565
+ for i in frame_indices:
1566
+ crop_pos_and_frames.append((i, target_frame))
1567
+ elif self.frame_extraction == "full":
1568
+ # select all frames
1569
+ target_frame = min(frame_count, self.max_frames)
1570
+ target_frame = (target_frame - 1) // 4 * 4 + 1 # round to N*4+1
1571
+ crop_pos_and_frames.append((0, target_frame))
1572
+ else:
1573
+ raise ValueError(f"frame_extraction {self.frame_extraction} is not supported")
1574
+
1575
+ for crop_pos, target_frame in crop_pos_and_frames:
1576
+ cropped_video = video[crop_pos : crop_pos + target_frame]
1577
+ body, ext = os.path.splitext(video_key)
1578
+ item_key = f"{body}_{crop_pos:05d}-{target_frame:03d}{ext}"
1579
+ batch_key = (*bucket_reso, target_frame) # bucket_reso with frame_count
1580
+
1581
+ # crop control video if available
1582
+ cropped_control = None
1583
+ if control_video is not None:
1584
+ cropped_control = control_video[crop_pos : crop_pos + target_frame]
1585
+
1586
+ item_info = ItemInfo(
1587
+ item_key, caption, original_frame_size, batch_key, frame_count=target_frame, content=cropped_video
1588
+ )
1589
+ item_info.latent_cache_path = self.get_latent_cache_path(item_info)
1590
+ item_info.control_content = cropped_control # None is allowed
1591
+
1592
+ batch = batches.get(batch_key, [])
1593
+ batch.append(item_info)
1594
+ batches[batch_key] = batch
1595
+
1596
+ futures.remove(future)
1597
+
1598
+ def submit_batch(flush: bool = False):
1599
+ for key in batches:
1600
+ if len(batches[key]) >= self.batch_size or flush:
1601
+ batch = batches[key][0 : self.batch_size]
1602
+ if len(batches[key]) > self.batch_size:
1603
+ batches[key] = batches[key][self.batch_size :]
1604
+ else:
1605
+ del batches[key]
1606
+ return key, batch
1607
+ return None, None
1608
+
1609
+ for operator in self.datasource:
1610
+
1611
+ def fetch_and_resize(op: callable) -> tuple[tuple[int, int], str, list[np.ndarray], str, Optional[list[np.ndarray]]]:
1612
+ result = op()
1613
+
1614
+ if len(result) == 3: # for backward compatibility TODO remove this in the future
1615
+ video_key, video, caption = result
1616
+ control = None
1617
+ else:
1618
+ video_key, video, caption, control = result
1619
+
1620
+ video: list[np.ndarray]
1621
+ frame_size = (video[0].shape[1], video[0].shape[0])
1622
+
1623
+ # resize if necessary
1624
+ bucket_reso = buckset_selector.get_bucket_resolution(frame_size)
1625
+ video = [resize_image_to_bucket(frame, bucket_reso) for frame in video]
1626
+
1627
+ # resize control if necessary
1628
+ if control is not None:
1629
+ control = [resize_image_to_bucket(frame, bucket_reso) for frame in control]
1630
+
1631
+ return frame_size, video_key, video, caption, control
1632
+
1633
+ future = executor.submit(fetch_and_resize, operator)
1634
+ futures.append(future)
1635
+ aggregate_future()
1636
+ while True:
1637
+ key, batch = submit_batch()
1638
+ if key is None:
1639
+ break
1640
+ yield key, batch
1641
+
1642
+ aggregate_future(consume_all=True)
1643
+ while True:
1644
+ key, batch = submit_batch(flush=True)
1645
+ if key is None:
1646
+ break
1647
+ yield key, batch
1648
+
1649
+ executor.shutdown()
1650
+
1651
+ def retrieve_text_encoder_output_cache_batches(self, num_workers: int):
1652
+ return self._default_retrieve_text_encoder_output_cache_batches(self.datasource, self.batch_size, num_workers)
1653
+
1654
+ def prepare_for_training(self):
1655
+ bucket_selector = BucketSelector(self.resolution, self.enable_bucket, self.bucket_no_upscale, self.architecture)
1656
+
1657
+ # glob cache files
1658
+ latent_cache_files = glob.glob(os.path.join(self.cache_directory, f"*_{self.architecture}.safetensors"))
1659
+
1660
+ # assign cache files to item info
1661
+ bucketed_item_info: dict[tuple[int, int, int], list[ItemInfo]] = {} # (width, height, frame_count) -> [ItemInfo]
1662
+ for cache_file in latent_cache_files:
1663
+ tokens = os.path.basename(cache_file).split("_")
1664
+
1665
+ image_size = tokens[-2] # 0000x0000
1666
+ image_width, image_height = map(int, image_size.split("x"))
1667
+ image_size = (image_width, image_height)
1668
+
1669
+ frame_pos, frame_count = tokens[-3].split("-")[:2] # "00000-000", or optional section index "00000-000-00"
1670
+ frame_pos, frame_count = int(frame_pos), int(frame_count)
1671
+
1672
+ item_key = "_".join(tokens[:-3])
1673
+ text_encoder_output_cache_file = os.path.join(self.cache_directory, f"{item_key}_{self.architecture}_te.safetensors")
1674
+ if not os.path.exists(text_encoder_output_cache_file):
1675
+ logger.warning(f"Text encoder output cache file not found: {text_encoder_output_cache_file}")
1676
+ continue
1677
+
1678
+ bucket_reso = bucket_selector.get_bucket_resolution(image_size)
1679
+ bucket_reso = (*bucket_reso, frame_count)
1680
+ item_info = ItemInfo(item_key, "", image_size, bucket_reso, frame_count=frame_count, latent_cache_path=cache_file)
1681
+ item_info.text_encoder_output_cache_path = text_encoder_output_cache_file
1682
+
1683
+ bucket = bucketed_item_info.get(bucket_reso, [])
1684
+ for _ in range(self.num_repeats):
1685
+ bucket.append(item_info)
1686
+ bucketed_item_info[bucket_reso] = bucket
1687
+
1688
+ # prepare batch manager
1689
+ self.batch_manager = BucketBatchManager(bucketed_item_info, self.batch_size)
1690
+ self.batch_manager.show_bucket_info()
1691
+
1692
+ self.num_train_items = sum([len(bucket) for bucket in bucketed_item_info.values()])
1693
+
1694
+ def shuffle_buckets(self):
1695
+ # set random seed for this epoch
1696
+ random.seed(self.seed + self.current_epoch)
1697
+ self.batch_manager.shuffle()
1698
+
1699
+ def __len__(self):
1700
+ if self.batch_manager is None:
1701
+ return 100 # dummy value
1702
+ return len(self.batch_manager)
1703
+
1704
+ def __getitem__(self, idx):
1705
+ return self.batch_manager[idx]
1706
+
1707
+
1708
+ class DatasetGroup(torch.utils.data.ConcatDataset):
1709
+ def __init__(self, datasets: Sequence[Union[ImageDataset, VideoDataset]]):
1710
+ super().__init__(datasets)
1711
+ self.datasets: list[Union[ImageDataset, VideoDataset]] = datasets
1712
+ self.num_train_items = 0
1713
+ for dataset in self.datasets:
1714
+ self.num_train_items += dataset.num_train_items
1715
+
1716
+ def set_current_epoch(self, epoch):
1717
+ for dataset in self.datasets:
1718
+ dataset.set_current_epoch(epoch)
1719
+
1720
+ def set_current_step(self, step):
1721
+ for dataset in self.datasets:
1722
+ dataset.set_current_step(step)
1723
+
1724
+ def set_max_train_steps(self, max_train_steps):
1725
+ for dataset in self.datasets:
1726
+ dataset.set_max_train_steps(max_train_steps)
docs/advanced_config.md ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ > 📝 Click on the language section to expand / 言語をクリックして展開
2
+
3
+ # Advanced configuration / 高度な設定
4
+
5
+ ## Table of contents / 目次
6
+
7
+ - [How to specify `network_args`](#how-to-specify-network_args--network_argsの指定方法)
8
+ - [LoRA+](#lora)
9
+ - [Select the target modules of LoRA](#select-the-target-modules-of-lora--loraの対象モジュールを選択する)
10
+ - [Save and view logs in TensorBoard format](#save-and-view-logs-in-tensorboard-format--tensorboard形式のログの保存と参照)
11
+ - [Save and view logs in wandb](#save-and-view-logs-in-wandb--wandbでログの保存と参照)
12
+ - [FP8 weight optimization for models](#fp8-weight-optimization-for-models--モデルの重みのfp8への最適化)
13
+ - [PyTorch Dynamo optimization for model training](#pytorch-dynamo-optimization-for-model-training--モデルの学習におけるpytorch-dynamoの最適化)
14
+
15
+ ## How to specify `network_args` / `network_args`の指定方法
16
+
17
+ The `--network_args` option is an option for specifying detailed arguments to LoRA. Specify the arguments in the form of `key=value` in `--network_args`.
18
+
19
+ <details>
20
+ <summary>日本語</summary>
21
+ `--network_args`オプションは、LoRAへの詳細な引数を指定するためのオプションです。`--network_args`には、`key=value`の形式で引数を指定します。
22
+ </details>
23
+
24
+ ### Example / 記述例
25
+
26
+ If you specify it on the command line, write as follows. / コマンドラインで指定する場合は以下のように記述します。
27
+
28
+ ```bash
29
+ accelerate launch --num_cpu_threads_per_process 1 --mixed_precision bf16 hv_train_network.py --dit ...
30
+ --network_module networks.lora --network_dim 32
31
+ --network_args "key1=value1" "key2=value2" ...
32
+ ```
33
+
34
+ If you specify it in the configuration file, write as follows. / 設定ファイルで指定する場合は以下のように記述します。
35
+
36
+ ```toml
37
+ network_args = ["key1=value1", "key2=value2", ...]
38
+ ```
39
+
40
+ If you specify `"verbose=True"`, detailed information of LoRA will be displayed. / `"verbose=True"`を指定するとLoRAの詳細な情報が表示されます。
41
+
42
+ ```bash
43
+ --network_args "verbose=True" "key1=value1" "key2=value2" ...
44
+ ```
45
+
46
+ ## LoRA+
47
+
48
+ LoRA+ is a method to improve the training speed by increasing the learning rate of the UP side (LoRA-B) of LoRA. Specify the multiplier for the learning rate. The original paper recommends 16, but adjust as needed. It seems to be good to start from around 4. For details, please refer to the [related PR of sd-scripts](https://github.com/kohya-ss/sd-scripts/pull/1233).
49
+
50
+ Specify `loraplus_lr_ratio` with `--network_args`.
51
+
52
+ <details>
53
+ <summary>日本語</summary>
54
+
55
+ LoRA+は、LoRAのUP側(LoRA-B)の学習率を上げることで学習速度を向上させる手法です。学習率に対する倍率を指定します。元論文では16を推奨していますが、必要に応じて調整してください。4程度から始めるとよいようです。詳細は[sd-scriptsの関連PR]https://github.com/kohya-ss/sd-scripts/pull/1233)を参照してください。
56
+
57
+ `--network_args`で`loraplus_lr_ratio`を指定します。
58
+ </details>
59
+
60
+ ### Example / 記述例
61
+
62
+ ```bash
63
+ accelerate launch --num_cpu_threads_per_process 1 --mixed_precision bf16 hv_train_network.py --dit ...
64
+ --network_module networks.lora --network_dim 32 --network_args "loraplus_lr_ratio=4" ...
65
+ ```
66
+
67
+ ## Select the target modules of LoRA / LoRAの対象モジュールを選択する
68
+
69
+ *This feature is highly experimental and the specification may change. / この機能は特に実験的なもので、仕様は変更される可能性があります。*
70
+
71
+ By specifying `exclude_patterns` and `include_patterns` with `--network_args`, you can select the target modules of LoRA.
72
+
73
+ `exclude_patterns` excludes modules that match the specified pattern. `include_patterns` targets only modules that match the specified pattern.
74
+
75
+ Specify the values as a list. For example, `"exclude_patterns=[r'.*single_blocks.*', r'.*double_blocks\.[0-9]\..*']"`.
76
+
77
+ The pattern is a regular expression for the module name. The module name is in the form of `double_blocks.0.img_mod.linear` or `single_blocks.39.modulation.linear`. The regular expression is not a partial match but a complete match.
78
+
79
+ The patterns are applied in the order of `exclude_patterns`→`include_patterns`. By default, the Linear layers of `img_mod`, `txt_mod`, and `modulation` of double blocks and single blocks are excluded.
80
+
81
+ (`.*(img_mod|txt_mod|modulation).*` is specified.)
82
+
83
+ <details>
84
+ <summary>日本語</summary>
85
+
86
+ `--network_args`で`exclude_patterns`と`include_patterns`を指定することで、LoRAの対象モジュールを選択することができます。
87
+
88
+ `exclude_patterns`は、指定したパターンに一致するモジュールを除外します。`include_patterns`は、指定したパターンに一致するモジュールのみを対象とします。
89
+
90
+ 値は、リストで指定します。`"exclude_patterns=[r'.*single_blocks.*', r'.*double_blocks\.[0-9]\..*']"`のようになります。
91
+
92
+ パターンは、モジュール名に対する正規表現です。モジュール名は、たとえば`double_blocks.0.img_mod.linear`や`single_blocks.39.modulation.linear`のような形式です。正規表現は部分一致ではなく完全一致です。
93
+
94
+ パターンは、`exclude_patterns`→`include_patterns`の順で適用されます。デフォルトは、double blocksとsingle blocksのLinear層のうち、`img_mod`、`txt_mod`、`modulation`が除外されています。
95
+
96
+ (`.*(img_mod|txt_mod|modulation).*`が指定されています。)
97
+ </details>
98
+
99
+ ### Example / 記述例
100
+
101
+ Only the modules of double blocks / double blocksのモジュールのみを対象とする場合:
102
+
103
+ ```bash
104
+ --network_args "exclude_patterns=[r'.*single_blocks.*']"
105
+ ```
106
+
107
+ Only the modules of single blocks from the 10th / single blocksの10番目以降のLinearモジュールのみを対象とする場合:
108
+
109
+ ```bash
110
+ --network_args "exclude_patterns=[r'.*']" "include_patterns=[r'.*single_blocks\.\d{2}\.linear.*']"
111
+ ```
112
+
113
+ ## Save and view logs in TensorBoard format / TensorBoard形式のログの保存と参照
114
+
115
+ Specify the folder to save the logs with the `--logging_dir` option. Logs in TensorBoard format will be saved.
116
+
117
+ For example, if you specify `--logging_dir=logs`, a `logs` folder will be created in the working folder, and logs will be saved in the date folder inside it.
118
+
119
+ Also, if you specify the `--log_prefix` option, the specified string will be added before the date. For example, use `--logging_dir=logs --log_prefix=lora_setting1_` for identification.
120
+
121
+ To view logs in TensorBoard, open another command prompt and activate the virtual environment. Then enter the following in the working folder.
122
+
123
+ ```powershell
124
+ tensorboard --logdir=logs
125
+ ```
126
+
127
+ (tensorboard installation is required.)
128
+
129
+ Then open a browser and access http://localhost:6006/ to display it.
130
+
131
+ <details>
132
+ <summary>日本語</summary>
133
+ `--logging_dir`オプションにログ保存先フォルダを指定してください。TensorBoard形式のログが保存されます。
134
+
135
+ たとえば`--logging_dir=logs`と指定すると、作業フォルダにlogsフォルダが作成され、その中の日時フォルダにログが保存されます。
136
+
137
+ また`--log_prefix`オプションを指定すると、日時の前に指定した文字列が追加されます。`--logging_dir=logs --log_prefix=lora_setting1_`などとして識別用にお使いください。
138
+
139
+ TensorBoardでログを確認するには、別のコマンドプロンプトを開き、仮想環境を有効にしてから、作業フォルダで以下のように入力します。
140
+
141
+ ```powershell
142
+ tensorboard --logdir=logs
143
+ ```
144
+
145
+ (tensorboardのインストールが必要です。)
146
+
147
+ その後ブラウザを開き、http://localhost:6006/ へアクセスすると表示されます。
148
+ </details>
149
+
150
+ ## Save and view logs in wandb / wandbでログの保存と参照
151
+
152
+ `--log_with wandb` option is available to save logs in wandb format. `tensorboard` or `all` is also available. The default is `tensorboard`.
153
+
154
+ Specify the project name with `--log_tracker_name` when using wandb.
155
+
156
+ <details>
157
+ <summary>日本語</summary>
158
+ `--log_with wandb`オプションを指定するとwandb形式でログを保存することができます。`tensorboard`や`all`も指定可能です。デフォルトは`tensorboard`です。
159
+
160
+ wandbを使用する場合は、`--log_tracker_name`でプロジェクト名を指定してください。
161
+ </details>
162
+
163
+ ## FP8 weight optimization for models / モデルの重みのFP8への最適化
164
+
165
+ The `--fp8_scaled` option is available to quantize the weights of the model to FP8 (E4M3) format with appropriate scaling. This reduces the VRAM usage while maintaining precision. Important weights are kept in FP16/BF16/FP32 format.
166
+
167
+ The model weights must be in fp16 or bf16. Weights that have been pre-converted to float8_e4m3 cannot be used.
168
+
169
+ Wan2.1 inference and training are supported.
170
+
171
+ Specify the `--fp8_scaled` option in addition to the `--fp8` option during inference.
172
+
173
+ Specify the `--fp8_scaled` option in addition to the `--fp8_base` option during training.
174
+
175
+ Acknowledgments: This feature is based on the [implementation](https://github.com/Tencent/HunyuanVideo/blob/7df4a45c7e424a3f6cd7d653a7ff1f60cddc1eb1/hyvideo/modules/fp8_optimization.py) of [HunyuanVideo](https://github.com/Tencent/HunyuanVideo). The selection of high-precision modules is based on the [implementation](https://github.com/tdrussell/diffusion-pipe/blob/407c04fdae1c9ab5e67b54d33bef62c3e0a8dbc7/models/wan.py) of [diffusion-pipe](https://github.com/tdrussell/diffusion-pipe). I would like to thank these repositories.
176
+
177
+ <details>
178
+ <summary>日本語</summary>
179
+ 重みを単純にFP8へcastするのではなく、適切なスケーリングでFP8形式に量子化することで、精度を維持しつつVRAM使用量を削減します。また、重要な重みはFP16/BF16/FP32形式で保持します。
180
+
181
+ モデルの重みは、fp16���たはbf16が必要です。あらかじめfloat8_e4m3に変換された重みは使用できません。
182
+
183
+ Wan2.1の推論、学習のみ対応しています。
184
+
185
+ 推論時は`--fp8`オプションに加えて `--fp8_scaled`オプションを指定してください。
186
+
187
+ 学習時は`--fp8_base`オプションに加えて `--fp8_scaled`オプションを指定してください。
188
+
189
+ 謝辞:この機能は、[HunyuanVideo](https://github.com/Tencent/HunyuanVideo)の[実装](https://github.com/Tencent/HunyuanVideo/blob/7df4a45c7e424a3f6cd7d653a7ff1f60cddc1eb1/hyvideo/modules/fp8_optimization.py)を参考にしました。また、高精度モジュールの選択においては[diffusion-pipe](https://github.com/tdrussell/diffusion-pipe)の[実装](https://github.com/tdrussell/diffusion-pipe/blob/407c04fdae1c9ab5e67b54d33bef62c3e0a8dbc7/models/wan.py)を参考にしました。これらのリポジトリに感謝します。
190
+
191
+ </details>
192
+
193
+ ### Key features and implementation details / 主な特徴と実装の詳細
194
+
195
+ - Implements FP8 (E4M3) weight quantization for Linear layers
196
+ - Reduces VRAM requirements by using 8-bit weights for storage (slightly increased compared to existing `--fp8` `--fp8_base` options)
197
+ - Quantizes weights to FP8 format with appropriate scaling instead of simple cast to FP8
198
+ - Maintains computational precision by dequantizing to original precision (FP16/BF16/FP32) during forward pass
199
+ - Preserves important weights in FP16/BF16/FP32 format
200
+
201
+ The implementation:
202
+
203
+ 1. Quantizes weights to FP8 format with appropriate scaling
204
+ 2. Replaces weights by FP8 quantized weights and stores scale factors in model state dict
205
+ 3. Applies monkey patching to Linear layers for transparent dequantization during computation
206
+
207
+ <details>
208
+ <summary>日本語</summary>
209
+
210
+ - Linear層のFP8(E4M3)重み量子化を実装
211
+ - 8ビットの重みを使用することでVRAM使用量を削減(既存の`--fp8` `--fp8_base` オプションに比べて微増)
212
+ - 単純なFP8へのcastではなく、適切な値でスケールして重みをFP8形式に量子化
213
+ - forward時に元の精度(FP16/BF16/FP32)に逆量子化して計算精度を維持
214
+ - 精度が重要な重みはFP16/BF16/FP32のまま保持
215
+
216
+ 実装:
217
+
218
+ 1. 精度を維持できる適切な倍率で重みをFP8形式に量子化
219
+ 2. 重みをFP8量子化重みに置き換え、倍率をモデルのstate dictに保存
220
+ 3. Linear層にmonkey patchingすることでモデルを変更せずに逆量子化
221
+ </details>
222
+
223
+ ## PyTorch Dynamo optimization for model training / モデルの学習におけるPyTorch Dynamoの最適化
224
+
225
+ The PyTorch Dynamo options are now available to optimize the training process. PyTorch Dynamo is a Python-level JIT compiler designed to make unmodified PyTorch programs faster by using TorchInductor, a deep learning compiler. This integration allows for potential speedups in training while maintaining model accuracy.
226
+
227
+ [PR #215](https://github.com/kohya-ss/musubi-tuner/pull/215) added this feature.
228
+
229
+ Specify the `--dynamo_backend` option to enable Dynamo optimization with one of the available backends from the `DynamoBackend` enum.
230
+
231
+ Additional options allow for fine-tuning the Dynamo behavior:
232
+ - `--dynamo_mode`: Controls the optimization strategy
233
+ - `--dynamo_fullgraph`: Enables fullgraph mode for potentially better optimization
234
+ - `--dynamo_dynamic`: Enables dynamic shape handling
235
+
236
+ The `--dynamo_dynamic` option has been reported to have many problems based on the validation in PR #215.
237
+
238
+ ### Available options:
239
+
240
+ ```
241
+ --dynamo_backend {NO, INDUCTOR, NVFUSER, CUDAGRAPHS, CUDAGRAPHS_FALLBACK, etc.}
242
+ Specifies the Dynamo backend to use (default is NO, which disables Dynamo)
243
+
244
+ --dynamo_mode {default, reduce-overhead, max-autotune}
245
+ Specifies the optimization mode (default is 'default')
246
+ - 'default': Standard optimization
247
+ - 'reduce-overhead': Focuses on reducing compilation overhead
248
+ - 'max-autotune': Performs extensive autotuning for potentially better performance
249
+
250
+ --dynamo_fullgraph
251
+ Flag to enable fullgraph mode, which attempts to capture and optimize the entire model graph
252
+
253
+ --dynamo_dynamic
254
+ Flag to enable dynamic shape handling for models with variable input shapes
255
+ ```
256
+
257
+ ### Usage example:
258
+
259
+ ```bash
260
+ python train_video_model.py --dynamo_backend INDUCTOR --dynamo_mode default
261
+ ```
262
+
263
+ For more aggressive optimization:
264
+ ```bash
265
+ python train_video_model.py --dynamo_backend INDUCTOR --dynamo_mode max-autotune --dynamo_fullgraph
266
+ ```
267
+
268
+ Note: The best combination of options may depend on your specific model and hardware. Experimentation may be necessary to find the optimal configuration.
269
+
270
+ <details>
271
+ <summary>日本語</summary>
272
+ PyTorch Dynamoオプションが学習プロセスを最適化するために追加されました。PyTorch Dynamoは、TorchInductor(ディープラーニングコンパイラ)を使用して、変更を加えることなくPyTorchプログラムを高速化するためのPythonレベルのJITコンパイラです。この統合により、モデルの精度を維持しながら学習の高速化が期待できます。
273
+
274
+ [PR #215](https://github.com/kohya-ss/musubi-tuner/pull/215) で追加されました。
275
+
276
+ `--dynamo_backend`オプションを指定して、`DynamoBackend`列挙型から利用可能なバックエンドの一つを選択することで、Dynamo最適化を有効にします。
277
+
278
+ 追加のオプションにより、Dynamoの動作を微調整できます:
279
+ - `--dynamo_mode`:最適化戦略を制御します
280
+ - `--dynamo_fullgraph`:より良い最適化の可能性のためにフルグラフモードを有効にします
281
+ - `--dynamo_dynamic`:動的形状処理を有効にします
282
+
283
+ PR #215での検証によると、`--dynamo_dynamic`には問題が多いことが報告されています。
284
+
285
+ __利用可能なオプション:__
286
+
287
+ ```
288
+ --dynamo_backend {NO, INDUCTOR, NVFUSER, CUDAGRAPHS, CUDAGRAPHS_FALLBACK, など}
289
+ 使用するDynamoバックエンドを指定します(デフォルトはNOで、Dynamoを無効にします)
290
+
291
+ --dynamo_mode {default, reduce-overhead, max-autotune}
292
+ 最適化モードを指定します(デフォルトは 'default')
293
+ - 'default':標準的な最適化
294
+ - 'reduce-overhead':コンパイルのオーバーヘッド削減に焦点を当てる
295
+ - 'max-autotune':より良いパフォーマンスのために広範な自動調整を実行
296
+
297
+ --dynamo_fullgraph
298
+ フルグラフモードを有効にするフラグ。モデルグラフ全体をキャプチャして最適化しようとします
299
+
300
+ --dynamo_dynamic
301
+ 可変入力形状を持つモデルのための動的形状処理を有効にするフラグ
302
+ ```
303
+
304
+ __使用例:__
305
+
306
+ ```bash
307
+ python train_video_model.py --dynamo_backend INDUCTOR --dynamo_mode default
308
+ ```
309
+
310
+ より積極的な最適化の場合:
311
+ ```bash
312
+ python train_video_model.py --dynamo_backend INDUCTOR --dynamo_mode max-autotune --dynamo_fullgraph
313
+ ```
314
+
315
+ 注意:最適なオプションの組み合わせは、特定のモデルとハードウェアに依存する場合があります。最適な構成を見つけるために実験が必要かもしれません。
316
+ </details>
docs/framepack.md ADDED
@@ -0,0 +1,331 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FramePack
2
+
3
+ ## Overview / 概要
4
+
5
+ This document describes the usage of the [FramePack](https://github.com/lllyasviel/FramePack) architecture within the Musubi Tuner framework. FramePack is a novel video generation architecture developed by lllyasviel.
6
+
7
+ Key differences from HunyuanVideo:
8
+ - FramePack only supports Image-to-Video (I2V) generation. Text-to-Video (T2V) is not supported.
9
+ - It utilizes a different DiT model architecture and requires an additional Image Encoder. VAE is same as HunyuanVideo. Text Encoders seem to be the same as HunyuanVideo but we employ the original FramePack method to utilize them.
10
+ - Caching and training scripts are specific to FramePack (`fpack_*.py`).
11
+ - Due to its progressive generation nature, VRAM usage can be significantly lower, especially for longer videos, compared to other architectures.
12
+
13
+ This feature is experimental.
14
+
15
+ <details>
16
+ <summary>日本語</summary>
17
+ このドキュメントは、Musubi Tunerフレームワーク内での[FramePack](https://github.com/lllyasviel/FramePack) アーキテクチャの使用法について説明しています。FramePackは、lllyasviel氏にによって開発された新しいビデオ生成アーキテクチャです。
18
+
19
+ HunyuanVideoとの主な違いは次のとおりです。
20
+ - FramePackは、画像からビデオ(I2V)生成のみをサポートしています。テキストからビデオ(T2V)はサポートされていません。
21
+ - 異なるDiTモデルアーキテクチャを使用し、追加の画像エンコーダーが必要です。VAEはHunyuanVideoと同じです。テキストエンコーダーはHunyuanVideoと同じと思われますが、FramePack公式と同じ方法で推論を行っています。
22
+ - キャッシングと学習スクリプトはFramePack専用(`fpack_*.py`)です。
23
+ - セクションずつ生成するため、他のアーキテクチャと比較して、特に長いビデオの場合、VRAM使用量が大幅に少なくなる可能性があります。
24
+
25
+ この機能は実験的なものですです。
26
+ </details>
27
+
28
+ ## Download the model / モデルのダウンロード
29
+
30
+ You need to download the DiT, VAE, Text Encoder 1 (LLaMA), Text Encoder 2 (CLIP), and Image Encoder (SigLIP) models specifically for FramePack. Several download options are available for each component.
31
+
32
+ ***Note:** The weights are publicly available on the following page: [maybleMyers/framepack_h1111](https://huggingface.co/maybleMyers/framepack_h1111). Thank you maybleMyers!
33
+
34
+ ### DiT Model
35
+
36
+ Choose one of the following methods:
37
+
38
+ 1. **From lllyasviel's Hugging Face repo:** Download the three `.safetensors` files (starting with `diffusion_pytorch_model-00001-of-00003.safetensors`) from [lllyasviel/FramePackI2V_HY](https://huggingface.co/lllyasviel/FramePackI2V_HY). Specify the path to the first file (`...-00001-of-00003.safetensors`) as the `--dit` argument.
39
+ 2. **From local FramePack installation:** If you have cloned and run the official FramePack repository, the model might be downloaded locally. Specify the path to the snapshot directory, e.g., `path/to/FramePack/hf_download/hub/models--lllyasviel--FramePackI2V_HY/snapshots/<hex-uuid-folder>`.
40
+ 3. **From Kijai's Hugging Face repo:** Download the single file `FramePackI2V_HY_bf16.safetensors` from [Kijai/HunyuanVideo_comfy](https://huggingface.co/Kijai/HunyuanVideo_comfy/blob/main/FramePackI2V_HY_bf16.safetensors). Specify the path to this file as the `--dit` argument.
41
+
42
+ ### VAE Model
43
+
44
+ Choose one of the following methods:
45
+
46
+ 1. **Use official HunyuanVideo VAE:** Follow the instructions in the main [README.md](../README.md#model-download).
47
+ 2. **From hunyuanvideo-community Hugging Face repo:** Download `vae/diffusion_pytorch_model.safetensors` from [hunyuanvideo-community/HunyuanVideo](https://huggingface.co/hunyuanvideo-community/HunyuanVideo).
48
+ 3. **From local FramePack installation:** If you have cloned and run the official FramePack repository, the VAE might be downloaded locally within the HunyuanVideo community model snapshot. Specify the path to the snapshot directory, e.g., `path/to/FramePack/hf_download/hub/models--hunyuanvideo-community--HunyuanVideo/snapshots/<hex-uuid-folder>`.
49
+
50
+ ### Text Encoder 1 (LLaMA) Model
51
+
52
+ Choose one of the following methods:
53
+
54
+ 1. **From Comfy-Org Hugging Face repo:** Download `split_files/text_encoders/llava_llama3_fp16.safetensors` from [Comfy-Org/HunyuanVideo_repackaged](https://huggingface.co/Comfy-Org/HunyuanVideo_repackaged).
55
+ 2. **From hunyuanvideo-community Hugging Face repo:** Download the four `.safetensors` files (starting with `text_encoder/model-00001-of-00004.safetensors`) from [hunyuanvideo-community/HunyuanVideo](https://huggingface.co/hunyuanvideo-community/HunyuanVideo). Specify the path to the first file (`...-00001-of-00004.safetensors`) as the `--text_encoder1` argument.
56
+ 3. **From local FramePack installation:** (Same as VAE) Specify the path to the HunyuanVideo community model snapshot directory, e.g., `path/to/FramePack/hf_download/hub/models--hunyuanvideo-community--HunyuanVideo/snapshots/<hex-uuid-folder>`.
57
+
58
+ ### Text Encoder 2 (CLIP) Model
59
+
60
+ Choose one of the following methods:
61
+
62
+ 1. **From Comfy-Org Hugging Face repo:** Download `split_files/text_encoders/clip_l.safetensors` from [Comfy-Org/HunyuanVideo_repackaged](https://huggingface.co/Comfy-Org/HunyuanVideo_repackaged).
63
+ 2. **From hunyuanvideo-community Hugging Face repo:** Download `text_encoder_2/model.safetensors` from [hunyuanvideo-community/HunyuanVideo](https://huggingface.co/hunyuanvideo-community/HunyuanVideo).
64
+ 3. **From local FramePack installation:** (Same as VAE) Specify the path to the HunyuanVideo community model snapshot directory, e.g., `path/to/FramePack/hf_download/hub/models--hunyuanvideo-community--HunyuanVideo/snapshots/<hex-uuid-folder>`.
65
+
66
+ ### Image Encoder (SigLIP) Model
67
+
68
+ Choose one of the following methods:
69
+
70
+ 1. **From Comfy-Org Hugging Face repo:** Download `sigclip_vision_patch14_384.safetensors` from [Comfy-Org/sigclip_vision_384](https://huggingface.co/Comfy-Org/sigclip_vision_384).
71
+ 2. **From lllyasviel's Hugging Face repo:** Download `image_encoder/model.safetensors` from [lllyasviel/flux_redux_bfl](https://huggingface.co/lllyasviel/flux_redux_bfl).
72
+ 3. **From local FramePack installation:** If you have cloned and run the official FramePack repository, the model might be downloaded locally. Specify the path to the snapshot directory, e.g., `path/to/FramePack/hf_download/hub/models--lllyasviel--flux_redux_bfl/snapshots/<hex-uuid-folder>`.
73
+
74
+ <details>
75
+ <summary>日本語</summary>
76
+
77
+ ※以下のページに重みが一括で公開されています。maybleMyers 氏に感謝いたします。: https://huggingface.co/maybleMyers/framepack_h1111
78
+
79
+ DiT、VAE、テキストエンコーダー1(LLaMA)、テキストエンコーダー2(CLIP)、および画像エンコーダー(SigLIP)モデルは複数の方法でダウンロードできます。英語の説明を参考にして、ダウンロードしてください。
80
+
81
+ FramePack公式のリポジトリをクローンして実行した場合、モデルはローカルにダウンロードされている可能性があります。スナップショットディレクトリへのパスを指定してください。例:`path/to/FramePack/hf_download/hub/models--lllyasviel--flux_redux_bfl/snapshots/<hex-uuid-folder>`
82
+
83
+ HunyuanVideoの推論をComfyUIですでに行っている場合、いくつかのモデルはすでにダウンロードされている可能性があります。
84
+ </details>
85
+
86
+ ## Pre-caching / 事前キャッシング
87
+
88
+ The default resolution for FramePack is 640x640. See [the source code](../frame_pack/bucket_tools.py) for the default resolution of each bucket.
89
+
90
+ The dataset for training must be a video dataset. Image datasets are not supported. You can train on videos of any length. Specify `frame_extraction` as `full` and set `max_frames` to a sufficiently large value. However, if the video is too long, you may run out of VRAM during VAE encoding.
91
+
92
+ ### Latent Pre-caching / latentの事前キャッシング
93
+
94
+ Latent pre-caching uses a dedicated script for FramePack. You **must** provide the Image Encoder model.
95
+
96
+ ```bash
97
+ python fpack_cache_latents.py \
98
+ --dataset_config path/to/toml --vanilla_sampling \
99
+ --vae path/to/vae_model.safetensors \
100
+ --image_encoder path/to/image_encoder_model.safetensors \
101
+ --vae_chunk_size 32 --vae_spatial_tile_sample_min_size 128
102
+ ```
103
+
104
+ Key differences from HunyuanVideo caching:
105
+ - Uses `fpack_cache_latents.py`.
106
+ - Requires the `--image_encoder` argument pointing to the downloaded SigLIP model.
107
+ - You can use the `--latent_window_size` argument (default 9) which defines the size of the latent sections FramePack processes (omitted in the example). This value should typically not be changed unless you understand the implications.
108
+ - The script generates multiple cache files per video, each corresponding to a different section, with the section index appended to the filename (e.g., `..._frame_pos-0000-count_...` becomes `..._frame_pos-0000-0000-count_...`, `..._frame_pos-0000-0001-count_...`, etc.).
109
+ - Image embeddings are calculated using the Image Encoder and stored in the cache files alongside the latents.
110
+
111
+ By default, the sampling method used is Inverted anti-drifting (the same as during inference, using the latent and index in reverse order), described in the paper. You can switch to Vanilla sampling in the paper (using the temporally ordered latent and index) by specifying `--vanilla_sampling`. Preliminary tests suggest that Vanilla sampling may yield better quality. If you change this option, please overwrite the existing cache without specifying `--skip_existing`.
112
+
113
+ For VRAM savings during VAE decoding, consider using `--vae_chunk_size` and `--vae_spatial_tile_sample_min_size`. If VRAM is overflowing and using shared memory, it is recommended to set `--vae_chunk_size` to 16 or 8, and `--vae_spatial_tile_sample_min_size` to 64 or 32.
114
+
115
+ <details>
116
+ <summary>日本語</summary>
117
+ FramePackのデフォルト解像度は640x640です。��バケットのデフォルト解像度については、[ソースコード](../frame_pack/bucket_tools.py)を参照してください。
118
+
119
+ 画像データセットでの学習は行えません。また動画の長さによらず学習可能です。 `frame_extraction` に `full` を指定して、`max_frames` に十分に大きな値を指定してください。ただし、あまりにも長いとVAEのencodeでVRAMが不足する可能性があります。
120
+
121
+ latentの事前キャッシングはFramePack専用のスクリプトを使用します。画像エンコーダーモデルを指定する必要があります。
122
+
123
+ HunyuanVideoのキャッシングとの主な違いは次のとおりです。
124
+ - `fpack_cache_latents.py`を使用します。
125
+ - ダウンロードしたSigLIPモデルを指す`--image_encoder`引数が必要です。
126
+ - `--latent_window_size`引数(デフォルト9)を指定できます(例では省略)。これは、FramePackが処理するlatentセクションのサイズを定義します。この値は、影響を理解していない限り、通常変更しないでください。
127
+ - スクリプトは、各ビデオに対して複数のキャッシュファイルを生成します。各ファイルは異なるセクションに対応し、セクションインデックスがファイル名に追加されます(例:`..._frame_pos-0000-count_...`は`..._frame_pos-0000-0000-count_...`、`..._frame_pos-0000-0001-count_...`などになります)。
128
+ - 画像埋め込みは画像エンコーダーを使用して計算され、latentとともにキャッシュファイルに保存されます。
129
+
130
+ デフォルトでは、論文のサンプリング方法 Inverted anti-drifting (推論時と同じ、逆順の latent と index を使用)を使用します。`--vanilla_sampling`を指定すると Vanilla sampling (時間順の latent と index を使用)に変更できます。簡単なテストの結果では、Vanilla sampling の方が品質が良いようです。このオプションの有無を変更する場合には `--skip_existing` を指定せずに既存のキャッシュを上書きしてください。
131
+
132
+ VAEのdecode時のVRAM節約のために、`--vae_chunk_size`と`--vae_spatial_tile_sample_min_size`を使用することを検討してください。VRAMがあふれて共有メモリを使用している場合には、`--vae_chunk_size`を16、8などに、`--vae_spatial_tile_sample_min_size`を64、32などに変更することをお勧めします。
133
+ </details>
134
+
135
+ ### Text Encoder Output Pre-caching / テキストエンコーダー出力の事前キャッシング
136
+
137
+ Text encoder output pre-caching also uses a dedicated script.
138
+
139
+ ```bash
140
+ python fpack_cache_text_encoder_outputs.py \
141
+ --dataset_config path/to/toml \
142
+ --text_encoder1 path/to/text_encoder1 \
143
+ --text_encoder2 path/to/text_encoder2 \
144
+ --batch_size 16
145
+ ```
146
+
147
+ Key differences from HunyuanVideo caching:
148
+ - Uses `fpack_cache_text_encoder_outputs.py`.
149
+ - Requires both `--text_encoder1` (LLaMA) and `--text_encoder2` (CLIP) arguments.
150
+ - Uses `--fp8_llm` option to run the LLaMA Text Encoder 1 in fp8 mode for VRAM savings (similar to `--fp8_t5` in Wan2.1).
151
+ - Saves LLaMA embeddings, attention mask, and CLIP pooler output to the cache file.
152
+
153
+ <details>
154
+ <summary>日本語</summary>
155
+ テキストエンコーダー出力の事前キャッシングも専用のスクリプトを使用します。
156
+
157
+ HunyuanVideoのキャッシングとの主な違いは次のとおりです。
158
+ - `fpack_cache_text_encoder_outputs.py`を使用します。
159
+ - LLaMAとCLIPの両方の引数が必要です。
160
+ - LLaMAテキストエンコーダー1をfp8モードで実行するための`--fp8_llm`オプションを使用します(Wan2.1の`--fp8_t5`に似ています)。
161
+ - LLaMAの埋め込み、アテンションマスク、CLIPのプーラー出力をキャッシュファイルに保存します。
162
+
163
+ </details>
164
+
165
+
166
+ ## Training / 学習
167
+
168
+ ### Training
169
+
170
+ Training uses a dedicated script `fpack_train_network.py`. Remember FramePack only supports I2V training.
171
+
172
+ ```bash
173
+ accelerate launch --num_cpu_threads_per_process 1 --mixed_precision bf16 fpack_train_network.py \
174
+ --dit path/to/dit_model \
175
+ --vae path/to/vae_model.safetensors \
176
+ --text_encoder1 path/to/text_encoder1 \
177
+ --text_encoder2 path/to/text_encoder2 \
178
+ --image_encoder path/to/image_encoder_model.safetensors \
179
+ --dataset_config path/to/toml \
180
+ --sdpa --mixed_precision bf16 \
181
+ --optimizer_type adamw8bit --learning_rate 2e-4 --gradient_checkpointing \
182
+ --timestep_sampling shift --weighting_scheme none --discrete_flow_shift 3.0 \
183
+ --max_data_loader_n_workers 2 --persistent_data_loader_workers \
184
+ --network_module networks.lora_framepack --network_dim 32 \
185
+ --max_train_epochs 16 --save_every_n_epochs 1 --seed 42 \
186
+ --output_dir path/to/output_dir --output_name name-of-lora
187
+ ```
188
+
189
+ If you use the command prompt (Windows, not PowerShell), you may need to write them in a single line, or use `^` at the end of each line to continue the command.
190
+
191
+ The maximum value for `--blocks_to_swap` is 36. The default resolution for FramePack is 640x640, which requires around 17GB of VRAM. If you run out of VRAM, consider lowering the dataset resolution.
192
+
193
+ Key differences from HunyuanVideo training:
194
+ - Uses `fpack_train_network.py`.
195
+ - **Requires** specifying `--vae`, `--text_encoder1`, `--text_encoder2`, and `--image_encoder`.
196
+ - **Requires** specifying `--network_module networks.lora_framepack`.
197
+ - Optional `--latent_window_size` argument (default 9, should match caching).
198
+ - Memory saving options like `--fp8_base` (for DiT) and `--fp8_llm` (for Text Encoder 1) are available. `--fp8_scaled` is recommended when using `--fp8_base` for DiT.
199
+ - `--vae_chunk_size` and `--vae_spatial_tile_sample_min_size` options are available for the VAE to prevent out-of-memory during sampling (similar to caching).
200
+ - `--gradient_checkpointing` is available for memory savings.
201
+ <!-- - Use `convert_lora.py` for converting the LoRA weights after training, similar to HunyuanVideo. -->
202
+
203
+ Training settings (learning rate, optimizers, etc.) are experimental. Feedback is welcome.
204
+
205
+ <details>
206
+ <summary>日本語</summary>
207
+ FramePackの学習は専用のスクリプト`fpack_train_network.py`を使用します。FramePackはI2V学習のみをサポートしています。
208
+
209
+ コマンド記述例は英語版を参考にしてください。WindowsでPowerShellではなくコマンドプロンプトを使用している場合、コマンドを1行で記述するか、各行の末尾に`^`を付けてコマンドを続ける必要があります。
210
+
211
+ `--blocks_to_swap`の最大値は36です。FramePackのデフォルト解像度(640x640)では、17GB程度のVRAMが必要です。VRAM容量が不足する場合は、データセットの解像度を下げてください。
212
+
213
+ HunyuanVideoの学習との主な違いは次のとおりです。
214
+ - `fpack_train_network.py`を使用します。
215
+ - `--vae`、`--text_encoder1`、`--text_encoder2`、`--image_encoder`を指定する必要があります。
216
+ - `--network_module networks.lora_framepack`を指定する必要があります。
217
+ - 必要に応じて`--latent_window_size`引数(デフォルト9)を指定できます(キャッシング時と一致させる必要があります)。
218
+ - `--fp8_base`(DiT用)や`--fp8_llm`(テキストエンコーダー1用)などのメモリ節約オプションが利用可能です。`--fp8_base`指定時は、`--fp8_scaled`を使用することをお勧めします。
219
+ - サンプル生成時にメモリ不足を防ぐため、VAE用の`--vae_chunk_size`、`--vae_spatial_tile_sample_min_size`オプションが利用可能です(キャッシング時と同様)。
220
+ - メモリ節約のために`--gradient_checkpointing`が利用可能です。
221
+
222
+ </details>
223
+
224
+ ## Inference
225
+
226
+ Inference uses a dedicated script `fpack_generate_video.py`.
227
+
228
+ ```bash
229
+ python fpack_generate_video.py \
230
+ --dit path/to/dit_model \
231
+ --vae path/to/vae_model.safetensors \
232
+ --text_encoder1 path/to/text_encoder1 \
233
+ --text_encoder2 path/to/text_encoder2 \
234
+ --image_encoder path/to/image_encoder_model.safetensors \
235
+ --image_path path/to/start_image.jpg \
236
+ --prompt "A cat walks on the grass, realistic style." \
237
+ --video_size 512 768 --video_seconds 5 --fps 30 --infer_steps 25 \
238
+ --attn_mode sdpa --fp8_scaled \
239
+ --vae_chunk_size 32 --vae_spatial_tile_sample_min_size 128 \
240
+ --save_path path/to/save/dir --output_type both \
241
+ --seed 1234 --lora_multiplier 1.0 --lora_weight path/to/lora.safetensors
242
+ ```
243
+ <!-- --embedded_cfg_scale 10.0 --guidance_scale 1.0 \ -->
244
+
245
+ Key differences from HunyuanVideo inference:
246
+ - Uses `fpack_generate_video.py`.
247
+ - **Requires** specifying `--vae`, `--text_encoder1`, `--text_encoder2`, and `--image_encoder`.
248
+ - **Requires** specifying `--image_path` for the starting frame.
249
+ - **Requires** specifying `--video_seconds` (length of the video in seconds).
250
+ - `--video_size` is the size of the generated video, height and width are specified in that order.
251
+ - `--prompt`: Prompt for generation.
252
+ - Optional `--latent_window_size` argument (default 9, should match caching and training).
253
+ - `--fp8_scaled` option is available for DiT to reduce memory usage. Quality may be slightly lower. `--fp8_llm` option is available to reduce memory usage of Text Encoder 1. `--fp8` alone is also an option for DiT but `--fp8_scaled` potentially offers better quality.
254
+ - LoRA loading options (`--lora_weight`, `--lora_multiplier`, `--include_patterns`, `--exclude_patterns`) are available. `--lycoris` is also supported.
255
+ - `--embedded_cfg_scale` (default 10.0) controls the distilled guidance scale.
256
+ - `--guidance_scale` (default 1.0) controls the standard classifier-free guidance scale. **Changing this from 1.0 is generally not recommended for the base FramePack model.**
257
+ - `--guidance_rescale` (default 0.0) is available but typically not needed.
258
+ - `--bulk_decode` option can decode all frames at once, potentially faster but uses more VRAM during decoding. `--vae_chunk_size` and `--vae_spatial_tile_sample_min_size` options are recommended to prevent out-of-memory errors.
259
+ - `--sample_solver` (default `unipc`) is available but only `unipc` is implemented.
260
+ - `--save_merged_model` option is available to save the DiT model after merging LoRA weights. Inference is skipped if this is specified.
261
+ - Batch and interactive modes (`--from_file`, `--interactive`) are **not yet implemented** for FramePack generation.
262
+
263
+ **Section-specific Prompts**
264
+
265
+ You can now provide different prompts for different sections of the video using the `--prompt` argument. Use `;;;` to separate sections and specify the starting section index followed by a colon (e.g., `0:prompt A;;;3:prompt B`). Each definition should be in the format `INDEX:PROMPT_TEXT`.
266
+
267
+ * `INDEX` can be:
268
+ * A non-negative integer (e.g., `0`, `3`): The prompt applies to this section index.
269
+ * A negative integer (e.g., `-1`, `-2`): The prompt applies to the k-th section from the end (e.g., `-1` for the last section, `-2` for the second to last).
270
+ * A range (e.g., `0-2`, `3-5`): The prompt applies to all sections within this inclusive range.
271
+ * If some parts are not specified with an index, the prompt associated with index `0` will be used (e.g., `0:prompt A;;;-1:prompt B` means the last section is prompt B, and all others are prompt A).
272
+ * This can be used with the end image guidance feature to specify a different prompt for the last section.
273
+ * If no index is specified for a part (e.g., `prompt A;;;3:prompt B`), it defaults to index `0`.
274
+ * Example 1: `"0:A cat walks;;;3:The cat sits down;;;-1:The cat sleeps"`
275
+ * Example 2: `"0:A cat turns around;;;-1:A cat walks towards the camera"`
276
+
277
+ **End Image Guidance**
278
+
279
+ Specify an `--end_image_path` to guide the generation towards a specific final frame. This is highly experimental.
280
+
281
+ * `--end_image_path` : Path to an image to be used as a target for the final frame. The generation process for the last section will be conditioned on this image's VAE latent and image encoder embedding. This may affect the naturalness of the transition into the final frames.
282
+
283
+ Other options like `--video_size`, `--fps`, `--infer_steps`, `--save_path`, `--output_type`, `--seed`, `--attn_mode`, `--blocks_to_swap`, `--vae_chunk_size`, `--vae_spatial_tile_sample_min_size` function similarly to HunyuanVideo/Wan2.1 where applicable.
284
+
285
+ The maximum value for `--blocks_to_swap` is 38.
286
+ <details>
287
+ <summary>日本語</summary>
288
+
289
+ FramePackの推論は専用のスクリプト`fpack_generate_video.py`を使用します。コマンド記述例は英語版を参考にしてください。
290
+
291
+ HunyuanVideoの推論との主な違いは次のとおりです。
292
+ - `fpack_generate_video.py`を使用します。
293
+ - `--vae`、`--text_encoder1`、`--text_encoder2`、`--image_encoder`を指定する必要があります。
294
+ - `--image_path`を指定する必要があります(開始フレーム)。
295
+ - `--video_seconds`を指定する必要があります(秒単位でのビデオの長さを指定)。
296
+ - `--video_size`は生成するビデオのサイズで、高さと幅をその順番で指定します。
297
+ - `--prompt`: 生成用のプロンプトです。
298
+ - 必要に応じて`--latent_window_size`引数(デフォルト9)を指定できます(キャッシング時、学習時と一致させる必要があります)。
299
+ - DiTのメモリ使用量を削減するために、`--fp8_scaled`オプションを指定可能です。品質はやや低下する可能性があります。またText Encoder 1のメモリ使用量を削減するために、`--fp8_llm`オプションを指定可能です。DiT用に`--fp8`単独のオプションも用意されていますが、`--fp8_scaled`の方が品質が良い可能性があります。
300
+ - LoRAの読み込みオプション(`--lora_weight`、`--lora_multiplier`、`--include_patterns`、`--exclude_patterns`)が利用可能です。LyCORISもサポートされています。
301
+ - `--embedded_cfg_scale`(デフォルト10.0)は、蒸留されたガイダンススケールを制御します。通常は変更しないでください。
302
+ - `--guidance_scale`(デフォルト1.0)は、標準の分類器フリーガイダンススケールを制御します。**FramePackモデルのベースモデルでは、通常1.0から変更しないことをお勧めします。**
303
+ - `--guidance_rescale`(デフォルト0.0)も利用可能ですが、通常は必要ありません。
304
+ - `--bulk_decode`オプションは、すべてのフレームを一度にデコードできるオプションです。高速ですが、デコード中にVRAMを多く使用します。VRAM不足エラーを防ぐために、`--vae_chunk_size`と`--vae_spatial_tile_sample_min_size`オプションを指定することをお勧めします。
305
+ - `--sample_solver`(デフォルト`unipc`)は利用可能ですが、`unipc`のみが実装されています。
306
+ - `--save_merged_model`オプションは、LoRAの重みをマージした後にDiTモデルを保存するためのオプションです。これを指定すると推論はスキップされます。
307
+ - バッチモードとインタラクティブモード(`--from_file`、`--interactive`)はFramePack生成には**まだ実装されていません**。
308
+
309
+ **セクション別プロンプト:**
310
+
311
+ `--prompt`引数を使用して、ビデオの異なるセクションに異なるプロンプトを指定できるようになりました。セクションを区切るには`;;;`を使用し、開始セクションインデックスの後にコロンを付けて指定します(例:`0:プロンプトA;;;3:プロンプトB`)。各定義は`インデックス:プロンプトテキスト`の形式である必要があります。
312
+
313
+ * `インデックス`には以下を指定できます:
314
+ * 非負の整数(例:`0`, `3`):このセクションインデックスに対してプロンプトが適用されます。
315
+ * 負の整数(例:`-1`, `-2`):最後からk番目のセクションにプロンプトが適用されます(例:`-1`は最後のセクション、`-2`は最後から2番目のセクション)。
316
+ * 範囲(例:`0-2`, `3-5`):この範囲(両端を含む)内のすべてのセクションにプロンプトが適用されます。
317
+ * インデックスが指定されていない部分は、インデックス`0`のプロンプトが適用されます。(例:`0:プロンプトA;;;-1:プロンプトB`なら、一番最後がプロンプトB、それ以外はプロンプトAになります。)
318
+ * 終端画像ガイダンスを使用する場合、この形式をお勧めします。
319
+ * ある部分にインデックスが指定されていない場合(例:`プロンプトA;;;3:プロンプトB`)、インデックス`0`として扱われます。
320
+
321
+
322
+ **終端画像ガイダンス**
323
+
324
+ `--end_image_path`を指定して、生成を特定の最終フレームに誘導します。これは非常に実験的な機能です。
325
+
326
+ - `--end_image_path` : 最終フレームのターゲットとして使用する画像へのパス。最後のセクションの生成プロセスは、この画像を初期画像として生成されます。これは最終フレームへの遷移の自然さに影響を与える可能性があります。
327
+
328
+ `--video_size`、`--fps`、`--infer_steps`、`--save_path`、`--output_type`、`--seed`、`--attn_mode`、`--blocks_to_swap`、`--vae_chunk_size`、`--vae_spatial_tile_sample_min_size`などの他のオプションは、HunyuanVideo/Wan2.1と同様に機能します。
329
+
330
+ `--blocks_to_swap`の最大値は38です。
331
+ </details>
docs/sampling_during_training.md ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ > 📝 Click on the language section to expand / 言語をクリックして展開
2
+
3
+ # Sampling during training / 学習中のサンプル画像生成
4
+
5
+ By preparing a prompt file, you can generate sample images during training.
6
+
7
+ Please be aware that it consumes a considerable amount of VRAM, so be careful when generating sample images for videos with a large number of frames. Also, since it takes time to generate, adjust the frequency of sample image generation as needed.
8
+
9
+ <details>
10
+ <summary>日本語</summary>
11
+
12
+ プロンプトファイルを用意することで、学習中にサンプル画像を生成することができます。
13
+
14
+ VRAMをそれなりに消費しますので、特にフレーム数が多い動画を生成する場合は注意してください。また生成には時間がかかりますので、サンプル画像生成の頻度は適宜調整してください。
15
+ </details>
16
+
17
+ ## How to use / 使い方
18
+
19
+ ### Command line options for training with sampling / サンプル画像生成に関連する学習時のコマンドラインオプション
20
+
21
+ Example of command line options for training with sampling / 記述例:
22
+
23
+ ```bash
24
+ --vae path/to/ckpts/hunyuan-video-t2v-720p/vae/pytorch_model.pt
25
+ --vae_chunk_size 32 --vae_spatial_tile_sample_min_size 128
26
+ --text_encoder1 path/to/ckpts/text_encoder
27
+ --text_encoder2 path/to/ckpts/text_encoder_2
28
+ --sample_prompts /path/to/prompt_file.txt
29
+ --sample_every_n_epochs 1 --sample_every_n_steps 1000 --sample_at_first
30
+ ```
31
+
32
+ `--vae`, `--vae_chunk_size`, `--vae_spatial_tile_sample_min_size`, `--text_encoder1`, `--text_encoder2` are the same as when generating images, so please refer to [here](/README.md#inference) for details. `--fp8_llm` can also be specified.
33
+
34
+ `--sample_prompts` specifies the path to the prompt file used for sample image generation. Details are described below.
35
+
36
+ `--sample_every_n_epochs` specifies how often to generate sample images in epochs, and `--sample_every_n_steps` specifies how often to generate sample images in steps.
37
+
38
+ `--sample_at_first` is specified when generating sample images at the beginning of training.
39
+
40
+ Sample images and videos are saved in the `sample` directory in the directory specified by `--output_dir`. They are saved as `.png` for still images and `.mp4` for videos.
41
+
42
+ <details>
43
+ <summary>日本語</summary>
44
+
45
+ `--vae`、`--vae_chunk_size`、`--vae_spatial_tile_sample_min_size`、`--text_encoder1`、`--text_encoder2`は、画像生成時と同様ですので、詳細は[こちら](/README.ja.md#推論)を参照してください。`--fp8_llm`も指定可能です。
46
+
47
+ `--sample_prompts`は、サンプル画像生成に使用するプロンプトファイルのパスを指定します。詳細は後述します。
48
+
49
+ `--sample_every_n_epochs`は、何エポックごとにサンプル画像を生成するかを、`--sample_every_n_steps`は、何ステップごとにサンプル画像を生成するかを指定します。
50
+
51
+ `--sample_at_first`は、学習開始時にサンプル画像を生成する場合に指定します。
52
+
53
+ サンプル画像、動画は、`--output_dir`で指定したディレクトリ内の、`sample`ディレクトリに保存されます。静止画の場合は`.png`、動画の場合は`.mp4`で保存されます。
54
+ </details>
55
+
56
+ ### Prompt file / プロンプトファイル
57
+
58
+ The prompt file is a text file that contains the prompts for generating sample images. The example is as follows. / プロンプトファイルは、サンプル画像生成のためのプロンプトを記述したテキストファイルです。例は以下の通りです。
59
+
60
+ ```
61
+ # prompt 1: for generating a cat video
62
+ A cat walks on the grass, realistic style. --w 640 --h 480 --f 25 --d 1 --s 20
63
+
64
+ # prompt 2: for generating a dog image
65
+ A dog runs on the beach, realistic style. --w 960 --h 544 --f 1 --d 2 --s 20
66
+ ```
67
+
68
+ A line starting with `#` is a comment.
69
+
70
+ * `--w` specifies the width of the generated image or video. The default is 256.
71
+ * `--h` specifies the height. The default is 256.
72
+ * `--f` specifies the number of frames. The default is 1, which generates a still image.
73
+ * `--d` specifies the seed. The default is random.
74
+ * `--s` specifies the number of steps in generation. The default is 20.
75
+ * `--g` specifies the embedded guidance scale (not CFG scale). The default is 6.0 for HunyuanVideo, 10.0 for FramePack, which is the default value during inference of each architecture. Specify 1.0 for SkyReels V1 models. Ignore this option for Wan2.1 models.
76
+ * `--fs` specifies the discrete flow shift. The default is 14.5, which corresponds to the number of steps 20. In the HunyuanVideo paper, 7.0 is recommended for 50 steps, and 17.0 is recommended for less than 20 steps (e.g. 10). Ignore this option for FramePack models (it uses 10.0).
77
+
78
+ If you train I2V models, you must add the following option.
79
+
80
+ * `--i path/to/image.png`: the image path for image2video inference.
81
+
82
+ If you train Wan2.1-Fun-Control models, you must add the following option.
83
+
84
+ * `--cn path/to/control_video_or_dir_of_images`: the path to the video or directory containing multiple images for control.
85
+
86
+ If you train the model with classifier free guidance (such as Wan2.1), you can use the additional options below.
87
+
88
+ *`--n negative prompt...`: the negative prompt for the classifier free guidance. The default prompt for each model is used if omitted.
89
+ *`--l 6.0`: the classifier free guidance scale. Should be set to 6.0 for SkyReels V1 models. 5.0 is the default value for Wan2.1 (if omitted).
90
+
91
+ <details>
92
+ <summary>日本語</summary>
93
+
94
+ `#` で始まる行はコメントです。
95
+
96
+ * `--w` 生成画像、動画の幅を指定します。省略時は256です。
97
+ * `--h` 高さを指定します。省略時は256です。
98
+ * `--f` フレーム数を指定します。省略時は1で、静止画を生成します。
99
+ * `--d` シードを指定します。省略時はランダムです。
100
+ * `--s` 生成におけるステップ数を指定します。省略時は20です。
101
+ * `--g` embedded guidance scaleを指定します(CFG scaleではありません)。省略時はHunyuanVideoは6.0、FramePackは10.0で、各アーキテクチャの推論時のデフォルト値です。SkyReels V1モデルの場合は1.0を指定してください。Wan2.1モデルの場合はこのオプションは無視されます。
102
+ * `--fs` discrete flow shiftを指定します。省略時は14.5で、ステップ数20の場合に対応した値です。HunyuanVideoの論文では、ステップ数50の場合は7.0、ステップ数20未満(10など)で17.0が推奨されています。FramePackモデルはこのオプションは無視され、10.0が使用されます。
103
+
104
+ I2Vモデルを学習する場合、以下のオプションを追加してください。
105
+
106
+ * `--i path/to/image.png`: image2video推論用の画像パス。
107
+
108
+ Wan2.1-Fun-Controlモデルを学習する場合、以下のオプションを追加してください。
109
+
110
+ * `--cn path/to/control_video_or_dir_of_images`: control用の動画または複数枚の画像を含むディレクトリのパス。
111
+
112
+ classifier free guidance(ネガティブプロンプト)を必要とするモデル(Wan2.1など)を学習する場合、以下の追加オプションを使用できます。
113
+
114
+ *`--n negative prompt...`: classifier free guidance用のネガティブプロンプト。省略時はモデルごとのデフォルトプロンプトが使用されます。
115
+ *`--l 6.0`: classifier free guidance scale。SkyReels V1モデルの場合は6.0に設定してください。Wan2.1の場合はデフォルト値が5.0です(省略時)。
116
+ </details>
docs/wan.md ADDED
@@ -0,0 +1,531 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ > 📝 Click on the language section to expand / 言語をクリックして展開
2
+
3
+ # Wan 2.1
4
+
5
+ ## Overview / 概要
6
+
7
+ This is an unofficial training and inference script for [Wan2.1](https://github.com/Wan-Video/Wan2.1). The features are as follows.
8
+
9
+ - fp8 support and memory reduction by block swap: Inference of a 720x1280x81frames videos with 24GB VRAM, training with 720x1280 images with 24GB VRAM
10
+ - Inference without installing Flash attention (using PyTorch's scaled dot product attention)
11
+ - Supports xformers and Sage attention
12
+
13
+ This feature is experimental.
14
+
15
+ <details>
16
+ <summary>日本語</summary>
17
+ [Wan2.1](https://github.com/Wan-Video/Wan2.1) の非公式の学習および推論スクリプトです。
18
+
19
+ 以下の特徴があります。
20
+
21
+ - fp8対応およびblock swapによる省メモリ化:720x1280x81framesの動画を24GB VRAMで推論可能、720x1280の画像での学習が24GB VRAMで可能
22
+ - Flash attentionのインストールなしでの実行(PyTorchのscaled dot product attentionを使用)
23
+ - xformersおよびSage attention対応
24
+
25
+ この機能は実験的なものです。
26
+ </details>
27
+
28
+ ## Download the model / モデルのダウンロード
29
+
30
+ Download the T5 `models_t5_umt5-xxl-enc-bf16.pth` and CLIP `models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth` from the following page: https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P/tree/main
31
+
32
+ Download the VAE from the above page `Wan2.1_VAE.pth` or download `split_files/vae/wan_2.1_vae.safetensors` from the following page: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/vae
33
+
34
+ Download the DiT weights from the following page: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
35
+
36
+ Wan2.1 Fun Control model weights can be downloaded from [here](https://huggingface.co/alibaba-pai/Wan2.1-Fun-14B-Control). Navigate to each weight page and download. The Fun Control model seems to support not only T2V but also I2V tasks.
37
+
38
+ Please select the appropriate weights according to T2V, I2V, resolution, model size, etc.
39
+
40
+ `fp16` and `bf16` models can be used, and `fp8_e4m3fn` models can be used if `--fp8` (or `--fp8_base`) is specified without specifying `--fp8_scaled`. **Please note that `fp8_scaled` models are not supported even with `--fp8_scaled`.**
41
+
42
+ (Thanks to Comfy-Org for providing the repackaged weights.)
43
+
44
+ ### Model support matrix / モデルサポートマトリックス
45
+
46
+ * columns: training dtype (行:学習時のデータ型)
47
+ * rows: model dtype (列:モデルのデータ型)
48
+
49
+ | model \ training |bf16|fp16|--fp8_base|--fp8base & --fp8_scaled|
50
+ |--|--|--|--|--|
51
+ |bf16|✓|--|✓|✓|
52
+ |fp16|--|✓|✓|✓|
53
+ |fp8_e4m3fn|--|--|✓|--|
54
+ |fp8_scaled|--|--|--|--|
55
+
56
+ <details>
57
+ <summary>日本語</summary>
58
+ T5 `models_t5_umt5-xxl-enc-bf16.pth` およびCLIP `models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth` を、次のページからダウンロードしてください:https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P/tree/main
59
+
60
+ VAEは上のページから `Wan2.1_VAE.pth` をダウンロードするか、次のページから `split_files/vae/wan_2.1_vae.safetensors` をダウンロードしてください:https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/vae
61
+
62
+ DiTの重みを次のページからダウンロードしてください:https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
63
+
64
+ Wan2.1 Fun Controlモデルの重みは、[こちら](https://huggingface.co/alibaba-pai/Wan2.1-Fun-14B-Control)から、それぞれの重みのページに遷移し、ダウンロードしてください。Fun ControlモデルはT2VだけでなくI2Vタスクにも対応しているようです。
65
+
66
+ T2VやI2V、解像度、モデルサイズなどにより適切な重みを選択してください。
67
+
68
+ `fp16` および `bf16` モデルを使用できます。また、`--fp8` (または`--fp8_base`)を指定し`--fp8_scaled`を指定をしないときには `fp8_e4m3fn` モデルを使用できます。**`fp8_scaled` モデルはいずれの場合もサポートされていませんのでご注意ください。**
69
+
70
+ (repackaged版の重みを提供してくださっているComfy-Orgに感謝いたします。)
71
+ </details>
72
+
73
+ ## Pre-caching / 事前キャッシュ
74
+
75
+ ### Latent Pre-caching
76
+
77
+ Latent pre-caching is almost the same as in HunyuanVideo. Create the cache using the following command:
78
+
79
+ ```bash
80
+ python wan_cache_latents.py --dataset_config path/to/toml --vae path/to/wan_2.1_vae.safetensors
81
+ ```
82
+
83
+ If you train I2V models, add `--clip path/to/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth` to specify the CLIP model. If not specified, the training will raise an error.
84
+
85
+ If you're running low on VRAM, specify `--vae_cache_cpu` to use the CPU for the VAE internal cache, which will reduce VRAM usage somewhat.
86
+
87
+ The control video settings are required for training the Fun-Control model. Please refer to [Dataset Settings](/dataset/dataset_config.md#sample-for-video-dataset-with-control-images) for details.
88
+
89
+ <details>
90
+ <summary>日本語</summary>
91
+ latentの事前キャッシングはHunyuanVideoとほぼ同じです。上のコマンド例を使用してキャッシュを作成してください。
92
+
93
+ I2Vモデルを学習する場合は、`--clip path/to/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth` を追加してCLIPモデルを指定してください。指定しないと学習時にエラーが発生します。
94
+
95
+ VRAMが不足している場合は、`--vae_cache_cpu` を指定するとVAEの内部キャッシュにCPUを使うことで、使用VRAMを多少削減できます。
96
+
97
+ Fun-Controlモデルを学習する場合は、制御用動画の設定が必要です。[データセット設定](/dataset/dataset_config.md#sample-for-video-dataset-with-control-images)を参照してください。
98
+ </details>
99
+
100
+ ### Text Encoder Output Pre-caching
101
+
102
+ Text encoder output pre-caching is also almost the same as in HunyuanVideo. Create the cache using the following command:
103
+
104
+ ```bash
105
+ python wan_cache_text_encoder_outputs.py --dataset_config path/to/toml --t5 path/to/models_t5_umt5-xxl-enc-bf16.pth --batch_size 16
106
+ ```
107
+
108
+ Adjust `--batch_size` according to your available VRAM.
109
+
110
+ For systems with limited VRAM (less than ~16GB), use `--fp8_t5` to run the T5 in fp8 mode.
111
+
112
+ <details>
113
+ <summary>日本語</summary>
114
+ テキストエンコーダ出力の事前キャッシングもHunyuanVideoとほぼ同じです。上のコマンド例を使用してキャッシュを作成してください。
115
+
116
+ 使用可能なVRAMに合わせて `--batch_size` を調整してください。
117
+
118
+ VRAMが限られているシステム(約16GB未満)の場合は、T5をfp8モードで実行するために `--fp8_t5` を使用してください。
119
+ </details>
120
+
121
+ ## Training / 学習
122
+
123
+ ### Training
124
+
125
+ Start training using the following command (input as a single line):
126
+
127
+ ```bash
128
+ accelerate launch --num_cpu_threads_per_process 1 --mixed_precision bf16 wan_train_network.py
129
+ --task t2v-1.3B
130
+ --dit path/to/wan2.1_xxx_bf16.safetensors
131
+ --dataset_config path/to/toml --sdpa --mixed_precision bf16 --fp8_base
132
+ --optimizer_type adamw8bit --learning_rate 2e-4 --gradient_checkpointing
133
+ --max_data_loader_n_workers 2 --persistent_data_loader_workers
134
+ --network_module networks.lora_wan --network_dim 32
135
+ --timestep_sampling shift --discrete_flow_shift 3.0
136
+ --max_train_epochs 16 --save_every_n_epochs 1 --seed 42
137
+ --output_dir path/to/output_dir --output_name name-of-lora
138
+ ```
139
+ The above is an example. The appropriate values for `timestep_sampling` and `discrete_flow_shift` need to be determined by experimentation.
140
+
141
+ For additional options, use `python wan_train_network.py --help` (note that many options are unverified).
142
+
143
+ `--task` is one of `t2v-1.3B`, `t2v-14B`, `i2v-14B`, `t2i-14B` (for Wan2.1 official models), `t2v-1.3B-FC`, `t2v-14B-FC`, and `i2v-14B-FC` (for Wan2.1 Fun Control model). Specify the DiT weights for the task with `--dit`.
144
+
145
+ Don't forget to specify `--network_module networks.lora_wan`.
146
+
147
+ Other options are mostly the same as `hv_train_network.py`.
148
+
149
+ Use `convert_lora.py` for converting the LoRA weights after training, as in HunyuanVideo.
150
+
151
+ <details>
152
+ <summary>日本語</summary>
153
+ `timestep_sampling`や`discrete_flow_shift`は一例です。どのような値が適切かは実験が必要です。
154
+
155
+ その他のオプションについては `python wan_train_network.py --help` を使用してください(多くのオプションは未検証です)。
156
+
157
+ `--task` には `t2v-1.3B`, `t2v-14B`, `i2v-14B`, `t2i-14B` (これらはWan2.1公式モデル)、`t2v-1.3B-FC`, `t2v-14B-FC`, `i2v-14B-FC`(Wan2.1-Fun Controlモデル)を指定します。`--dit`に、taskに応じたDiTの重みを指定してください。
158
+
159
+ `--network_module` に `networks.lora_wan` を指定することを忘れないでください。
160
+
161
+ その他のオプションは、ほぼ`hv_train_network.py`と同様です。
162
+
163
+ 学習後のLoRAの重みの変換は、HunyuanVideoと同様に`convert_lora.py`を使用してください。
164
+ </details>
165
+
166
+ ### Command line options for training with sampling / サンプル画像生成に関連する学習時のコマンドラインオプション
167
+
168
+ Example of command line options for training with sampling / 記述例:
169
+
170
+ ```bash
171
+ --vae path/to/wan_2.1_vae.safetensors
172
+ --t5 path/to/models_t5_umt5-xxl-enc-bf16.pth
173
+ --sample_prompts /path/to/prompt_file.txt
174
+ --sample_every_n_epochs 1 --sample_every_n_steps 1000 -- sample_at_first
175
+ ```
176
+ Each option is the same as when generating images or as HunyuanVideo. Please refer to [here](/docs/sampling_during_training.md) for details.
177
+
178
+ If you train I2V models, add `--clip path/to/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth` to specify the CLIP model.
179
+
180
+ You can specify the initial image, the negative prompt and the control video (for Wan2.1-Fun-Control) in the prompt file. Please refer to [here](/docs/sampling_during_training.md#prompt-file--プロンプトファイル).
181
+
182
+ <details>
183
+ <summary>日本語</summary>
184
+ 各オプションは推論時、およびHunyuanVideoの場合と同様です。[こちら](/docs/sampling_during_training.md)を参照してください。
185
+
186
+ I2Vモデルを学習する場合は、`--clip path/to/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth` を追加してCLIPモデルを指定してください。
187
+
188
+ プロンプトファイルで、初期画像やネガティブプロンプト、制御動画(Wan2.1-Fun-Control用)等を指定できます。[こちら](/docs/sampling_during_training.md#prompt-file--プロンプトファイル)を参照してください。
189
+ </details>
190
+
191
+
192
+ ## Inference / 推論
193
+
194
+ ### Inference Options Comparison / 推論オプション比較
195
+
196
+ #### Speed Comparison (Faster → Slower) / 速度比較(速い→遅い)
197
+ *Note: Results may vary depending on GPU type*
198
+
199
+ fp8_fast > bf16/fp16 (no block swap) > fp8 > fp8_scaled > bf16/fp16 (block swap)
200
+
201
+ #### Quality Comparison (Higher → Lower) / 品質比較(高→低)
202
+
203
+ bf16/fp16 > fp8_scaled > fp8 >> fp8_fast
204
+
205
+ ### T2V Inference / T2V推論
206
+
207
+ The following is an example of T2V inference (input as a single line):
208
+
209
+ ```bash
210
+ python wan_generate_video.py --fp8 --task t2v-1.3B --video_size 832 480 --video_length 81 --infer_steps 20
211
+ --prompt "prompt for the video" --save_path path/to/save.mp4 --output_type both
212
+ --dit path/to/wan2.1_t2v_1.3B_bf16_etc.safetensors --vae path/to/wan_2.1_vae.safetensors
213
+ --t5 path/to/models_t5_umt5-xxl-enc-bf16.pth
214
+ --attn_mode torch
215
+ ```
216
+
217
+ `--task` is one of `t2v-1.3B`, `t2v-14B`, `i2v-14B`, `t2i-14B` (these are Wan2.1 official models), `t2v-1.3B-FC`, `t2v-14B-FC` and `i2v-14B-FC` (for Wan2.1-Fun Control model).
218
+
219
+ `--attn_mode` is `torch`, `sdpa` (same as `torch`), `xformers`, `sageattn`,`flash2`, `flash` (same as `flash2`) or `flash3`. `torch` is the default. Other options require the corresponding library to be installed. `flash3` (Flash attention 3) is not tested.
220
+
221
+ Specifying `--fp8` runs DiT in fp8 mode. fp8 can significantly reduce memory consumption but may impact output quality.
222
+
223
+ `--fp8_scaled` can be specified in addition to `--fp8` to run the model in fp8 weights optimization. This increases memory consumption and speed slightly but improves output quality. See [here](advanced_config.md#fp8-weight-optimization-for-models--モデルの重みのfp8への最適化) for details.
224
+
225
+ `--fp8_fast` option is also available for faster inference on RTX 40x0 GPUs. This option requires `--fp8_scaled` option. **This option seems to degrade the output quality.**
226
+
227
+ `--fp8_t5` can be used to specify the T5 model in fp8 format. This option reduces memory usage for the T5 model.
228
+
229
+ `--negative_prompt` can be used to specify a negative prompt. If omitted, the default negative prompt is used.
230
+
231
+ `--flow_shift` can be used to specify the flow shift (default 3.0 for I2V with 480p, 5.0 for others).
232
+
233
+ `--guidance_scale` can be used to specify the guidance scale for classifier free guidance (default 5.0).
234
+
235
+ `--blocks_to_swap` is the number of blocks to swap during inference. The default value is None (no block swap). The maximum value is 39 for 14B model and 29 for 1.3B model.
236
+
237
+ `--vae_cache_cpu` enables VAE cache in main memory. This reduces VRAM usage slightly but processing is slower.
238
+
239
+ `--compile` enables torch.compile. See [here](/README.md#inference) for details.
240
+
241
+ `--trim_tail_frames` can be used to trim the tail frames when saving. The default is 0.
242
+
243
+ `--cfg_skip_mode` specifies the mode for skipping CFG in different steps. The default is `none` (all steps).`--cfg_apply_ratio` specifies the ratio of steps where CFG is applied. See below for details.
244
+
245
+ `--include_patterns` and `--exclude_patterns` can be used to specify which LoRA modules to apply or exclude during training. If not specified, all modules are applied by default. These options accept regular expressions.
246
+
247
+ `--include_patterns` specifies the modules to be applied, and `--exclude_patterns` specifies the modules to be excluded. The regular expression is matched against the LoRA key name, and include takes precedence.
248
+
249
+ The key name to be searched is in sd-scripts format (`lora_unet_<module_name with dot replaced by _>`). For example, `lora_unet_blocks_9_cross_attn_k`.
250
+
251
+ For example, if you specify `--exclude_patterns "blocks_[23]\d_"`, it will exclude modules containing `blocks_20` to `blocks_39`. If you specify `--include_patterns "cross_attn" --exclude_patterns "blocks_(0|1|2|3|4)_"`, it will apply LoRA to modules containing `cross_attn` and not containing `blocks_0` to `blocks_4`.
252
+
253
+ If you specify multiple LoRA weights, please specify them with multiple arguments. For example: `--include_patterns "cross_attn" ".*" --exclude_patterns "dummy_do_not_exclude" "blocks_(0|1|2|3|4)"`. `".*"` is a regex that matches everything. `dummy_do_not_exclude` is a dummy regex that does not match anything.
254
+
255
+ `--cpu_noise` generates initial noise on the CPU. This may result in the same results as ComfyUI with the same seed (depending on other settings).
256
+
257
+ If you are using the Fun Control model, specify the control video with `--control_path`. You can specify a video file or a folder containing multiple image files. The number of frames in the video file (or the number of images) should be at least the number specified in `--video_length` (plus 1 frame if you specify `--end_image_path`).
258
+
259
+ Please try to match the aspect ratio of the control video with the aspect ratio specified in `--video_size` (there may be some deviation from the initial image of I2V due to the use of bucketing processing).
260
+
261
+ Other options are same as `hv_generate_video.py` (some options are not supported, please check the help).
262
+
263
+ <details>
264
+ <summary>日本語</summary>
265
+ `--task` には `t2v-1.3B`, `t2v-14B`, `i2v-14B`, `t2i-14B` (これらはWan2.1公式モデル)、`t2v-1.3B-FC`, `t2v-14B-FC`, `i2v-14B-FC`(Wan2.1-Fun Controlモデル)を指定します。
266
+
267
+ `--attn_mode` には `torch`, `sdpa`(`torch`と同じ)、`xformers`, `sageattn`, `flash2`, `flash`(`flash2`と同じ), `flash3` のいずれかを指定します。デフォルトは `torch` です。その他のオプションを使用する場合は、対応するライブラリをインストールする必要があります。`flash3`(Flash attention 3)は未テストです。
268
+
269
+ `--fp8` を指定するとDiTモデルをfp8形式で実行します。fp8はメモリ消費を大幅に削減できますが、出力品質に影響を与える可能性があります。
270
+
271
+ `--fp8_scaled` を `--fp8` と併用すると、fp8への重み量子化を行います。メモリ消費と速度はわずかに悪化しますが、出力品質が向上します。詳しくは[こちら](advanced_config.md#fp8-weight-optimization-for-models--モデルの重みのfp8への最適化)を参照してください。
272
+
273
+ `--fp8_fast` オプションはRTX 40x0 GPUでの高速推論に使用されるオプションです。このオプションは `--fp8_scaled` オプションが必要です。**出力品質が劣化するようです。**
274
+
275
+ `--fp8_t5` を指定するとT5モデルをfp8形式で実行します。T5モデル呼び出し時のメモリ使用量を削減します。
276
+
277
+ `--negative_prompt` でネガティブプロンプトを指定できます。省略した場合はデフォルトのネガティブプロンプトが使用されます。
278
+
279
+ `--flow_shift` でflow shiftを指定できます(480pのI2Vの場合はデフォルト3.0、それ以外は5.0)。
280
+
281
+ `--guidance_scale` でclassifier free guianceのガイダンススケールを指定できます(デフォルト5.0)。
282
+
283
+ `--blocks_to_swap` は推論時のblock swapの数です。デフォルト値はNone(block swapなし)です。最大値は14Bモデルの場合39、1.3Bモデルの場合29です。
284
+
285
+ `--vae_cache_cpu` を有効にすると、VAEのキャッシュをメインメモリに保持します。VRAM使用量が多少減りますが、処理は遅くなります。
286
+
287
+ `--compile`でtorch.compileを有効にします。詳細については[こちら](/README.md#inference)を参照してください。
288
+
289
+ `--trim_tail_frames` で保存時に末尾のフレームをトリミングできます。デフォルトは0です。
290
+
291
+ `--cfg_skip_mode` は異なるステップでCFGをスキップするモードを指定します。デフォルトは `none`(全ステップ)。`--cfg_apply_ratio` はCFGが適用されるステップの割合を指定します。詳細は後述します。
292
+
293
+ LoRAのどのモジュールを適用するかを、`--include_patterns`と`--exclude_patterns`で指定できます(未指定時・デフォルトは全モジュール適用されます
294
+ )。これらのオプションには、正規表現を指定します。`--include_patterns`は適用するモジュール、`--exclude_patterns`は適用しないモジュールを指定します。正規表現がLoRAのキー名に含まれるかどうかで判断され、includeが優先されます。
295
+
296
+ 検索対象となるキー名は sd-scripts 形式(`lora_unet_<モジュール名のドットを_に置換したもの>`)です。例:`lora_unet_blocks_9_cross_attn_k`
297
+
298
+ たとえば `--exclude_patterns "blocks_[23]\d_"`のみを指定すると、`blocks_20`から`blocks_39`を含むモジュールが除外されます。`--include_patterns "cross_attn" --exclude_patterns "blocks_(0|1|2|3|4)_"`のようにincludeとexcludeを指定すると、`cross_attn`を含むモジュールで、かつ`blocks_0`から`blocks_4`を含まないモジュールにLoRAが適用されます。
299
+
300
+ 複数のLoRAの重みを指定する場合は、複数個の引数で指定してください。例:`--include_patterns "cross_attn" ".*" --exclude_patterns "dummy_do_not_exclude" "blocks_(0|1|2|3|4)"` `".*"`は全てにマッチする正規表現です。`dummy_do_not_exclude`は何にもマッチしないダミーの正規表現です。
301
+
302
+ `--cpu_noise`を指定すると初期ノイズをCPUで生成します。これにより同一seed時の結果がComfyUIと同じになる可能性があります(他の設定にもよります)。
303
+
304
+ Fun Controlモデルを使用する場合は、`--control_path`で制御用の映像を指定します。動画ファイル、または複数枚の画像ファイルを含んだフォルダを指定できます。動画ファイルのフレーム数(または画像の枚数)は、`--video_length`で指定したフレーム数以上にしてください(後述の`--end_image_path`を指定した場合は、さらに+1フレーム)。
305
+
306
+ 制御用の映像のアスペクト比は、`--video_size`で指定したアスペクト比とできるかぎり合わせてください(bucketingの処理を流用しているためI2Vの初期画像とズレる場合があります)。
307
+
308
+ その他のオプションは `hv_generate_video.py` と同じです(一部のオプションはサポートされていないため、ヘルプを確認してください)。
309
+ </details>
310
+
311
+ #### CFG Skip Mode / CFGスキップモード
312
+
313
+ These options allow you to balance generation speed against prompt accuracy. More skipped steps results in faster generation with potential quality degradation.
314
+
315
+ Setting `--cfg_apply_ratio` to 0.5 speeds up the denoising loop by up to 25%.
316
+
317
+ `--cfg_skip_mode` specified one of the following modes:
318
+
319
+ - `early`: Skips CFG in early steps for faster generation, applying guidance mainly in later refinement steps
320
+ - `late`: Skips CFG in later steps, applying guidance during initial structure formation
321
+ - `middle`: Skips CFG in middle steps, applying guidance in both early and later steps
322
+ - `early_late`: Skips CFG in both early and late steps, applying only in middle steps
323
+ - `alternate`: Applies CFG in alternate steps based on the specified ratio
324
+ - `none`: Applies CFG at all steps (default)
325
+
326
+ `--cfg_apply_ratio` specifies a value from 0.0 to 1.0 controlling the proportion of steps where CFG is applied. For example, setting 0.5 means CFG will be applied in only 50% of the steps.
327
+
328
+ If num_steps is 10, the following table shows the steps where CFG is applied based on the `--cfg_skip_mode` option (A means CFG is applied, S means it is skipped, `--cfg_apply_ratio` is 0.6):
329
+
330
+ | skip mode | CFG apply pattern |
331
+ |---|---|
332
+ | early | SSSSAAAAAA |
333
+ | late | AAAAAASSSS |
334
+ | middle | AAASSSSAAA |
335
+ | early_late | SSAAAAAASS |
336
+ | alternate | SASASAASAS |
337
+
338
+ The appropriate settings are unknown, but you may want to try `late` or `early_late` mode with a ratio of around 0.3 to 0.5.
339
+ <details>
340
+ <summary>日本語</summary>
341
+ これらのオプションは、生成速度とプロンプトの精度のバランスを取ることができます。スキップされるステップが多いほど、生成速度が速くなりますが、品質が低下する可能性があります。
342
+
343
+ ratioに0.5を指定することで、デノイジングのループが最大25%程度、高速化されます。
344
+
345
+ `--cfg_skip_mode` は次のモードのいずれかを指定します:
346
+
347
+ - `early`:初期のステップでCFGをスキップして、主に終盤の精細化のステップで適用します
348
+ - `late`:終盤のステップでCFGをスキップし、初期の構造が決まる段階で適用します
349
+ - `middle`:中間のステップでCFGをスキップし、初期と終盤のステップの両方で適用します
350
+ - `early_late`:初期と終盤のステップの両方でCFGをスキップし、中間のステップのみ適用します
351
+ - `alternate`:指定された割合に基づいてCFGを適用します
352
+
353
+ `--cfg_apply_ratio` は、CFGが適用されるステップの割合を0.0から1.0の値で指定します。たとえば、0.5に設定すると、CFGはステップの50%のみで適用されます。
354
+
355
+ 具体的なパターンは上のテーブルを参照してください。
356
+
357
+ 適切な設定は不明ですが、モードは`late`または`early_late`、ratioは0.3~0.5程度から試してみると良いかもしれません。
358
+ </details>
359
+
360
+ #### Skip Layer Guidance
361
+
362
+ Skip Layer Guidance is a feature that uses the output of a model with some blocks skipped as the unconditional output of classifier free guidance. It was originally proposed in [SD 3.5](https://github.com/comfyanonymous/ComfyUI/pull/5404) and first applied in Wan2GP in [this PR](https://github.com/deepbeepmeep/Wan2GP/pull/61). It may improve the quality of generated videos.
363
+
364
+ The implementation of SD 3.5 is [here](https://github.com/Stability-AI/sd3.5/blob/main/sd3_impls.py), and the implementation of Wan2GP (the PR mentioned above) has some different specifications. This inference script allows you to choose between the two methods.
365
+
366
+ *The SD3.5 method applies slg output in addition to cond and uncond (slows down the speed). The Wan2GP method uses only cond and slg output.*
367
+
368
+ The following arguments are available:
369
+
370
+ - `--slg_mode`: Specifies the SLG mode. `original` for SD 3.5 method, `uncond` for Wan2GP method. Default is None (no SLG).
371
+ - `--slg_layers`: Specifies the indices of the blocks (layers) to skip in SLG, separated by commas. Example: `--slg_layers 4,5,6`. Default is empty (no skip). If this option is not specified, `--slg_mode` is ignored.
372
+ - `--slg_scale`: Specifies the scale of SLG when `original`. Default is 3.0.
373
+ - `--slg_start`: Specifies the start step of SLG application in inference steps from 0.0 to 1.0. Default is 0.0 (applied from the beginning).
374
+ - `--slg_end`: Specifies the end step of SLG application in inference steps from 0.0 to 1.0. Default is 0.3 (applied up to 30% from the beginning).
375
+
376
+ Appropriate settings are unknown, but you may want to try `original` mode with a scale of around 3.0 and a start ratio of 0.0 and an end ratio of 0.5, with layers 4, 5, and 6 skipped.
377
+
378
+ <details>
379
+ <summary>日本語</summary>
380
+ Skip Layer Guidanceは、一部のblockをスキップしたモデル出力をclassifier free guidanceのunconditional出力に使用する機能です。元々は[SD 3.5](https://github.com/comfyanonymous/ComfyUI/pull/5404)で提案されたもので、Wan2.1には[Wan2GPのこちらのPR](https://github.com/deepbeepmeep/Wan2GP/pull/61)で初めて適用されました。生成動画の品質が向上する可能性があります。
381
+
382
+ SD 3.5の実装は[こちら](https://github.com/Stability-AI/sd3.5/blob/main/sd3_impls.py)で、Wan2GPの実装(前述のPR)は一部仕様が異なります。この推論スクリプトでは両者の方式を選択できるようになっています。
383
+
384
+ ※SD3.5方式はcondとuncondに加えてslg outputを適用します(速度が低下します)。Wan2GP方式はcondとslg outputのみを使用します。
385
+
386
+ 以下の引数があります。
387
+
388
+ - `--slg_mode`:SLGのモードを指定します。`original`でSD 3.5の方式、`uncond`でWan2GPの方式です。デフォルトはNoneで、SLGを使用しません。
389
+ - `--slg_layers`:SLGでスキップするblock (layer)のインデクスをカンマ区切りで指定します。例:`--slg_layers 4,5,6`。デフォルトは空(スキップしない)です。このオプションを指定しないと`--slg_mode`は無視されます。
390
+ - `--slg_scale`:`original`のときのSLGのスケールを指定します。デフォルトは3.0です。
391
+ - `--slg_start`:推論ステップのSLG適用開始ステップを0.0から1.0の割合で指定します。デフォルトは0.0です(最初から適用)。
392
+ - `--slg_end`:推論ステップのSLG適用終了ステップを0.0から1.0の割合で指定します。デフォルトは0.3です(最初から30%まで適用)。
393
+
394
+ 適切な設定は不明ですが、`original`モードでスケールを3.0程度、開始割合を0.0、終了割合を0.5程度に設定し、4, 5, 6のlayerをスキップする設定から始めると良いかもしれません。
395
+ </details>
396
+
397
+ ### I2V Inference / I2V推論
398
+
399
+ The following is an example of I2V inference (input as a single line):
400
+
401
+ ```bash
402
+ python wan_generate_video.py --fp8 --task i2v-14B --video_size 832 480 --video_length 81 --infer_steps 20
403
+ --prompt "prompt for the video" --save_path path/to/save.mp4 --output_type both
404
+ --dit path/to/wan2.1_i2v_480p_14B_bf16_etc.safetensors --vae path/to/wan_2.1_vae.safetensors
405
+ --t5 path/to/models_t5_umt5-xxl-enc-bf16.pth --clip path/to/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth
406
+ --attn_mode torch --image_path path/to/image.jpg
407
+ ```
408
+
409
+ Add `--clip` to specify the CLIP model. `--image_path` is the path to the image to be used as the initial frame.
410
+
411
+ `--end_image_path` can be used to specify the end image. This option is experimental. When this option is specified, the saved video will be slightly longer than the specified number of frames and will have noise, so it is recommended to specify `--trim_tail_frames 3` to trim the tail frames.
412
+
413
+ You can also use the Fun Control model for I2V inference. Specify the control video with `--control_path`.
414
+
415
+ Other options are same as T2V inference.
416
+
417
+ <details>
418
+ <summary>日本語</summary>
419
+ `--clip` を追加してCLIPモデルを指定します。`--image_path` は初期フレームとして使用する画像のパスです。
420
+
421
+ `--end_image_path` で終了画像を指定できます。このオプションは実験的なものです。このオプションを指定すると、保存される動画が指定フレーム数よりもやや多くなり、かつノイズが乗るため、`--trim_tail_frames 3` などを指定して末尾のフレームをトリミングすることをお勧めします。
422
+
423
+ I2V推論でもFun Controlモデルが使用できます。`--control_path` で制御用の映像を指定します。
424
+
425
+ その他のオプションはT2V推論と同じです。
426
+ </details>
427
+
428
+ ### New Batch and Interactive Modes / 新しいバッチモードとインタラクティブモード
429
+
430
+ In addition to single video generation, Wan 2.1 now supports batch generation from file and interactive prompt input:
431
+
432
+ #### Batch Mode from File / ファイルからのバッチモード
433
+
434
+ Generate multiple videos from prompts stored in a text file:
435
+
436
+ ```bash
437
+ python wan_generate_video.py --from_file prompts.txt --task t2v-14B
438
+ --dit path/to/model.safetensors --vae path/to/vae.safetensors
439
+ --t5 path/to/t5_model.pth --save_path output_directory
440
+ ```
441
+
442
+ The prompts file format:
443
+ - One prompt per line
444
+ - Empty lines and lines starting with # are ignored (comments)
445
+ - Each line can include prompt-specific parameters using command-line style format:
446
+
447
+ ```
448
+ A beautiful sunset over mountains --w 832 --h 480 --f 81 --d 42 --s 20
449
+ A busy city street at night --w 480 --h 832 --g 7.5 --n low quality, blurry
450
+ ```
451
+
452
+ Supported inline parameters (if ommitted, default values from the command line are used):
453
+ - `--w`: Width
454
+ - `--h`: Height
455
+ - `--f`: Frame count
456
+ - `--d`: Seed
457
+ - `--s`: Inference steps
458
+ - `--g` or `--l`: Guidance scale
459
+ - `--fs`: Flow shift
460
+ - `--i`: Image path (for I2V)
461
+ - `--cn`: Control path (for Fun Control)
462
+ - `--n`: Negative prompt
463
+
464
+ In batch mode, models are loaded once and reused for all prompts, significantly improving overall generation time compared to multiple single runs.
465
+
466
+ #### Interactive Mode / インタラクティブモード
467
+
468
+ Interactive command-line interface for entering prompts:
469
+
470
+ ```bash
471
+ python wan_generate_video.py --interactive --task t2v-14B
472
+ --dit path/to/model.safetensors --vae path/to/vae.safetensors
473
+ --t5 path/to/t5_model.pth --save_path output_directory
474
+ ```
475
+
476
+ In interactive mode:
477
+ - Enter prompts directly at the command line
478
+ - Use the same inline parameter format as batch mode
479
+ - Use Ctrl+D (or Ctrl+Z on Windows) to exit
480
+ - Models remain loaded between generations for efficiency
481
+
482
+ <details>
483
+ <summary>日本語</summary>
484
+ 単一動画の生成に加えて、Wan 2.1は現在、ファイルからのバッチ生成とインタラクティブなプロンプト入力をサポートしています。
485
+
486
+ #### ファイルからのバッチモード
487
+
488
+ テキストファイルに保存されたプロンプトから複数の動画を生成します:
489
+
490
+ ```bash
491
+ python wan_generate_video.py --from_file prompts.txt --task t2v-14B
492
+ --dit path/to/model.safetensors --vae path/to/vae.safetensors
493
+ --t5 path/to/t5_model.pth --save_path output_directory
494
+ ```
495
+
496
+ プロンプトファイルの形式:
497
+ - 1行に1つのプロンプト
498
+ - 空行や#で始まる行は無視されます(コメント)
499
+ - 各行にはコマンドライン形式でプロンプト固有のパラメータを含めることができます:
500
+
501
+ サポートされているインラインパラメータ(省略した場合、コマンドラインのデフォルト値が使用されます)
502
+ - `--w`: 幅
503
+ - `--h`: 高さ
504
+ - `--f`: フレーム数
505
+ - `--d`: シード
506
+ - `--s`: 推論ステップ
507
+ - `--g` または `--l`: ガイダンススケール
508
+ - `--fs`: フローシフト
509
+ - `--i`: 画像パス(I2V用)
510
+ - `--cn`: コントロールパス(Fun Control用)
511
+ - `--n`: ネガティブプロンプト
512
+
513
+ バッチモードでは、モデルは一度だけロードされ、すべてのプロンプトで再利用されるため、複数回の単一実行と比較して全体的な生成時間が大幅に改善されます。
514
+
515
+ #### インタラクティブモード
516
+
517
+ プロンプトを入力するためのインタラクティブなコマンドラインインターフェース:
518
+
519
+ ```bash
520
+ python wan_generate_video.py --interactive --task t2v-14B
521
+ --dit path/to/model.safetensors --vae path/to/vae.safetensors
522
+ --t5 path/to/t5_model.pth --save_path output_directory
523
+ ```
524
+
525
+ インタラクティブモードでは:
526
+ - コマンドラインで直接プロンプトを入力
527
+ - バッチモードと同じインラインパラメータ形式を使用
528
+ - 終了するには Ctrl+D (Windowsでは Ctrl+Z) を使用
529
+ - 効率のため、モデルは生成間で読み込まれたままになります
530
+ </details>
531
+
fpack_cache_latents.py ADDED
@@ -0,0 +1,381 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import logging
3
+ import math
4
+ import os
5
+ from typing import List
6
+
7
+ import numpy as np
8
+ import torch
9
+ import torch.nn.functional as F
10
+ from tqdm import tqdm
11
+ from transformers import SiglipImageProcessor, SiglipVisionModel
12
+
13
+ from dataset import config_utils
14
+ from dataset.config_utils import BlueprintGenerator, ConfigSanitizer
15
+ from dataset.image_video_dataset import BaseDataset, ItemInfo, save_latent_cache_framepack, ARCHITECTURE_FRAMEPACK
16
+ from frame_pack import hunyuan
17
+ from frame_pack.framepack_utils import load_image_encoders, load_vae
18
+ from hunyuan_model.autoencoder_kl_causal_3d import AutoencoderKLCausal3D
19
+ from frame_pack.clip_vision import hf_clip_vision_encode
20
+ import cache_latents
21
+
22
+ logger = logging.getLogger(__name__)
23
+ logging.basicConfig(level=logging.INFO)
24
+
25
+
26
+ def encode_and_save_batch(
27
+ vae: AutoencoderKLCausal3D,
28
+ feature_extractor: SiglipImageProcessor,
29
+ image_encoder: SiglipVisionModel,
30
+ batch: List[ItemInfo],
31
+ latent_window_size: int,
32
+ vanilla_sampling: bool = False,
33
+ ):
34
+ """Encode a batch of original RGB videos and save FramePack section caches."""
35
+
36
+ # Stack batch into tensor (B,C,F,H,W) in RGB order
37
+ contents = torch.stack([torch.from_numpy(item.content) for item in batch])
38
+ if len(contents.shape) == 4:
39
+ contents = contents.unsqueeze(1) # B, H, W, C -> B, F, H, W, C
40
+
41
+ contents = contents.permute(0, 4, 1, 2, 3).contiguous() # B, C, F, H, W
42
+ contents = contents.to(vae.device, dtype=vae.dtype)
43
+ contents = contents / 127.5 - 1.0 # normalize to [-1, 1]
44
+
45
+ height, width = contents.shape[3], contents.shape[4]
46
+ if height < 8 or width < 8:
47
+ item = batch[0] # other items should have the same size
48
+ raise ValueError(f"Image or video size too small: {item.item_key} and {len(batch) - 1} more, size: {item.original_size}")
49
+
50
+ # calculate latent frame count from original frame count (4n+1)
51
+ latent_f = (batch[0].frame_count - 1) // 4 + 1
52
+
53
+ # calculate the total number of sections (excluding the first frame, divided by window size)
54
+ total_latent_sections = math.floor((latent_f - 1) / latent_window_size)
55
+ if total_latent_sections < 1:
56
+ min_frames_needed = latent_window_size * 4 + 1
57
+ raise ValueError(
58
+ f"Not enough frames for FramePack: {batch[0].frame_count} frames ({latent_f} latent frames), minimum required: {min_frames_needed} frames ({latent_window_size+1} latent frames)"
59
+ )
60
+
61
+ # 実際に処理する潜在変数のフレーム数 (セクション境界に合わせる)
62
+ latent_f_aligned = total_latent_sections * latent_window_size + 1
63
+ # 実際に処理する元のフレーム数
64
+ frame_count_aligned = (latent_f_aligned - 1) * 4 + 1
65
+ if frame_count_aligned != batch[0].frame_count:
66
+ logger.info(
67
+ f"Frame count mismatch: required={frame_count_aligned} != actual={batch[0].frame_count}, trimming to {frame_count_aligned}"
68
+ )
69
+ contents = contents[:, :, :frame_count_aligned, :, :]
70
+
71
+ latent_f = latent_f_aligned # Update to the aligned value
72
+
73
+ # VAE encode (list of tensor -> stack)
74
+ latents = hunyuan.vae_encode(contents, vae) # include scaling factor
75
+ latents = latents.to("cpu") # (B, C, latent_f, H/8, W/8)
76
+
77
+ # Vision encoding per‑item (once)
78
+ images = np.stack([item.content[0] for item in batch], axis=0) # B, H, W, C
79
+
80
+ # encode image with image encoder
81
+ image_embeddings = []
82
+ with torch.no_grad():
83
+ for image in images:
84
+ image_encoder_output = hf_clip_vision_encode(image, feature_extractor, image_encoder)
85
+ image_embeddings.append(image_encoder_output.last_hidden_state)
86
+ image_embeddings = torch.cat(image_embeddings, dim=0) # B, LEN, 1152
87
+ image_embeddings = image_embeddings.to("cpu") # Save memory
88
+
89
+ if not vanilla_sampling:
90
+ # padding is reversed for inference (future to past)
91
+ latent_paddings = list(reversed(range(total_latent_sections)))
92
+ # Note: The padding trick for inference. See the paper for details.
93
+ if total_latent_sections > 4:
94
+ latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0]
95
+
96
+ for b, item in enumerate(batch):
97
+ original_latent_cache_path = item.latent_cache_path
98
+ video_lat = latents[b : b + 1] # keep batch dim, 1, C, F, H, W
99
+
100
+ # emulate inference step (history latents)
101
+ # Note: In inference, history_latents stores *generated* future latents.
102
+ # Here, for caching, we just need its shape and type for clean_* tensors.
103
+ # The actual content doesn't matter much as clean_* will be overwritten.
104
+ history_latents = torch.zeros(
105
+ (1, video_lat.shape[1], 1 + 2 + 16, video_lat.shape[3], video_lat.shape[4]), dtype=video_lat.dtype
106
+ ) # C=16 for HY
107
+
108
+ latent_f_index = latent_f - latent_window_size # Start from the last section
109
+ section_index = total_latent_sections - 1
110
+
111
+ for latent_padding in latent_paddings:
112
+ is_last_section = section_index == 0 # the last section in inference order == the first section in time
113
+ latent_padding_size = latent_padding * latent_window_size
114
+ if is_last_section:
115
+ assert latent_f_index == 1, "Last section should be starting from frame 1"
116
+
117
+ # indices generation (same as inference)
118
+ indices = torch.arange(0, sum([1, latent_padding_size, latent_window_size, 1, 2, 16])).unsqueeze(0)
119
+ (
120
+ clean_latent_indices_pre, # Index for start_latent
121
+ blank_indices, # Indices for padding (future context in inference)
122
+ latent_indices, # Indices for the target latents to predict
123
+ clean_latent_indices_post, # Index for the most recent history frame
124
+ clean_latent_2x_indices, # Indices for the next 2 history frames
125
+ clean_latent_4x_indices, # Indices for the next 16 history frames
126
+ ) = indices.split([1, latent_padding_size, latent_window_size, 1, 2, 16], dim=1)
127
+
128
+ # Indices for clean_latents (start + recent history)
129
+ clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1)
130
+
131
+ # clean latents preparation (emulating inference)
132
+ clean_latents_pre = video_lat[:, :, 0:1, :, :] # Always the first frame (start_latent)
133
+ clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, : 1 + 2 + 16, :, :].split(
134
+ [1, 2, 16], dim=2
135
+ )
136
+ clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2) # Combine start frame + placeholder
137
+
138
+ # Target latents for this section (ground truth)
139
+ target_latents = video_lat[:, :, latent_f_index : latent_f_index + latent_window_size, :, :]
140
+
141
+ # save cache (file path is inside item.latent_cache_path pattern), remove batch dim
142
+ item.latent_cache_path = append_section_idx_to_latent_cache_path(original_latent_cache_path, section_index)
143
+ save_latent_cache_framepack(
144
+ item_info=item,
145
+ latent=target_latents.squeeze(0), # Ground truth for this section
146
+ latent_indices=latent_indices.squeeze(0), # Indices for the ground truth section
147
+ clean_latents=clean_latents.squeeze(0), # Start frame + history placeholder
148
+ clean_latent_indices=clean_latent_indices.squeeze(0), # Indices for start frame + history placeholder
149
+ clean_latents_2x=clean_latents_2x.squeeze(0), # History placeholder
150
+ clean_latent_2x_indices=clean_latent_2x_indices.squeeze(0), # Indices for history placeholder
151
+ clean_latents_4x=clean_latents_4x.squeeze(0), # History placeholder
152
+ clean_latent_4x_indices=clean_latent_4x_indices.squeeze(0), # Indices for history placeholder
153
+ image_embeddings=image_embeddings[b],
154
+ )
155
+
156
+ if is_last_section: # If this was the first section generated in inference (time=0)
157
+ # History gets the start frame + the generated first section
158
+ generated_latents_for_history = video_lat[:, :, : latent_window_size + 1, :, :]
159
+ else:
160
+ # History gets the generated current section
161
+ generated_latents_for_history = target_latents # Use true latents as stand-in for generated
162
+
163
+ history_latents = torch.cat([generated_latents_for_history, history_latents], dim=2)
164
+
165
+ section_index -= 1
166
+ latent_f_index -= latent_window_size
167
+
168
+ else:
169
+ # Vanilla Sampling Logic
170
+ for b, item in enumerate(batch):
171
+ original_latent_cache_path = item.latent_cache_path
172
+ video_lat = latents[b : b + 1] # Keep batch dim: 1, C, F_aligned, H, W
173
+ img_emb = image_embeddings[b] # LEN, 1152
174
+
175
+ for section_index in range(total_latent_sections):
176
+ target_start_f = section_index * latent_window_size + 1
177
+ target_end_f = target_start_f + latent_window_size
178
+ target_latents = video_lat[:, :, target_start_f:target_end_f, :, :]
179
+
180
+ # Clean latents preparation (Vanilla)
181
+
182
+ # Get clean_latents_pre (Always frame 0)
183
+ clean_latents_pre = video_lat[:, :, 0:1, :, :]
184
+
185
+ # Frame indices for past context (relative to anchor)
186
+ idx_post_frame = target_start_f - 1 # Frame index of the last frame of section i-1
187
+ idx_2x_frame_1 = idx_post_frame - 1
188
+ idx_2x_frame_2 = idx_post_frame - 2
189
+ idx_4x_start_frame = idx_post_frame - idx_2x_frame_2 - 16
190
+
191
+ # Helper function to get frame or zeros if index is out of bounds
192
+ def get_frame_or_zeros(frame_idx):
193
+ if frame_idx >= 0:
194
+ # Ensure frame_idx doesn't exceed the actual length
195
+ if frame_idx < video_lat.shape[2]:
196
+ return video_lat[:, :, frame_idx : frame_idx + 1, :, :]
197
+ else:
198
+ # This case should ideally not happen if indexing is correct
199
+ logger.warning(
200
+ f"Attempted to access frame {frame_idx} beyond latent length {video_lat.shape[2]}. Returning zeros."
201
+ )
202
+ return torch.zeros_like(clean_latents_pre)
203
+ else:
204
+ return torch.zeros_like(clean_latents_pre)
205
+
206
+ # Get clean_latents_post (frame at idx_post_frame)
207
+ clean_latents_post = get_frame_or_zeros(idx_post_frame)
208
+
209
+ # Get clean_latents_2x (frames at idx_2x_frame_1, idx_2x_frame_2)
210
+ frame_2x_1 = get_frame_or_zeros(idx_2x_frame_1)
211
+ frame_2x_2 = get_frame_or_zeros(idx_2x_frame_2)
212
+ clean_latents_2x = torch.cat(
213
+ [frame_2x_2, frame_2x_1], dim=2
214
+ ) # Order might matter (older first?) - assuming order [..., t-2, t-1]
215
+
216
+ # Get clean_latents_4x (16 frames ending at idx_4x_start_frame)
217
+ clean_latents_4x_list = []
218
+ for i in range(16):
219
+ frame_idx = idx_4x_start_frame + i
220
+ clean_latents_4x_list.append(get_frame_or_zeros(frame_idx))
221
+ clean_latents_4x = torch.cat(clean_latents_4x_list, dim=2) # Ensure correct temporal order [..., t-18, ..., t-3]
222
+
223
+ # Combine pre and post for the main clean_latents input
224
+ clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2) # (1, C, 2, H, W)
225
+
226
+ # Indices generation (Vanilla with Offset)
227
+ vanilla_offset_size = section_index * latent_window_size # Offset based on section index
228
+ # print(f"Vanilla offset size: {vanilla_offset_size}")
229
+
230
+ # Calculate total length including the offset
231
+ total_length = sum([1, vanilla_offset_size, latent_window_size, 1, 2, 16])
232
+ indices = torch.arange(0, total_length).unsqueeze(0)
233
+
234
+ # Split indices including the offset part
235
+ (
236
+ clean_latent_indices_pre, # Index for frame 0
237
+ past_offset_indices, # Indices representing the time offset *before* section i
238
+ latent_indices, # Indices for the target latents (section i)
239
+ clean_latent_indices_post, # Index for frame from end of section i-1
240
+ clean_latent_2x_indices, # Indices for frames from end of section i-2, i-3
241
+ clean_latent_4x_indices, # Indices for the 16 past frames
242
+ ) = indices.split([1, vanilla_offset_size, latent_window_size, 1, 2, 16], dim=1)
243
+
244
+ clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1)
245
+
246
+ # Save cache
247
+ item.latent_cache_path = append_section_idx_to_latent_cache_path(original_latent_cache_path, section_index)
248
+ save_latent_cache_framepack(
249
+ item_info=item,
250
+ latent=target_latents.squeeze(0),
251
+ latent_indices=latent_indices.squeeze(0), # Indices for target section i
252
+ clean_latents=clean_latents.squeeze(0), # Past clean frames
253
+ clean_latent_indices=clean_latent_indices.squeeze(0), # Indices for clean_latents_pre/post
254
+ clean_latents_2x=clean_latents_2x.squeeze(0), # Past clean frames (2x)
255
+ clean_latent_2x_indices=clean_latent_2x_indices.squeeze(0), # Indices for clean_latents_2x
256
+ clean_latents_4x=clean_latents_4x.squeeze(0), # Past clean frames (4x)
257
+ clean_latent_4x_indices=clean_latent_4x_indices.squeeze(0), # Indices for clean_latents_4x
258
+ image_embeddings=img_emb,
259
+ # Note: We don't explicitly save past_offset_indices,
260
+ # but its size influences the absolute values in other indices.
261
+ )
262
+
263
+
264
+ def framepack_setup_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
265
+ parser.add_argument("--image_encoder", type=str, required=True, help="Image encoder (CLIP) checkpoint path or directory")
266
+ parser.add_argument("--latent_window_size", type=int, default=9, help="FramePack latent window size (default 9)")
267
+ parser.add_argument(
268
+ "--vanilla_sampling",
269
+ action="store_true",
270
+ help="Generate cache for vanilla (autoregressive) sampling instead of inference emulation",
271
+ )
272
+ return parser
273
+
274
+
275
+ def main(args: argparse.Namespace):
276
+ device = args.device if hasattr(args, "device") and args.device else ("cuda" if torch.cuda.is_available() else "cpu")
277
+ device = torch.device(device)
278
+
279
+ # Load dataset config
280
+ blueprint_generator = BlueprintGenerator(ConfigSanitizer())
281
+ logger.info(f"Load dataset config from {args.dataset_config}")
282
+ user_config = config_utils.load_user_config(args.dataset_config)
283
+ blueprint = blueprint_generator.generate(user_config, args, architecture=ARCHITECTURE_FRAMEPACK)
284
+ train_dataset_group = config_utils.generate_dataset_group_by_blueprint(blueprint.dataset_group)
285
+
286
+ datasets = train_dataset_group.datasets
287
+
288
+ if args.debug_mode is not None:
289
+ cache_latents.show_datasets(
290
+ datasets, args.debug_mode, args.console_width, args.console_back, args.console_num_images, fps=16
291
+ )
292
+ return
293
+
294
+ assert args.vae is not None, "vae checkpoint is required"
295
+
296
+ logger.info(f"Loading VAE model from {args.vae}")
297
+ vae = load_vae(args.vae, args.vae_chunk_size, args.vae_spatial_tile_sample_min_size, device=device)
298
+ vae.to(device)
299
+
300
+ logger.info(f"Loading image encoder from {args.image_encoder}")
301
+ feature_extractor, image_encoder = load_image_encoders(args)
302
+ image_encoder.eval()
303
+ image_encoder.to(device)
304
+
305
+ logger.info(f"Cache generation mode: {'Vanilla Sampling' if args.vanilla_sampling else 'Inference Emulation'}")
306
+
307
+ # encoding closure
308
+ def encode(batch: List[ItemInfo]):
309
+ encode_and_save_batch(vae, feature_extractor, image_encoder, batch, args.latent_window_size, args.vanilla_sampling)
310
+
311
+ # reuse core loop from cache_latents with no change
312
+ encode_datasets_framepack(datasets, encode, args)
313
+
314
+
315
+ def append_section_idx_to_latent_cache_path(latent_cache_path: str, section_idx: int) -> str:
316
+ tokens = latent_cache_path.split("_")
317
+ tokens[-3] = f"{tokens[-3]}-{section_idx:04d}" # append section index to "frame_pos-count"
318
+ return "_".join(tokens)
319
+
320
+
321
+ def encode_datasets_framepack(datasets: list[BaseDataset], encode: callable, args: argparse.Namespace):
322
+ num_workers = args.num_workers if args.num_workers is not None else max(1, os.cpu_count() - 1)
323
+ for i, dataset in enumerate(datasets):
324
+ logger.info(f"Encoding dataset [{i}]")
325
+ all_latent_cache_paths = []
326
+ for _, batch in tqdm(dataset.retrieve_latent_cache_batches(num_workers)):
327
+ batch: list[ItemInfo] = batch # type: ignore
328
+
329
+ # latent_cache_path is "{basename}_{w:04d}x{h:04d}_{self.architecture}.safetensors"
330
+ # we expand it to "{basename}_{section_idx:04d}_{w:04d}x{h:04d}_{self.architecture}.safetensors"
331
+ filtered_batch = []
332
+ for item in batch:
333
+ latent_f = (item.frame_count - 1) // 4 + 1
334
+ num_sections = math.floor((latent_f - 1) / args.latent_window_size)
335
+ all_existing = True
336
+ for sec in range(num_sections):
337
+ p = append_section_idx_to_latent_cache_path(item.latent_cache_path, sec)
338
+ all_latent_cache_paths.append(p)
339
+ all_existing = all_existing and os.path.exists(p)
340
+
341
+ if all_existing:
342
+ filtered_batch.append(item)
343
+
344
+ if args.skip_existing:
345
+ if len(filtered_batch) == 0:
346
+ continue
347
+ batch = filtered_batch
348
+
349
+ bs = args.batch_size if args.batch_size is not None else len(batch)
350
+ for i in range(0, len(batch), bs):
351
+ encode(batch[i : i + bs])
352
+
353
+ # normalize paths
354
+ all_latent_cache_paths = [os.path.normpath(p) for p in all_latent_cache_paths]
355
+ all_latent_cache_paths = set(all_latent_cache_paths)
356
+
357
+ # remove old cache files not in the dataset
358
+ all_cache_files = dataset.get_all_latent_cache_files()
359
+ for cache_file in all_cache_files:
360
+ if os.path.normpath(cache_file) not in all_latent_cache_paths:
361
+ if args.keep_cache:
362
+ logger.info(f"Keep cache file not in the dataset: {cache_file}")
363
+ else:
364
+ os.remove(cache_file)
365
+ logger.info(f"Removed old cache file: {cache_file}")
366
+
367
+
368
+ if __name__ == "__main__":
369
+ parser = cache_latents.setup_parser_common()
370
+ parser = cache_latents.hv_setup_parser(parser) # VAE
371
+ parser = framepack_setup_parser(parser)
372
+
373
+ args = parser.parse_args()
374
+
375
+ if args.vae_dtype is not None:
376
+ raise ValueError("VAE dtype is not supported in FramePack")
377
+ # if args.batch_size != 1:
378
+ # args.batch_size = 1
379
+ # logger.info("Batch size is set to 1 for FramePack.")
380
+
381
+ main(args)
fpack_cache_text_encoder_outputs.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ from typing import Optional, Union
4
+
5
+ import numpy as np
6
+ import torch
7
+ from tqdm import tqdm
8
+ from transformers import LlamaTokenizerFast, LlamaModel, CLIPTokenizer, CLIPTextModel
9
+ from dataset import config_utils
10
+ from dataset.config_utils import BlueprintGenerator, ConfigSanitizer
11
+ from dataset.image_video_dataset import ARCHITECTURE_FRAMEPACK, ItemInfo, save_text_encoder_output_cache_framepack
12
+ import cache_text_encoder_outputs
13
+ from frame_pack import hunyuan
14
+ from frame_pack.framepack_utils import load_text_encoder1, load_text_encoder2
15
+
16
+ import logging
17
+
18
+ from frame_pack.utils import crop_or_pad_yield_mask
19
+
20
+ logger = logging.getLogger(__name__)
21
+ logging.basicConfig(level=logging.INFO)
22
+
23
+
24
+ def encode_and_save_batch(
25
+ tokenizer1: LlamaTokenizerFast,
26
+ text_encoder1: LlamaModel,
27
+ tokenizer2: CLIPTokenizer,
28
+ text_encoder2: CLIPTextModel,
29
+ batch: list[ItemInfo],
30
+ device: torch.device,
31
+ ):
32
+ prompts = [item.caption for item in batch]
33
+
34
+ # encode prompt
35
+ # FramePack's encode_prompt_conds only supports single prompt, so we need to encode each prompt separately
36
+ list_of_llama_vec = []
37
+ list_of_llama_attention_mask = []
38
+ list_of_clip_l_pooler = []
39
+ for prompt in prompts:
40
+ with torch.autocast(device_type=device.type, dtype=text_encoder1.dtype), torch.no_grad():
41
+ # llama_vec, clip_l_pooler = hunyuan.encode_prompt_conds(prompts, text_encoder1, text_encoder2, tokenizer1, tokenizer2)
42
+ llama_vec, clip_l_pooler = hunyuan.encode_prompt_conds(prompt, text_encoder1, text_encoder2, tokenizer1, tokenizer2)
43
+ llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
44
+
45
+ list_of_llama_vec.append(llama_vec.squeeze(0))
46
+ list_of_llama_attention_mask.append(llama_attention_mask.squeeze(0))
47
+ list_of_clip_l_pooler.append(clip_l_pooler.squeeze(0))
48
+
49
+ # save prompt cache
50
+ for item, llama_vec, llama_attention_mask, clip_l_pooler in zip(
51
+ batch, list_of_llama_vec, list_of_llama_attention_mask, list_of_clip_l_pooler
52
+ ):
53
+ # save llama_vec and clip_l_pooler to cache
54
+ save_text_encoder_output_cache_framepack(item, llama_vec, llama_attention_mask, clip_l_pooler)
55
+
56
+
57
+ def main(args):
58
+ device = args.device if args.device is not None else "cuda" if torch.cuda.is_available() else "cpu"
59
+ device = torch.device(device)
60
+
61
+ # Load dataset config
62
+ blueprint_generator = BlueprintGenerator(ConfigSanitizer())
63
+ logger.info(f"Load dataset config from {args.dataset_config}")
64
+ user_config = config_utils.load_user_config(args.dataset_config)
65
+ blueprint = blueprint_generator.generate(user_config, args, architecture=ARCHITECTURE_FRAMEPACK)
66
+ train_dataset_group = config_utils.generate_dataset_group_by_blueprint(blueprint.dataset_group)
67
+
68
+ datasets = train_dataset_group.datasets
69
+
70
+ # prepare cache files and paths: all_cache_files_for_dataset = exisiting cache files, all_cache_paths_for_dataset = all cache paths in the dataset
71
+ all_cache_files_for_dataset, all_cache_paths_for_dataset = cache_text_encoder_outputs.prepare_cache_files_and_paths(datasets)
72
+
73
+ # load text encoder
74
+ tokenizer1, text_encoder1 = load_text_encoder1(args, args.fp8_llm, device)
75
+ tokenizer2, text_encoder2 = load_text_encoder2(args)
76
+ text_encoder2.to(device)
77
+
78
+ # Encode with Text Encoders
79
+ logger.info("Encoding with Text Encoders")
80
+
81
+ def encode_for_text_encoder(batch: list[ItemInfo]):
82
+ encode_and_save_batch(tokenizer1, text_encoder1, tokenizer2, text_encoder2, batch, device)
83
+
84
+ cache_text_encoder_outputs.process_text_encoder_batches(
85
+ args.num_workers,
86
+ args.skip_existing,
87
+ args.batch_size,
88
+ datasets,
89
+ all_cache_files_for_dataset,
90
+ all_cache_paths_for_dataset,
91
+ encode_for_text_encoder,
92
+ )
93
+
94
+ # remove cache files not in dataset
95
+ cache_text_encoder_outputs.post_process_cache_files(datasets, all_cache_files_for_dataset, all_cache_paths_for_dataset)
96
+
97
+
98
+ def framepack_setup_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
99
+ parser.add_argument("--text_encoder1", type=str, required=True, help="Text Encoder 1 directory")
100
+ parser.add_argument("--text_encoder2", type=str, required=True, help="Text Encoder 2 directory")
101
+ parser.add_argument("--fp8_llm", action="store_true", help="use fp8 for Text Encoder 1 (LLM)")
102
+ return parser
103
+
104
+
105
+ if __name__ == "__main__":
106
+ parser = cache_text_encoder_outputs.setup_parser_common()
107
+ parser = framepack_setup_parser(parser)
108
+
109
+ args = parser.parse_args()
110
+ main(args)
fpack_generate_video.py ADDED
@@ -0,0 +1,1149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from datetime import datetime
3
+ import gc
4
+ import json
5
+ import random
6
+ import os
7
+ import re
8
+ import time
9
+ import math
10
+ import copy
11
+ from typing import Tuple, Optional, List, Union, Any, Dict
12
+
13
+ import torch
14
+ from safetensors.torch import load_file, save_file
15
+ from safetensors import safe_open
16
+ from PIL import Image
17
+ import cv2
18
+ import numpy as np
19
+ import torchvision.transforms.functional as TF
20
+ from transformers import LlamaModel
21
+ from tqdm import tqdm
22
+
23
+ from networks import lora_framepack
24
+ from hunyuan_model.autoencoder_kl_causal_3d import AutoencoderKLCausal3D
25
+ from frame_pack import hunyuan
26
+ from frame_pack.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked, load_packed_model
27
+ from frame_pack.utils import crop_or_pad_yield_mask, resize_and_center_crop, soft_append_bcthw
28
+ from frame_pack.bucket_tools import find_nearest_bucket
29
+ from frame_pack.clip_vision import hf_clip_vision_encode
30
+ from frame_pack.k_diffusion_hunyuan import sample_hunyuan
31
+ from dataset import image_video_dataset
32
+
33
+ try:
34
+ from lycoris.kohya import create_network_from_weights
35
+ except:
36
+ pass
37
+
38
+ from utils.device_utils import clean_memory_on_device
39
+ from hv_generate_video import save_images_grid, save_videos_grid, synchronize_device
40
+ from wan_generate_video import merge_lora_weights
41
+ from frame_pack.framepack_utils import load_vae, load_text_encoder1, load_text_encoder2, load_image_encoders
42
+ from dataset.image_video_dataset import load_video
43
+
44
+ import logging
45
+
46
+ logger = logging.getLogger(__name__)
47
+ logging.basicConfig(level=logging.INFO)
48
+
49
+
50
+ class GenerationSettings:
51
+ def __init__(self, device: torch.device, dit_weight_dtype: Optional[torch.dtype] = None):
52
+ self.device = device
53
+ self.dit_weight_dtype = dit_weight_dtype
54
+
55
+
56
+ def parse_args() -> argparse.Namespace:
57
+ """parse command line arguments"""
58
+ parser = argparse.ArgumentParser(description="Wan 2.1 inference script")
59
+
60
+ # WAN arguments
61
+ # parser.add_argument("--ckpt_dir", type=str, default=None, help="The path to the checkpoint directory (Wan 2.1 official).")
62
+ parser.add_argument(
63
+ "--sample_solver", type=str, default="unipc", choices=["unipc", "dpm++", "vanilla"], help="The solver used to sample."
64
+ )
65
+
66
+ parser.add_argument("--dit", type=str, default=None, help="DiT directory or path")
67
+ parser.add_argument("--vae", type=str, default=None, help="VAE directory or path")
68
+ parser.add_argument("--text_encoder1", type=str, required=True, help="Text Encoder 1 directory or path")
69
+ parser.add_argument("--text_encoder2", type=str, required=True, help="Text Encoder 2 directory or path")
70
+ parser.add_argument("--image_encoder", type=str, required=True, help="Image Encoder directory or path")
71
+ # LoRA
72
+ parser.add_argument("--lora_weight", type=str, nargs="*", required=False, default=None, help="LoRA weight path")
73
+ parser.add_argument("--lora_multiplier", type=float, nargs="*", default=1.0, help="LoRA multiplier")
74
+ parser.add_argument("--include_patterns", type=str, nargs="*", default=None, help="LoRA module include patterns")
75
+ parser.add_argument("--exclude_patterns", type=str, nargs="*", default=None, help="LoRA module exclude patterns")
76
+ parser.add_argument(
77
+ "--save_merged_model",
78
+ type=str,
79
+ default=None,
80
+ help="Save merged model to path. If specified, no inference will be performed.",
81
+ )
82
+
83
+ # inference
84
+ parser.add_argument(
85
+ "--prompt",
86
+ type=str,
87
+ default=None,
88
+ help="prompt for generation. If `;;;` is used, it will be split into sections. Example: `section_index:prompt` or "
89
+ "`section_index:prompt;;;section_index:prompt;;;...`, section_index can be `0` or `-1` or `0-2`, `-1` means last section, `0-2` means from 0 to 2 (inclusive).",
90
+ )
91
+ parser.add_argument(
92
+ "--negative_prompt",
93
+ type=str,
94
+ default=None,
95
+ help="negative prompt for generation, default is empty string. should not change.",
96
+ )
97
+ parser.add_argument("--video_size", type=int, nargs=2, default=[256, 256], help="video size, height and width")
98
+ parser.add_argument("--video_seconds", type=float, default=5.0, help="video length, Default is 5.0 seconds")
99
+ parser.add_argument("--fps", type=int, default=30, help="video fps, Default is 30")
100
+ parser.add_argument("--infer_steps", type=int, default=25, help="number of inference steps, Default is 25")
101
+ parser.add_argument("--save_path", type=str, required=True, help="path to save generated video")
102
+ parser.add_argument("--seed", type=int, default=None, help="Seed for evaluation.")
103
+ # parser.add_argument(
104
+ # "--cpu_noise", action="store_true", help="Use CPU to generate noise (compatible with ComfyUI). Default is False."
105
+ # )
106
+ parser.add_argument("--latent_window_size", type=int, default=9, help="latent window size, default is 9. should not change.")
107
+ parser.add_argument(
108
+ "--embedded_cfg_scale", type=float, default=10.0, help="Embeded CFG scale (distilled CFG Scale), default is 10.0"
109
+ )
110
+ parser.add_argument(
111
+ "--guidance_scale",
112
+ type=float,
113
+ default=1.0,
114
+ help="Guidance scale for classifier free guidance. Default is 1.0, should not change.",
115
+ )
116
+ parser.add_argument("--guidance_rescale", type=float, default=0.0, help="CFG Re-scale, default is 0.0. Should not change.")
117
+ # parser.add_argument("--video_path", type=str, default=None, help="path to video for video2video inference")
118
+ parser.add_argument("--image_path", type=str, default=None, help="path to image for image2video inference")
119
+ parser.add_argument("--end_image_path", type=str, default=None, help="path to end image for image2video inference")
120
+ # parser.add_argument(
121
+ # "--control_path",
122
+ # type=str,
123
+ # default=None,
124
+ # help="path to control video for inference with controlnet. video file or directory with images",
125
+ # )
126
+ # parser.add_argument("--trim_tail_frames", type=int, default=0, help="trim tail N frames from the video before saving")
127
+
128
+ # # Flow Matching
129
+ # parser.add_argument(
130
+ # "--flow_shift",
131
+ # type=float,
132
+ # default=None,
133
+ # help="Shift factor for flow matching schedulers. Default depends on task.",
134
+ # )
135
+
136
+ parser.add_argument("--fp8", action="store_true", help="use fp8 for DiT model")
137
+ parser.add_argument("--fp8_scaled", action="store_true", help="use scaled fp8 for DiT, only for fp8")
138
+ # parser.add_argument("--fp8_fast", action="store_true", help="Enable fast FP8 arithmetic (RTX 4XXX+), only for fp8_scaled")
139
+ parser.add_argument("--fp8_llm", action="store_true", help="use fp8 for Text Encoder 1 (LLM)")
140
+ parser.add_argument(
141
+ "--device", type=str, default=None, help="device to use for inference. If None, use CUDA if available, otherwise use CPU"
142
+ )
143
+ parser.add_argument(
144
+ "--attn_mode",
145
+ type=str,
146
+ default="torch",
147
+ choices=["flash", "torch", "sageattn", "xformers", "sdpa"], # "flash2", "flash3",
148
+ help="attention mode",
149
+ )
150
+ parser.add_argument("--vae_chunk_size", type=int, default=None, help="chunk size for CausalConv3d in VAE")
151
+ parser.add_argument(
152
+ "--vae_spatial_tile_sample_min_size", type=int, default=None, help="spatial tile sample min size for VAE, default 256"
153
+ )
154
+ parser.add_argument("--bulk_decode", action="store_true", help="decode all frames at once")
155
+ parser.add_argument("--blocks_to_swap", type=int, default=0, help="number of blocks to swap in the model")
156
+ parser.add_argument(
157
+ "--output_type", type=str, default="video", choices=["video", "images", "latent", "both"], help="output type"
158
+ )
159
+ parser.add_argument("--no_metadata", action="store_true", help="do not save metadata")
160
+ parser.add_argument("--latent_path", type=str, nargs="*", default=None, help="path to latent for decode. no inference")
161
+ parser.add_argument("--lycoris", action="store_true", help="use lycoris for inference")
162
+ # parser.add_argument("--compile", action="store_true", help="Enable torch.compile")
163
+ # parser.add_argument(
164
+ # "--compile_args",
165
+ # nargs=4,
166
+ # metavar=("BACKEND", "MODE", "DYNAMIC", "FULLGRAPH"),
167
+ # default=["inductor", "max-autotune-no-cudagraphs", "False", "False"],
168
+ # help="Torch.compile settings",
169
+ # )
170
+
171
+ # New arguments for batch and interactive modes
172
+ parser.add_argument("--from_file", type=str, default=None, help="Read prompts from a file")
173
+ parser.add_argument("--interactive", action="store_true", help="Interactive mode: read prompts from console")
174
+
175
+ args = parser.parse_args()
176
+
177
+ # Validate arguments
178
+ if args.from_file and args.interactive:
179
+ raise ValueError("Cannot use both --from_file and --interactive at the same time")
180
+
181
+ if args.prompt is None and not args.from_file and not args.interactive:
182
+ raise ValueError("Either --prompt, --from_file or --interactive must be specified")
183
+
184
+ return args
185
+
186
+
187
+ def parse_prompt_line(line: str) -> Dict[str, Any]:
188
+ """Parse a prompt line into a dictionary of argument overrides
189
+
190
+ Args:
191
+ line: Prompt line with options
192
+
193
+ Returns:
194
+ Dict[str, Any]: Dictionary of argument overrides
195
+ """
196
+ # TODO common function with hv_train_network.line_to_prompt_dict
197
+ parts = line.split(" --")
198
+ prompt = parts[0].strip()
199
+
200
+ # Create dictionary of overrides
201
+ overrides = {"prompt": prompt}
202
+
203
+ for part in parts[1:]:
204
+ if not part.strip():
205
+ continue
206
+ option_parts = part.split(" ", 1)
207
+ option = option_parts[0].strip()
208
+ value = option_parts[1].strip() if len(option_parts) > 1 else ""
209
+
210
+ # Map options to argument names
211
+ if option == "w":
212
+ overrides["video_size_width"] = int(value)
213
+ elif option == "h":
214
+ overrides["video_size_height"] = int(value)
215
+ elif option == "f":
216
+ overrides["video_seconds"] = float(value)
217
+ elif option == "d":
218
+ overrides["seed"] = int(value)
219
+ elif option == "s":
220
+ overrides["infer_steps"] = int(value)
221
+ elif option == "g" or option == "l":
222
+ overrides["guidance_scale"] = float(value)
223
+ # elif option == "fs":
224
+ # overrides["flow_shift"] = float(value)
225
+ elif option == "i":
226
+ overrides["image_path"] = value
227
+ elif option == "cn":
228
+ overrides["control_path"] = value
229
+ elif option == "n":
230
+ overrides["negative_prompt"] = value
231
+
232
+ return overrides
233
+
234
+
235
+ def apply_overrides(args: argparse.Namespace, overrides: Dict[str, Any]) -> argparse.Namespace:
236
+ """Apply overrides to args
237
+
238
+ Args:
239
+ args: Original arguments
240
+ overrides: Dictionary of overrides
241
+
242
+ Returns:
243
+ argparse.Namespace: New arguments with overrides applied
244
+ """
245
+ args_copy = copy.deepcopy(args)
246
+
247
+ for key, value in overrides.items():
248
+ if key == "video_size_width":
249
+ args_copy.video_size[1] = value
250
+ elif key == "video_size_height":
251
+ args_copy.video_size[0] = value
252
+ else:
253
+ setattr(args_copy, key, value)
254
+
255
+ return args_copy
256
+
257
+
258
+ def check_inputs(args: argparse.Namespace) -> Tuple[int, int, int]:
259
+ """Validate video size and length
260
+
261
+ Args:
262
+ args: command line arguments
263
+
264
+ Returns:
265
+ Tuple[int, int, float]: (height, width, video_seconds)
266
+ """
267
+ height = args.video_size[0]
268
+ width = args.video_size[1]
269
+
270
+ video_seconds = args.video_seconds
271
+
272
+ if height % 8 != 0 or width % 8 != 0:
273
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
274
+
275
+ return height, width, video_seconds
276
+
277
+
278
+ # region DiT model
279
+
280
+
281
+ def load_dit_model(args: argparse.Namespace, device: torch.device) -> HunyuanVideoTransformer3DModelPacked:
282
+ """load DiT model
283
+
284
+ Args:
285
+ args: command line arguments
286
+ device: device to use
287
+ dit_dtype: data type for the model
288
+ dit_weight_dtype: data type for the model weights. None for as-is
289
+
290
+ Returns:
291
+ HunyuanVideoTransformer3DModelPacked: DiT model
292
+ """
293
+ loading_device = "cpu"
294
+ if args.blocks_to_swap == 0 and not args.fp8_scaled and args.lora_weight is None:
295
+ loading_device = device
296
+
297
+ # do not fp8 optimize because we will merge LoRA weights
298
+ model = load_packed_model(device, args.dit, args.attn_mode, loading_device)
299
+ return model
300
+
301
+
302
+ def optimize_model(model: HunyuanVideoTransformer3DModelPacked, args: argparse.Namespace, device: torch.device) -> None:
303
+ """optimize the model (FP8 conversion, device move etc.)
304
+
305
+ Args:
306
+ model: dit model
307
+ args: command line arguments
308
+ device: device to use
309
+ """
310
+ if args.fp8_scaled:
311
+ # load state dict as-is and optimize to fp8
312
+ state_dict = model.state_dict()
313
+
314
+ # if no blocks to swap, we can move the weights to GPU after optimization on GPU (omit redundant CPU->GPU copy)
315
+ move_to_device = args.blocks_to_swap == 0 # if blocks_to_swap > 0, we will keep the model on CPU
316
+ state_dict = model.fp8_optimization(state_dict, device, move_to_device, use_scaled_mm=False) # args.fp8_fast)
317
+
318
+ info = model.load_state_dict(state_dict, strict=True, assign=True)
319
+ logger.info(f"Loaded FP8 optimized weights: {info}")
320
+
321
+ if args.blocks_to_swap == 0:
322
+ model.to(device) # make sure all parameters are on the right device (e.g. RoPE etc.)
323
+ else:
324
+ # simple cast to dit_dtype
325
+ target_dtype = None # load as-is (dit_weight_dtype == dtype of the weights in state_dict)
326
+ target_device = None
327
+
328
+ if args.fp8:
329
+ target_dtype = torch.float8e4m3fn
330
+
331
+ if args.blocks_to_swap == 0:
332
+ logger.info(f"Move model to device: {device}")
333
+ target_device = device
334
+
335
+ if target_device is not None and target_dtype is not None:
336
+ model.to(target_device, target_dtype) # move and cast at the same time. this reduces redundant copy operations
337
+
338
+ # if args.compile:
339
+ # compile_backend, compile_mode, compile_dynamic, compile_fullgraph = args.compile_args
340
+ # logger.info(
341
+ # f"Torch Compiling[Backend: {compile_backend}; Mode: {compile_mode}; Dynamic: {compile_dynamic}; Fullgraph: {compile_fullgraph}]"
342
+ # )
343
+ # torch._dynamo.config.cache_size_limit = 32
344
+ # for i in range(len(model.blocks)):
345
+ # model.blocks[i] = torch.compile(
346
+ # model.blocks[i],
347
+ # backend=compile_backend,
348
+ # mode=compile_mode,
349
+ # dynamic=compile_dynamic.lower() in "true",
350
+ # fullgraph=compile_fullgraph.lower() in "true",
351
+ # )
352
+
353
+ if args.blocks_to_swap > 0:
354
+ logger.info(f"Enable swap {args.blocks_to_swap} blocks to CPU from device: {device}")
355
+ model.enable_block_swap(args.blocks_to_swap, device, supports_backward=False)
356
+ model.move_to_device_except_swap_blocks(device)
357
+ model.prepare_block_swap_before_forward()
358
+ else:
359
+ # make sure the model is on the right device
360
+ model.to(device)
361
+
362
+ model.eval().requires_grad_(False)
363
+ clean_memory_on_device(device)
364
+
365
+
366
+ # endregion
367
+
368
+
369
+ def decode_latent(
370
+ latent_window_size: int,
371
+ total_latent_sections: int,
372
+ bulk_decode: bool,
373
+ vae: AutoencoderKLCausal3D,
374
+ latent: torch.Tensor,
375
+ device: torch.device,
376
+ ) -> torch.Tensor:
377
+ logger.info(f"Decoding video...")
378
+ if latent.ndim == 4:
379
+ latent = latent.unsqueeze(0) # add batch dimension
380
+
381
+ vae.to(device)
382
+ if not bulk_decode:
383
+ latent_window_size = latent_window_size # default is 9
384
+ # total_latent_sections = (args.video_seconds * 30) / (latent_window_size * 4)
385
+ # total_latent_sections = int(max(round(total_latent_sections), 1))
386
+ num_frames = latent_window_size * 4 - 3
387
+
388
+ latents_to_decode = []
389
+ latent_frame_index = 0
390
+ for i in range(total_latent_sections - 1, -1, -1):
391
+ is_last_section = i == total_latent_sections - 1
392
+ generated_latent_frames = (num_frames + 3) // 4 + (1 if is_last_section else 0)
393
+ section_latent_frames = (latent_window_size * 2 + 1) if is_last_section else (latent_window_size * 2)
394
+
395
+ section_latent = latent[:, :, latent_frame_index : latent_frame_index + section_latent_frames, :, :]
396
+ latents_to_decode.append(section_latent)
397
+
398
+ latent_frame_index += generated_latent_frames
399
+
400
+ latents_to_decode = latents_to_decode[::-1] # reverse the order of latents to decode
401
+
402
+ history_pixels = None
403
+ for latent in tqdm(latents_to_decode):
404
+ if history_pixels is None:
405
+ history_pixels = hunyuan.vae_decode(latent, vae).cpu()
406
+ else:
407
+ overlapped_frames = latent_window_size * 4 - 3
408
+ current_pixels = hunyuan.vae_decode(latent, vae).cpu()
409
+ history_pixels = soft_append_bcthw(current_pixels, history_pixels, overlapped_frames)
410
+ clean_memory_on_device(device)
411
+ else:
412
+ # bulk decode
413
+ logger.info(f"Bulk decoding")
414
+ history_pixels = hunyuan.vae_decode(latent, vae).cpu()
415
+ vae.to("cpu")
416
+
417
+ print(f"Decoded. Pixel shape {history_pixels.shape}")
418
+ return history_pixels[0] # remove batch dimension
419
+
420
+
421
+ def prepare_i2v_inputs(
422
+ args: argparse.Namespace,
423
+ device: torch.device,
424
+ vae: AutoencoderKLCausal3D,
425
+ encoded_context: Optional[Dict] = None,
426
+ encoded_context_n: Optional[Dict] = None,
427
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Tuple[dict, dict]]:
428
+ """Prepare inputs for I2V
429
+
430
+ Args:
431
+ args: command line arguments
432
+ config: model configuration
433
+ device: device to use
434
+ vae: VAE model, used for image encoding
435
+ encoded_context: Pre-encoded text context
436
+
437
+ Returns:
438
+ Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Tuple[dict, dict]]:
439
+ (noise, context, context_null, y, (arg_c, arg_null))
440
+ """
441
+
442
+ height, width, video_seconds = check_inputs(args)
443
+
444
+ # prepare image
445
+ def preprocess_image(image_path: str):
446
+ image = Image.open(image_path).convert("RGB")
447
+
448
+ image_np = np.array(image) # PIL to numpy, HWC
449
+
450
+ image_np = image_video_dataset.resize_image_to_bucket(image_np, (width, height))
451
+ image_tensor = torch.from_numpy(image_np).float() / 127.5 - 1.0 # -1 to 1.0, HWC
452
+ image_tensor = image_tensor.permute(2, 0, 1)[None, :, None] # HWC -> CHW -> NCFHW, N=1, C=3, F=1
453
+ return image_tensor, image_np
454
+
455
+ img_tensor, img_np = preprocess_image(args.image_path)
456
+ if args.end_image_path is not None:
457
+ end_img_tensor, end_img_np = preprocess_image(args.end_image_path)
458
+ else:
459
+ end_img_tensor, end_img_np = None, None
460
+
461
+ # configure negative prompt
462
+ n_prompt = args.negative_prompt if args.negative_prompt else ""
463
+
464
+ if encoded_context is None:
465
+ # load text encoder
466
+ tokenizer1, text_encoder1 = load_text_encoder1(args, args.fp8_llm, device)
467
+ tokenizer2, text_encoder2 = load_text_encoder2(args)
468
+ text_encoder2.to(device)
469
+
470
+ # parse section prompts
471
+ section_prompts = {}
472
+ if ";;;" in args.prompt:
473
+ section_prompt_strs = args.prompt.split(";;;")
474
+ for section_prompt_str in section_prompt_strs:
475
+ if ":" not in section_prompt_str:
476
+ start = end = 0
477
+ prompt_str = section_prompt_str.strip()
478
+ else:
479
+ index_str, prompt_str = section_prompt_str.split(":", 1)
480
+ index_str = index_str.strip()
481
+ prompt_str = prompt_str.strip()
482
+
483
+ m = re.match(r"^(-?\d+)(-\d+)?$", index_str)
484
+ if m:
485
+ start = int(m.group(1))
486
+ end = int(m.group(2)[1:]) if m.group(2) is not None else start
487
+ else:
488
+ start = end = 0
489
+ prompt_str = section_prompt_str.strip()
490
+ for i in range(start, end + 1):
491
+ section_prompts[i] = prompt_str
492
+ else:
493
+ section_prompts[0] = args.prompt
494
+
495
+ # assert 0 in section_prompts, "Section prompts must contain section 0"
496
+ if 0 not in section_prompts:
497
+ # use smallest section index. prefer positive index over negative index
498
+ # if all section indices are negative, use the smallest negative index
499
+ indices = list(section_prompts.keys())
500
+ if all(i < 0 for i in indices):
501
+ section_index = min(indices)
502
+ else:
503
+ section_index = min(i for i in indices if i >= 0)
504
+ section_prompts[0] = section_prompts[section_index]
505
+ print(section_prompts)
506
+
507
+ logger.info(f"Encoding prompt")
508
+ llama_vecs = {}
509
+ llama_attention_masks = {}
510
+ clip_l_poolers = {}
511
+ with torch.autocast(device_type=device.type, dtype=text_encoder1.dtype), torch.no_grad():
512
+ for index, prompt in section_prompts.items():
513
+ llama_vec, clip_l_pooler = hunyuan.encode_prompt_conds(prompt, text_encoder1, text_encoder2, tokenizer1, tokenizer2)
514
+ llama_vec = llama_vec.cpu()
515
+ clip_l_pooler = clip_l_pooler.cpu()
516
+
517
+ llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
518
+
519
+ llama_vecs[index] = llama_vec
520
+ llama_attention_masks[index] = llama_attention_mask
521
+ clip_l_poolers[index] = clip_l_pooler
522
+
523
+ if args.guidance_scale == 1.0:
524
+ llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vecs[0]), torch.zeros_like(clip_l_poolers[0])
525
+ else:
526
+ with torch.autocast(device_type=device.type, dtype=text_encoder1.dtype), torch.no_grad():
527
+ llama_vec_n, clip_l_pooler_n = hunyuan.encode_prompt_conds(
528
+ n_prompt, text_encoder1, text_encoder2, tokenizer1, tokenizer2
529
+ )
530
+ llama_vec_n = llama_vec_n.cpu()
531
+ clip_l_pooler_n = clip_l_pooler_n.cpu()
532
+
533
+ llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
534
+
535
+ # free text encoder and clean memory
536
+ del text_encoder1, text_encoder2, tokenizer1, tokenizer2
537
+ clean_memory_on_device(device)
538
+
539
+ # load image encoder
540
+ feature_extractor, image_encoder = load_image_encoders(args)
541
+ image_encoder.to(device)
542
+
543
+ # encode image with image encoder
544
+ with torch.no_grad():
545
+ image_encoder_output = hf_clip_vision_encode(img_np, feature_extractor, image_encoder)
546
+ image_encoder_last_hidden_state = image_encoder_output.last_hidden_state.cpu()
547
+
548
+ if end_img_np is not None:
549
+ with torch.no_grad():
550
+ end_image_encoder_output = hf_clip_vision_encode(end_img_np, feature_extractor, image_encoder)
551
+ end_image_encoder_last_hidden_state = end_image_encoder_output.last_hidden_state.cpu()
552
+ else:
553
+ end_image_encoder_last_hidden_state = None
554
+
555
+ # free image encoder and clean memory
556
+ del image_encoder, feature_extractor
557
+ clean_memory_on_device(device)
558
+ else:
559
+ # Use pre-encoded context
560
+ llama_vecs = encoded_context["llama_vecs"]
561
+ llama_attention_masks = encoded_context["llama_attention_masks"]
562
+ clip_l_poolers = encoded_context["clip_l_poolers"]
563
+ llama_vec_n = encoded_context_n["llama_vec"]
564
+ llama_attention_mask_n = encoded_context_n["llama_attention_mask"]
565
+ clip_l_pooler_n = encoded_context_n["clip_l_pooler"]
566
+ image_encoder_last_hidden_state = encoded_context["image_encoder_last_hidden_state"]
567
+
568
+ # # end frame image
569
+ # if args.end_image_path is not None:
570
+ # end_img = Image.open(args.end_image_path).convert("RGB")
571
+ # end_img_cv2 = np.array(end_img) # PIL to numpy
572
+ # else:
573
+ # end_img = None
574
+ # end_img_cv2 = None
575
+ # has_end_image = end_img is not None
576
+
577
+ # VAE encoding
578
+ logger.info(f"Encoding image to latent space")
579
+ vae.to(device)
580
+ start_latent = hunyuan.vae_encode(img_tensor, vae).cpu()
581
+ if end_img_tensor is not None:
582
+ end_latent = hunyuan.vae_encode(end_img_tensor, vae).cpu()
583
+ else:
584
+ end_latent = None
585
+ vae.to("cpu") # move VAE to CPU to save memory
586
+ clean_memory_on_device(device)
587
+
588
+ # prepare model input arguments
589
+ arg_c = {}
590
+ for index in llama_vecs.keys():
591
+ llama_vec = llama_vecs[index]
592
+ llama_attention_mask = llama_attention_masks[index]
593
+ clip_l_pooler = clip_l_poolers[index]
594
+ arg_c_i = {
595
+ "llama_vec": llama_vec,
596
+ "llama_attention_mask": llama_attention_mask,
597
+ "clip_l_pooler": clip_l_pooler,
598
+ "image_encoder_last_hidden_state": image_encoder_last_hidden_state,
599
+ "end_image_encoder_last_hidden_state": end_image_encoder_last_hidden_state,
600
+ "prompt": section_prompts[index], # for debugging
601
+ }
602
+ arg_c[index] = arg_c_i
603
+
604
+ arg_null = {
605
+ "llama_vec": llama_vec_n,
606
+ "llama_attention_mask": llama_attention_mask_n,
607
+ "clip_l_pooler": clip_l_pooler_n,
608
+ "image_encoder_last_hidden_state": image_encoder_last_hidden_state,
609
+ "end_image_encoder_last_hidden_state": end_image_encoder_last_hidden_state,
610
+ }
611
+
612
+ return height, width, video_seconds, start_latent, end_latent, arg_c, arg_null
613
+
614
+
615
+ # def setup_scheduler(args: argparse.Namespace, config, device: torch.device) -> Tuple[Any, torch.Tensor]:
616
+ # """setup scheduler for sampling
617
+
618
+ # Args:
619
+ # args: command line arguments
620
+ # config: model configuration
621
+ # device: device to use
622
+
623
+ # Returns:
624
+ # Tuple[Any, torch.Tensor]: (scheduler, timesteps)
625
+ # """
626
+ # if args.sample_solver == "unipc":
627
+ # scheduler = FlowUniPCMultistepScheduler(num_train_timesteps=config.num_train_timesteps, shift=1, use_dynamic_shifting=False)
628
+ # scheduler.set_timesteps(args.infer_steps, device=device, shift=args.flow_shift)
629
+ # timesteps = scheduler.timesteps
630
+ # elif args.sample_solver == "dpm++":
631
+ # scheduler = FlowDPMSolverMultistepScheduler(
632
+ # num_train_timesteps=config.num_train_timesteps, shift=1, use_dynamic_shifting=False
633
+ # )
634
+ # sampling_sigmas = get_sampling_sigmas(args.infer_steps, args.flow_shift)
635
+ # timesteps, _ = retrieve_timesteps(scheduler, device=device, sigmas=sampling_sigmas)
636
+ # elif args.sample_solver == "vanilla":
637
+ # scheduler = FlowMatchDiscreteScheduler(num_train_timesteps=config.num_train_timesteps, shift=args.flow_shift)
638
+ # scheduler.set_timesteps(args.infer_steps, device=device)
639
+ # timesteps = scheduler.timesteps
640
+
641
+ # # FlowMatchDiscreteScheduler does not support generator argument in step method
642
+ # org_step = scheduler.step
643
+
644
+ # def step_wrapper(
645
+ # model_output: torch.Tensor,
646
+ # timestep: Union[int, torch.Tensor],
647
+ # sample: torch.Tensor,
648
+ # return_dict: bool = True,
649
+ # generator=None,
650
+ # ):
651
+ # return org_step(model_output, timestep, sample, return_dict=return_dict)
652
+
653
+ # scheduler.step = step_wrapper
654
+ # else:
655
+ # raise NotImplementedError("Unsupported solver.")
656
+
657
+ # return scheduler, timesteps
658
+
659
+
660
+ def generate(args: argparse.Namespace, gen_settings: GenerationSettings, shared_models: Optional[Dict] = None) -> torch.Tensor:
661
+ """main function for generation
662
+
663
+ Args:
664
+ args: command line arguments
665
+ shared_models: dictionary containing pre-loaded models and encoded data
666
+
667
+ Returns:
668
+ torch.Tensor: generated latent
669
+ """
670
+ device, dit_weight_dtype = (gen_settings.device, gen_settings.dit_weight_dtype)
671
+
672
+ # prepare seed
673
+ seed = args.seed if args.seed is not None else random.randint(0, 2**32 - 1)
674
+ args.seed = seed # set seed to args for saving
675
+
676
+ # Check if we have shared models
677
+ if shared_models is not None:
678
+ # Use shared models and encoded data
679
+ vae = shared_models.get("vae")
680
+ model = shared_models.get("model")
681
+ encoded_context = shared_models.get("encoded_contexts", {}).get(args.prompt)
682
+ n_prompt = args.negative_prompt if args.negative_prompt else ""
683
+ encoded_context_n = shared_models.get("encoded_contexts", {}).get(n_prompt)
684
+
685
+ height, width, video_seconds, start_latent, end_latent, context, context_null = prepare_i2v_inputs(
686
+ args, device, vae, encoded_context, encoded_context_n
687
+ )
688
+ else:
689
+ # prepare inputs without shared models
690
+ vae = load_vae(args.vae, args.vae_chunk_size, args.vae_spatial_tile_sample_min_size, device)
691
+ height, width, video_seconds, start_latent, end_latent, context, context_null = prepare_i2v_inputs(args, device, vae)
692
+
693
+ # load DiT model
694
+ model = load_dit_model(args, device)
695
+
696
+ # merge LoRA weights
697
+ if args.lora_weight is not None and len(args.lora_weight) > 0:
698
+ merge_lora_weights(lora_framepack, model, args, device) # ugly hack to common merge_lora_weights function
699
+ # if we only want to save the model, we can skip the rest
700
+ if args.save_merged_model:
701
+ return None
702
+
703
+ # optimize model: fp8 conversion, block swap etc.
704
+ optimize_model(model, args, device)
705
+
706
+ # sampling
707
+ latent_window_size = args.latent_window_size # default is 9
708
+ # ex: (5s * 30fps) / (9 * 4) = 4.16 -> 4 sections, 60s -> 1800 / 36 = 50 sections
709
+ total_latent_sections = (video_seconds * 30) / (latent_window_size * 4)
710
+ total_latent_sections = int(max(round(total_latent_sections), 1))
711
+
712
+ # set random generator
713
+ seed_g = torch.Generator(device="cpu")
714
+ seed_g.manual_seed(seed)
715
+ num_frames = latent_window_size * 4 - 3
716
+
717
+ logger.info(
718
+ f"Video size: {height}x{width}@{video_seconds} (HxW@seconds), fps: {args.fps}, "
719
+ f"infer_steps: {args.infer_steps}, frames per generation: {num_frames}"
720
+ )
721
+
722
+ history_latents = torch.zeros((1, 16, 1 + 2 + 16, height // 8, width // 8), dtype=torch.float32)
723
+ # history_pixels = None
724
+ total_generated_latent_frames = 0
725
+
726
+ latent_paddings = reversed(range(total_latent_sections))
727
+
728
+ if total_latent_sections > 4:
729
+ # In theory the latent_paddings should follow the above sequence, but it seems that duplicating some
730
+ # items looks better than expanding it when total_latent_sections > 4
731
+ # One can try to remove below trick and just
732
+ # use `latent_paddings = list(reversed(range(total_latent_sections)))` to compare
733
+ # 4 sections: 3, 2, 1, 0. 50 sections: 3, 2, 2, ... 2, 1, 0
734
+ latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0]
735
+
736
+ for section_index_reverse, latent_padding in enumerate(latent_paddings):
737
+ section_index = total_latent_sections - 1 - section_index_reverse
738
+
739
+ is_last_section = latent_padding == 0
740
+ is_first_section = section_index_reverse == 0
741
+ latent_padding_size = latent_padding * latent_window_size
742
+
743
+ logger.info(f"latent_padding_size = {latent_padding_size}, is_last_section = {is_last_section}")
744
+
745
+ reference_start_latent = start_latent
746
+ apply_end_image = args.end_image_path is not None and is_first_section
747
+ if apply_end_image:
748
+ latent_padding_size = 0
749
+ reference_start_latent = end_latent
750
+ logger.info(f"Apply experimental end image, latent_padding_size = {latent_padding_size}")
751
+
752
+ # sum([1, 3, 9, 1, 2, 16]) = 32
753
+ indices = torch.arange(0, sum([1, latent_padding_size, latent_window_size, 1, 2, 16])).unsqueeze(0)
754
+ (
755
+ clean_latent_indices_pre,
756
+ blank_indices,
757
+ latent_indices,
758
+ clean_latent_indices_post,
759
+ clean_latent_2x_indices,
760
+ clean_latent_4x_indices,
761
+ ) = indices.split([1, latent_padding_size, latent_window_size, 1, 2, 16], dim=1)
762
+ clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1)
763
+
764
+ clean_latents_pre = reference_start_latent.to(history_latents)
765
+ clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, : 1 + 2 + 16, :, :].split([1, 2, 16], dim=2)
766
+ clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2)
767
+
768
+ # if use_teacache:
769
+ # transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
770
+ # else:
771
+ # transformer.initialize_teacache(enable_teacache=False)
772
+
773
+ section_index_from_last = -(section_index_reverse + 1) # -1, -2 ...
774
+ if section_index_from_last in context:
775
+ prompt_index = section_index_from_last
776
+ elif section_index in context:
777
+ prompt_index = section_index
778
+ else:
779
+ prompt_index = 0
780
+ context_for_index = context[prompt_index]
781
+ # if args.section_prompts is not None:
782
+ logger.info(f"Section {section_index}: {context_for_index['prompt']}")
783
+
784
+ llama_vec = context_for_index["llama_vec"].to(device, dtype=torch.bfloat16)
785
+ llama_attention_mask = context_for_index["llama_attention_mask"].to(device)
786
+ clip_l_pooler = context_for_index["clip_l_pooler"].to(device, dtype=torch.bfloat16)
787
+
788
+ if not apply_end_image:
789
+ image_encoder_last_hidden_state = context_for_index["image_encoder_last_hidden_state"].to(device, dtype=torch.bfloat16)
790
+ else:
791
+ image_encoder_last_hidden_state = context_for_index["end_image_encoder_last_hidden_state"].to(
792
+ device, dtype=torch.bfloat16
793
+ )
794
+
795
+ llama_vec_n = context_null["llama_vec"].to(device, dtype=torch.bfloat16)
796
+ llama_attention_mask_n = context_null["llama_attention_mask"].to(device)
797
+ clip_l_pooler_n = context_null["clip_l_pooler"].to(device, dtype=torch.bfloat16)
798
+
799
+ generated_latents = sample_hunyuan(
800
+ transformer=model,
801
+ sampler=args.sample_solver,
802
+ width=width,
803
+ height=height,
804
+ frames=num_frames,
805
+ real_guidance_scale=args.guidance_scale,
806
+ distilled_guidance_scale=args.embedded_cfg_scale,
807
+ guidance_rescale=args.guidance_rescale,
808
+ # shift=3.0,
809
+ num_inference_steps=args.infer_steps,
810
+ generator=seed_g,
811
+ prompt_embeds=llama_vec,
812
+ prompt_embeds_mask=llama_attention_mask,
813
+ prompt_poolers=clip_l_pooler,
814
+ negative_prompt_embeds=llama_vec_n,
815
+ negative_prompt_embeds_mask=llama_attention_mask_n,
816
+ negative_prompt_poolers=clip_l_pooler_n,
817
+ device=device,
818
+ dtype=torch.bfloat16,
819
+ image_embeddings=image_encoder_last_hidden_state,
820
+ latent_indices=latent_indices,
821
+ clean_latents=clean_latents,
822
+ clean_latent_indices=clean_latent_indices,
823
+ clean_latents_2x=clean_latents_2x,
824
+ clean_latent_2x_indices=clean_latent_2x_indices,
825
+ clean_latents_4x=clean_latents_4x,
826
+ clean_latent_4x_indices=clean_latent_4x_indices,
827
+ )
828
+
829
+ if is_last_section:
830
+ generated_latents = torch.cat([start_latent.to(generated_latents), generated_latents], dim=2)
831
+
832
+ total_generated_latent_frames += int(generated_latents.shape[2])
833
+ history_latents = torch.cat([generated_latents.to(history_latents), history_latents], dim=2)
834
+
835
+ real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]
836
+
837
+ logger.info(f"Generated. Latent shape {real_history_latents.shape}")
838
+
839
+ # # TODO support saving intermediate video
840
+ # clean_memory_on_device(device)
841
+ # vae.to(device)
842
+ # if history_pixels is None:
843
+ # history_pixels = hunyuan.vae_decode(real_history_latents, vae).cpu()
844
+ # else:
845
+ # section_latent_frames = (latent_window_size * 2 + 1) if is_last_section else (latent_window_size * 2)
846
+ # overlapped_frames = latent_window_size * 4 - 3
847
+ # current_pixels = hunyuan.vae_decode(real_history_latents[:, :, :section_latent_frames], vae).cpu()
848
+ # history_pixels = soft_append_bcthw(current_pixels, history_pixels, overlapped_frames)
849
+ # vae.to("cpu")
850
+ # # if not is_last_section:
851
+ # # # save intermediate video
852
+ # # save_video(history_pixels[0], args, total_generated_latent_frames)
853
+ # print(f"Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}")
854
+
855
+ # Only clean up shared models if they were created within this function
856
+ if shared_models is None:
857
+ # free memory
858
+ del model
859
+ # del scheduler
860
+ synchronize_device(device)
861
+
862
+ # wait for 5 seconds until block swap is done
863
+ logger.info("Waiting for 5 seconds to finish block swap")
864
+ time.sleep(5)
865
+
866
+ gc.collect()
867
+ clean_memory_on_device(device)
868
+
869
+ return vae, real_history_latents
870
+
871
+
872
+ def save_latent(latent: torch.Tensor, args: argparse.Namespace, height: int, width: int) -> str:
873
+ """Save latent to file
874
+
875
+ Args:
876
+ latent: Latent tensor
877
+ args: command line arguments
878
+ height: height of frame
879
+ width: width of frame
880
+
881
+ Returns:
882
+ str: Path to saved latent file
883
+ """
884
+ save_path = args.save_path
885
+ os.makedirs(save_path, exist_ok=True)
886
+ time_flag = datetime.fromtimestamp(time.time()).strftime("%Y%m%d-%H%M%S")
887
+
888
+ seed = args.seed
889
+ video_seconds = args.video_seconds
890
+ latent_path = f"{save_path}/{time_flag}_{seed}_latent.safetensors"
891
+
892
+ if args.no_metadata:
893
+ metadata = None
894
+ else:
895
+ metadata = {
896
+ "seeds": f"{seed}",
897
+ "prompt": f"{args.prompt}",
898
+ "height": f"{height}",
899
+ "width": f"{width}",
900
+ "video_seconds": f"{video_seconds}",
901
+ "infer_steps": f"{args.infer_steps}",
902
+ "guidance_scale": f"{args.guidance_scale}",
903
+ "latent_window_size": f"{args.latent_window_size}",
904
+ "embedded_cfg_scale": f"{args.embedded_cfg_scale}",
905
+ "guidance_rescale": f"{args.guidance_rescale}",
906
+ "sample_solver": f"{args.sample_solver}",
907
+ "latent_window_size": f"{args.latent_window_size}",
908
+ "fps": f"{args.fps}",
909
+ }
910
+ if args.negative_prompt is not None:
911
+ metadata["negative_prompt"] = f"{args.negative_prompt}"
912
+
913
+ sd = {"latent": latent.contiguous()}
914
+ save_file(sd, latent_path, metadata=metadata)
915
+ logger.info(f"Latent saved to: {latent_path}")
916
+
917
+ return latent_path
918
+
919
+
920
+ def save_video(
921
+ video: torch.Tensor, args: argparse.Namespace, original_base_name: Optional[str] = None, latent_frames: Optional[int] = None
922
+ ) -> str:
923
+ """Save video to file
924
+
925
+ Args:
926
+ video: Video tensor
927
+ args: command line arguments
928
+ original_base_name: Original base name (if latents are loaded from files)
929
+
930
+ Returns:
931
+ str: Path to saved video file
932
+ """
933
+ save_path = args.save_path
934
+ os.makedirs(save_path, exist_ok=True)
935
+ time_flag = datetime.fromtimestamp(time.time()).strftime("%Y%m%d-%H%M%S")
936
+
937
+ seed = args.seed
938
+ original_name = "" if original_base_name is None else f"_{original_base_name}"
939
+ latent_frames = "" if latent_frames is None else f"_{latent_frames}"
940
+ video_path = f"{save_path}/{time_flag}_{seed}{original_name}{latent_frames}.mp4"
941
+
942
+ video = video.unsqueeze(0)
943
+ save_videos_grid(video, video_path, fps=args.fps, rescale=True)
944
+ logger.info(f"Video saved to: {video_path}")
945
+
946
+ return video_path
947
+
948
+
949
+ def save_images(sample: torch.Tensor, args: argparse.Namespace, original_base_name: Optional[str] = None) -> str:
950
+ """Save images to directory
951
+
952
+ Args:
953
+ sample: Video tensor
954
+ args: command line arguments
955
+ original_base_name: Original base name (if latents are loaded from files)
956
+
957
+ Returns:
958
+ str: Path to saved images directory
959
+ """
960
+ save_path = args.save_path
961
+ os.makedirs(save_path, exist_ok=True)
962
+ time_flag = datetime.fromtimestamp(time.time()).strftime("%Y%m%d-%H%M%S")
963
+
964
+ seed = args.seed
965
+ original_name = "" if original_base_name is None else f"_{original_base_name}"
966
+ image_name = f"{time_flag}_{seed}{original_name}"
967
+ sample = sample.unsqueeze(0)
968
+ save_images_grid(sample, save_path, image_name, rescale=True)
969
+ logger.info(f"Sample images saved to: {save_path}/{image_name}")
970
+
971
+ return f"{save_path}/{image_name}"
972
+
973
+
974
+ def save_output(
975
+ args: argparse.Namespace,
976
+ vae: AutoencoderKLCausal3D,
977
+ latent: torch.Tensor,
978
+ device: torch.device,
979
+ original_base_names: Optional[List[str]] = None,
980
+ ) -> None:
981
+ """save output
982
+
983
+ Args:
984
+ args: command line arguments
985
+ vae: VAE model
986
+ latent: latent tensor
987
+ device: device to use
988
+ original_base_names: original base names (if latents are loaded from files)
989
+ """
990
+ height, width = latent.shape[-2], latent.shape[-1] # BCTHW
991
+ height *= 8
992
+ width *= 8
993
+ # print(f"Saving output. Latent shape {latent.shape}; pixel shape {height}x{width}")
994
+ if args.output_type == "latent" or args.output_type == "both":
995
+ # save latent
996
+ save_latent(latent, args, height, width)
997
+ if args.output_type == "latent":
998
+ return
999
+
1000
+ total_latent_sections = (args.video_seconds * 30) / (args.latent_window_size * 4)
1001
+ total_latent_sections = int(max(round(total_latent_sections), 1))
1002
+ video = decode_latent(args.latent_window_size, total_latent_sections, args.bulk_decode, vae, latent, device)
1003
+
1004
+ if args.output_type == "video" or args.output_type == "both":
1005
+ # save video
1006
+ original_name = "" if original_base_names is None else f"_{original_base_names[0]}"
1007
+ save_video(video, args, original_name)
1008
+
1009
+ elif args.output_type == "images":
1010
+ # save images
1011
+ original_name = "" if original_base_names is None else f"_{original_base_names[0]}"
1012
+ save_images(video, args, original_name)
1013
+
1014
+
1015
+ def preprocess_prompts_for_batch(prompt_lines: List[str], base_args: argparse.Namespace) -> List[Dict]:
1016
+ """Process multiple prompts for batch mode
1017
+
1018
+ Args:
1019
+ prompt_lines: List of prompt lines
1020
+ base_args: Base command line arguments
1021
+
1022
+ Returns:
1023
+ List[Dict]: List of prompt data dictionaries
1024
+ """
1025
+ prompts_data = []
1026
+
1027
+ for line in prompt_lines:
1028
+ line = line.strip()
1029
+ if not line or line.startswith("#"): # Skip empty lines and comments
1030
+ continue
1031
+
1032
+ # Parse prompt line and create override dictionary
1033
+ prompt_data = parse_prompt_line(line)
1034
+ logger.info(f"Parsed prompt data: {prompt_data}")
1035
+ prompts_data.append(prompt_data)
1036
+
1037
+ return prompts_data
1038
+
1039
+
1040
+ def get_generation_settings(args: argparse.Namespace) -> GenerationSettings:
1041
+ device = torch.device(args.device)
1042
+
1043
+ dit_weight_dtype = None # default
1044
+ if args.fp8_scaled:
1045
+ dit_weight_dtype = None # various precision weights, so don't cast to specific dtype
1046
+ elif args.fp8:
1047
+ dit_weight_dtype = torch.float8_e4m3fn
1048
+
1049
+ logger.info(f"Using device: {device}, DiT weight weight precision: {dit_weight_dtype}")
1050
+
1051
+ gen_settings = GenerationSettings(device=device, dit_weight_dtype=dit_weight_dtype)
1052
+ return gen_settings
1053
+
1054
+
1055
+ def main():
1056
+ # Parse arguments
1057
+ args = parse_args()
1058
+
1059
+ # Check if latents are provided
1060
+ latents_mode = args.latent_path is not None and len(args.latent_path) > 0
1061
+
1062
+ # Set device
1063
+ device = args.device if args.device is not None else "cuda" if torch.cuda.is_available() else "cpu"
1064
+ device = torch.device(device)
1065
+ logger.info(f"Using device: {device}")
1066
+ args.device = device
1067
+
1068
+ if latents_mode:
1069
+ # Original latent decode mode
1070
+ original_base_names = []
1071
+ latents_list = []
1072
+ seeds = []
1073
+
1074
+ assert len(args.latent_path) == 1, "Only one latent path is supported for now"
1075
+
1076
+ for latent_path in args.latent_path:
1077
+ original_base_names.append(os.path.splitext(os.path.basename(latent_path))[0])
1078
+ seed = 0
1079
+
1080
+ if os.path.splitext(latent_path)[1] != ".safetensors":
1081
+ latents = torch.load(latent_path, map_location="cpu")
1082
+ else:
1083
+ latents = load_file(latent_path)["latent"]
1084
+ with safe_open(latent_path, framework="pt") as f:
1085
+ metadata = f.metadata()
1086
+ if metadata is None:
1087
+ metadata = {}
1088
+ logger.info(f"Loaded metadata: {metadata}")
1089
+
1090
+ if "seeds" in metadata:
1091
+ seed = int(metadata["seeds"])
1092
+ if "height" in metadata and "width" in metadata:
1093
+ height = int(metadata["height"])
1094
+ width = int(metadata["width"])
1095
+ args.video_size = [height, width]
1096
+ if "video_seconds" in metadata:
1097
+ args.video_seconds = float(metadata["video_seconds"])
1098
+
1099
+ seeds.append(seed)
1100
+ logger.info(f"Loaded latent from {latent_path}. Shape: {latents.shape}")
1101
+
1102
+ if latents.ndim == 5: # [BCTHW]
1103
+ latents = latents.squeeze(0) # [CTHW]
1104
+
1105
+ latents_list.append(latents)
1106
+
1107
+ latent = torch.stack(latents_list, dim=0) # [N, ...], must be same shape
1108
+
1109
+ args.seed = seeds[0]
1110
+
1111
+ vae = load_vae(args.vae, args.vae_chunk_size, args.vae_spatial_tile_sample_min_size, device)
1112
+ save_output(args, vae, latent, device, original_base_names)
1113
+
1114
+ elif args.from_file:
1115
+ # Batch mode from file
1116
+
1117
+ # Read prompts from file
1118
+ with open(args.from_file, "r", encoding="utf-8") as f:
1119
+ prompt_lines = f.readlines()
1120
+
1121
+ # Process prompts
1122
+ prompts_data = preprocess_prompts_for_batch(prompt_lines, args)
1123
+ # process_batch_prompts(prompts_data, args)
1124
+ raise NotImplementedError("Batch mode is not implemented yet.")
1125
+
1126
+ elif args.interactive:
1127
+ # Interactive mode
1128
+ # process_interactive(args)
1129
+ raise NotImplementedError("Interactive mode is not implemented yet.")
1130
+
1131
+ else:
1132
+ # Single prompt mode (original behavior)
1133
+
1134
+ # Generate latent
1135
+ gen_settings = get_generation_settings(args)
1136
+ vae, latent = generate(args, gen_settings)
1137
+ # print(f"Generated latent shape: {latent.shape}")
1138
+
1139
+ # # Save latent and video
1140
+ # if args.save_merged_model:
1141
+ # return
1142
+
1143
+ save_output(args, vae, latent[0], device)
1144
+
1145
+ logger.info("Done!")
1146
+
1147
+
1148
+ if __name__ == "__main__":
1149
+ main()
fpack_train_network.py ADDED
@@ -0,0 +1,410 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import gc
3
+ import math
4
+ import time
5
+ from typing import Optional
6
+ from PIL import Image
7
+
8
+
9
+ import numpy as np
10
+ import torch
11
+ import torchvision.transforms.functional as TF
12
+ from tqdm import tqdm
13
+ from accelerate import Accelerator, init_empty_weights
14
+
15
+ from dataset import image_video_dataset
16
+ from dataset.image_video_dataset import ARCHITECTURE_FRAMEPACK, ARCHITECTURE_FRAMEPACK_FULL, load_video
17
+ from fpack_generate_video import decode_latent
18
+ from frame_pack import hunyuan
19
+ from frame_pack.clip_vision import hf_clip_vision_encode
20
+ from frame_pack.framepack_utils import load_image_encoders, load_text_encoder1, load_text_encoder2
21
+ from frame_pack.framepack_utils import load_vae as load_framepack_vae
22
+ from frame_pack.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked, load_packed_model
23
+ from frame_pack.k_diffusion_hunyuan import sample_hunyuan
24
+ from frame_pack.utils import crop_or_pad_yield_mask
25
+ from dataset.image_video_dataset import resize_image_to_bucket
26
+ from hv_train_network import NetworkTrainer, load_prompts, clean_memory_on_device, setup_parser_common, read_config_from_file
27
+
28
+ import logging
29
+
30
+ logger = logging.getLogger(__name__)
31
+ logging.basicConfig(level=logging.INFO)
32
+
33
+ from utils import model_utils
34
+ from utils.safetensors_utils import load_safetensors, MemoryEfficientSafeOpen
35
+
36
+
37
+ class FramePackNetworkTrainer(NetworkTrainer):
38
+ def __init__(self):
39
+ super().__init__()
40
+
41
+ # region model specific
42
+
43
+ @property
44
+ def architecture(self) -> str:
45
+ return ARCHITECTURE_FRAMEPACK
46
+
47
+ @property
48
+ def architecture_full_name(self) -> str:
49
+ return ARCHITECTURE_FRAMEPACK_FULL
50
+
51
+ def handle_model_specific_args(self, args):
52
+ self._i2v_training = True
53
+ self._control_training = False
54
+ self.default_guidance_scale = 10.0 # embeded guidance scale
55
+
56
+ def process_sample_prompts(
57
+ self,
58
+ args: argparse.Namespace,
59
+ accelerator: Accelerator,
60
+ sample_prompts: str,
61
+ ):
62
+ device = accelerator.device
63
+
64
+ logger.info(f"cache Text Encoder outputs for sample prompt: {sample_prompts}")
65
+ prompts = load_prompts(sample_prompts)
66
+
67
+ # load text encoder
68
+ tokenizer1, text_encoder1 = load_text_encoder1(args, args.fp8_llm, device)
69
+ tokenizer2, text_encoder2 = load_text_encoder2(args)
70
+ text_encoder2.to(device)
71
+
72
+ sample_prompts_te_outputs = {} # (prompt) -> (t1 embeds, t1 mask, t2 embeds)
73
+ for prompt_dict in prompts:
74
+ for p in [prompt_dict.get("prompt", ""), prompt_dict.get("negative_prompt", "")]:
75
+ if p is None or p in sample_prompts_te_outputs:
76
+ continue
77
+ logger.info(f"cache Text Encoder outputs for prompt: {p}")
78
+ with torch.amp.autocast(device_type=device.type, dtype=text_encoder1.dtype), torch.no_grad():
79
+ llama_vec, clip_l_pooler = hunyuan.encode_prompt_conds(p, text_encoder1, text_encoder2, tokenizer1, tokenizer2)
80
+ llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
81
+
82
+ llama_vec = llama_vec.to("cpu")
83
+ llama_attention_mask = llama_attention_mask.to("cpu")
84
+ clip_l_pooler = clip_l_pooler.to("cpu")
85
+ sample_prompts_te_outputs[p] = (llama_vec, llama_attention_mask, clip_l_pooler)
86
+ del text_encoder1, text_encoder2
87
+ clean_memory_on_device(device)
88
+
89
+ # image embedding for I2V training
90
+ feature_extractor, image_encoder = load_image_encoders(args)
91
+ image_encoder.to(device)
92
+
93
+ # encode image with image encoder
94
+ sample_prompts_image_embs = {}
95
+ for prompt_dict in prompts:
96
+ image_path = prompt_dict.get("image_path", None)
97
+ assert image_path is not None, "image_path should be set for I2V training"
98
+ if image_path in sample_prompts_image_embs:
99
+ continue
100
+
101
+ logger.info(f"Encoding image to image encoder context: {image_path}")
102
+
103
+ height = prompt_dict.get("height", 256)
104
+ width = prompt_dict.get("width", 256)
105
+
106
+ img = Image.open(image_path).convert("RGB")
107
+ img_np = np.array(img) # PIL to numpy, HWC
108
+ img_np = image_video_dataset.resize_image_to_bucket(img_np, (width, height)) # returns a numpy array
109
+
110
+ with torch.no_grad():
111
+ image_encoder_output = hf_clip_vision_encode(img_np, feature_extractor, image_encoder)
112
+ image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
113
+
114
+ image_encoder_last_hidden_state = image_encoder_last_hidden_state.to("cpu")
115
+ sample_prompts_image_embs[image_path] = image_encoder_last_hidden_state
116
+
117
+ del image_encoder
118
+ clean_memory_on_device(device)
119
+
120
+ # prepare sample parameters
121
+ sample_parameters = []
122
+ for prompt_dict in prompts:
123
+ prompt_dict_copy = prompt_dict.copy()
124
+
125
+ p = prompt_dict.get("prompt", "")
126
+ llama_vec, llama_attention_mask, clip_l_pooler = sample_prompts_te_outputs[p]
127
+ prompt_dict_copy["llama_vec"] = llama_vec
128
+ prompt_dict_copy["llama_attention_mask"] = llama_attention_mask
129
+ prompt_dict_copy["clip_l_pooler"] = clip_l_pooler
130
+
131
+ p = prompt_dict.get("negative_prompt", "")
132
+ llama_vec, llama_attention_mask, clip_l_pooler = sample_prompts_te_outputs[p]
133
+ prompt_dict_copy["negative_llama_vec"] = llama_vec
134
+ prompt_dict_copy["negative_llama_attention_mask"] = llama_attention_mask
135
+ prompt_dict_copy["negative_clip_l_pooler"] = clip_l_pooler
136
+
137
+ p = prompt_dict.get("image_path", None)
138
+ prompt_dict_copy["image_encoder_last_hidden_state"] = sample_prompts_image_embs[p]
139
+
140
+ sample_parameters.append(prompt_dict_copy)
141
+
142
+ clean_memory_on_device(accelerator.device)
143
+ return sample_parameters
144
+
145
+ def do_inference(
146
+ self,
147
+ accelerator,
148
+ args,
149
+ sample_parameter,
150
+ vae,
151
+ dit_dtype,
152
+ transformer,
153
+ discrete_flow_shift,
154
+ sample_steps,
155
+ width,
156
+ height,
157
+ frame_count,
158
+ generator,
159
+ do_classifier_free_guidance,
160
+ guidance_scale,
161
+ cfg_scale,
162
+ image_path=None,
163
+ control_video_path=None,
164
+ ):
165
+ """architecture dependent inference"""
166
+ model: HunyuanVideoTransformer3DModelPacked = transformer
167
+ device = accelerator.device
168
+ if cfg_scale is None:
169
+ cfg_scale = 1.0
170
+ do_classifier_free_guidance = do_classifier_free_guidance and cfg_scale != 1.0
171
+
172
+ # prepare parameters
173
+ latent_window_size = args.latent_window_size # default is 9
174
+ latent_f = (frame_count - 1) // 4 + 1
175
+ total_latent_sections = math.floor((latent_f - 1) / latent_window_size)
176
+ if total_latent_sections < 1:
177
+ logger.warning(f"Not enough frames for FramePack: {latent_f}, minimum: {latent_window_size*4+1}")
178
+ return None
179
+
180
+ latent_f = total_latent_sections * latent_window_size + 1
181
+ actual_frame_count = (latent_f - 1) * 4 + 1
182
+ if actual_frame_count != frame_count:
183
+ logger.info(f"Frame count mismatch: {actual_frame_count} != {frame_count}, trimming to {actual_frame_count}")
184
+ frame_count = actual_frame_count
185
+ num_frames = latent_window_size * 4 - 3
186
+
187
+ # parepare start latent
188
+ image = Image.open(image_path).convert("RGB")
189
+ image = resize_image_to_bucket(image, (width, height)) # returns a numpy array
190
+ image = torch.from_numpy(image).permute(2, 0, 1).unsqueeze(1).unsqueeze(0).float() # 1, C, 1, H, W
191
+ image = image / 127.5 - 1 # -1 to 1
192
+
193
+ # VAE encoding
194
+ logger.info(f"Encoding image to latent space")
195
+ vae.to(device)
196
+ start_latent = hunyuan.vae_encode(image, vae)
197
+ vae.to("cpu") # move VAE to CPU to save memory
198
+ clean_memory_on_device(device)
199
+
200
+ # sampilng
201
+ history_latents = torch.zeros((1, 16, 1 + 2 + 16, height // 8, width // 8), dtype=torch.float32)
202
+ total_generated_latent_frames = 0
203
+
204
+ latent_paddings = reversed(range(total_latent_sections))
205
+
206
+ if total_latent_sections > 4:
207
+ latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0]
208
+
209
+ for latent_padding in latent_paddings:
210
+ is_last_section = latent_padding == 0
211
+ latent_padding_size = latent_padding * latent_window_size
212
+
213
+ logger.info(f"latent_padding_size = {latent_padding_size}, is_last_section = {is_last_section}")
214
+
215
+ indices = torch.arange(0, sum([1, latent_padding_size, latent_window_size, 1, 2, 16])).unsqueeze(0)
216
+ (
217
+ clean_latent_indices_pre,
218
+ blank_indices,
219
+ latent_indices,
220
+ clean_latent_indices_post,
221
+ clean_latent_2x_indices,
222
+ clean_latent_4x_indices,
223
+ ) = indices.split([1, latent_padding_size, latent_window_size, 1, 2, 16], dim=1)
224
+ clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1)
225
+
226
+ clean_latents_pre = start_latent.to(history_latents)
227
+ clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, : 1 + 2 + 16, :, :].split(
228
+ [1, 2, 16], dim=2
229
+ )
230
+ clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2)
231
+
232
+ # if use_teacache:
233
+ # transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
234
+ # else:
235
+ # transformer.initialize_teacache(enable_teacache=False)
236
+
237
+ llama_vec = sample_parameter["llama_vec"].to(device, dtype=torch.bfloat16)
238
+ llama_attention_mask = sample_parameter["llama_attention_mask"].to(device)
239
+ clip_l_pooler = sample_parameter["clip_l_pooler"].to(device, dtype=torch.bfloat16)
240
+ if cfg_scale == 1.0:
241
+ llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
242
+ llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
243
+ else:
244
+ llama_vec_n = sample_parameter["negative_llama_vec"].to(device, dtype=torch.bfloat16)
245
+ llama_attention_mask_n = sample_parameter["negative_llama_attention_mask"].to(device)
246
+ clip_l_pooler_n = sample_parameter["negative_clip_l_pooler"].to(device, dtype=torch.bfloat16)
247
+ image_encoder_last_hidden_state = sample_parameter["image_encoder_last_hidden_state"].to(device, dtype=torch.bfloat16)
248
+
249
+ generated_latents = sample_hunyuan(
250
+ transformer=model,
251
+ sampler=args.sample_solver,
252
+ width=width,
253
+ height=height,
254
+ frames=num_frames,
255
+ real_guidance_scale=cfg_scale,
256
+ distilled_guidance_scale=guidance_scale,
257
+ guidance_rescale=0.0,
258
+ # shift=3.0,
259
+ num_inference_steps=sample_steps,
260
+ generator=generator,
261
+ prompt_embeds=llama_vec,
262
+ prompt_embeds_mask=llama_attention_mask,
263
+ prompt_poolers=clip_l_pooler,
264
+ negative_prompt_embeds=llama_vec_n,
265
+ negative_prompt_embeds_mask=llama_attention_mask_n,
266
+ negative_prompt_poolers=clip_l_pooler_n,
267
+ device=device,
268
+ dtype=torch.bfloat16,
269
+ image_embeddings=image_encoder_last_hidden_state,
270
+ latent_indices=latent_indices,
271
+ clean_latents=clean_latents,
272
+ clean_latent_indices=clean_latent_indices,
273
+ clean_latents_2x=clean_latents_2x,
274
+ clean_latent_2x_indices=clean_latent_2x_indices,
275
+ clean_latents_4x=clean_latents_4x,
276
+ clean_latent_4x_indices=clean_latent_4x_indices,
277
+ )
278
+
279
+ if is_last_section:
280
+ generated_latents = torch.cat([start_latent.to(generated_latents), generated_latents], dim=2)
281
+
282
+ total_generated_latent_frames += int(generated_latents.shape[2])
283
+ history_latents = torch.cat([generated_latents.to(history_latents), history_latents], dim=2)
284
+
285
+ real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]
286
+
287
+ logger.info(f"Generated. Latent shape {real_history_latents.shape}")
288
+
289
+ # wait for 5 seconds until block swap is done
290
+ logger.info("Waiting for 5 seconds to finish block swap")
291
+ time.sleep(5)
292
+
293
+ gc.collect()
294
+ clean_memory_on_device(device)
295
+
296
+ video = decode_latent(latent_window_size, total_latent_sections, args.bulk_decode, vae, real_history_latents, device)
297
+ video = video.to("cpu", dtype=torch.float32).unsqueeze(0) # add batch dimension
298
+ video = (video / 2 + 0.5).clamp(0, 1) # -1 to 1 -> 0 to 1
299
+ clean_memory_on_device(device)
300
+
301
+ return video
302
+
303
+ def load_vae(self, args: argparse.Namespace, vae_dtype: torch.dtype, vae_path: str):
304
+ vae_path = args.vae
305
+ logger.info(f"Loading VAE model from {vae_path}")
306
+ vae = load_framepack_vae(args.vae, args.vae_chunk_size, args.vae_spatial_tile_sample_min_size, "cpu")
307
+ return vae
308
+
309
+ def load_transformer(
310
+ self,
311
+ accelerator: Accelerator,
312
+ args: argparse.Namespace,
313
+ dit_path: str,
314
+ attn_mode: str,
315
+ split_attn: bool,
316
+ loading_device: str,
317
+ dit_weight_dtype: Optional[torch.dtype],
318
+ ):
319
+ logger.info(f"Loading DiT model from {dit_path}")
320
+ device = accelerator.device
321
+ model = load_packed_model(device, dit_path, attn_mode, loading_device, args.fp8_scaled, split_attn)
322
+ return model
323
+
324
+ def scale_shift_latents(self, latents):
325
+ # FramePack VAE includes scaling
326
+ return latents
327
+
328
+ def call_dit(
329
+ self,
330
+ args: argparse.Namespace,
331
+ accelerator: Accelerator,
332
+ transformer,
333
+ latents: torch.Tensor,
334
+ batch: dict[str, torch.Tensor],
335
+ noise: torch.Tensor,
336
+ noisy_model_input: torch.Tensor,
337
+ timesteps: torch.Tensor,
338
+ network_dtype: torch.dtype,
339
+ ):
340
+ model: HunyuanVideoTransformer3DModelPacked = transformer
341
+ device = accelerator.device
342
+ batch_size = latents.shape[0]
343
+
344
+ # maybe model.dtype is better than network_dtype...
345
+ distilled_guidance = torch.tensor([args.guidance_scale * 1000.0] * batch_size).to(device=device, dtype=network_dtype)
346
+ latents = latents.to(device=accelerator.device, dtype=network_dtype)
347
+ noisy_model_input = noisy_model_input.to(device=accelerator.device, dtype=network_dtype)
348
+ # for k, v in batch.items():
349
+ # if isinstance(v, torch.Tensor):
350
+ # print(f"{k}: {v.shape} {v.dtype} {v.device}")
351
+ with accelerator.autocast():
352
+ model_pred = model(
353
+ hidden_states=noisy_model_input,
354
+ timestep=timesteps,
355
+ encoder_hidden_states=batch["llama_vec"],
356
+ encoder_attention_mask=batch["llama_attention_mask"],
357
+ pooled_projections=batch["clip_l_pooler"],
358
+ guidance=distilled_guidance,
359
+ latent_indices=batch["latent_indices"],
360
+ clean_latents=batch["latents_clean"],
361
+ clean_latent_indices=batch["clean_latent_indices"],
362
+ clean_latents_2x=batch["latents_clean_2x"],
363
+ clean_latent_2x_indices=batch["clean_latent_2x_indices"],
364
+ clean_latents_4x=batch["latents_clean_4x"],
365
+ clean_latent_4x_indices=batch["clean_latent_4x_indices"],
366
+ image_embeddings=batch["image_embeddings"],
367
+ return_dict=False,
368
+ )
369
+ model_pred = model_pred[0] # returns tuple (model_pred, )
370
+
371
+ # flow matching loss
372
+ target = noise - latents
373
+
374
+ return model_pred, target
375
+
376
+ # endregion model specific
377
+
378
+
379
+ def framepack_setup_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
380
+ """FramePack specific parser setup"""
381
+ parser.add_argument("--fp8_scaled", action="store_true", help="use scaled fp8 for DiT / DiTにスケーリングされたfp8を使う")
382
+ parser.add_argument("--fp8_llm", action="store_true", help="use fp8 for LLM / LLMにfp8を使う")
383
+ parser.add_argument("--text_encoder1", type=str, help="Text Encoder 1 directory / テキストエンコーダ1のディレクトリ")
384
+ parser.add_argument("--text_encoder2", type=str, help="Text Encoder 2 directory / テキストエンコーダ2のディレクトリ")
385
+ parser.add_argument("--vae_chunk_size", type=int, default=None, help="chunk size for CausalConv3d in VAE")
386
+ parser.add_argument(
387
+ "--vae_spatial_tile_sample_min_size", type=int, default=None, help="spatial tile sample min size for VAE, default 256"
388
+ )
389
+ parser.add_argument("--image_encoder", type=str, required=True, help="Image encoder (CLIP) checkpoint path or directory")
390
+ parser.add_argument("--latent_window_size", type=int, default=9, help="FramePack latent window size (default 9)")
391
+ parser.add_argument("--bulk_decode", action="store_true", help="decode all frames at once in sample generation")
392
+ return parser
393
+
394
+
395
+ if __name__ == "__main__":
396
+ parser = setup_parser_common()
397
+ parser = framepack_setup_parser(parser)
398
+
399
+ args = parser.parse_args()
400
+ args = read_config_from_file(args, parser)
401
+
402
+ assert (
403
+ args.vae_dtype is None or args.vae_dtype == "float16"
404
+ ), "VAE dtype must be float16 / VAEのdtypeはfloat16でなければなりません"
405
+ args.vae_dtype = "float16" # fixed
406
+ args.dit_dtype = "bfloat16" # fixed
407
+ args.sample_solver = "unipc" # for sample generation, fixed to unipc
408
+
409
+ trainer = FramePackNetworkTrainer()
410
+ trainer.train(args)
frame_pack/__init__.py ADDED
File without changes
frame_pack/bucket_tools.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ bucket_options = {
2
+ 640: [
3
+ (416, 960),
4
+ (448, 864),
5
+ (480, 832),
6
+ (512, 768),
7
+ (544, 704),
8
+ (576, 672),
9
+ (608, 640),
10
+ (640, 608),
11
+ (672, 576),
12
+ (704, 544),
13
+ (768, 512),
14
+ (832, 480),
15
+ (864, 448),
16
+ (960, 416),
17
+ ],
18
+ }
19
+
20
+
21
+ def find_nearest_bucket(h, w, resolution=640):
22
+ min_metric = float('inf')
23
+ best_bucket = None
24
+ for (bucket_h, bucket_w) in bucket_options[resolution]:
25
+ metric = abs(h * bucket_w - w * bucket_h)
26
+ if metric <= min_metric:
27
+ min_metric = metric
28
+ best_bucket = (bucket_h, bucket_w)
29
+ return best_bucket
30
+
frame_pack/clip_vision.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+
4
+ def hf_clip_vision_encode(image, feature_extractor, image_encoder):
5
+ assert isinstance(image, np.ndarray)
6
+ assert image.ndim == 3 and image.shape[2] == 3
7
+ assert image.dtype == np.uint8
8
+
9
+ preprocessed = feature_extractor.preprocess(images=image, return_tensors="pt").to(
10
+ device=image_encoder.device, dtype=image_encoder.dtype
11
+ )
12
+ image_encoder_output = image_encoder(**preprocessed)
13
+
14
+ return image_encoder_output
frame_pack/framepack_utils.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ from types import SimpleNamespace
4
+ from typing import Optional, Union
5
+
6
+ import accelerate
7
+ from accelerate import Accelerator, init_empty_weights
8
+ import torch
9
+ from safetensors.torch import load_file
10
+ from transformers import (
11
+ LlamaTokenizerFast,
12
+ LlamaConfig,
13
+ LlamaModel,
14
+ CLIPTokenizer,
15
+ CLIPTextModel,
16
+ CLIPConfig,
17
+ SiglipImageProcessor,
18
+ SiglipVisionModel,
19
+ SiglipVisionConfig,
20
+ )
21
+
22
+ from utils.safetensors_utils import load_split_weights
23
+ from hunyuan_model.vae import load_vae as hunyuan_load_vae
24
+
25
+ import logging
26
+
27
+ logger = logging.getLogger(__name__)
28
+ logging.basicConfig(level=logging.INFO)
29
+
30
+
31
+ def load_vae(
32
+ vae_path: str, vae_chunk_size: Optional[int], vae_spatial_tile_sample_min_size: Optional[int], device: Union[str, torch.device]
33
+ ):
34
+ # single file and directory (contains 'vae') support
35
+ if os.path.isdir(vae_path):
36
+ vae_path = os.path.join(vae_path, "vae", "diffusion_pytorch_model.safetensors")
37
+ else:
38
+ vae_path = vae_path
39
+
40
+ vae_dtype = torch.float16 # if vae_dtype is None else str_to_dtype(vae_dtype)
41
+ vae, _, s_ratio, t_ratio = hunyuan_load_vae(vae_dtype=vae_dtype, device=device, vae_path=vae_path)
42
+ vae.eval()
43
+ # vae_kwargs = {"s_ratio": s_ratio, "t_ratio": t_ratio}
44
+
45
+ # set chunk_size to CausalConv3d recursively
46
+ chunk_size = vae_chunk_size
47
+ if chunk_size is not None:
48
+ vae.set_chunk_size_for_causal_conv_3d(chunk_size)
49
+ logger.info(f"Set chunk_size to {chunk_size} for CausalConv3d")
50
+
51
+ if vae_spatial_tile_sample_min_size is not None:
52
+ vae.enable_spatial_tiling(True)
53
+ vae.tile_sample_min_size = vae_spatial_tile_sample_min_size
54
+ vae.tile_latent_min_size = vae_spatial_tile_sample_min_size // 8
55
+ logger.info(f"Enabled spatial tiling with min size {vae_spatial_tile_sample_min_size}")
56
+ # elif vae_tiling:
57
+ else:
58
+ vae.enable_spatial_tiling(True)
59
+
60
+ return vae
61
+
62
+
63
+ # region Text Encoders
64
+
65
+ # Text Encoder configs are copied from HunyuanVideo repo
66
+
67
+ LLAMA_CONFIG = {
68
+ "architectures": ["LlamaModel"],
69
+ "attention_bias": False,
70
+ "attention_dropout": 0.0,
71
+ "bos_token_id": 128000,
72
+ "eos_token_id": 128001,
73
+ "head_dim": 128,
74
+ "hidden_act": "silu",
75
+ "hidden_size": 4096,
76
+ "initializer_range": 0.02,
77
+ "intermediate_size": 14336,
78
+ "max_position_embeddings": 8192,
79
+ "mlp_bias": False,
80
+ "model_type": "llama",
81
+ "num_attention_heads": 32,
82
+ "num_hidden_layers": 32,
83
+ "num_key_value_heads": 8,
84
+ "pretraining_tp": 1,
85
+ "rms_norm_eps": 1e-05,
86
+ "rope_scaling": None,
87
+ "rope_theta": 500000.0,
88
+ "tie_word_embeddings": False,
89
+ "torch_dtype": "float16",
90
+ "transformers_version": "4.46.3",
91
+ "use_cache": True,
92
+ "vocab_size": 128320,
93
+ }
94
+
95
+ CLIP_CONFIG = {
96
+ # "_name_or_path": "/raid/aryan/llava-llama-3-8b-v1_1-extracted/text_encoder_2",
97
+ "architectures": ["CLIPTextModel"],
98
+ "attention_dropout": 0.0,
99
+ "bos_token_id": 0,
100
+ "dropout": 0.0,
101
+ "eos_token_id": 2,
102
+ "hidden_act": "quick_gelu",
103
+ "hidden_size": 768,
104
+ "initializer_factor": 1.0,
105
+ "initializer_range": 0.02,
106
+ "intermediate_size": 3072,
107
+ "layer_norm_eps": 1e-05,
108
+ "max_position_embeddings": 77,
109
+ "model_type": "clip_text_model",
110
+ "num_attention_heads": 12,
111
+ "num_hidden_layers": 12,
112
+ "pad_token_id": 1,
113
+ "projection_dim": 768,
114
+ "torch_dtype": "float16",
115
+ "transformers_version": "4.48.0.dev0",
116
+ "vocab_size": 49408,
117
+ }
118
+
119
+
120
+ def load_text_encoder1(
121
+ args, fp8_llm: Optional[bool] = False, device: Optional[Union[str, torch.device]] = None
122
+ ) -> tuple[LlamaTokenizerFast, LlamaModel]:
123
+ # single file, split file and directory (contains 'text_encoder') support
124
+ logger.info(f"Loading text encoder 1 tokenizer")
125
+ tokenizer1 = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder="tokenizer")
126
+
127
+ logger.info(f"Loading text encoder 1 from {args.text_encoder1}")
128
+ if os.path.isdir(args.text_encoder1):
129
+ # load from directory, configs are in the directory
130
+ text_encoder1 = LlamaModel.from_pretrained(args.text_encoder1, subfolder="text_encoder", torch_dtype=torch.float16)
131
+ else:
132
+ # load from file, we create the model with the appropriate config
133
+ config = LlamaConfig(**LLAMA_CONFIG)
134
+ with init_empty_weights():
135
+ text_encoder1 = LlamaModel._from_config(config, torch_dtype=torch.float16)
136
+
137
+ state_dict = load_split_weights(args.text_encoder1)
138
+
139
+ # support weights from ComfyUI
140
+ if "model.embed_tokens.weight" in state_dict:
141
+ for key in list(state_dict.keys()):
142
+ if key.startswith("model."):
143
+ new_key = key.replace("model.", "")
144
+ state_dict[new_key] = state_dict[key]
145
+ del state_dict[key]
146
+ if "tokenizer" in state_dict:
147
+ state_dict.pop("tokenizer")
148
+ if "lm_head.weight" in state_dict:
149
+ state_dict.pop("lm_head.weight")
150
+
151
+ # # support weights from ComfyUI
152
+ # if "tokenizer" in state_dict:
153
+ # state_dict.pop("tokenizer")
154
+
155
+ text_encoder1.load_state_dict(state_dict, strict=True, assign=True)
156
+
157
+ if fp8_llm:
158
+ org_dtype = text_encoder1.dtype
159
+ logger.info(f"Moving and casting text encoder to {device} and torch.float8_e4m3fn")
160
+ text_encoder1.to(device=device, dtype=torch.float8_e4m3fn)
161
+
162
+ # prepare LLM for fp8
163
+ def prepare_fp8(llama_model: LlamaModel, target_dtype):
164
+ def forward_hook(module):
165
+ def forward(hidden_states):
166
+ input_dtype = hidden_states.dtype
167
+ hidden_states = hidden_states.to(torch.float32)
168
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
169
+ hidden_states = hidden_states * torch.rsqrt(variance + module.variance_epsilon)
170
+ return module.weight.to(input_dtype) * hidden_states.to(input_dtype)
171
+
172
+ return forward
173
+
174
+ for module in llama_model.modules():
175
+ if module.__class__.__name__ in ["Embedding"]:
176
+ # print("set", module.__class__.__name__, "to", target_dtype)
177
+ module.to(target_dtype)
178
+ if module.__class__.__name__ in ["LlamaRMSNorm"]:
179
+ # print("set", module.__class__.__name__, "hooks")
180
+ module.forward = forward_hook(module)
181
+
182
+ prepare_fp8(text_encoder1, org_dtype)
183
+ else:
184
+ text_encoder1.to(device)
185
+
186
+ text_encoder1.eval()
187
+ return tokenizer1, text_encoder1
188
+
189
+
190
+ def load_text_encoder2(args) -> tuple[CLIPTokenizer, CLIPTextModel]:
191
+ # single file and directory (contains 'text_encoder_2') support
192
+ logger.info(f"Loading text encoder 2 tokenizer")
193
+ tokenizer2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder="tokenizer_2")
194
+
195
+ logger.info(f"Loading text encoder 2 from {args.text_encoder2}")
196
+ if os.path.isdir(args.text_encoder2):
197
+ # load from directory, configs are in the directory
198
+ text_encoder2 = CLIPTextModel.from_pretrained(args.text_encoder2, subfolder="text_encoder_2", torch_dtype=torch.float16)
199
+ else:
200
+ # we only have one file, so we can load it directly
201
+ config = CLIPConfig(**CLIP_CONFIG)
202
+ with init_empty_weights():
203
+ text_encoder2 = CLIPTextModel._from_config(config, torch_dtype=torch.float16)
204
+
205
+ state_dict = load_file(args.text_encoder2)
206
+
207
+ text_encoder2.load_state_dict(state_dict, strict=True, assign=True)
208
+
209
+ text_encoder2.eval()
210
+ return tokenizer2, text_encoder2
211
+
212
+
213
+ # endregion
214
+
215
+ # region image encoder
216
+
217
+ # Siglip configs are copied from FramePack repo
218
+ FEATURE_EXTRACTOR_CONFIG = {
219
+ "do_convert_rgb": None,
220
+ "do_normalize": True,
221
+ "do_rescale": True,
222
+ "do_resize": True,
223
+ "image_mean": [0.5, 0.5, 0.5],
224
+ "image_processor_type": "SiglipImageProcessor",
225
+ "image_std": [0.5, 0.5, 0.5],
226
+ "processor_class": "SiglipProcessor",
227
+ "resample": 3,
228
+ "rescale_factor": 0.00392156862745098,
229
+ "size": {"height": 384, "width": 384},
230
+ }
231
+ IMAGE_ENCODER_CONFIG = {
232
+ "_name_or_path": "/home/lvmin/.cache/huggingface/hub/models--black-forest-labs--FLUX.1-Redux-dev/snapshots/1282f955f706b5240161278f2ef261d2a29ad649/image_encoder",
233
+ "architectures": ["SiglipVisionModel"],
234
+ "attention_dropout": 0.0,
235
+ "hidden_act": "gelu_pytorch_tanh",
236
+ "hidden_size": 1152,
237
+ "image_size": 384,
238
+ "intermediate_size": 4304,
239
+ "layer_norm_eps": 1e-06,
240
+ "model_type": "siglip_vision_model",
241
+ "num_attention_heads": 16,
242
+ "num_channels": 3,
243
+ "num_hidden_layers": 27,
244
+ "patch_size": 14,
245
+ "torch_dtype": "bfloat16",
246
+ "transformers_version": "4.46.2",
247
+ }
248
+
249
+
250
+ def load_image_encoders(args):
251
+ logger.info(f"Loading image encoder feature extractor")
252
+ feature_extractor = SiglipImageProcessor(**FEATURE_EXTRACTOR_CONFIG)
253
+
254
+ # single file, split file and directory (contains 'image_encoder') support
255
+ logger.info(f"Loading image encoder from {args.image_encoder}")
256
+ if os.path.isdir(args.image_encoder):
257
+ # load from directory, configs are in the directory
258
+ image_encoder = SiglipVisionModel.from_pretrained(args.image_encoder, subfolder="image_encoder", torch_dtype=torch.float16)
259
+ else:
260
+ # load from file, we create the model with the appropriate config
261
+ config = SiglipVisionConfig(**IMAGE_ENCODER_CONFIG)
262
+ with init_empty_weights():
263
+ image_encoder = SiglipVisionModel._from_config(config, torch_dtype=torch.float16)
264
+
265
+ state_dict = load_file(args.image_encoder)
266
+
267
+ image_encoder.load_state_dict(state_dict, strict=True, assign=True)
268
+
269
+ image_encoder.eval()
270
+ return feature_extractor, image_encoder
271
+
272
+
273
+ # endregion
frame_pack/hunyuan.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # original code: https://github.com/lllyasviel/FramePack
2
+ # original license: Apache-2.0
3
+
4
+ import torch
5
+
6
+ # from diffusers.pipelines.hunyuan_video.pipeline_hunyuan_video import DEFAULT_PROMPT_TEMPLATE
7
+ # from diffusers_helper.utils import crop_or_pad_yield_mask
8
+ from hunyuan_model.autoencoder_kl_causal_3d import AutoencoderKLCausal3D
9
+ from hunyuan_model.text_encoder import PROMPT_TEMPLATE
10
+
11
+
12
+ @torch.no_grad()
13
+ def encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2, max_length=256):
14
+ assert isinstance(prompt, str)
15
+
16
+ prompt = [prompt]
17
+
18
+ # LLAMA
19
+
20
+ prompt_llama = [PROMPT_TEMPLATE["dit-llm-encode-video"]["template"].format(p) for p in prompt]
21
+ crop_start = PROMPT_TEMPLATE["dit-llm-encode-video"]["crop_start"]
22
+
23
+ llama_inputs = tokenizer(
24
+ prompt_llama,
25
+ padding="max_length",
26
+ max_length=max_length + crop_start,
27
+ truncation=True,
28
+ return_tensors="pt",
29
+ return_length=False,
30
+ return_overflowing_tokens=False,
31
+ return_attention_mask=True,
32
+ )
33
+
34
+ llama_input_ids = llama_inputs.input_ids.to(text_encoder.device)
35
+ llama_attention_mask = llama_inputs.attention_mask.to(text_encoder.device)
36
+ llama_attention_length = int(llama_attention_mask.sum())
37
+
38
+ llama_outputs = text_encoder(
39
+ input_ids=llama_input_ids,
40
+ attention_mask=llama_attention_mask,
41
+ output_hidden_states=True,
42
+ )
43
+
44
+ llama_vec = llama_outputs.hidden_states[-3][:, crop_start:llama_attention_length]
45
+ # llama_vec_remaining = llama_outputs.hidden_states[-3][:, llama_attention_length:]
46
+ llama_attention_mask = llama_attention_mask[:, crop_start:llama_attention_length]
47
+
48
+ assert torch.all(llama_attention_mask.bool())
49
+
50
+ # CLIP
51
+
52
+ clip_l_input_ids = tokenizer_2(
53
+ prompt,
54
+ padding="max_length",
55
+ max_length=77,
56
+ truncation=True,
57
+ return_overflowing_tokens=False,
58
+ return_length=False,
59
+ return_tensors="pt",
60
+ ).input_ids
61
+ clip_l_pooler = text_encoder_2(clip_l_input_ids.to(text_encoder_2.device), output_hidden_states=False).pooler_output
62
+
63
+ return llama_vec, clip_l_pooler
64
+
65
+
66
+ @torch.no_grad()
67
+ def vae_decode_fake(latents):
68
+ latent_rgb_factors = [
69
+ [-0.0395, -0.0331, 0.0445],
70
+ [0.0696, 0.0795, 0.0518],
71
+ [0.0135, -0.0945, -0.0282],
72
+ [0.0108, -0.0250, -0.0765],
73
+ [-0.0209, 0.0032, 0.0224],
74
+ [-0.0804, -0.0254, -0.0639],
75
+ [-0.0991, 0.0271, -0.0669],
76
+ [-0.0646, -0.0422, -0.0400],
77
+ [-0.0696, -0.0595, -0.0894],
78
+ [-0.0799, -0.0208, -0.0375],
79
+ [0.1166, 0.1627, 0.0962],
80
+ [0.1165, 0.0432, 0.0407],
81
+ [-0.2315, -0.1920, -0.1355],
82
+ [-0.0270, 0.0401, -0.0821],
83
+ [-0.0616, -0.0997, -0.0727],
84
+ [0.0249, -0.0469, -0.1703],
85
+ ] # From comfyui
86
+
87
+ latent_rgb_factors_bias = [0.0259, -0.0192, -0.0761]
88
+
89
+ weight = torch.tensor(latent_rgb_factors, device=latents.device, dtype=latents.dtype).transpose(0, 1)[:, :, None, None, None]
90
+ bias = torch.tensor(latent_rgb_factors_bias, device=latents.device, dtype=latents.dtype)
91
+
92
+ images = torch.nn.functional.conv3d(latents, weight, bias=bias, stride=1, padding=0, dilation=1, groups=1)
93
+ images = images.clamp(0.0, 1.0)
94
+
95
+ return images
96
+
97
+
98
+ @torch.no_grad()
99
+ def vae_decode(latents, vae, image_mode=False) -> torch.Tensor:
100
+ latents = latents / vae.config.scaling_factor
101
+
102
+ if not image_mode:
103
+ image = vae.decode(latents.to(device=vae.device, dtype=vae.dtype)).sample
104
+ else:
105
+ latents = latents.to(device=vae.device, dtype=vae.dtype).unbind(2)
106
+ image = [vae.decode(l.unsqueeze(2)).sample for l in latents]
107
+ image = torch.cat(image, dim=2)
108
+
109
+ return image
110
+
111
+
112
+ @torch.no_grad()
113
+ def vae_encode(image, vae: AutoencoderKLCausal3D) -> torch.Tensor:
114
+ latents = vae.encode(image.to(device=vae.device, dtype=vae.dtype)).latent_dist.sample()
115
+ latents = latents * vae.config.scaling_factor
116
+ return latents
frame_pack/hunyuan_video_packed.py ADDED
@@ -0,0 +1,2015 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # original code: https://github.com/lllyasviel/FramePack
2
+ # original license: Apache-2.0
3
+
4
+ import glob
5
+ import math
6
+ import numbers
7
+ import os
8
+ from types import SimpleNamespace
9
+ from typing import Any, Dict, List, Optional, Tuple, Union
10
+
11
+ import torch
12
+ import einops
13
+ import torch.nn as nn
14
+ import torch.nn.functional as F
15
+ import numpy as np
16
+
17
+ from modules.custom_offloading_utils import ModelOffloader
18
+ from utils.safetensors_utils import load_split_weights
19
+ from modules.fp8_optimization_utils import apply_fp8_monkey_patch, optimize_state_dict_with_fp8
20
+ from accelerate import init_empty_weights
21
+
22
+ try:
23
+ # raise NotImplementedError
24
+ from xformers.ops import memory_efficient_attention as xformers_attn_func
25
+
26
+ print("Xformers is installed!")
27
+ except:
28
+ print("Xformers is not installed!")
29
+ xformers_attn_func = None
30
+
31
+ try:
32
+ # raise NotImplementedError
33
+ from flash_attn import flash_attn_varlen_func, flash_attn_func
34
+
35
+ print("Flash Attn is installed!")
36
+ except:
37
+ print("Flash Attn is not installed!")
38
+ flash_attn_varlen_func = None
39
+ flash_attn_func = None
40
+
41
+ try:
42
+ # raise NotImplementedError
43
+ from sageattention import sageattn_varlen, sageattn
44
+
45
+ print("Sage Attn is installed!")
46
+ except:
47
+ print("Sage Attn is not installed!")
48
+ sageattn_varlen = None
49
+ sageattn = None
50
+
51
+
52
+ import logging
53
+
54
+ logger = logging.getLogger(__name__)
55
+ logging.basicConfig(level=logging.INFO)
56
+
57
+ # region diffusers
58
+
59
+ # copied from diffusers with some modifications to minimize dependencies
60
+ # original code: https://github.com/huggingface/diffusers/
61
+ # original license: Apache-2.0
62
+
63
+ ACT2CLS = {
64
+ "swish": nn.SiLU,
65
+ "silu": nn.SiLU,
66
+ "mish": nn.Mish,
67
+ "gelu": nn.GELU,
68
+ "relu": nn.ReLU,
69
+ }
70
+
71
+
72
+ def get_activation(act_fn: str) -> nn.Module:
73
+ """Helper function to get activation function from string.
74
+
75
+ Args:
76
+ act_fn (str): Name of activation function.
77
+
78
+ Returns:
79
+ nn.Module: Activation function.
80
+ """
81
+
82
+ act_fn = act_fn.lower()
83
+ if act_fn in ACT2CLS:
84
+ return ACT2CLS[act_fn]()
85
+ else:
86
+ raise ValueError(f"activation function {act_fn} not found in ACT2FN mapping {list(ACT2CLS.keys())}")
87
+
88
+
89
+ def get_timestep_embedding(
90
+ timesteps: torch.Tensor,
91
+ embedding_dim: int,
92
+ flip_sin_to_cos: bool = False,
93
+ downscale_freq_shift: float = 1,
94
+ scale: float = 1,
95
+ max_period: int = 10000,
96
+ ):
97
+ """
98
+ This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
99
+
100
+ Args
101
+ timesteps (torch.Tensor):
102
+ a 1-D Tensor of N indices, one per batch element. These may be fractional.
103
+ embedding_dim (int):
104
+ the dimension of the output.
105
+ flip_sin_to_cos (bool):
106
+ Whether the embedding order should be `cos, sin` (if True) or `sin, cos` (if False)
107
+ downscale_freq_shift (float):
108
+ Controls the delta between frequencies between dimensions
109
+ scale (float):
110
+ Scaling factor applied to the embeddings.
111
+ max_period (int):
112
+ Controls the maximum frequency of the embeddings
113
+ Returns
114
+ torch.Tensor: an [N x dim] Tensor of positional embeddings.
115
+ """
116
+ assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
117
+
118
+ half_dim = embedding_dim // 2
119
+ exponent = -math.log(max_period) * torch.arange(start=0, end=half_dim, dtype=torch.float32, device=timesteps.device)
120
+ exponent = exponent / (half_dim - downscale_freq_shift)
121
+
122
+ emb = torch.exp(exponent)
123
+ emb = timesteps[:, None].float() * emb[None, :]
124
+
125
+ # scale embeddings
126
+ emb = scale * emb
127
+
128
+ # concat sine and cosine embeddings
129
+ emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
130
+
131
+ # flip sine and cosine embeddings
132
+ if flip_sin_to_cos:
133
+ emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1)
134
+
135
+ # zero pad
136
+ if embedding_dim % 2 == 1:
137
+ emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
138
+ return emb
139
+
140
+
141
+ class TimestepEmbedding(nn.Module):
142
+ def __init__(
143
+ self,
144
+ in_channels: int,
145
+ time_embed_dim: int,
146
+ act_fn: str = "silu",
147
+ out_dim: int = None,
148
+ post_act_fn: Optional[str] = None,
149
+ cond_proj_dim=None,
150
+ sample_proj_bias=True,
151
+ ):
152
+ super().__init__()
153
+
154
+ self.linear_1 = nn.Linear(in_channels, time_embed_dim, sample_proj_bias)
155
+
156
+ if cond_proj_dim is not None:
157
+ self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias=False)
158
+ else:
159
+ self.cond_proj = None
160
+
161
+ self.act = get_activation(act_fn)
162
+
163
+ if out_dim is not None:
164
+ time_embed_dim_out = out_dim
165
+ else:
166
+ time_embed_dim_out = time_embed_dim
167
+ self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out, sample_proj_bias)
168
+
169
+ if post_act_fn is None:
170
+ self.post_act = None
171
+ else:
172
+ self.post_act = get_activation(post_act_fn)
173
+
174
+ def forward(self, sample, condition=None):
175
+ if condition is not None:
176
+ sample = sample + self.cond_proj(condition)
177
+ sample = self.linear_1(sample)
178
+
179
+ if self.act is not None:
180
+ sample = self.act(sample)
181
+
182
+ sample = self.linear_2(sample)
183
+
184
+ if self.post_act is not None:
185
+ sample = self.post_act(sample)
186
+ return sample
187
+
188
+
189
+ class Timesteps(nn.Module):
190
+ def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float, scale: int = 1):
191
+ super().__init__()
192
+ self.num_channels = num_channels
193
+ self.flip_sin_to_cos = flip_sin_to_cos
194
+ self.downscale_freq_shift = downscale_freq_shift
195
+ self.scale = scale
196
+
197
+ def forward(self, timesteps):
198
+ t_emb = get_timestep_embedding(
199
+ timesteps,
200
+ self.num_channels,
201
+ flip_sin_to_cos=self.flip_sin_to_cos,
202
+ downscale_freq_shift=self.downscale_freq_shift,
203
+ scale=self.scale,
204
+ )
205
+ return t_emb
206
+
207
+
208
+ class FP32SiLU(nn.Module):
209
+ r"""
210
+ SiLU activation function with input upcasted to torch.float32.
211
+ """
212
+
213
+ def __init__(self):
214
+ super().__init__()
215
+
216
+ def forward(self, inputs: torch.Tensor) -> torch.Tensor:
217
+ return F.silu(inputs.float(), inplace=False).to(inputs.dtype)
218
+
219
+
220
+ class GELU(nn.Module):
221
+ r"""
222
+ GELU activation function with tanh approximation support with `approximate="tanh"`.
223
+
224
+ Parameters:
225
+ dim_in (`int`): The number of channels in the input.
226
+ dim_out (`int`): The number of channels in the output.
227
+ approximate (`str`, *optional*, defaults to `"none"`): If `"tanh"`, use tanh approximation.
228
+ bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
229
+ """
230
+
231
+ def __init__(self, dim_in: int, dim_out: int, approximate: str = "none", bias: bool = True):
232
+ super().__init__()
233
+ self.proj = nn.Linear(dim_in, dim_out, bias=bias)
234
+ self.approximate = approximate
235
+
236
+ def gelu(self, gate: torch.Tensor) -> torch.Tensor:
237
+ # if gate.device.type == "mps" and is_torch_version("<", "2.0.0"):
238
+ # # fp16 gelu not supported on mps before torch 2.0
239
+ # return F.gelu(gate.to(dtype=torch.float32), approximate=self.approximate).to(dtype=gate.dtype)
240
+ return F.gelu(gate, approximate=self.approximate)
241
+
242
+ def forward(self, hidden_states):
243
+ hidden_states = self.proj(hidden_states)
244
+ hidden_states = self.gelu(hidden_states)
245
+ return hidden_states
246
+
247
+
248
+ class PixArtAlphaTextProjection(nn.Module):
249
+ """
250
+ Projects caption embeddings. Also handles dropout for classifier-free guidance.
251
+
252
+ Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/nets/PixArt_blocks.py
253
+ """
254
+
255
+ def __init__(self, in_features, hidden_size, out_features=None, act_fn="gelu_tanh"):
256
+ super().__init__()
257
+ if out_features is None:
258
+ out_features = hidden_size
259
+ self.linear_1 = nn.Linear(in_features=in_features, out_features=hidden_size, bias=True)
260
+ if act_fn == "gelu_tanh":
261
+ self.act_1 = nn.GELU(approximate="tanh")
262
+ elif act_fn == "silu":
263
+ self.act_1 = nn.SiLU()
264
+ elif act_fn == "silu_fp32":
265
+ self.act_1 = FP32SiLU()
266
+ else:
267
+ raise ValueError(f"Unknown activation function: {act_fn}")
268
+ self.linear_2 = nn.Linear(in_features=hidden_size, out_features=out_features, bias=True)
269
+
270
+ def forward(self, caption):
271
+ hidden_states = self.linear_1(caption)
272
+ hidden_states = self.act_1(hidden_states)
273
+ hidden_states = self.linear_2(hidden_states)
274
+ return hidden_states
275
+
276
+
277
+ class LayerNormFramePack(nn.LayerNorm):
278
+ # casting to dtype of input tensor is added
279
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
280
+ return torch.nn.functional.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps).to(x)
281
+
282
+
283
+ class FP32LayerNormFramePack(nn.LayerNorm):
284
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
285
+ origin_dtype = x.dtype
286
+ return torch.nn.functional.layer_norm(
287
+ x.float(),
288
+ self.normalized_shape,
289
+ self.weight.float() if self.weight is not None else None,
290
+ self.bias.float() if self.bias is not None else None,
291
+ self.eps,
292
+ ).to(origin_dtype)
293
+
294
+
295
+ class RMSNormFramePack(nn.Module):
296
+ r"""
297
+ RMS Norm as introduced in https://arxiv.org/abs/1910.07467 by Zhang et al.
298
+
299
+ Args:
300
+ dim (`int`): Number of dimensions to use for `weights`. Only effective when `elementwise_affine` is True.
301
+ eps (`float`): Small value to use when calculating the reciprocal of the square-root.
302
+ elementwise_affine (`bool`, defaults to `True`):
303
+ Boolean flag to denote if affine transformation should be applied.
304
+ bias (`bool`, defaults to False): If also training the `bias` param.
305
+ """
306
+
307
+ def __init__(self, dim, eps: float, elementwise_affine: bool = True, bias: bool = False):
308
+ super().__init__()
309
+
310
+ self.eps = eps
311
+ self.elementwise_affine = elementwise_affine
312
+
313
+ if isinstance(dim, numbers.Integral):
314
+ dim = (dim,)
315
+
316
+ self.dim = torch.Size(dim)
317
+
318
+ self.weight = None
319
+ self.bias = None
320
+
321
+ if elementwise_affine:
322
+ self.weight = nn.Parameter(torch.ones(dim))
323
+ if bias:
324
+ self.bias = nn.Parameter(torch.zeros(dim))
325
+
326
+ def forward(self, hidden_states):
327
+ input_dtype = hidden_states.dtype
328
+ variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
329
+ hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
330
+
331
+ if self.weight is None:
332
+ return hidden_states.to(input_dtype)
333
+
334
+ return hidden_states.to(input_dtype) * self.weight.to(input_dtype)
335
+
336
+
337
+ class AdaLayerNormContinuousFramePack(nn.Module):
338
+ r"""
339
+ Adaptive normalization layer with a norm layer (layer_norm or rms_norm).
340
+
341
+ Args:
342
+ embedding_dim (`int`): Embedding dimension to use during projection.
343
+ conditioning_embedding_dim (`int`): Dimension of the input condition.
344
+ elementwise_affine (`bool`, defaults to `True`):
345
+ Boolean flag to denote if affine transformation should be applied.
346
+ eps (`float`, defaults to 1e-5): Epsilon factor.
347
+ bias (`bias`, defaults to `True`): Boolean flag to denote if bias should be use.
348
+ norm_type (`str`, defaults to `"layer_norm"`):
349
+ Normalization layer to use. Values supported: "layer_norm", "rms_norm".
350
+ """
351
+
352
+ def __init__(
353
+ self,
354
+ embedding_dim: int,
355
+ conditioning_embedding_dim: int,
356
+ # NOTE: It is a bit weird that the norm layer can be configured to have scale and shift parameters
357
+ # because the output is immediately scaled and shifted by the projected conditioning embeddings.
358
+ # Note that AdaLayerNorm does not let the norm layer have scale and shift parameters.
359
+ # However, this is how it was implemented in the original code, and it's rather likely you should
360
+ # set `elementwise_affine` to False.
361
+ elementwise_affine=True,
362
+ eps=1e-5,
363
+ bias=True,
364
+ norm_type="layer_norm",
365
+ ):
366
+ super().__init__()
367
+ self.silu = nn.SiLU()
368
+ self.linear = nn.Linear(conditioning_embedding_dim, embedding_dim * 2, bias=bias)
369
+ if norm_type == "layer_norm":
370
+ self.norm = LayerNormFramePack(embedding_dim, eps, elementwise_affine, bias)
371
+ elif norm_type == "rms_norm":
372
+ self.norm = RMSNormFramePack(embedding_dim, eps, elementwise_affine)
373
+ else:
374
+ raise ValueError(f"unknown norm_type {norm_type}")
375
+
376
+ def forward(self, x, conditioning_embedding):
377
+ emb = self.linear(self.silu(conditioning_embedding))
378
+ scale, shift = emb.chunk(2, dim=1)
379
+ x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
380
+ return x
381
+
382
+
383
+ class LinearActivation(nn.Module):
384
+ def __init__(self, dim_in: int, dim_out: int, bias: bool = True, activation: str = "silu"):
385
+ super().__init__()
386
+
387
+ self.proj = nn.Linear(dim_in, dim_out, bias=bias)
388
+ self.activation = get_activation(activation)
389
+
390
+ def forward(self, hidden_states):
391
+ hidden_states = self.proj(hidden_states)
392
+ return self.activation(hidden_states)
393
+
394
+
395
+ class FeedForward(nn.Module):
396
+ r"""
397
+ A feed-forward layer.
398
+
399
+ Parameters:
400
+ dim (`int`): The number of channels in the input.
401
+ dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
402
+ mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
403
+ dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
404
+ activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
405
+ final_dropout (`bool` *optional*, defaults to False): Apply a final dropout.
406
+ bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
407
+ """
408
+
409
+ def __init__(
410
+ self,
411
+ dim: int,
412
+ dim_out: Optional[int] = None,
413
+ mult: int = 4,
414
+ dropout: float = 0.0,
415
+ activation_fn: str = "geglu",
416
+ final_dropout: bool = False,
417
+ inner_dim=None,
418
+ bias: bool = True,
419
+ ):
420
+ super().__init__()
421
+ if inner_dim is None:
422
+ inner_dim = int(dim * mult)
423
+ dim_out = dim_out if dim_out is not None else dim
424
+
425
+ # if activation_fn == "gelu":
426
+ # act_fn = GELU(dim, inner_dim, bias=bias)
427
+ if activation_fn == "gelu-approximate":
428
+ act_fn = GELU(dim, inner_dim, approximate="tanh", bias=bias)
429
+ # elif activation_fn == "geglu":
430
+ # act_fn = GEGLU(dim, inner_dim, bias=bias)
431
+ # elif activation_fn == "geglu-approximate":
432
+ # act_fn = ApproximateGELU(dim, inner_dim, bias=bias)
433
+ # elif activation_fn == "swiglu":
434
+ # act_fn = SwiGLU(dim, inner_dim, bias=bias)
435
+ elif activation_fn == "linear-silu":
436
+ act_fn = LinearActivation(dim, inner_dim, bias=bias, activation="silu")
437
+ else:
438
+ raise ValueError(f"Unknown activation function: {activation_fn}")
439
+
440
+ self.net = nn.ModuleList([])
441
+ # project in
442
+ self.net.append(act_fn)
443
+ # project dropout
444
+ self.net.append(nn.Dropout(dropout))
445
+ # project out
446
+ self.net.append(nn.Linear(inner_dim, dim_out, bias=bias))
447
+ # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
448
+ if final_dropout:
449
+ self.net.append(nn.Dropout(dropout))
450
+
451
+ def forward(self, hidden_states: torch.Tensor, *args, **kwargs) -> torch.Tensor:
452
+ if len(args) > 0 or kwargs.get("scale", None) is not None:
453
+ # deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
454
+ # deprecate("scale", "1.0.0", deprecation_message)
455
+ raise ValueError("scale is not supported in this version. Please remove it.")
456
+ for module in self.net:
457
+ hidden_states = module(hidden_states)
458
+ return hidden_states
459
+
460
+
461
+ # @maybe_allow_in_graph
462
+ class Attention(nn.Module):
463
+ r"""
464
+ Minimal copy of Attention class from diffusers.
465
+ """
466
+
467
+ def __init__(
468
+ self,
469
+ query_dim: int,
470
+ cross_attention_dim: Optional[int] = None,
471
+ heads: int = 8,
472
+ dim_head: int = 64,
473
+ bias: bool = False,
474
+ qk_norm: Optional[str] = None,
475
+ added_kv_proj_dim: Optional[int] = None,
476
+ eps: float = 1e-5,
477
+ processor: Optional[any] = None,
478
+ out_dim: int = None,
479
+ context_pre_only=None,
480
+ pre_only=False,
481
+ ):
482
+ super().__init__()
483
+ self.inner_dim = out_dim if out_dim is not None else dim_head * heads
484
+ self.inner_kv_dim = self.inner_dim # if kv_heads is None else dim_head * kv_heads
485
+ self.query_dim = query_dim
486
+ self.use_bias = bias
487
+ self.cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
488
+ self.out_dim = out_dim if out_dim is not None else query_dim
489
+ self.out_context_dim = query_dim
490
+ self.context_pre_only = context_pre_only
491
+ self.pre_only = pre_only
492
+
493
+ self.scale = dim_head**-0.5
494
+ self.heads = out_dim // dim_head if out_dim is not None else heads
495
+
496
+ self.added_kv_proj_dim = added_kv_proj_dim
497
+
498
+ if qk_norm is None:
499
+ self.norm_q = None
500
+ self.norm_k = None
501
+ elif qk_norm == "rms_norm":
502
+ self.norm_q = RMSNormFramePack(dim_head, eps=eps)
503
+ self.norm_k = RMSNormFramePack(dim_head, eps=eps)
504
+ else:
505
+ raise ValueError(
506
+ f"unknown qk_norm: {qk_norm}. Should be one of None, 'layer_norm', 'fp32_layer_norm', 'layer_norm_across_heads', 'rms_norm', 'rms_norm_across_heads', 'l2'."
507
+ )
508
+
509
+ self.to_q = nn.Linear(query_dim, self.inner_dim, bias=bias)
510
+ self.to_k = nn.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias)
511
+ self.to_v = nn.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias)
512
+
513
+ self.added_proj_bias = True # added_proj_bias
514
+ if self.added_kv_proj_dim is not None:
515
+ self.add_k_proj = nn.Linear(added_kv_proj_dim, self.inner_kv_dim, bias=True)
516
+ self.add_v_proj = nn.Linear(added_kv_proj_dim, self.inner_kv_dim, bias=True)
517
+ if self.context_pre_only is not None:
518
+ self.add_q_proj = nn.Linear(added_kv_proj_dim, self.inner_dim, bias=True)
519
+ else:
520
+ self.add_q_proj = None
521
+ self.add_k_proj = None
522
+ self.add_v_proj = None
523
+
524
+ if not self.pre_only:
525
+ self.to_out = nn.ModuleList([])
526
+ self.to_out.append(nn.Linear(self.inner_dim, self.out_dim, bias=True))
527
+ # self.to_out.append(nn.Dropout(dropout))
528
+ self.to_out.append(nn.Identity()) # dropout=0.0
529
+ else:
530
+ self.to_out = None
531
+
532
+ if self.context_pre_only is not None and not self.context_pre_only:
533
+ self.to_add_out = nn.Linear(self.inner_dim, self.out_context_dim, bias=True)
534
+ else:
535
+ self.to_add_out = None
536
+
537
+ if qk_norm is not None and added_kv_proj_dim is not None:
538
+ if qk_norm == "rms_norm":
539
+ self.norm_added_q = RMSNormFramePack(dim_head, eps=eps)
540
+ self.norm_added_k = RMSNormFramePack(dim_head, eps=eps)
541
+ else:
542
+ raise ValueError(f"unknown qk_norm: {qk_norm}. Should be one of `None,'layer_norm','fp32_layer_norm','rms_norm'`")
543
+ else:
544
+ self.norm_added_q = None
545
+ self.norm_added_k = None
546
+
547
+ # set attention processor
548
+ # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
549
+ # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
550
+ if processor is None:
551
+ processor = AttnProcessor2_0()
552
+ self.set_processor(processor)
553
+
554
+ def set_processor(self, processor: any) -> None:
555
+ self.processor = processor
556
+
557
+ def get_processor(self) -> any:
558
+ return self.processor
559
+
560
+ def forward(
561
+ self,
562
+ hidden_states: torch.Tensor,
563
+ encoder_hidden_states: Optional[torch.Tensor] = None,
564
+ attention_mask: Optional[torch.Tensor] = None,
565
+ **cross_attention_kwargs,
566
+ ) -> torch.Tensor:
567
+ return self.processor(
568
+ self,
569
+ hidden_states,
570
+ encoder_hidden_states=encoder_hidden_states,
571
+ attention_mask=attention_mask,
572
+ **cross_attention_kwargs,
573
+ )
574
+
575
+ def prepare_attention_mask(
576
+ self, attention_mask: torch.Tensor, target_length: int, batch_size: int, out_dim: int = 3
577
+ ) -> torch.Tensor:
578
+ r"""
579
+ Prepare the attention mask for the attention computation.
580
+
581
+ Args:
582
+ attention_mask (`torch.Tensor`):
583
+ The attention mask to prepare.
584
+ target_length (`int`):
585
+ The target length of the attention mask. This is the length of the attention mask after padding.
586
+ batch_size (`int`):
587
+ The batch size, which is used to repeat the attention mask.
588
+ out_dim (`int`, *optional*, defaults to `3`):
589
+ The output dimension of the attention mask. Can be either `3` or `4`.
590
+
591
+ Returns:
592
+ `torch.Tensor`: The prepared attention mask.
593
+ """
594
+ head_size = self.heads
595
+ if attention_mask is None:
596
+ return attention_mask
597
+
598
+ current_length: int = attention_mask.shape[-1]
599
+ if current_length != target_length:
600
+ if attention_mask.device.type == "mps":
601
+ # HACK: MPS: Does not support padding by greater than dimension of input tensor.
602
+ # Instead, we can manually construct the padding tensor.
603
+ padding_shape = (attention_mask.shape[0], attention_mask.shape[1], target_length)
604
+ padding = torch.zeros(padding_shape, dtype=attention_mask.dtype, device=attention_mask.device)
605
+ attention_mask = torch.cat([attention_mask, padding], dim=2)
606
+ else:
607
+ # TODO: for pipelines such as stable-diffusion, padding cross-attn mask:
608
+ # we want to instead pad by (0, remaining_length), where remaining_length is:
609
+ # remaining_length: int = target_length - current_length
610
+ # TODO: re-enable tests/models/test_models_unet_2d_condition.py#test_model_xattn_padding
611
+ attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
612
+
613
+ if out_dim == 3:
614
+ if attention_mask.shape[0] < batch_size * head_size:
615
+ attention_mask = attention_mask.repeat_interleave(head_size, dim=0, output_size=attention_mask.shape[0] * head_size)
616
+ elif out_dim == 4:
617
+ attention_mask = attention_mask.unsqueeze(1)
618
+ attention_mask = attention_mask.repeat_interleave(head_size, dim=1, output_size=attention_mask.shape[1] * head_size)
619
+
620
+ return attention_mask
621
+
622
+
623
+ class AttnProcessor2_0:
624
+ r"""
625
+ Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
626
+ """
627
+
628
+ def __init__(self):
629
+ if not hasattr(F, "scaled_dot_product_attention"):
630
+ raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
631
+
632
+ def __call__(
633
+ self,
634
+ attn: Attention,
635
+ hidden_states: torch.Tensor,
636
+ encoder_hidden_states: Optional[torch.Tensor] = None,
637
+ attention_mask: Optional[torch.Tensor] = None,
638
+ temb: Optional[torch.Tensor] = None,
639
+ *args,
640
+ **kwargs,
641
+ ) -> torch.Tensor:
642
+ input_ndim = hidden_states.ndim
643
+
644
+ if input_ndim == 4:
645
+ batch_size, channel, height, width = hidden_states.shape
646
+ hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
647
+
648
+ batch_size, sequence_length, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
649
+
650
+ if attention_mask is not None:
651
+ attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
652
+ # scaled_dot_product_attention expects attention_mask shape to be
653
+ # (batch, heads, source_length, target_length)
654
+ attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
655
+
656
+ query = attn.to_q(hidden_states)
657
+ query_dtype = query.dtype # store dtype before potentially deleting query
658
+
659
+ if encoder_hidden_states is None:
660
+ encoder_hidden_states = hidden_states
661
+
662
+ key = attn.to_k(encoder_hidden_states)
663
+ value = attn.to_v(encoder_hidden_states)
664
+
665
+ inner_dim = key.shape[-1]
666
+ head_dim = inner_dim // attn.heads
667
+
668
+ query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
669
+ key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
670
+ value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
671
+
672
+ if attn.norm_q is not None:
673
+ query = attn.norm_q(query)
674
+ if attn.norm_k is not None:
675
+ key = attn.norm_k(key)
676
+
677
+ # the output of sdp = (batch, num_heads, seq_len, head_dim)
678
+ hidden_states = F.scaled_dot_product_attention(query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False)
679
+ del query, key, value, attention_mask # free memory
680
+
681
+ hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
682
+ hidden_states = hidden_states.to(query_dtype) # use stored dtype
683
+
684
+ # linear proj
685
+ hidden_states = attn.to_out[0](hidden_states)
686
+ # dropout
687
+ hidden_states = attn.to_out[1](hidden_states)
688
+
689
+ if input_ndim == 4:
690
+ hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
691
+
692
+ return hidden_states
693
+
694
+
695
+ # endregion diffusers
696
+
697
+
698
+ def pad_for_3d_conv(x, kernel_size):
699
+ b, c, t, h, w = x.shape
700
+ pt, ph, pw = kernel_size
701
+ pad_t = (pt - (t % pt)) % pt
702
+ pad_h = (ph - (h % ph)) % ph
703
+ pad_w = (pw - (w % pw)) % pw
704
+ return torch.nn.functional.pad(x, (0, pad_w, 0, pad_h, 0, pad_t), mode="replicate")
705
+
706
+
707
+ def center_down_sample_3d(x, kernel_size):
708
+ # pt, ph, pw = kernel_size
709
+ # cp = (pt * ph * pw) // 2
710
+ # xp = einops.rearrange(x, 'b c (t pt) (h ph) (w pw) -> (pt ph pw) b c t h w', pt=pt, ph=ph, pw=pw)
711
+ # xc = xp[cp]
712
+ # return xc
713
+ return torch.nn.functional.avg_pool3d(x, kernel_size, stride=kernel_size)
714
+
715
+
716
+ def get_cu_seqlens(text_mask, img_len):
717
+ batch_size = text_mask.shape[0]
718
+ text_len = text_mask.sum(dim=1)
719
+ max_len = text_mask.shape[1] + img_len
720
+
721
+ cu_seqlens = torch.zeros([2 * batch_size + 1], dtype=torch.int32, device=text_mask.device) # ensure device match
722
+
723
+ for i in range(batch_size):
724
+ s = text_len[i] + img_len
725
+ s1 = i * max_len + s
726
+ s2 = (i + 1) * max_len
727
+ cu_seqlens[2 * i + 1] = s1
728
+ cu_seqlens[2 * i + 2] = s2
729
+
730
+ return cu_seqlens
731
+
732
+
733
+ def apply_rotary_emb_transposed(x, freqs_cis):
734
+ cos, sin = freqs_cis.unsqueeze(-2).chunk(2, dim=-1)
735
+ del freqs_cis
736
+ x_real, x_imag = x.unflatten(-1, (-1, 2)).unbind(-1)
737
+ x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
738
+ del x_real, x_imag
739
+ return (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
740
+
741
+
742
+ def attn_varlen_func(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv, attn_mode=None, split_attn=False):
743
+ if cu_seqlens_q is None and cu_seqlens_kv is None and max_seqlen_q is None and max_seqlen_kv is None:
744
+ if attn_mode == "sageattn" or attn_mode is None and sageattn is not None:
745
+ x = sageattn(q, k, v, tensor_layout="NHD")
746
+ return x
747
+
748
+ if attn_mode == "flash" or attn_mode is None and flash_attn_func is not None:
749
+ x = flash_attn_func(q, k, v)
750
+ return x
751
+
752
+ if attn_mode == "xformers" or attn_mode is None and xformers_attn_func is not None:
753
+ x = xformers_attn_func(q, k, v)
754
+ return x
755
+
756
+ x = torch.nn.functional.scaled_dot_product_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)).transpose(
757
+ 1, 2
758
+ )
759
+ return x
760
+ if split_attn:
761
+ if attn_mode == "sageattn" or attn_mode is None and sageattn is not None:
762
+ x = torch.empty_like(q)
763
+ for i in range(q.size(0)):
764
+ x[i : i + 1] = sageattn(q[i : i + 1], k[i : i + 1], v[i : i + 1], tensor_layout="NHD")
765
+ return x
766
+
767
+ if attn_mode == "flash" or attn_mode is None and flash_attn_func is not None:
768
+ x = torch.empty_like(q)
769
+ for i in range(q.size(0)):
770
+ x[i : i + 1] = flash_attn_func(q[i : i + 1], k[i : i + 1], v[i : i + 1])
771
+ return x
772
+
773
+ if attn_mode == "xformers" or attn_mode is None and xformers_attn_func is not None:
774
+ x = torch.empty_like(q)
775
+ for i in range(q.size(0)):
776
+ x[i : i + 1] = xformers_attn_func(q[i : i + 1], k[i : i + 1], v[i : i + 1])
777
+ return x
778
+
779
+ q = q.transpose(1, 2)
780
+ k = k.transpose(1, 2)
781
+ v = v.transpose(1, 2)
782
+ x = torch.empty_like(q)
783
+ for i in range(q.size(0)):
784
+ x[i : i + 1] = torch.nn.functional.scaled_dot_product_attention(q[i : i + 1], k[i : i + 1], v[i : i + 1])
785
+ x = x.transpose(1, 2)
786
+ return x
787
+
788
+ batch_size = q.shape[0]
789
+ q = q.view(q.shape[0] * q.shape[1], *q.shape[2:])
790
+ k = k.view(k.shape[0] * k.shape[1], *k.shape[2:])
791
+ v = v.view(v.shape[0] * v.shape[1], *v.shape[2:])
792
+ if attn_mode == "sageattn" or attn_mode is None and sageattn_varlen is not None:
793
+ x = sageattn_varlen(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv)
794
+ del q, k, v # free memory
795
+ elif attn_mode == "flash" or attn_mode is None and flash_attn_varlen_func is not None:
796
+ x = flash_attn_varlen_func(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv)
797
+ del q, k, v # free memory
798
+ else:
799
+ raise NotImplementedError("No Attn Installed!")
800
+ x = x.view(batch_size, max_seqlen_q, *x.shape[2:])
801
+ return x
802
+
803
+
804
+ class HunyuanAttnProcessorFlashAttnDouble:
805
+ def __call__(
806
+ self,
807
+ attn: Attention,
808
+ hidden_states,
809
+ encoder_hidden_states,
810
+ attention_mask,
811
+ image_rotary_emb,
812
+ attn_mode: Optional[str] = None,
813
+ split_attn: Optional[bool] = False,
814
+ ):
815
+ cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv = attention_mask
816
+
817
+ # Project image latents
818
+ query = attn.to_q(hidden_states)
819
+ key = attn.to_k(hidden_states)
820
+ value = attn.to_v(hidden_states)
821
+ del hidden_states # free memory
822
+
823
+ query = query.unflatten(2, (attn.heads, -1))
824
+ key = key.unflatten(2, (attn.heads, -1))
825
+ value = value.unflatten(2, (attn.heads, -1))
826
+
827
+ query = attn.norm_q(query)
828
+ key = attn.norm_k(key)
829
+
830
+ query = apply_rotary_emb_transposed(query, image_rotary_emb)
831
+ key = apply_rotary_emb_transposed(key, image_rotary_emb)
832
+ del image_rotary_emb # free memory
833
+
834
+ # Project context (text/encoder) embeddings
835
+ encoder_query = attn.add_q_proj(encoder_hidden_states)
836
+ encoder_key = attn.add_k_proj(encoder_hidden_states)
837
+ encoder_value = attn.add_v_proj(encoder_hidden_states)
838
+ txt_length = encoder_hidden_states.shape[1] # store length before deleting
839
+ del encoder_hidden_states # free memory
840
+
841
+ encoder_query = encoder_query.unflatten(2, (attn.heads, -1))
842
+ encoder_key = encoder_key.unflatten(2, (attn.heads, -1))
843
+ encoder_value = encoder_value.unflatten(2, (attn.heads, -1))
844
+
845
+ encoder_query = attn.norm_added_q(encoder_query)
846
+ encoder_key = attn.norm_added_k(encoder_key)
847
+
848
+ # Concatenate image and context q, k, v
849
+ query = torch.cat([query, encoder_query], dim=1)
850
+ key = torch.cat([key, encoder_key], dim=1)
851
+ value = torch.cat([value, encoder_value], dim=1)
852
+ del encoder_query, encoder_key, encoder_value # free memory
853
+
854
+ hidden_states_attn = attn_varlen_func(
855
+ query, key, value, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv, attn_mode=attn_mode, split_attn=split_attn
856
+ )
857
+ del query, key, value # free memory
858
+ hidden_states_attn = hidden_states_attn.flatten(-2)
859
+
860
+ hidden_states, encoder_hidden_states = hidden_states_attn[:, :-txt_length], hidden_states_attn[:, -txt_length:]
861
+ del hidden_states_attn # free memory
862
+
863
+ # Apply output projections
864
+ hidden_states = attn.to_out[0](hidden_states)
865
+ hidden_states = attn.to_out[1](hidden_states) # Dropout/Identity
866
+ encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
867
+
868
+ return hidden_states, encoder_hidden_states
869
+
870
+
871
+ class HunyuanAttnProcessorFlashAttnSingle:
872
+ def __call__(
873
+ self,
874
+ attn: Attention,
875
+ hidden_states,
876
+ encoder_hidden_states,
877
+ attention_mask,
878
+ image_rotary_emb,
879
+ attn_mode: Optional[str] = None,
880
+ split_attn: Optional[bool] = False,
881
+ ):
882
+ cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv = attention_mask
883
+ txt_length = encoder_hidden_states.shape[1] # Store text length
884
+
885
+ # Concatenate image and context inputs
886
+ hidden_states_cat = torch.cat([hidden_states, encoder_hidden_states], dim=1)
887
+ del hidden_states, encoder_hidden_states # free memory
888
+
889
+ # Project concatenated inputs
890
+ query = attn.to_q(hidden_states_cat)
891
+ key = attn.to_k(hidden_states_cat)
892
+ value = attn.to_v(hidden_states_cat)
893
+ del hidden_states_cat # free memory
894
+
895
+ query = query.unflatten(2, (attn.heads, -1))
896
+ key = key.unflatten(2, (attn.heads, -1))
897
+ value = value.unflatten(2, (attn.heads, -1))
898
+
899
+ query = attn.norm_q(query)
900
+ key = attn.norm_k(key)
901
+
902
+ query = torch.cat([apply_rotary_emb_transposed(query[:, :-txt_length], image_rotary_emb), query[:, -txt_length:]], dim=1)
903
+ key = torch.cat([apply_rotary_emb_transposed(key[:, :-txt_length], image_rotary_emb), key[:, -txt_length:]], dim=1)
904
+ del image_rotary_emb # free memory
905
+
906
+ hidden_states = attn_varlen_func(
907
+ query, key, value, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv, attn_mode=attn_mode, split_attn=split_attn
908
+ )
909
+ del query, key, value # free memory
910
+ hidden_states = hidden_states.flatten(-2)
911
+
912
+ hidden_states, encoder_hidden_states = hidden_states[:, :-txt_length], hidden_states[:, -txt_length:]
913
+
914
+ return hidden_states, encoder_hidden_states
915
+
916
+
917
+ class CombinedTimestepGuidanceTextProjEmbeddings(nn.Module):
918
+ def __init__(self, embedding_dim, pooled_projection_dim):
919
+ super().__init__()
920
+
921
+ self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
922
+ self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
923
+ self.guidance_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
924
+ self.text_embedder = PixArtAlphaTextProjection(pooled_projection_dim, embedding_dim, act_fn="silu")
925
+
926
+ def forward(self, timestep, guidance, pooled_projection):
927
+ timesteps_proj = self.time_proj(timestep)
928
+ timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=pooled_projection.dtype))
929
+
930
+ guidance_proj = self.time_proj(guidance)
931
+ guidance_emb = self.guidance_embedder(guidance_proj.to(dtype=pooled_projection.dtype))
932
+
933
+ time_guidance_emb = timesteps_emb + guidance_emb
934
+
935
+ pooled_projections = self.text_embedder(pooled_projection)
936
+ conditioning = time_guidance_emb + pooled_projections
937
+
938
+ return conditioning
939
+
940
+
941
+ class CombinedTimestepTextProjEmbeddings(nn.Module):
942
+ def __init__(self, embedding_dim, pooled_projection_dim):
943
+ super().__init__()
944
+
945
+ self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
946
+ self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
947
+ self.text_embedder = PixArtAlphaTextProjection(pooled_projection_dim, embedding_dim, act_fn="silu")
948
+
949
+ def forward(self, timestep, pooled_projection):
950
+ timesteps_proj = self.time_proj(timestep)
951
+ timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=pooled_projection.dtype))
952
+
953
+ pooled_projections = self.text_embedder(pooled_projection)
954
+
955
+ conditioning = timesteps_emb + pooled_projections
956
+
957
+ return conditioning
958
+
959
+
960
+ class HunyuanVideoAdaNorm(nn.Module):
961
+ def __init__(self, in_features: int, out_features: Optional[int] = None) -> None:
962
+ super().__init__()
963
+
964
+ out_features = out_features or 2 * in_features
965
+ self.linear = nn.Linear(in_features, out_features)
966
+ self.nonlinearity = nn.SiLU()
967
+
968
+ def forward(self, temb: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
969
+ temb = self.linear(self.nonlinearity(temb))
970
+ gate_msa, gate_mlp = temb.chunk(2, dim=-1)
971
+ gate_msa, gate_mlp = gate_msa.unsqueeze(1), gate_mlp.unsqueeze(1)
972
+ return gate_msa, gate_mlp
973
+
974
+
975
+ class HunyuanVideoIndividualTokenRefinerBlock(nn.Module):
976
+ def __init__(
977
+ self,
978
+ num_attention_heads: int,
979
+ attention_head_dim: int,
980
+ mlp_width_ratio: float = 4.0,
981
+ mlp_drop_rate: float = 0.0,
982
+ attention_bias: bool = True,
983
+ ) -> None:
984
+ super().__init__()
985
+
986
+ hidden_size = num_attention_heads * attention_head_dim
987
+
988
+ self.norm1 = LayerNormFramePack(hidden_size, elementwise_affine=True, eps=1e-6)
989
+ self.attn = Attention(
990
+ query_dim=hidden_size,
991
+ cross_attention_dim=None,
992
+ heads=num_attention_heads,
993
+ dim_head=attention_head_dim,
994
+ bias=attention_bias,
995
+ )
996
+
997
+ self.norm2 = LayerNormFramePack(hidden_size, elementwise_affine=True, eps=1e-6)
998
+ self.ff = FeedForward(hidden_size, mult=mlp_width_ratio, activation_fn="linear-silu", dropout=mlp_drop_rate)
999
+
1000
+ self.norm_out = HunyuanVideoAdaNorm(hidden_size, 2 * hidden_size)
1001
+
1002
+ def forward(
1003
+ self,
1004
+ hidden_states: torch.Tensor,
1005
+ temb: torch.Tensor,
1006
+ attention_mask: Optional[torch.Tensor] = None,
1007
+ ) -> torch.Tensor:
1008
+ norm_hidden_states = self.norm1(hidden_states)
1009
+
1010
+ # Self-attention
1011
+ attn_output = self.attn(
1012
+ hidden_states=norm_hidden_states,
1013
+ encoder_hidden_states=None,
1014
+ attention_mask=attention_mask,
1015
+ )
1016
+ del norm_hidden_states # free memory
1017
+
1018
+ gate_msa, gate_mlp = self.norm_out(temb)
1019
+ hidden_states = hidden_states + attn_output * gate_msa
1020
+ del attn_output, gate_msa # free memory
1021
+
1022
+ ff_output = self.ff(self.norm2(hidden_states))
1023
+ hidden_states = hidden_states + ff_output * gate_mlp
1024
+ del ff_output, gate_mlp # free memory
1025
+
1026
+ return hidden_states
1027
+
1028
+
1029
+ class HunyuanVideoIndividualTokenRefiner(nn.Module):
1030
+ def __init__(
1031
+ self,
1032
+ num_attention_heads: int,
1033
+ attention_head_dim: int,
1034
+ num_layers: int,
1035
+ mlp_width_ratio: float = 4.0,
1036
+ mlp_drop_rate: float = 0.0,
1037
+ attention_bias: bool = True,
1038
+ ) -> None:
1039
+ super().__init__()
1040
+
1041
+ self.refiner_blocks = nn.ModuleList(
1042
+ [
1043
+ HunyuanVideoIndividualTokenRefinerBlock(
1044
+ num_attention_heads=num_attention_heads,
1045
+ attention_head_dim=attention_head_dim,
1046
+ mlp_width_ratio=mlp_width_ratio,
1047
+ mlp_drop_rate=mlp_drop_rate,
1048
+ attention_bias=attention_bias,
1049
+ )
1050
+ for _ in range(num_layers)
1051
+ ]
1052
+ )
1053
+
1054
+ def forward(
1055
+ self,
1056
+ hidden_states: torch.Tensor,
1057
+ temb: torch.Tensor,
1058
+ attention_mask: Optional[torch.Tensor] = None,
1059
+ ) -> torch.Tensor:
1060
+ self_attn_mask = None
1061
+ if attention_mask is not None:
1062
+ batch_size = attention_mask.shape[0]
1063
+ seq_len = attention_mask.shape[1]
1064
+ attention_mask = attention_mask.to(hidden_states.device).bool()
1065
+ self_attn_mask_1 = attention_mask.view(batch_size, 1, 1, seq_len).repeat(1, 1, seq_len, 1)
1066
+ self_attn_mask_2 = self_attn_mask_1.transpose(2, 3)
1067
+ self_attn_mask = (self_attn_mask_1 & self_attn_mask_2).bool()
1068
+ self_attn_mask[:, :, :, 0] = True
1069
+
1070
+ for block in self.refiner_blocks:
1071
+ hidden_states = block(hidden_states, temb, self_attn_mask)
1072
+
1073
+ return hidden_states
1074
+
1075
+
1076
+ class HunyuanVideoTokenRefiner(nn.Module):
1077
+ def __init__(
1078
+ self,
1079
+ in_channels: int,
1080
+ num_attention_heads: int,
1081
+ attention_head_dim: int,
1082
+ num_layers: int,
1083
+ mlp_ratio: float = 4.0,
1084
+ mlp_drop_rate: float = 0.0,
1085
+ attention_bias: bool = True,
1086
+ ) -> None:
1087
+ super().__init__()
1088
+
1089
+ hidden_size = num_attention_heads * attention_head_dim
1090
+
1091
+ self.time_text_embed = CombinedTimestepTextProjEmbeddings(embedding_dim=hidden_size, pooled_projection_dim=in_channels)
1092
+ self.proj_in = nn.Linear(in_channels, hidden_size, bias=True)
1093
+ self.token_refiner = HunyuanVideoIndividualTokenRefiner(
1094
+ num_attention_heads=num_attention_heads,
1095
+ attention_head_dim=attention_head_dim,
1096
+ num_layers=num_layers,
1097
+ mlp_width_ratio=mlp_ratio,
1098
+ mlp_drop_rate=mlp_drop_rate,
1099
+ attention_bias=attention_bias,
1100
+ )
1101
+
1102
+ def forward(
1103
+ self,
1104
+ hidden_states: torch.Tensor,
1105
+ timestep: torch.LongTensor,
1106
+ attention_mask: Optional[torch.LongTensor] = None,
1107
+ ) -> torch.Tensor:
1108
+ if attention_mask is None:
1109
+ pooled_projections = hidden_states.mean(dim=1)
1110
+ else:
1111
+ original_dtype = hidden_states.dtype
1112
+ mask_float = attention_mask.float().unsqueeze(-1)
1113
+ pooled_projections = (hidden_states * mask_float).sum(dim=1) / mask_float.sum(dim=1)
1114
+ pooled_projections = pooled_projections.to(original_dtype)
1115
+
1116
+ temb = self.time_text_embed(timestep, pooled_projections)
1117
+ del pooled_projections # free memory
1118
+
1119
+ hidden_states = self.proj_in(hidden_states)
1120
+ hidden_states = self.token_refiner(hidden_states, temb, attention_mask)
1121
+ del temb, attention_mask # free memory
1122
+
1123
+ return hidden_states
1124
+
1125
+
1126
+ class HunyuanVideoRotaryPosEmbed(nn.Module):
1127
+ def __init__(self, rope_dim, theta):
1128
+ super().__init__()
1129
+ self.DT, self.DY, self.DX = rope_dim
1130
+ self.theta = theta
1131
+
1132
+ @torch.no_grad()
1133
+ def get_frequency(self, dim, pos):
1134
+ T, H, W = pos.shape
1135
+ freqs = 1.0 / (self.theta ** (torch.arange(0, dim, 2, dtype=torch.float32, device=pos.device)[: (dim // 2)] / dim))
1136
+ freqs = torch.outer(freqs, pos.reshape(-1)).unflatten(-1, (T, H, W)).repeat_interleave(2, dim=0)
1137
+ return freqs.cos(), freqs.sin()
1138
+
1139
+ @torch.no_grad()
1140
+ def forward_inner(self, frame_indices, height, width, device):
1141
+ GT, GY, GX = torch.meshgrid(
1142
+ frame_indices.to(device=device, dtype=torch.float32),
1143
+ torch.arange(0, height, device=device, dtype=torch.float32),
1144
+ torch.arange(0, width, device=device, dtype=torch.float32),
1145
+ indexing="ij",
1146
+ )
1147
+
1148
+ FCT, FST = self.get_frequency(self.DT, GT)
1149
+ del GT # free memory
1150
+ FCY, FSY = self.get_frequency(self.DY, GY)
1151
+ del GY # free memory
1152
+ FCX, FSX = self.get_frequency(self.DX, GX)
1153
+ del GX # free memory
1154
+
1155
+ result = torch.cat([FCT, FCY, FCX, FST, FSY, FSX], dim=0)
1156
+ del FCT, FCY, FCX, FST, FSY, FSX # free memory
1157
+
1158
+ # Return result already on the correct device
1159
+ return result # Shape (2 * total_dim / 2, T, H, W) -> (total_dim, T, H, W)
1160
+
1161
+ @torch.no_grad()
1162
+ def forward(self, frame_indices, height, width, device):
1163
+ frame_indices = frame_indices.unbind(0)
1164
+ results = [self.forward_inner(f, height, width, device) for f in frame_indices]
1165
+ results = torch.stack(results, dim=0)
1166
+ return results
1167
+
1168
+
1169
+ class AdaLayerNormZero(nn.Module):
1170
+ def __init__(self, embedding_dim: int, norm_type="layer_norm", bias=True):
1171
+ super().__init__()
1172
+ self.silu = nn.SiLU()
1173
+ self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=bias)
1174
+ if norm_type == "layer_norm":
1175
+ self.norm = LayerNormFramePack(embedding_dim, elementwise_affine=False, eps=1e-6)
1176
+ else:
1177
+ raise ValueError(f"unknown norm_type {norm_type}")
1178
+
1179
+ def forward(
1180
+ self, x: torch.Tensor, emb: Optional[torch.Tensor] = None
1181
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
1182
+ emb = emb.unsqueeze(-2)
1183
+ emb = self.linear(self.silu(emb))
1184
+ shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.chunk(6, dim=-1)
1185
+ x = self.norm(x) * (1 + scale_msa) + shift_msa
1186
+ return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
1187
+
1188
+
1189
+ class AdaLayerNormZeroSingle(nn.Module):
1190
+ def __init__(self, embedding_dim: int, norm_type="layer_norm", bias=True):
1191
+ super().__init__()
1192
+
1193
+ self.silu = nn.SiLU()
1194
+ self.linear = nn.Linear(embedding_dim, 3 * embedding_dim, bias=bias)
1195
+ if norm_type == "layer_norm":
1196
+ self.norm = LayerNormFramePack(embedding_dim, elementwise_affine=False, eps=1e-6)
1197
+ else:
1198
+ raise ValueError(f"unknown norm_type {norm_type}")
1199
+
1200
+ def forward(
1201
+ self,
1202
+ x: torch.Tensor,
1203
+ emb: Optional[torch.Tensor] = None,
1204
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
1205
+ emb = emb.unsqueeze(-2)
1206
+ emb = self.linear(self.silu(emb))
1207
+ shift_msa, scale_msa, gate_msa = emb.chunk(3, dim=-1)
1208
+ x = self.norm(x) * (1 + scale_msa) + shift_msa
1209
+ return x, gate_msa
1210
+
1211
+
1212
+ class AdaLayerNormContinuous(nn.Module):
1213
+ def __init__(
1214
+ self,
1215
+ embedding_dim: int,
1216
+ conditioning_embedding_dim: int,
1217
+ elementwise_affine=True,
1218
+ eps=1e-5,
1219
+ bias=True,
1220
+ norm_type="layer_norm",
1221
+ ):
1222
+ super().__init__()
1223
+ self.silu = nn.SiLU()
1224
+ self.linear = nn.Linear(conditioning_embedding_dim, embedding_dim * 2, bias=bias)
1225
+ if norm_type == "layer_norm":
1226
+ self.norm = LayerNormFramePack(embedding_dim, eps, elementwise_affine, bias)
1227
+ else:
1228
+ raise ValueError(f"unknown norm_type {norm_type}")
1229
+
1230
+ def forward(self, x: torch.Tensor, emb: torch.Tensor) -> torch.Tensor:
1231
+ emb = emb.unsqueeze(-2)
1232
+ emb = self.linear(self.silu(emb))
1233
+ scale, shift = emb.chunk(2, dim=-1)
1234
+ del emb # free memory
1235
+ x = self.norm(x) * (1 + scale) + shift
1236
+ return x
1237
+
1238
+
1239
+ class HunyuanVideoSingleTransformerBlock(nn.Module):
1240
+ def __init__(
1241
+ self,
1242
+ num_attention_heads: int,
1243
+ attention_head_dim: int,
1244
+ mlp_ratio: float = 4.0,
1245
+ qk_norm: str = "rms_norm",
1246
+ attn_mode: Optional[str] = None,
1247
+ split_attn: Optional[bool] = False,
1248
+ ) -> None:
1249
+ super().__init__()
1250
+
1251
+ hidden_size = num_attention_heads * attention_head_dim
1252
+ mlp_dim = int(hidden_size * mlp_ratio)
1253
+ self.attn_mode = attn_mode
1254
+ self.split_attn = split_attn
1255
+
1256
+ # Attention layer (pre_only=True means no output projection in Attention module itself)
1257
+ self.attn = Attention(
1258
+ query_dim=hidden_size,
1259
+ cross_attention_dim=None,
1260
+ dim_head=attention_head_dim,
1261
+ heads=num_attention_heads,
1262
+ out_dim=hidden_size,
1263
+ bias=True,
1264
+ processor=HunyuanAttnProcessorFlashAttnSingle(),
1265
+ qk_norm=qk_norm,
1266
+ eps=1e-6,
1267
+ pre_only=True, # Crucial: Attn processor will return raw attention output
1268
+ )
1269
+
1270
+ self.norm = AdaLayerNormZeroSingle(hidden_size, norm_type="layer_norm")
1271
+ self.proj_mlp = nn.Linear(hidden_size, mlp_dim)
1272
+ self.act_mlp = nn.GELU(approximate="tanh")
1273
+ self.proj_out = nn.Linear(hidden_size + mlp_dim, hidden_size)
1274
+
1275
+ def forward(
1276
+ self,
1277
+ hidden_states: torch.Tensor,
1278
+ encoder_hidden_states: torch.Tensor,
1279
+ temb: torch.Tensor,
1280
+ attention_mask: Optional[torch.Tensor] = None,
1281
+ image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
1282
+ ) -> torch.Tensor:
1283
+ text_seq_length = encoder_hidden_states.shape[1]
1284
+ hidden_states = torch.cat([hidden_states, encoder_hidden_states], dim=1)
1285
+ del encoder_hidden_states # free memory
1286
+
1287
+ residual = hidden_states
1288
+
1289
+ # 1. Input normalization
1290
+ norm_hidden_states, gate = self.norm(hidden_states, emb=temb)
1291
+ mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states))
1292
+
1293
+ norm_hidden_states, norm_encoder_hidden_states = (
1294
+ norm_hidden_states[:, :-text_seq_length, :],
1295
+ norm_hidden_states[:, -text_seq_length:, :],
1296
+ )
1297
+
1298
+ # 2. Attention
1299
+ attn_output, context_attn_output = self.attn(
1300
+ hidden_states=norm_hidden_states,
1301
+ encoder_hidden_states=norm_encoder_hidden_states,
1302
+ attention_mask=attention_mask,
1303
+ image_rotary_emb=image_rotary_emb,
1304
+ attn_mode=self.attn_mode,
1305
+ split_attn=self.split_attn,
1306
+ )
1307
+ attn_output = torch.cat([attn_output, context_attn_output], dim=1)
1308
+ del norm_hidden_states, norm_encoder_hidden_states, context_attn_output # free memory
1309
+ del image_rotary_emb
1310
+
1311
+ # 3. Modulation and residual connection
1312
+ hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2)
1313
+ del attn_output, mlp_hidden_states # free memory
1314
+ hidden_states = gate * self.proj_out(hidden_states)
1315
+ hidden_states = hidden_states + residual
1316
+
1317
+ hidden_states, encoder_hidden_states = (
1318
+ hidden_states[:, :-text_seq_length, :],
1319
+ hidden_states[:, -text_seq_length:, :],
1320
+ )
1321
+ return hidden_states, encoder_hidden_states
1322
+
1323
+
1324
+ class HunyuanVideoTransformerBlock(nn.Module):
1325
+ def __init__(
1326
+ self,
1327
+ num_attention_heads: int,
1328
+ attention_head_dim: int,
1329
+ mlp_ratio: float,
1330
+ qk_norm: str = "rms_norm",
1331
+ attn_mode: Optional[str] = None,
1332
+ split_attn: Optional[bool] = False,
1333
+ ) -> None:
1334
+ super().__init__()
1335
+
1336
+ hidden_size = num_attention_heads * attention_head_dim
1337
+ self.attn_mode = attn_mode
1338
+ self.split_attn = split_attn
1339
+
1340
+ self.norm1 = AdaLayerNormZero(hidden_size, norm_type="layer_norm")
1341
+ self.norm1_context = AdaLayerNormZero(hidden_size, norm_type="layer_norm")
1342
+
1343
+ self.attn = Attention(
1344
+ query_dim=hidden_size,
1345
+ cross_attention_dim=None,
1346
+ added_kv_proj_dim=hidden_size,
1347
+ dim_head=attention_head_dim,
1348
+ heads=num_attention_heads,
1349
+ out_dim=hidden_size,
1350
+ context_pre_only=False,
1351
+ bias=True,
1352
+ processor=HunyuanAttnProcessorFlashAttnDouble(),
1353
+ qk_norm=qk_norm,
1354
+ eps=1e-6,
1355
+ )
1356
+
1357
+ self.norm2 = LayerNormFramePack(hidden_size, elementwise_affine=False, eps=1e-6)
1358
+ self.ff = FeedForward(hidden_size, mult=mlp_ratio, activation_fn="gelu-approximate")
1359
+
1360
+ self.norm2_context = LayerNormFramePack(hidden_size, elementwise_affine=False, eps=1e-6)
1361
+ self.ff_context = FeedForward(hidden_size, mult=mlp_ratio, activation_fn="gelu-approximate")
1362
+
1363
+ def forward(
1364
+ self,
1365
+ hidden_states: torch.Tensor,
1366
+ encoder_hidden_states: torch.Tensor,
1367
+ temb: torch.Tensor,
1368
+ attention_mask: Optional[torch.Tensor] = None,
1369
+ freqs_cis: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
1370
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
1371
+ # 1. Input normalization
1372
+ norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
1373
+ norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
1374
+ encoder_hidden_states, emb=temb
1375
+ )
1376
+
1377
+ # 2. Joint attention
1378
+ attn_output, context_attn_output = self.attn(
1379
+ hidden_states=norm_hidden_states,
1380
+ encoder_hidden_states=norm_encoder_hidden_states,
1381
+ attention_mask=attention_mask,
1382
+ image_rotary_emb=freqs_cis,
1383
+ attn_mode=self.attn_mode,
1384
+ split_attn=self.split_attn,
1385
+ )
1386
+ del norm_hidden_states, norm_encoder_hidden_states, freqs_cis # free memory
1387
+
1388
+ # 3. Modulation and residual connection
1389
+ hidden_states = hidden_states + attn_output * gate_msa
1390
+ del attn_output, gate_msa # free memory
1391
+ encoder_hidden_states = encoder_hidden_states + context_attn_output * c_gate_msa
1392
+ del context_attn_output, c_gate_msa # free memory
1393
+
1394
+ norm_hidden_states = self.norm2(hidden_states)
1395
+ norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
1396
+
1397
+ norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
1398
+ del shift_mlp, scale_mlp # free memory
1399
+ norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp) + c_shift_mlp
1400
+ del c_shift_mlp, c_scale_mlp # free memory
1401
+
1402
+ # 4. Feed-forward
1403
+ ff_output = self.ff(norm_hidden_states)
1404
+ del norm_hidden_states # free memory
1405
+ context_ff_output = self.ff_context(norm_encoder_hidden_states)
1406
+ del norm_encoder_hidden_states # free memory
1407
+
1408
+ hidden_states = hidden_states + gate_mlp * ff_output
1409
+ del ff_output, gate_mlp # free memory
1410
+ encoder_hidden_states = encoder_hidden_states + c_gate_mlp * context_ff_output
1411
+ del context_ff_output, c_gate_mlp # free memory
1412
+
1413
+ return hidden_states, encoder_hidden_states
1414
+
1415
+
1416
+ class ClipVisionProjection(nn.Module):
1417
+ def __init__(self, in_channels, out_channels):
1418
+ super().__init__()
1419
+ self.up = nn.Linear(in_channels, out_channels * 3)
1420
+ self.down = nn.Linear(out_channels * 3, out_channels)
1421
+
1422
+ def forward(self, x):
1423
+ projected_x = self.down(nn.functional.silu(self.up(x)))
1424
+ return projected_x
1425
+
1426
+
1427
+ class HunyuanVideoPatchEmbed(nn.Module):
1428
+ def __init__(self, patch_size, in_chans, embed_dim):
1429
+ super().__init__()
1430
+ self.proj = nn.Conv3d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
1431
+
1432
+
1433
+ class HunyuanVideoPatchEmbedForCleanLatents(nn.Module):
1434
+ def __init__(self, inner_dim):
1435
+ super().__init__()
1436
+ self.proj = nn.Conv3d(16, inner_dim, kernel_size=(1, 2, 2), stride=(1, 2, 2))
1437
+ self.proj_2x = nn.Conv3d(16, inner_dim, kernel_size=(2, 4, 4), stride=(2, 4, 4))
1438
+ self.proj_4x = nn.Conv3d(16, inner_dim, kernel_size=(4, 8, 8), stride=(4, 8, 8))
1439
+
1440
+ @torch.no_grad()
1441
+ def initialize_weight_from_another_conv3d(self, another_layer):
1442
+ weight = another_layer.weight.detach().clone()
1443
+ bias = another_layer.bias.detach().clone()
1444
+
1445
+ sd = {
1446
+ "proj.weight": weight.clone(),
1447
+ "proj.bias": bias.clone(),
1448
+ "proj_2x.weight": einops.repeat(weight, "b c t h w -> b c (t tk) (h hk) (w wk)", tk=2, hk=2, wk=2) / 8.0,
1449
+ "proj_2x.bias": bias.clone(),
1450
+ "proj_4x.weight": einops.repeat(weight, "b c t h w -> b c (t tk) (h hk) (w wk)", tk=4, hk=4, wk=4) / 64.0,
1451
+ "proj_4x.bias": bias.clone(),
1452
+ }
1453
+
1454
+ sd = {k: v.clone() for k, v in sd.items()}
1455
+
1456
+ self.load_state_dict(sd)
1457
+ return
1458
+
1459
+
1460
+ class HunyuanVideoTransformer3DModelPacked(nn.Module): # (PreTrainedModelMixin, GenerationMixin,
1461
+ # ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
1462
+ # @register_to_config
1463
+ def __init__(
1464
+ self,
1465
+ in_channels: int = 16,
1466
+ out_channels: int = 16,
1467
+ num_attention_heads: int = 24,
1468
+ attention_head_dim: int = 128,
1469
+ num_layers: int = 20,
1470
+ num_single_layers: int = 40,
1471
+ num_refiner_layers: int = 2,
1472
+ mlp_ratio: float = 4.0,
1473
+ patch_size: int = 2,
1474
+ patch_size_t: int = 1,
1475
+ qk_norm: str = "rms_norm",
1476
+ guidance_embeds: bool = True,
1477
+ text_embed_dim: int = 4096,
1478
+ pooled_projection_dim: int = 768,
1479
+ rope_theta: float = 256.0,
1480
+ rope_axes_dim: Tuple[int] = (16, 56, 56),
1481
+ has_image_proj=False,
1482
+ image_proj_dim=1152,
1483
+ has_clean_x_embedder=False,
1484
+ attn_mode: Optional[str] = None,
1485
+ split_attn: Optional[bool] = False,
1486
+ ) -> None:
1487
+ super().__init__()
1488
+
1489
+ inner_dim = num_attention_heads * attention_head_dim
1490
+ out_channels = out_channels or in_channels
1491
+ self.config_patch_size = patch_size
1492
+ self.config_patch_size_t = patch_size_t
1493
+
1494
+ # 1. Latent and condition embedders
1495
+ self.x_embedder = HunyuanVideoPatchEmbed((patch_size_t, patch_size, patch_size), in_channels, inner_dim)
1496
+ self.context_embedder = HunyuanVideoTokenRefiner(
1497
+ text_embed_dim, num_attention_heads, attention_head_dim, num_layers=num_refiner_layers
1498
+ )
1499
+ self.time_text_embed = CombinedTimestepGuidanceTextProjEmbeddings(inner_dim, pooled_projection_dim)
1500
+
1501
+ self.clean_x_embedder = None
1502
+ self.image_projection = None
1503
+
1504
+ # 2. RoPE
1505
+ self.rope = HunyuanVideoRotaryPosEmbed(rope_axes_dim, rope_theta)
1506
+
1507
+ # 3. Dual stream transformer blocks
1508
+ self.transformer_blocks = nn.ModuleList(
1509
+ [
1510
+ HunyuanVideoTransformerBlock(
1511
+ num_attention_heads,
1512
+ attention_head_dim,
1513
+ mlp_ratio=mlp_ratio,
1514
+ qk_norm=qk_norm,
1515
+ attn_mode=attn_mode,
1516
+ split_attn=split_attn,
1517
+ )
1518
+ for _ in range(num_layers)
1519
+ ]
1520
+ )
1521
+
1522
+ # 4. Single stream transformer blocks
1523
+ self.single_transformer_blocks = nn.ModuleList(
1524
+ [
1525
+ HunyuanVideoSingleTransformerBlock(
1526
+ num_attention_heads,
1527
+ attention_head_dim,
1528
+ mlp_ratio=mlp_ratio,
1529
+ qk_norm=qk_norm,
1530
+ attn_mode=attn_mode,
1531
+ split_attn=split_attn,
1532
+ )
1533
+ for _ in range(num_single_layers)
1534
+ ]
1535
+ )
1536
+
1537
+ # 5. Output projection
1538
+ self.norm_out = AdaLayerNormContinuous(inner_dim, inner_dim, elementwise_affine=False, eps=1e-6)
1539
+ self.proj_out = nn.Linear(inner_dim, patch_size_t * patch_size * patch_size * out_channels)
1540
+
1541
+ self.inner_dim = inner_dim
1542
+ self.use_gradient_checkpointing = False
1543
+ self.enable_teacache = False
1544
+
1545
+ # if has_image_proj:
1546
+ # self.install_image_projection(image_proj_dim)
1547
+ self.image_projection = ClipVisionProjection(in_channels=image_proj_dim, out_channels=self.inner_dim)
1548
+ # self.config["has_image_proj"] = True
1549
+ # self.config["image_proj_dim"] = in_channels
1550
+
1551
+ # if has_clean_x_embedder:
1552
+ # self.install_clean_x_embedder()
1553
+ self.clean_x_embedder = HunyuanVideoPatchEmbedForCleanLatents(self.inner_dim)
1554
+ # self.config["has_clean_x_embedder"] = True
1555
+
1556
+ self.high_quality_fp32_output_for_inference = True # False # change default to True
1557
+
1558
+ # Block swapping attributes (initialized to None)
1559
+ self.blocks_to_swap = None
1560
+ self.offloader_double = None
1561
+ self.offloader_single = None
1562
+
1563
+ @property
1564
+ def device(self):
1565
+ return next(self.parameters()).device
1566
+
1567
+ @property
1568
+ def dtype(self):
1569
+ return next(self.parameters()).dtype
1570
+
1571
+ def enable_gradient_checkpointing(self):
1572
+ self.use_gradient_checkpointing = True
1573
+ print("Gradient checkpointing enabled for HunyuanVideoTransformer3DModelPacked.") # Logging
1574
+
1575
+ def disable_gradient_checkpointing(self):
1576
+ self.use_gradient_checkpointing = False
1577
+ print("Gradient checkpointing disabled for HunyuanVideoTransformer3DModelPacked.") # Logging
1578
+
1579
+ def initialize_teacache(self, enable_teacache=True, num_steps=25, rel_l1_thresh=0.15):
1580
+ self.enable_teacache = enable_teacache
1581
+ self.cnt = 0
1582
+ self.num_steps = num_steps
1583
+ self.rel_l1_thresh = rel_l1_thresh # 0.1 for 1.6x speedup, 0.15 for 2.1x speedup
1584
+ self.accumulated_rel_l1_distance = 0
1585
+ self.previous_modulated_input = None
1586
+ self.previous_residual = None
1587
+ self.teacache_rescale_func = np.poly1d([7.33226126e02, -4.01131952e02, 6.75869174e01, -3.14987800e00, 9.61237896e-02])
1588
+ if enable_teacache:
1589
+ print(f"TeaCache enabled: num_steps={num_steps}, rel_l1_thresh={rel_l1_thresh}")
1590
+ else:
1591
+ print("TeaCache disabled.")
1592
+
1593
+ def gradient_checkpointing_method(self, block, *args):
1594
+ if self.use_gradient_checkpointing:
1595
+ result = torch.utils.checkpoint.checkpoint(block, *args, use_reentrant=False)
1596
+ else:
1597
+ result = block(*args)
1598
+ return result
1599
+
1600
+ def enable_block_swap(self, num_blocks: int, device: torch.device, supports_backward: bool):
1601
+ self.blocks_to_swap = num_blocks
1602
+ self.num_double_blocks = len(self.transformer_blocks)
1603
+ self.num_single_blocks = len(self.single_transformer_blocks)
1604
+ double_blocks_to_swap = num_blocks // 2
1605
+ single_blocks_to_swap = (num_blocks - double_blocks_to_swap) * 2 + 1
1606
+
1607
+ assert double_blocks_to_swap <= self.num_double_blocks - 1 and single_blocks_to_swap <= self.num_single_blocks - 1, (
1608
+ f"Cannot swap more than {self.num_double_blocks - 1} double blocks and {self.num_single_blocks - 1} single blocks. "
1609
+ f"Requested {double_blocks_to_swap} double blocks and {single_blocks_to_swap} single blocks."
1610
+ )
1611
+
1612
+ self.offloader_double = ModelOffloader(
1613
+ "double",
1614
+ self.transformer_blocks,
1615
+ self.num_double_blocks,
1616
+ double_blocks_to_swap,
1617
+ supports_backward,
1618
+ device,
1619
+ # debug=True # Optional debugging
1620
+ )
1621
+ self.offloader_single = ModelOffloader(
1622
+ "single",
1623
+ self.single_transformer_blocks,
1624
+ self.num_single_blocks,
1625
+ single_blocks_to_swap,
1626
+ supports_backward,
1627
+ device, # , debug=True
1628
+ )
1629
+ print(
1630
+ f"HunyuanVideoTransformer3DModelPacked: Block swap enabled. Swapping {num_blocks} blocks, "
1631
+ + f"double blocks: {double_blocks_to_swap}, single blocks: {single_blocks_to_swap}, supports_backward: {supports_backward}."
1632
+ )
1633
+
1634
+ def switch_block_swap_for_inference(self):
1635
+ if self.blocks_to_swap and self.blocks_to_swap > 0:
1636
+ self.offloader_double.set_forward_only(True)
1637
+ self.offloader_single.set_forward_only(True)
1638
+ self.prepare_block_swap_before_forward()
1639
+ print(f"HunyuanVideoTransformer3DModelPacked: Block swap set to forward only.")
1640
+
1641
+ def switch_block_swap_for_training(self):
1642
+ if self.blocks_to_swap and self.blocks_to_swap > 0:
1643
+ self.offloader_double.set_forward_only(False)
1644
+ self.offloader_single.set_forward_only(False)
1645
+ self.prepare_block_swap_before_forward()
1646
+ print(f"HunyuanVideoTransformer3DModelPacked: Block swap set to forward and backward.")
1647
+
1648
+ def move_to_device_except_swap_blocks(self, device: torch.device):
1649
+ # assume model is on cpu. do not move blocks to device to reduce temporary memory usage
1650
+ if self.blocks_to_swap:
1651
+ saved_double_blocks = self.transformer_blocks
1652
+ saved_single_blocks = self.single_transformer_blocks
1653
+ self.transformer_blocks = None
1654
+ self.single_transformer_blocks = None
1655
+
1656
+ self.to(device)
1657
+
1658
+ if self.blocks_to_swap:
1659
+ self.transformer_blocks = saved_double_blocks
1660
+ self.single_transformer_blocks = saved_single_blocks
1661
+
1662
+ def prepare_block_swap_before_forward(self):
1663
+ if self.blocks_to_swap is None or self.blocks_to_swap == 0:
1664
+ return
1665
+ self.offloader_double.prepare_block_devices_before_forward(self.transformer_blocks)
1666
+ self.offloader_single.prepare_block_devices_before_forward(self.single_transformer_blocks)
1667
+
1668
+ def process_input_hidden_states(
1669
+ self,
1670
+ latents,
1671
+ latent_indices=None,
1672
+ clean_latents=None,
1673
+ clean_latent_indices=None,
1674
+ clean_latents_2x=None,
1675
+ clean_latent_2x_indices=None,
1676
+ clean_latents_4x=None,
1677
+ clean_latent_4x_indices=None,
1678
+ ):
1679
+ hidden_states = self.gradient_checkpointing_method(self.x_embedder.proj, latents)
1680
+ B, C, T, H, W = hidden_states.shape
1681
+
1682
+ if latent_indices is None:
1683
+ latent_indices = torch.arange(0, T).unsqueeze(0).expand(B, -1)
1684
+
1685
+ hidden_states = hidden_states.flatten(2).transpose(1, 2)
1686
+
1687
+ rope_freqs = self.rope(frame_indices=latent_indices, height=H, width=W, device=hidden_states.device)
1688
+ rope_freqs = rope_freqs.flatten(2).transpose(1, 2)
1689
+
1690
+ if clean_latents is not None and clean_latent_indices is not None:
1691
+ clean_latents = clean_latents.to(hidden_states)
1692
+ clean_latents = self.gradient_checkpointing_method(self.clean_x_embedder.proj, clean_latents)
1693
+ clean_latents = clean_latents.flatten(2).transpose(1, 2)
1694
+
1695
+ clean_latent_rope_freqs = self.rope(frame_indices=clean_latent_indices, height=H, width=W, device=clean_latents.device)
1696
+ clean_latent_rope_freqs = clean_latent_rope_freqs.flatten(2).transpose(1, 2)
1697
+
1698
+ hidden_states = torch.cat([clean_latents, hidden_states], dim=1)
1699
+ rope_freqs = torch.cat([clean_latent_rope_freqs, rope_freqs], dim=1)
1700
+
1701
+ if clean_latents_2x is not None and clean_latent_2x_indices is not None:
1702
+ clean_latents_2x = clean_latents_2x.to(hidden_states)
1703
+ clean_latents_2x = pad_for_3d_conv(clean_latents_2x, (2, 4, 4))
1704
+ clean_latents_2x = self.gradient_checkpointing_method(self.clean_x_embedder.proj_2x, clean_latents_2x)
1705
+ clean_latents_2x = clean_latents_2x.flatten(2).transpose(1, 2)
1706
+
1707
+ clean_latent_2x_rope_freqs = self.rope(
1708
+ frame_indices=clean_latent_2x_indices, height=H, width=W, device=clean_latents_2x.device
1709
+ )
1710
+ clean_latent_2x_rope_freqs = pad_for_3d_conv(clean_latent_2x_rope_freqs, (2, 2, 2))
1711
+ clean_latent_2x_rope_freqs = center_down_sample_3d(clean_latent_2x_rope_freqs, (2, 2, 2))
1712
+ clean_latent_2x_rope_freqs = clean_latent_2x_rope_freqs.flatten(2).transpose(1, 2)
1713
+
1714
+ hidden_states = torch.cat([clean_latents_2x, hidden_states], dim=1)
1715
+ rope_freqs = torch.cat([clean_latent_2x_rope_freqs, rope_freqs], dim=1)
1716
+
1717
+ if clean_latents_4x is not None and clean_latent_4x_indices is not None:
1718
+ clean_latents_4x = clean_latents_4x.to(hidden_states)
1719
+ clean_latents_4x = pad_for_3d_conv(clean_latents_4x, (4, 8, 8))
1720
+ clean_latents_4x = self.gradient_checkpointing_method(self.clean_x_embedder.proj_4x, clean_latents_4x)
1721
+ clean_latents_4x = clean_latents_4x.flatten(2).transpose(1, 2)
1722
+
1723
+ clean_latent_4x_rope_freqs = self.rope(
1724
+ frame_indices=clean_latent_4x_indices, height=H, width=W, device=clean_latents_4x.device
1725
+ )
1726
+ clean_latent_4x_rope_freqs = pad_for_3d_conv(clean_latent_4x_rope_freqs, (4, 4, 4))
1727
+ clean_latent_4x_rope_freqs = center_down_sample_3d(clean_latent_4x_rope_freqs, (4, 4, 4))
1728
+ clean_latent_4x_rope_freqs = clean_latent_4x_rope_freqs.flatten(2).transpose(1, 2)
1729
+
1730
+ hidden_states = torch.cat([clean_latents_4x, hidden_states], dim=1)
1731
+ rope_freqs = torch.cat([clean_latent_4x_rope_freqs, rope_freqs], dim=1)
1732
+
1733
+ return hidden_states, rope_freqs
1734
+
1735
+ def forward(
1736
+ self,
1737
+ hidden_states,
1738
+ timestep,
1739
+ encoder_hidden_states,
1740
+ encoder_attention_mask,
1741
+ pooled_projections,
1742
+ guidance,
1743
+ latent_indices=None,
1744
+ clean_latents=None,
1745
+ clean_latent_indices=None,
1746
+ clean_latents_2x=None,
1747
+ clean_latent_2x_indices=None,
1748
+ clean_latents_4x=None,
1749
+ clean_latent_4x_indices=None,
1750
+ image_embeddings=None,
1751
+ attention_kwargs=None,
1752
+ return_dict=True,
1753
+ ):
1754
+
1755
+ if attention_kwargs is None:
1756
+ attention_kwargs = {}
1757
+
1758
+ batch_size, num_channels, num_frames, height, width = hidden_states.shape
1759
+ p, p_t = self.config_patch_size, self.config_patch_size_t
1760
+ post_patch_num_frames = num_frames // p_t
1761
+ post_patch_height = height // p
1762
+ post_patch_width = width // p
1763
+ original_context_length = post_patch_num_frames * post_patch_height * post_patch_width
1764
+
1765
+ hidden_states, rope_freqs = self.process_input_hidden_states(
1766
+ hidden_states,
1767
+ latent_indices,
1768
+ clean_latents,
1769
+ clean_latent_indices,
1770
+ clean_latents_2x,
1771
+ clean_latent_2x_indices,
1772
+ clean_latents_4x,
1773
+ clean_latent_4x_indices,
1774
+ )
1775
+ del (
1776
+ latent_indices,
1777
+ clean_latents,
1778
+ clean_latent_indices,
1779
+ clean_latents_2x,
1780
+ clean_latent_2x_indices,
1781
+ clean_latents_4x,
1782
+ clean_latent_4x_indices,
1783
+ ) # free memory
1784
+
1785
+ temb = self.gradient_checkpointing_method(self.time_text_embed, timestep, guidance, pooled_projections)
1786
+ encoder_hidden_states = self.gradient_checkpointing_method(
1787
+ self.context_embedder, encoder_hidden_states, timestep, encoder_attention_mask
1788
+ )
1789
+
1790
+ if self.image_projection is not None:
1791
+ assert image_embeddings is not None, "You must use image embeddings!"
1792
+ extra_encoder_hidden_states = self.gradient_checkpointing_method(self.image_projection, image_embeddings)
1793
+ extra_attention_mask = torch.ones(
1794
+ (batch_size, extra_encoder_hidden_states.shape[1]),
1795
+ dtype=encoder_attention_mask.dtype,
1796
+ device=encoder_attention_mask.device,
1797
+ )
1798
+
1799
+ # must cat before (not after) encoder_hidden_states, due to attn masking
1800
+ encoder_hidden_states = torch.cat([extra_encoder_hidden_states, encoder_hidden_states], dim=1)
1801
+ encoder_attention_mask = torch.cat([extra_attention_mask, encoder_attention_mask], dim=1)
1802
+ del extra_encoder_hidden_states, extra_attention_mask # free memory
1803
+
1804
+ with torch.no_grad():
1805
+ if batch_size == 1:
1806
+ # When batch size is 1, we do not need any masks or var-len funcs since cropping is mathematically same to what we want
1807
+ # If they are not same, then their impls are wrong. Ours are always the correct one.
1808
+ text_len = encoder_attention_mask.sum().item()
1809
+ encoder_hidden_states = encoder_hidden_states[:, :text_len]
1810
+ attention_mask = None, None, None, None
1811
+ else:
1812
+ img_seq_len = hidden_states.shape[1]
1813
+ txt_seq_len = encoder_hidden_states.shape[1]
1814
+
1815
+ cu_seqlens_q = get_cu_seqlens(encoder_attention_mask, img_seq_len)
1816
+ cu_seqlens_kv = cu_seqlens_q
1817
+ max_seqlen_q = img_seq_len + txt_seq_len
1818
+ max_seqlen_kv = max_seqlen_q
1819
+
1820
+ attention_mask = cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv
1821
+ del cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv # free memory
1822
+ del encoder_attention_mask # free memory
1823
+
1824
+ if self.enable_teacache:
1825
+ modulated_inp = self.transformer_blocks[0].norm1(hidden_states, emb=temb)[0]
1826
+
1827
+ if self.cnt == 0 or self.cnt == self.num_steps - 1:
1828
+ should_calc = True
1829
+ self.accumulated_rel_l1_distance = 0
1830
+ else:
1831
+ curr_rel_l1 = (
1832
+ ((modulated_inp - self.previous_modulated_input).abs().mean() / self.previous_modulated_input.abs().mean())
1833
+ .cpu()
1834
+ .item()
1835
+ )
1836
+ self.accumulated_rel_l1_distance += self.teacache_rescale_func(curr_rel_l1)
1837
+ should_calc = self.accumulated_rel_l1_distance >= self.rel_l1_thresh
1838
+
1839
+ if should_calc:
1840
+ self.accumulated_rel_l1_distance = 0
1841
+
1842
+ self.previous_modulated_input = modulated_inp
1843
+ self.cnt += 1
1844
+
1845
+ if self.cnt == self.num_steps:
1846
+ self.cnt = 0
1847
+
1848
+ if not should_calc:
1849
+ hidden_states = hidden_states + self.previous_residual
1850
+ else:
1851
+ ori_hidden_states = hidden_states.clone()
1852
+
1853
+ for block_id, block in enumerate(self.transformer_blocks):
1854
+ hidden_states, encoder_hidden_states = self.gradient_checkpointing_method(
1855
+ block, hidden_states, encoder_hidden_states, temb, attention_mask, rope_freqs
1856
+ )
1857
+
1858
+ for block_id, block in enumerate(self.single_transformer_blocks):
1859
+ hidden_states, encoder_hidden_states = self.gradient_checkpointing_method(
1860
+ block, hidden_states, encoder_hidden_states, temb, attention_mask, rope_freqs
1861
+ )
1862
+
1863
+ self.previous_residual = hidden_states - ori_hidden_states
1864
+ del ori_hidden_states # free memory
1865
+ else:
1866
+ for block_id, block in enumerate(self.transformer_blocks):
1867
+ if self.blocks_to_swap:
1868
+ self.offloader_double.wait_for_block(block_id)
1869
+
1870
+ hidden_states, encoder_hidden_states = self.gradient_checkpointing_method(
1871
+ block, hidden_states, encoder_hidden_states, temb, attention_mask, rope_freqs
1872
+ )
1873
+
1874
+ if self.blocks_to_swap:
1875
+ self.offloader_double.submit_move_blocks_forward(self.transformer_blocks, block_id)
1876
+
1877
+ for block_id, block in enumerate(self.single_transformer_blocks):
1878
+ if self.blocks_to_swap:
1879
+ self.offloader_single.wait_for_block(block_id)
1880
+
1881
+ hidden_states, encoder_hidden_states = self.gradient_checkpointing_method(
1882
+ block, hidden_states, encoder_hidden_states, temb, attention_mask, rope_freqs
1883
+ )
1884
+
1885
+ if self.blocks_to_swap:
1886
+ self.offloader_single.submit_move_blocks_forward(self.single_transformer_blocks, block_id)
1887
+
1888
+ del attention_mask, rope_freqs # free memory
1889
+ del encoder_hidden_states # free memory
1890
+
1891
+ hidden_states = self.gradient_checkpointing_method(self.norm_out, hidden_states, temb)
1892
+
1893
+ hidden_states = hidden_states[:, -original_context_length:, :]
1894
+
1895
+ if self.high_quality_fp32_output_for_inference:
1896
+ hidden_states = hidden_states.to(dtype=torch.float32)
1897
+ if self.proj_out.weight.dtype != torch.float32:
1898
+ self.proj_out.to(dtype=torch.float32)
1899
+
1900
+ hidden_states = self.gradient_checkpointing_method(self.proj_out, hidden_states)
1901
+
1902
+ hidden_states = einops.rearrange(
1903
+ hidden_states,
1904
+ "b (t h w) (c pt ph pw) -> b c (t pt) (h ph) (w pw)",
1905
+ t=post_patch_num_frames,
1906
+ h=post_patch_height,
1907
+ w=post_patch_width,
1908
+ pt=p_t,
1909
+ ph=p,
1910
+ pw=p,
1911
+ )
1912
+
1913
+ if return_dict:
1914
+ # return Transformer2DModelOutput(sample=hidden_states)
1915
+ return SimpleNamespace(sample=hidden_states)
1916
+
1917
+ return (hidden_states,)
1918
+
1919
+ def fp8_optimization(
1920
+ self, state_dict: dict[str, torch.Tensor], device: torch.device, move_to_device: bool, use_scaled_mm: bool = False
1921
+ ) -> dict[str, torch.Tensor]: # Return type hint added
1922
+ """
1923
+ Optimize the model state_dict with fp8.
1924
+
1925
+ Args:
1926
+ state_dict (dict[str, torch.Tensor]):
1927
+ The state_dict of the model.
1928
+ device (torch.device):
1929
+ The device to calculate the weight.
1930
+ move_to_device (bool):
1931
+ Whether to move the weight to the device after optimization.
1932
+ use_scaled_mm (bool):
1933
+ Whether to use scaled matrix multiplication for FP8.
1934
+ """
1935
+ TARGET_KEYS = ["transformer_blocks", "single_transformer_blocks"]
1936
+ EXCLUDE_KEYS = ["norm"] # Exclude norm layers (e.g., LayerNorm, RMSNorm) from FP8
1937
+
1938
+ # inplace optimization
1939
+ state_dict = optimize_state_dict_with_fp8(state_dict, device, TARGET_KEYS, EXCLUDE_KEYS, move_to_device=move_to_device)
1940
+
1941
+ # apply monkey patching
1942
+ apply_fp8_monkey_patch(self, state_dict, use_scaled_mm=use_scaled_mm)
1943
+
1944
+ return state_dict
1945
+
1946
+
1947
+ def load_packed_model(
1948
+ device: Union[str, torch.device],
1949
+ dit_path: str,
1950
+ attn_mode: str,
1951
+ loading_device: Union[str, torch.device],
1952
+ fp8_scaled: bool = False,
1953
+ split_attn: bool = False,
1954
+ ) -> HunyuanVideoTransformer3DModelPacked:
1955
+ # TODO support split_attn
1956
+ device = torch.device(device)
1957
+ loading_device = torch.device(loading_device)
1958
+
1959
+ if os.path.isdir(dit_path):
1960
+ # we don't support from_pretrained for now, so loading safetensors directly
1961
+ safetensor_files = glob.glob(os.path.join(dit_path, "*.safetensors"))
1962
+ if len(safetensor_files) == 0:
1963
+ raise ValueError(f"Cannot find safetensors file in {dit_path}")
1964
+ # sort by name and take the first one
1965
+ safetensor_files.sort()
1966
+ dit_path = safetensor_files[0]
1967
+
1968
+ with init_empty_weights():
1969
+ logger.info(f"Creating HunyuanVideoTransformer3DModelPacked")
1970
+ model = HunyuanVideoTransformer3DModelPacked(
1971
+ attention_head_dim=128,
1972
+ guidance_embeds=True,
1973
+ has_clean_x_embedder=True,
1974
+ has_image_proj=True,
1975
+ image_proj_dim=1152,
1976
+ in_channels=16,
1977
+ mlp_ratio=4.0,
1978
+ num_attention_heads=24,
1979
+ num_layers=20,
1980
+ num_refiner_layers=2,
1981
+ num_single_layers=40,
1982
+ out_channels=16,
1983
+ patch_size=2,
1984
+ patch_size_t=1,
1985
+ pooled_projection_dim=768,
1986
+ qk_norm="rms_norm",
1987
+ rope_axes_dim=(16, 56, 56),
1988
+ rope_theta=256.0,
1989
+ text_embed_dim=4096,
1990
+ attn_mode=attn_mode,
1991
+ split_attn=split_attn,
1992
+ )
1993
+
1994
+ # if fp8_scaled, load model weights to CPU to reduce VRAM usage. Otherwise, load to the specified device (CPU for block swap or CUDA for others)
1995
+ dit_loading_device = torch.device("cpu") if fp8_scaled else loading_device
1996
+ logger.info(f"Loading DiT model from {dit_path}, device={dit_loading_device}")
1997
+
1998
+ # load model weights with the specified dtype or as is
1999
+ sd = load_split_weights(dit_path, device=dit_loading_device, disable_mmap=True)
2000
+
2001
+ if fp8_scaled:
2002
+ # fp8 optimization: calculate on CUDA, move back to CPU if loading_device is CPU (block swap)
2003
+ logger.info(f"Optimizing model weights to fp8. This may take a while.")
2004
+ sd = model.fp8_optimization(sd, device, move_to_device=loading_device.type == "cpu")
2005
+
2006
+ if loading_device.type != "cpu":
2007
+ # make sure all the model weights are on the loading_device
2008
+ logger.info(f"Moving weights to {loading_device}")
2009
+ for key in sd.keys():
2010
+ sd[key] = sd[key].to(loading_device)
2011
+
2012
+ info = model.load_state_dict(sd, strict=True, assign=True)
2013
+ logger.info(f"Loaded DiT model from {dit_path}, info={info}")
2014
+
2015
+ return model
frame_pack/k_diffusion_hunyuan.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # original code: https://github.com/lllyasviel/FramePack
2
+ # original license: Apache-2.0
3
+
4
+ import torch
5
+ import math
6
+
7
+ # from diffusers_helper.k_diffusion.uni_pc_fm import sample_unipc
8
+ # from diffusers_helper.k_diffusion.wrapper import fm_wrapper
9
+ # from diffusers_helper.utils import repeat_to_batch_size
10
+ from frame_pack.uni_pc_fm import sample_unipc
11
+ from frame_pack.wrapper import fm_wrapper
12
+ from frame_pack.utils import repeat_to_batch_size
13
+
14
+
15
+ def flux_time_shift(t, mu=1.15, sigma=1.0):
16
+ return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
17
+
18
+
19
+ def calculate_flux_mu(context_length, x1=256, y1=0.5, x2=4096, y2=1.15, exp_max=7.0):
20
+ k = (y2 - y1) / (x2 - x1)
21
+ b = y1 - k * x1
22
+ mu = k * context_length + b
23
+ mu = min(mu, math.log(exp_max))
24
+ return mu
25
+
26
+
27
+ def get_flux_sigmas_from_mu(n, mu):
28
+ sigmas = torch.linspace(1, 0, steps=n + 1)
29
+ sigmas = flux_time_shift(sigmas, mu=mu)
30
+ return sigmas
31
+
32
+
33
+ # @torch.inference_mode()
34
+ def sample_hunyuan(
35
+ transformer,
36
+ sampler="unipc",
37
+ initial_latent=None,
38
+ concat_latent=None,
39
+ strength=1.0,
40
+ width=512,
41
+ height=512,
42
+ frames=16,
43
+ real_guidance_scale=1.0,
44
+ distilled_guidance_scale=6.0,
45
+ guidance_rescale=0.0,
46
+ shift=None,
47
+ num_inference_steps=25,
48
+ batch_size=None,
49
+ generator=None,
50
+ prompt_embeds=None,
51
+ prompt_embeds_mask=None,
52
+ prompt_poolers=None,
53
+ negative_prompt_embeds=None,
54
+ negative_prompt_embeds_mask=None,
55
+ negative_prompt_poolers=None,
56
+ dtype=torch.bfloat16,
57
+ device=None,
58
+ negative_kwargs=None,
59
+ callback=None,
60
+ **kwargs,
61
+ ):
62
+ device = device or transformer.device
63
+
64
+ if batch_size is None:
65
+ batch_size = int(prompt_embeds.shape[0])
66
+
67
+ latents = torch.randn(
68
+ (batch_size, 16, (frames + 3) // 4, height // 8, width // 8), generator=generator, device=generator.device
69
+ ).to(device=device, dtype=torch.float32)
70
+
71
+ B, C, T, H, W = latents.shape
72
+ seq_length = T * H * W // 4 # 9*80*80//4 = 14400
73
+
74
+ if shift is None:
75
+ mu = calculate_flux_mu(seq_length, exp_max=7.0) # 1.9459... if seq_len is large, mu is clipped.
76
+ else:
77
+ mu = math.log(shift)
78
+
79
+ sigmas = get_flux_sigmas_from_mu(num_inference_steps, mu).to(device)
80
+
81
+ k_model = fm_wrapper(transformer)
82
+
83
+ if initial_latent is not None:
84
+ sigmas = sigmas * strength
85
+ first_sigma = sigmas[0].to(device=device, dtype=torch.float32)
86
+ initial_latent = initial_latent.to(device=device, dtype=torch.float32)
87
+ latents = initial_latent.float() * (1.0 - first_sigma) + latents.float() * first_sigma
88
+
89
+ if concat_latent is not None:
90
+ concat_latent = concat_latent.to(latents)
91
+
92
+ distilled_guidance = torch.tensor([distilled_guidance_scale * 1000.0] * batch_size).to(device=device, dtype=dtype)
93
+
94
+ prompt_embeds = repeat_to_batch_size(prompt_embeds, batch_size)
95
+ prompt_embeds_mask = repeat_to_batch_size(prompt_embeds_mask, batch_size)
96
+ prompt_poolers = repeat_to_batch_size(prompt_poolers, batch_size)
97
+ negative_prompt_embeds = repeat_to_batch_size(negative_prompt_embeds, batch_size)
98
+ negative_prompt_embeds_mask = repeat_to_batch_size(negative_prompt_embeds_mask, batch_size)
99
+ negative_prompt_poolers = repeat_to_batch_size(negative_prompt_poolers, batch_size)
100
+ concat_latent = repeat_to_batch_size(concat_latent, batch_size)
101
+
102
+ sampler_kwargs = dict(
103
+ dtype=dtype,
104
+ cfg_scale=real_guidance_scale,
105
+ cfg_rescale=guidance_rescale,
106
+ concat_latent=concat_latent,
107
+ positive=dict(
108
+ pooled_projections=prompt_poolers,
109
+ encoder_hidden_states=prompt_embeds,
110
+ encoder_attention_mask=prompt_embeds_mask,
111
+ guidance=distilled_guidance,
112
+ **kwargs,
113
+ ),
114
+ negative=dict(
115
+ pooled_projections=negative_prompt_poolers,
116
+ encoder_hidden_states=negative_prompt_embeds,
117
+ encoder_attention_mask=negative_prompt_embeds_mask,
118
+ guidance=distilled_guidance,
119
+ **(kwargs if negative_kwargs is None else {**kwargs, **negative_kwargs}),
120
+ ),
121
+ )
122
+
123
+ if sampler == "unipc":
124
+ results = sample_unipc(k_model, latents, sigmas, extra_args=sampler_kwargs, disable=False, callback=callback)
125
+ else:
126
+ raise NotImplementedError(f"Sampler {sampler} is not supported.")
127
+
128
+ return results
frame_pack/uni_pc_fm.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Better Flow Matching UniPC by Lvmin Zhang
2
+ # (c) 2025
3
+ # CC BY-SA 4.0
4
+ # Attribution-ShareAlike 4.0 International Licence
5
+
6
+
7
+ import torch
8
+
9
+ from tqdm.auto import trange
10
+
11
+
12
+ def expand_dims(v, dims):
13
+ return v[(...,) + (None,) * (dims - 1)]
14
+
15
+
16
+ class FlowMatchUniPC:
17
+ def __init__(self, model, extra_args, variant='bh1'):
18
+ self.model = model
19
+ self.variant = variant
20
+ self.extra_args = extra_args
21
+
22
+ def model_fn(self, x, t):
23
+ return self.model(x, t, **self.extra_args)
24
+
25
+ def update_fn(self, x, model_prev_list, t_prev_list, t, order):
26
+ assert order <= len(model_prev_list)
27
+ dims = x.dim()
28
+
29
+ t_prev_0 = t_prev_list[-1]
30
+ lambda_prev_0 = - torch.log(t_prev_0)
31
+ lambda_t = - torch.log(t)
32
+ model_prev_0 = model_prev_list[-1]
33
+
34
+ h = lambda_t - lambda_prev_0
35
+
36
+ rks = []
37
+ D1s = []
38
+ for i in range(1, order):
39
+ t_prev_i = t_prev_list[-(i + 1)]
40
+ model_prev_i = model_prev_list[-(i + 1)]
41
+ lambda_prev_i = - torch.log(t_prev_i)
42
+ rk = ((lambda_prev_i - lambda_prev_0) / h)[0]
43
+ rks.append(rk)
44
+ D1s.append((model_prev_i - model_prev_0) / rk)
45
+
46
+ rks.append(1.)
47
+ rks = torch.tensor(rks, device=x.device)
48
+
49
+ R = []
50
+ b = []
51
+
52
+ hh = -h[0]
53
+ h_phi_1 = torch.expm1(hh)
54
+ h_phi_k = h_phi_1 / hh - 1
55
+
56
+ factorial_i = 1
57
+
58
+ if self.variant == 'bh1':
59
+ B_h = hh
60
+ elif self.variant == 'bh2':
61
+ B_h = torch.expm1(hh)
62
+ else:
63
+ raise NotImplementedError('Bad variant!')
64
+
65
+ for i in range(1, order + 1):
66
+ R.append(torch.pow(rks, i - 1))
67
+ b.append(h_phi_k * factorial_i / B_h)
68
+ factorial_i *= (i + 1)
69
+ h_phi_k = h_phi_k / hh - 1 / factorial_i
70
+
71
+ R = torch.stack(R)
72
+ b = torch.tensor(b, device=x.device)
73
+
74
+ use_predictor = len(D1s) > 0
75
+
76
+ if use_predictor:
77
+ D1s = torch.stack(D1s, dim=1)
78
+ if order == 2:
79
+ rhos_p = torch.tensor([0.5], device=b.device)
80
+ else:
81
+ rhos_p = torch.linalg.solve(R[:-1, :-1], b[:-1])
82
+ else:
83
+ D1s = None
84
+ rhos_p = None
85
+
86
+ if order == 1:
87
+ rhos_c = torch.tensor([0.5], device=b.device)
88
+ else:
89
+ rhos_c = torch.linalg.solve(R, b)
90
+
91
+ x_t_ = expand_dims(t / t_prev_0, dims) * x - expand_dims(h_phi_1, dims) * model_prev_0
92
+
93
+ if use_predictor:
94
+ pred_res = torch.tensordot(D1s, rhos_p, dims=([1], [0]))
95
+ else:
96
+ pred_res = 0
97
+
98
+ x_t = x_t_ - expand_dims(B_h, dims) * pred_res
99
+ model_t = self.model_fn(x_t, t)
100
+
101
+ if D1s is not None:
102
+ corr_res = torch.tensordot(D1s, rhos_c[:-1], dims=([1], [0]))
103
+ else:
104
+ corr_res = 0
105
+
106
+ D1_t = (model_t - model_prev_0)
107
+ x_t = x_t_ - expand_dims(B_h, dims) * (corr_res + rhos_c[-1] * D1_t)
108
+
109
+ return x_t, model_t
110
+
111
+ def sample(self, x, sigmas, callback=None, disable_pbar=False):
112
+ order = min(3, len(sigmas) - 2)
113
+ model_prev_list, t_prev_list = [], []
114
+ for i in trange(len(sigmas) - 1, disable=disable_pbar):
115
+ vec_t = sigmas[i].expand(x.shape[0])
116
+
117
+ with torch.no_grad():
118
+ if i == 0:
119
+ model_prev_list = [self.model_fn(x, vec_t)]
120
+ t_prev_list = [vec_t]
121
+ elif i < order:
122
+ init_order = i
123
+ x, model_x = self.update_fn(x, model_prev_list, t_prev_list, vec_t, init_order)
124
+ model_prev_list.append(model_x)
125
+ t_prev_list.append(vec_t)
126
+ else:
127
+ x, model_x = self.update_fn(x, model_prev_list, t_prev_list, vec_t, order)
128
+ model_prev_list.append(model_x)
129
+ t_prev_list.append(vec_t)
130
+
131
+ model_prev_list = model_prev_list[-order:]
132
+ t_prev_list = t_prev_list[-order:]
133
+
134
+ if callback is not None:
135
+ callback({'x': x, 'i': i, 'denoised': model_prev_list[-1]})
136
+
137
+ return model_prev_list[-1]
138
+
139
+
140
+ def sample_unipc(model, noise, sigmas, extra_args=None, callback=None, disable=False, variant='bh1'):
141
+ assert variant in ['bh1', 'bh2']
142
+ return FlowMatchUniPC(model, extra_args=extra_args, variant=variant).sample(noise, sigmas=sigmas, callback=callback, disable_pbar=disable)
frame_pack/utils.py ADDED
@@ -0,0 +1,617 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import cv2
3
+ import json
4
+ import random
5
+ import glob
6
+ import torch
7
+ import einops
8
+ import numpy as np
9
+ import datetime
10
+ import torchvision
11
+
12
+ import safetensors.torch as sf
13
+ from PIL import Image
14
+
15
+
16
+ def min_resize(x, m):
17
+ if x.shape[0] < x.shape[1]:
18
+ s0 = m
19
+ s1 = int(float(m) / float(x.shape[0]) * float(x.shape[1]))
20
+ else:
21
+ s0 = int(float(m) / float(x.shape[1]) * float(x.shape[0]))
22
+ s1 = m
23
+ new_max = max(s1, s0)
24
+ raw_max = max(x.shape[0], x.shape[1])
25
+ if new_max < raw_max:
26
+ interpolation = cv2.INTER_AREA
27
+ else:
28
+ interpolation = cv2.INTER_LANCZOS4
29
+ y = cv2.resize(x, (s1, s0), interpolation=interpolation)
30
+ return y
31
+
32
+
33
+ def d_resize(x, y):
34
+ H, W, C = y.shape
35
+ new_min = min(H, W)
36
+ raw_min = min(x.shape[0], x.shape[1])
37
+ if new_min < raw_min:
38
+ interpolation = cv2.INTER_AREA
39
+ else:
40
+ interpolation = cv2.INTER_LANCZOS4
41
+ y = cv2.resize(x, (W, H), interpolation=interpolation)
42
+ return y
43
+
44
+
45
+ def resize_and_center_crop(image, target_width, target_height):
46
+ if target_height == image.shape[0] and target_width == image.shape[1]:
47
+ return image
48
+
49
+ pil_image = Image.fromarray(image)
50
+ original_width, original_height = pil_image.size
51
+ scale_factor = max(target_width / original_width, target_height / original_height)
52
+ resized_width = int(round(original_width * scale_factor))
53
+ resized_height = int(round(original_height * scale_factor))
54
+ resized_image = pil_image.resize((resized_width, resized_height), Image.LANCZOS)
55
+ left = (resized_width - target_width) / 2
56
+ top = (resized_height - target_height) / 2
57
+ right = (resized_width + target_width) / 2
58
+ bottom = (resized_height + target_height) / 2
59
+ cropped_image = resized_image.crop((left, top, right, bottom))
60
+ return np.array(cropped_image)
61
+
62
+
63
+ def resize_and_center_crop_pytorch(image, target_width, target_height):
64
+ B, C, H, W = image.shape
65
+
66
+ if H == target_height and W == target_width:
67
+ return image
68
+
69
+ scale_factor = max(target_width / W, target_height / H)
70
+ resized_width = int(round(W * scale_factor))
71
+ resized_height = int(round(H * scale_factor))
72
+
73
+ resized = torch.nn.functional.interpolate(image, size=(resized_height, resized_width), mode="bilinear", align_corners=False)
74
+
75
+ top = (resized_height - target_height) // 2
76
+ left = (resized_width - target_width) // 2
77
+ cropped = resized[:, :, top : top + target_height, left : left + target_width]
78
+
79
+ return cropped
80
+
81
+
82
+ def resize_without_crop(image, target_width, target_height):
83
+ if target_height == image.shape[0] and target_width == image.shape[1]:
84
+ return image
85
+
86
+ pil_image = Image.fromarray(image)
87
+ resized_image = pil_image.resize((target_width, target_height), Image.LANCZOS)
88
+ return np.array(resized_image)
89
+
90
+
91
+ def just_crop(image, w, h):
92
+ if h == image.shape[0] and w == image.shape[1]:
93
+ return image
94
+
95
+ original_height, original_width = image.shape[:2]
96
+ k = min(original_height / h, original_width / w)
97
+ new_width = int(round(w * k))
98
+ new_height = int(round(h * k))
99
+ x_start = (original_width - new_width) // 2
100
+ y_start = (original_height - new_height) // 2
101
+ cropped_image = image[y_start : y_start + new_height, x_start : x_start + new_width]
102
+ return cropped_image
103
+
104
+
105
+ def write_to_json(data, file_path):
106
+ temp_file_path = file_path + ".tmp"
107
+ with open(temp_file_path, "wt", encoding="utf-8") as temp_file:
108
+ json.dump(data, temp_file, indent=4)
109
+ os.replace(temp_file_path, file_path)
110
+ return
111
+
112
+
113
+ def read_from_json(file_path):
114
+ with open(file_path, "rt", encoding="utf-8") as file:
115
+ data = json.load(file)
116
+ return data
117
+
118
+
119
+ def get_active_parameters(m):
120
+ return {k: v for k, v in m.named_parameters() if v.requires_grad}
121
+
122
+
123
+ def cast_training_params(m, dtype=torch.float32):
124
+ result = {}
125
+ for n, param in m.named_parameters():
126
+ if param.requires_grad:
127
+ param.data = param.to(dtype)
128
+ result[n] = param
129
+ return result
130
+
131
+
132
+ def separate_lora_AB(parameters, B_patterns=None):
133
+ parameters_normal = {}
134
+ parameters_B = {}
135
+
136
+ if B_patterns is None:
137
+ B_patterns = [".lora_B.", "__zero__"]
138
+
139
+ for k, v in parameters.items():
140
+ if any(B_pattern in k for B_pattern in B_patterns):
141
+ parameters_B[k] = v
142
+ else:
143
+ parameters_normal[k] = v
144
+
145
+ return parameters_normal, parameters_B
146
+
147
+
148
+ def set_attr_recursive(obj, attr, value):
149
+ attrs = attr.split(".")
150
+ for name in attrs[:-1]:
151
+ obj = getattr(obj, name)
152
+ setattr(obj, attrs[-1], value)
153
+ return
154
+
155
+
156
+ def print_tensor_list_size(tensors):
157
+ total_size = 0
158
+ total_elements = 0
159
+
160
+ if isinstance(tensors, dict):
161
+ tensors = tensors.values()
162
+
163
+ for tensor in tensors:
164
+ total_size += tensor.nelement() * tensor.element_size()
165
+ total_elements += tensor.nelement()
166
+
167
+ total_size_MB = total_size / (1024**2)
168
+ total_elements_B = total_elements / 1e9
169
+
170
+ print(f"Total number of tensors: {len(tensors)}")
171
+ print(f"Total size of tensors: {total_size_MB:.2f} MB")
172
+ print(f"Total number of parameters: {total_elements_B:.3f} billion")
173
+ return
174
+
175
+
176
+ @torch.no_grad()
177
+ def batch_mixture(a, b=None, probability_a=0.5, mask_a=None):
178
+ batch_size = a.size(0)
179
+
180
+ if b is None:
181
+ b = torch.zeros_like(a)
182
+
183
+ if mask_a is None:
184
+ mask_a = torch.rand(batch_size) < probability_a
185
+
186
+ mask_a = mask_a.to(a.device)
187
+ mask_a = mask_a.reshape((batch_size,) + (1,) * (a.dim() - 1))
188
+ result = torch.where(mask_a, a, b)
189
+ return result
190
+
191
+
192
+ @torch.no_grad()
193
+ def zero_module(module):
194
+ for p in module.parameters():
195
+ p.detach().zero_()
196
+ return module
197
+
198
+
199
+ @torch.no_grad()
200
+ def supress_lower_channels(m, k, alpha=0.01):
201
+ data = m.weight.data.clone()
202
+
203
+ assert int(data.shape[1]) >= k
204
+
205
+ data[:, :k] = data[:, :k] * alpha
206
+ m.weight.data = data.contiguous().clone()
207
+ return m
208
+
209
+
210
+ def freeze_module(m):
211
+ if not hasattr(m, "_forward_inside_frozen_module"):
212
+ m._forward_inside_frozen_module = m.forward
213
+ m.requires_grad_(False)
214
+ m.forward = torch.no_grad()(m.forward)
215
+ return m
216
+
217
+
218
+ def get_latest_safetensors(folder_path):
219
+ safetensors_files = glob.glob(os.path.join(folder_path, "*.safetensors"))
220
+
221
+ if not safetensors_files:
222
+ raise ValueError("No file to resume!")
223
+
224
+ latest_file = max(safetensors_files, key=os.path.getmtime)
225
+ latest_file = os.path.abspath(os.path.realpath(latest_file))
226
+ return latest_file
227
+
228
+
229
+ def generate_random_prompt_from_tags(tags_str, min_length=3, max_length=32):
230
+ tags = tags_str.split(", ")
231
+ tags = random.sample(tags, k=min(random.randint(min_length, max_length), len(tags)))
232
+ prompt = ", ".join(tags)
233
+ return prompt
234
+
235
+
236
+ def interpolate_numbers(a, b, n, round_to_int=False, gamma=1.0):
237
+ numbers = a + (b - a) * (np.linspace(0, 1, n) ** gamma)
238
+ if round_to_int:
239
+ numbers = np.round(numbers).astype(int)
240
+ return numbers.tolist()
241
+
242
+
243
+ def uniform_random_by_intervals(inclusive, exclusive, n, round_to_int=False):
244
+ edges = np.linspace(0, 1, n + 1)
245
+ points = np.random.uniform(edges[:-1], edges[1:])
246
+ numbers = inclusive + (exclusive - inclusive) * points
247
+ if round_to_int:
248
+ numbers = np.round(numbers).astype(int)
249
+ return numbers.tolist()
250
+
251
+
252
+ def soft_append_bcthw(history, current, overlap=0):
253
+ if overlap <= 0:
254
+ return torch.cat([history, current], dim=2)
255
+
256
+ assert history.shape[2] >= overlap, f"History length ({history.shape[2]}) must be >= overlap ({overlap})"
257
+ assert current.shape[2] >= overlap, f"Current length ({current.shape[2]}) must be >= overlap ({overlap})"
258
+
259
+ weights = torch.linspace(1, 0, overlap, dtype=history.dtype, device=history.device).view(1, 1, -1, 1, 1)
260
+ blended = weights * history[:, :, -overlap:] + (1 - weights) * current[:, :, :overlap]
261
+ output = torch.cat([history[:, :, :-overlap], blended, current[:, :, overlap:]], dim=2)
262
+
263
+ return output.to(history)
264
+
265
+
266
+ def save_bcthw_as_mp4(x, output_filename, fps=10):
267
+ b, c, t, h, w = x.shape
268
+
269
+ per_row = b
270
+ for p in [6, 5, 4, 3, 2]:
271
+ if b % p == 0:
272
+ per_row = p
273
+ break
274
+
275
+ os.makedirs(os.path.dirname(os.path.abspath(os.path.realpath(output_filename))), exist_ok=True)
276
+ x = torch.clamp(x.float(), -1.0, 1.0) * 127.5 + 127.5
277
+ x = x.detach().cpu().to(torch.uint8)
278
+ x = einops.rearrange(x, "(m n) c t h w -> t (m h) (n w) c", n=per_row)
279
+ torchvision.io.write_video(output_filename, x, fps=fps, video_codec="libx264", options={"crf": "0"})
280
+
281
+ # write tensor as .pt file
282
+ torch.save(x, output_filename.replace(".mp4", ".pt"))
283
+
284
+ return x
285
+
286
+
287
+ def save_bcthw_as_png(x, output_filename):
288
+ os.makedirs(os.path.dirname(os.path.abspath(os.path.realpath(output_filename))), exist_ok=True)
289
+ x = torch.clamp(x.float(), -1.0, 1.0) * 127.5 + 127.5
290
+ x = x.detach().cpu().to(torch.uint8)
291
+ x = einops.rearrange(x, "b c t h w -> c (b h) (t w)")
292
+ torchvision.io.write_png(x, output_filename)
293
+ return output_filename
294
+
295
+
296
+ def save_bchw_as_png(x, output_filename):
297
+ os.makedirs(os.path.dirname(os.path.abspath(os.path.realpath(output_filename))), exist_ok=True)
298
+ x = torch.clamp(x.float(), -1.0, 1.0) * 127.5 + 127.5
299
+ x = x.detach().cpu().to(torch.uint8)
300
+ x = einops.rearrange(x, "b c h w -> c h (b w)")
301
+ torchvision.io.write_png(x, output_filename)
302
+ return output_filename
303
+
304
+
305
+ def add_tensors_with_padding(tensor1, tensor2):
306
+ if tensor1.shape == tensor2.shape:
307
+ return tensor1 + tensor2
308
+
309
+ shape1 = tensor1.shape
310
+ shape2 = tensor2.shape
311
+
312
+ new_shape = tuple(max(s1, s2) for s1, s2 in zip(shape1, shape2))
313
+
314
+ padded_tensor1 = torch.zeros(new_shape)
315
+ padded_tensor2 = torch.zeros(new_shape)
316
+
317
+ padded_tensor1[tuple(slice(0, s) for s in shape1)] = tensor1
318
+ padded_tensor2[tuple(slice(0, s) for s in shape2)] = tensor2
319
+
320
+ result = padded_tensor1 + padded_tensor2
321
+ return result
322
+
323
+
324
+ def print_free_mem():
325
+ torch.cuda.empty_cache()
326
+ free_mem, total_mem = torch.cuda.mem_get_info(0)
327
+ free_mem_mb = free_mem / (1024**2)
328
+ total_mem_mb = total_mem / (1024**2)
329
+ print(f"Free memory: {free_mem_mb:.2f} MB")
330
+ print(f"Total memory: {total_mem_mb:.2f} MB")
331
+ return
332
+
333
+
334
+ def print_gpu_parameters(device, state_dict, log_count=1):
335
+ summary = {"device": device, "keys_count": len(state_dict)}
336
+
337
+ logged_params = {}
338
+ for i, (key, tensor) in enumerate(state_dict.items()):
339
+ if i >= log_count:
340
+ break
341
+ logged_params[key] = tensor.flatten()[:3].tolist()
342
+
343
+ summary["params"] = logged_params
344
+
345
+ print(str(summary))
346
+ return
347
+
348
+
349
+ def visualize_txt_as_img(width, height, text, font_path="font/DejaVuSans.ttf", size=18):
350
+ from PIL import Image, ImageDraw, ImageFont
351
+
352
+ txt = Image.new("RGB", (width, height), color="white")
353
+ draw = ImageDraw.Draw(txt)
354
+ font = ImageFont.truetype(font_path, size=size)
355
+
356
+ if text == "":
357
+ return np.array(txt)
358
+
359
+ # Split text into lines that fit within the image width
360
+ lines = []
361
+ words = text.split()
362
+ current_line = words[0]
363
+
364
+ for word in words[1:]:
365
+ line_with_word = f"{current_line} {word}"
366
+ if draw.textbbox((0, 0), line_with_word, font=font)[2] <= width:
367
+ current_line = line_with_word
368
+ else:
369
+ lines.append(current_line)
370
+ current_line = word
371
+
372
+ lines.append(current_line)
373
+
374
+ # Draw the text line by line
375
+ y = 0
376
+ line_height = draw.textbbox((0, 0), "A", font=font)[3]
377
+
378
+ for line in lines:
379
+ if y + line_height > height:
380
+ break # stop drawing if the next line will be outside the image
381
+ draw.text((0, y), line, fill="black", font=font)
382
+ y += line_height
383
+
384
+ return np.array(txt)
385
+
386
+
387
+ def blue_mark(x):
388
+ x = x.copy()
389
+ c = x[:, :, 2]
390
+ b = cv2.blur(c, (9, 9))
391
+ x[:, :, 2] = ((c - b) * 16.0 + b).clip(-1, 1)
392
+ return x
393
+
394
+
395
+ def green_mark(x):
396
+ x = x.copy()
397
+ x[:, :, 2] = -1
398
+ x[:, :, 0] = -1
399
+ return x
400
+
401
+
402
+ def frame_mark(x):
403
+ x = x.copy()
404
+ x[:64] = -1
405
+ x[-64:] = -1
406
+ x[:, :8] = 1
407
+ x[:, -8:] = 1
408
+ return x
409
+
410
+
411
+ @torch.inference_mode()
412
+ def pytorch2numpy(imgs):
413
+ results = []
414
+ for x in imgs:
415
+ y = x.movedim(0, -1)
416
+ y = y * 127.5 + 127.5
417
+ y = y.detach().float().cpu().numpy().clip(0, 255).astype(np.uint8)
418
+ results.append(y)
419
+ return results
420
+
421
+
422
+ @torch.inference_mode()
423
+ def numpy2pytorch(imgs):
424
+ h = torch.from_numpy(np.stack(imgs, axis=0)).float() / 127.5 - 1.0
425
+ h = h.movedim(-1, 1)
426
+ return h
427
+
428
+
429
+ @torch.no_grad()
430
+ def duplicate_prefix_to_suffix(x, count, zero_out=False):
431
+ if zero_out:
432
+ return torch.cat([x, torch.zeros_like(x[:count])], dim=0)
433
+ else:
434
+ return torch.cat([x, x[:count]], dim=0)
435
+
436
+
437
+ def weighted_mse(a, b, weight):
438
+ return torch.mean(weight.float() * (a.float() - b.float()) ** 2)
439
+
440
+
441
+ def clamped_linear_interpolation(x, x_min, y_min, x_max, y_max, sigma=1.0):
442
+ x = (x - x_min) / (x_max - x_min)
443
+ x = max(0.0, min(x, 1.0))
444
+ x = x**sigma
445
+ return y_min + x * (y_max - y_min)
446
+
447
+
448
+ def expand_to_dims(x, target_dims):
449
+ return x.view(*x.shape, *([1] * max(0, target_dims - x.dim())))
450
+
451
+
452
+ def repeat_to_batch_size(tensor: torch.Tensor, batch_size: int):
453
+ if tensor is None:
454
+ return None
455
+
456
+ first_dim = tensor.shape[0]
457
+
458
+ if first_dim == batch_size:
459
+ return tensor
460
+
461
+ if batch_size % first_dim != 0:
462
+ raise ValueError(f"Cannot evenly repeat first dim {first_dim} to match batch_size {batch_size}.")
463
+
464
+ repeat_times = batch_size // first_dim
465
+
466
+ return tensor.repeat(repeat_times, *[1] * (tensor.dim() - 1))
467
+
468
+
469
+ def dim5(x):
470
+ return expand_to_dims(x, 5)
471
+
472
+
473
+ def dim4(x):
474
+ return expand_to_dims(x, 4)
475
+
476
+
477
+ def dim3(x):
478
+ return expand_to_dims(x, 3)
479
+
480
+
481
+ def crop_or_pad_yield_mask(x, length):
482
+ B, F, C = x.shape
483
+ device = x.device
484
+ dtype = x.dtype
485
+
486
+ if F < length:
487
+ y = torch.zeros((B, length, C), dtype=dtype, device=device)
488
+ mask = torch.zeros((B, length), dtype=torch.bool, device=device)
489
+ y[:, :F, :] = x
490
+ mask[:, :F] = True
491
+ return y, mask
492
+
493
+ return x[:, :length, :], torch.ones((B, length), dtype=torch.bool, device=device)
494
+
495
+
496
+ def extend_dim(x, dim, minimal_length, zero_pad=False):
497
+ original_length = int(x.shape[dim])
498
+
499
+ if original_length >= minimal_length:
500
+ return x
501
+
502
+ if zero_pad:
503
+ padding_shape = list(x.shape)
504
+ padding_shape[dim] = minimal_length - original_length
505
+ padding = torch.zeros(padding_shape, dtype=x.dtype, device=x.device)
506
+ else:
507
+ idx = (slice(None),) * dim + (slice(-1, None),) + (slice(None),) * (len(x.shape) - dim - 1)
508
+ last_element = x[idx]
509
+ padding = last_element.repeat_interleave(minimal_length - original_length, dim=dim)
510
+
511
+ return torch.cat([x, padding], dim=dim)
512
+
513
+
514
+ def lazy_positional_encoding(t, repeats=None):
515
+ if not isinstance(t, list):
516
+ t = [t]
517
+
518
+ from diffusers.models.embeddings import get_timestep_embedding
519
+
520
+ te = torch.tensor(t)
521
+ te = get_timestep_embedding(timesteps=te, embedding_dim=256, flip_sin_to_cos=True, downscale_freq_shift=0.0, scale=1.0)
522
+
523
+ if repeats is None:
524
+ return te
525
+
526
+ te = te[:, None, :].expand(-1, repeats, -1)
527
+
528
+ return te
529
+
530
+
531
+ def state_dict_offset_merge(A, B, C=None):
532
+ result = {}
533
+ keys = A.keys()
534
+
535
+ for key in keys:
536
+ A_value = A[key]
537
+ B_value = B[key].to(A_value)
538
+
539
+ if C is None:
540
+ result[key] = A_value + B_value
541
+ else:
542
+ C_value = C[key].to(A_value)
543
+ result[key] = A_value + B_value - C_value
544
+
545
+ return result
546
+
547
+
548
+ def state_dict_weighted_merge(state_dicts, weights):
549
+ if len(state_dicts) != len(weights):
550
+ raise ValueError("Number of state dictionaries must match number of weights")
551
+
552
+ if not state_dicts:
553
+ return {}
554
+
555
+ total_weight = sum(weights)
556
+
557
+ if total_weight == 0:
558
+ raise ValueError("Sum of weights cannot be zero")
559
+
560
+ normalized_weights = [w / total_weight for w in weights]
561
+
562
+ keys = state_dicts[0].keys()
563
+ result = {}
564
+
565
+ for key in keys:
566
+ result[key] = state_dicts[0][key] * normalized_weights[0]
567
+
568
+ for i in range(1, len(state_dicts)):
569
+ state_dict_value = state_dicts[i][key].to(result[key])
570
+ result[key] += state_dict_value * normalized_weights[i]
571
+
572
+ return result
573
+
574
+
575
+ def group_files_by_folder(all_files):
576
+ grouped_files = {}
577
+
578
+ for file in all_files:
579
+ folder_name = os.path.basename(os.path.dirname(file))
580
+ if folder_name not in grouped_files:
581
+ grouped_files[folder_name] = []
582
+ grouped_files[folder_name].append(file)
583
+
584
+ list_of_lists = list(grouped_files.values())
585
+ return list_of_lists
586
+
587
+
588
+ def generate_timestamp():
589
+ now = datetime.datetime.now()
590
+ timestamp = now.strftime("%y%m%d_%H%M%S")
591
+ milliseconds = f"{int(now.microsecond / 1000):03d}"
592
+ random_number = random.randint(0, 9999)
593
+ return f"{timestamp}_{milliseconds}_{random_number}"
594
+
595
+
596
+ def write_PIL_image_with_png_info(image, metadata, path):
597
+ from PIL.PngImagePlugin import PngInfo
598
+
599
+ png_info = PngInfo()
600
+ for key, value in metadata.items():
601
+ png_info.add_text(key, value)
602
+
603
+ image.save(path, "PNG", pnginfo=png_info)
604
+ return image
605
+
606
+
607
+ def torch_safe_save(content, path):
608
+ torch.save(content, path + "_tmp")
609
+ os.replace(path + "_tmp", path)
610
+ return path
611
+
612
+
613
+ def move_optimizer_to_device(optimizer, device):
614
+ for state in optimizer.state.values():
615
+ for k, v in state.items():
616
+ if isinstance(v, torch.Tensor):
617
+ state[k] = v.to(device)
frame_pack/wrapper.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+
4
+ def append_dims(x, target_dims):
5
+ return x[(...,) + (None,) * (target_dims - x.ndim)]
6
+
7
+
8
+ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=1.0):
9
+ if guidance_rescale == 0:
10
+ return noise_cfg
11
+
12
+ std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
13
+ std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
14
+ noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
15
+ noise_cfg = guidance_rescale * noise_pred_rescaled + (1.0 - guidance_rescale) * noise_cfg
16
+ return noise_cfg
17
+
18
+
19
+ def fm_wrapper(transformer, t_scale=1000.0):
20
+ def k_model(x, sigma, **extra_args):
21
+ dtype = extra_args['dtype']
22
+ cfg_scale = extra_args['cfg_scale']
23
+ cfg_rescale = extra_args['cfg_rescale']
24
+ concat_latent = extra_args['concat_latent']
25
+
26
+ original_dtype = x.dtype
27
+ sigma = sigma.float()
28
+
29
+ x = x.to(dtype)
30
+ timestep = (sigma * t_scale).to(dtype)
31
+
32
+ if concat_latent is None:
33
+ hidden_states = x
34
+ else:
35
+ hidden_states = torch.cat([x, concat_latent.to(x)], dim=1)
36
+
37
+ pred_positive = transformer(hidden_states=hidden_states, timestep=timestep, return_dict=False, **extra_args['positive'])[0].float()
38
+
39
+ if cfg_scale == 1.0:
40
+ pred_negative = torch.zeros_like(pred_positive)
41
+ else:
42
+ pred_negative = transformer(hidden_states=hidden_states, timestep=timestep, return_dict=False, **extra_args['negative'])[0].float()
43
+
44
+ pred_cfg = pred_negative + cfg_scale * (pred_positive - pred_negative)
45
+ pred = rescale_noise_cfg(pred_cfg, pred_positive, guidance_rescale=cfg_rescale)
46
+
47
+ x0 = x.float() - pred.float() * append_dims(sigma, x.ndim)
48
+
49
+ return x0.to(dtype=original_dtype)
50
+
51
+ return k_model
framepack_yichen_output/framepack-yichen-lora-000001.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:326b6106f35da477b51af1d7a0064745e839906b68cc6de7a181af8f24102969
3
+ size 275426472
framepack_yichen_output/framepack-yichen-lora-000002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6152c8741c1830db7238cd468f702925b1e32e9aaad47683d6f4a3a71b5f8a80
3
+ size 275426472
framepack_yichen_output/framepack-yichen-lora-000003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73b38a3bfcb16e58aab100f8737937ac498e905180ce7d21262821d624fc38e5
3
+ size 275426480
framepack_yichen_output/framepack-yichen-lora-000004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:000d4dbc25ae1af98e578d014593d777d6ea79cf298baea76a16acb19cfe1d48
3
+ size 275426472
framepack_yichen_output/framepack-yichen-lora-000005.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bf8f8dd7a2a7568d2ddd7dcfffb7ef041b32981f4f8400ec6b0edd5c308449c
3
+ size 275426472
framepack_yichen_output/framepack-yichen-lora-000006.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f8672ad47082125de5898c45c9040a36549a75b419c1ba9a0f6efdd1775f79d
3
+ size 275426480
hunyuan_model/__init__.py ADDED
File without changes
hunyuan_model/activation_layers.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+
3
+
4
+ def get_activation_layer(act_type):
5
+ """get activation layer
6
+
7
+ Args:
8
+ act_type (str): the activation type
9
+
10
+ Returns:
11
+ torch.nn.functional: the activation layer
12
+ """
13
+ if act_type == "gelu":
14
+ return lambda: nn.GELU()
15
+ elif act_type == "gelu_tanh":
16
+ # Approximate `tanh` requires torch >= 1.13
17
+ return lambda: nn.GELU(approximate="tanh")
18
+ elif act_type == "relu":
19
+ return nn.ReLU
20
+ elif act_type == "silu":
21
+ return nn.SiLU
22
+ else:
23
+ raise ValueError(f"Unknown activation type: {act_type}")
hunyuan_model/attention.py ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import importlib.metadata
2
+ import math
3
+
4
+ import torch
5
+ import torch.nn as nn
6
+ import torch.nn.functional as F
7
+
8
+ try:
9
+ import flash_attn
10
+ from flash_attn.flash_attn_interface import _flash_attn_forward
11
+ from flash_attn.flash_attn_interface import flash_attn_varlen_func
12
+ from flash_attn.flash_attn_interface import flash_attn_func
13
+ except ImportError:
14
+ flash_attn = None
15
+ flash_attn_varlen_func = None
16
+ _flash_attn_forward = None
17
+ flash_attn_func = None
18
+
19
+ try:
20
+ print(f"Trying to import sageattention")
21
+ from sageattention import sageattn_varlen, sageattn
22
+
23
+ print("Successfully imported sageattention")
24
+ except ImportError:
25
+ print(f"Failed to import sageattention")
26
+ sageattn_varlen = None
27
+ sageattn = None
28
+
29
+ try:
30
+ import xformers.ops as xops
31
+ except ImportError:
32
+ xops = None
33
+
34
+ MEMORY_LAYOUT = {
35
+ "flash": (
36
+ lambda x: x.view(x.shape[0] * x.shape[1], *x.shape[2:]),
37
+ lambda x: x,
38
+ ),
39
+ "flash_fixlen": (
40
+ lambda x: x,
41
+ lambda x: x,
42
+ ),
43
+ "sageattn": (
44
+ lambda x: x.view(x.shape[0] * x.shape[1], *x.shape[2:]),
45
+ lambda x: x,
46
+ ),
47
+ "sageattn_fixlen": (
48
+ lambda x: x.transpose(1, 2),
49
+ lambda x: x.transpose(1, 2),
50
+ ),
51
+ "torch": (
52
+ lambda x: x.transpose(1, 2),
53
+ lambda x: x.transpose(1, 2),
54
+ ),
55
+ "xformers": (
56
+ lambda x: x,
57
+ lambda x: x,
58
+ ),
59
+ "vanilla": (
60
+ lambda x: x.transpose(1, 2),
61
+ lambda x: x.transpose(1, 2),
62
+ ),
63
+ }
64
+
65
+
66
+ def get_cu_seqlens(text_mask, img_len):
67
+ """Calculate cu_seqlens_q, cu_seqlens_kv using text_mask and img_len
68
+
69
+ Args:
70
+ text_mask (torch.Tensor): the mask of text
71
+ img_len (int): the length of image
72
+
73
+ Returns:
74
+ torch.Tensor: the calculated cu_seqlens for flash attention
75
+ """
76
+ batch_size = text_mask.shape[0]
77
+ text_len = text_mask.sum(dim=1)
78
+ max_len = text_mask.shape[1] + img_len
79
+
80
+ cu_seqlens = torch.zeros([2 * batch_size + 1], dtype=torch.int32, device="cuda")
81
+
82
+ for i in range(batch_size):
83
+ s = text_len[i] + img_len
84
+ s1 = i * max_len + s
85
+ s2 = (i + 1) * max_len
86
+ cu_seqlens[2 * i + 1] = s1
87
+ cu_seqlens[2 * i + 2] = s2
88
+
89
+ return cu_seqlens
90
+
91
+
92
+ def attention(
93
+ q_or_qkv_list,
94
+ k=None,
95
+ v=None,
96
+ mode="flash",
97
+ drop_rate=0,
98
+ attn_mask=None,
99
+ total_len=None,
100
+ causal=False,
101
+ cu_seqlens_q=None,
102
+ cu_seqlens_kv=None,
103
+ max_seqlen_q=None,
104
+ max_seqlen_kv=None,
105
+ batch_size=1,
106
+ ):
107
+ """
108
+ Perform QKV self attention.
109
+
110
+ Args:
111
+ q (torch.Tensor): Query tensor with shape [b, s, a, d], where a is the number of heads.
112
+ k (torch.Tensor): Key tensor with shape [b, s1, a, d]
113
+ v (torch.Tensor): Value tensor with shape [b, s1, a, d]
114
+ mode (str): Attention mode. Choose from 'self_flash', 'cross_flash', 'torch', and 'vanilla'.
115
+ drop_rate (float): Dropout rate in attention map. (default: 0)
116
+ attn_mask (torch.Tensor): Attention mask with shape [b, s1] (cross_attn), or [b, a, s, s1] (torch or vanilla).
117
+ (default: None)
118
+ causal (bool): Whether to use causal attention. (default: False)
119
+ cu_seqlens_q (torch.Tensor): dtype torch.int32. The cumulative sequence lengths of the sequences in the batch,
120
+ used to index into q.
121
+ cu_seqlens_kv (torch.Tensor): dtype torch.int32. The cumulative sequence lengths of the sequences in the batch,
122
+ used to index into kv.
123
+ max_seqlen_q (int): The maximum sequence length in the batch of q.
124
+ max_seqlen_kv (int): The maximum sequence length in the batch of k and v.
125
+
126
+ Returns:
127
+ torch.Tensor: Output tensor after self attention with shape [b, s, ad]
128
+ """
129
+ q, k, v = q_or_qkv_list if type(q_or_qkv_list) == list else (q_or_qkv_list, k, v)
130
+ if type(q_or_qkv_list) == list:
131
+ q_or_qkv_list.clear()
132
+ split_attn = total_len is not None
133
+ if split_attn and mode == "sageattn":
134
+ mode = "sageattn_fixlen"
135
+ elif split_attn and mode == "flash":
136
+ mode = "flash_fixlen"
137
+ # print(f"Attention mode: {mode}, split_attn: {split_attn}")
138
+ pre_attn_layout, post_attn_layout = MEMORY_LAYOUT[mode]
139
+
140
+ # trim the sequence length to the actual length instead of attn_mask
141
+ if split_attn:
142
+ trimmed_len = q.shape[1] - total_len
143
+ q = [q[i : i + 1, : total_len[i]] for i in range(len(q))]
144
+ k = [k[i : i + 1, : total_len[i]] for i in range(len(k))]
145
+ v = [v[i : i + 1, : total_len[i]] for i in range(len(v))]
146
+ q = [pre_attn_layout(q_i) for q_i in q]
147
+ k = [pre_attn_layout(k_i) for k_i in k]
148
+ v = [pre_attn_layout(v_i) for v_i in v]
149
+ # print(
150
+ # f"Trimming the sequence length to {total_len},trimmed_len: {trimmed_len}, q.shape: {[q_i.shape for q_i in q]}, mode: {mode}"
151
+ # )
152
+ else:
153
+ q = pre_attn_layout(q)
154
+ k = pre_attn_layout(k)
155
+ v = pre_attn_layout(v)
156
+
157
+ if mode == "torch":
158
+ if split_attn:
159
+ x = []
160
+ for i in range(len(q)):
161
+ x_i = F.scaled_dot_product_attention(q[i], k[i], v[i], dropout_p=drop_rate, is_causal=causal)
162
+ q[i], k[i], v[i] = None, None, None
163
+ x.append(x_i)
164
+ del q, k, v
165
+ else:
166
+ if attn_mask is not None and attn_mask.dtype != torch.bool:
167
+ attn_mask = attn_mask.to(q.dtype)
168
+ x = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask, dropout_p=drop_rate, is_causal=causal)
169
+ del q, k, v
170
+ del attn_mask
171
+
172
+ elif mode == "xformers":
173
+ # B, M, H, K: M is the sequence length, H is the number of heads, K is the dimension of the heads -> it is same as input dimension
174
+ # currently only support batch_size = 1
175
+ assert split_attn, "Xformers only supports splitting"
176
+ x = []
177
+ for i in range(len(q)):
178
+ x_i = xops.memory_efficient_attention(q[i], k[i], v[i], p=drop_rate) # , causal=causal)
179
+ q[i], k[i], v[i] = None, None, None
180
+ x.append(x_i)
181
+ del q, k, v
182
+
183
+ elif mode == "flash":
184
+ x = flash_attn_varlen_func(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv)
185
+ del q, k, v
186
+ # x with shape [(bxs), a, d]
187
+ x = x.view(batch_size, max_seqlen_q, x.shape[-2], x.shape[-1]) # reshape x to [b, s, a, d]
188
+ elif mode == "flash_fixlen":
189
+ x = []
190
+ for i in range(len(q)):
191
+ # q: (batch_size, seqlen, nheads, headdim), k: (batch_size, seqlen, nheads_k, headdim), v: (batch_size, seqlen, nheads_k, headdim)
192
+ x_i = flash_attn_func(q[i], k[i], v[i], dropout_p=drop_rate, causal=causal)
193
+ q[i], k[i], v[i] = None, None, None
194
+ x.append(x_i)
195
+ del q, k, v
196
+ elif mode == "sageattn":
197
+ x = sageattn_varlen(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv)
198
+ del q, k, v
199
+ # x with shape [(bxs), a, d]
200
+ x = x.view(batch_size, max_seqlen_q, x.shape[-2], x.shape[-1]) # reshape x to [b, s, a, d]
201
+ elif mode == "sageattn_fixlen":
202
+ x = []
203
+ for i in range(len(q)):
204
+ # HND seems to cause an error
205
+ x_i = sageattn(q[i], k[i], v[i]) # (batch_size, seq_len, head_num, head_dim)
206
+ q[i], k[i], v[i] = None, None, None
207
+ x.append(x_i)
208
+ del q, k, v
209
+ elif mode == "vanilla":
210
+ assert not split_attn, "Vanilla attention does not support trimming"
211
+ scale_factor = 1 / math.sqrt(q.size(-1))
212
+
213
+ b, a, s, _ = q.shape
214
+ s1 = k.size(2)
215
+ attn_bias = torch.zeros(b, a, s, s1, dtype=q.dtype, device=q.device)
216
+ if causal:
217
+ # Only applied to self attention
218
+ assert attn_mask is None, "Causal mask and attn_mask cannot be used together"
219
+ temp_mask = torch.ones(b, a, s, s, dtype=torch.bool, device=q.device).tril(diagonal=0)
220
+ attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
221
+ attn_bias.to(q.dtype)
222
+
223
+ if attn_mask is not None:
224
+ if attn_mask.dtype == torch.bool:
225
+ attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
226
+ else:
227
+ attn_bias += attn_mask
228
+
229
+ # TODO: Maybe force q and k to be float32 to avoid numerical overflow
230
+ attn = (q @ k.transpose(-2, -1)) * scale_factor
231
+ attn += attn_bias
232
+ attn = attn.softmax(dim=-1)
233
+ attn = torch.dropout(attn, p=drop_rate, train=True)
234
+ x = attn @ v
235
+ else:
236
+ raise NotImplementedError(f"Unsupported attention mode: {mode}")
237
+
238
+ if split_attn:
239
+ x = [post_attn_layout(x_i) for x_i in x]
240
+ for i in range(len(x)):
241
+ x[i] = F.pad(x[i], (0, 0, 0, 0, 0, trimmed_len[i]))
242
+ x = torch.cat(x, dim=0)
243
+ else:
244
+ x = post_attn_layout(x)
245
+
246
+ b, s, a, d = x.shape
247
+ out = x.reshape(b, s, -1)
248
+ return out
249
+
250
+
251
+ def parallel_attention(hybrid_seq_parallel_attn, q, k, v, img_q_len, img_kv_len, cu_seqlens_q, cu_seqlens_kv):
252
+ attn1 = hybrid_seq_parallel_attn(
253
+ None,
254
+ q[:, :img_q_len, :, :],
255
+ k[:, :img_kv_len, :, :],
256
+ v[:, :img_kv_len, :, :],
257
+ dropout_p=0.0,
258
+ causal=False,
259
+ joint_tensor_query=q[:, img_q_len : cu_seqlens_q[1]],
260
+ joint_tensor_key=k[:, img_kv_len : cu_seqlens_kv[1]],
261
+ joint_tensor_value=v[:, img_kv_len : cu_seqlens_kv[1]],
262
+ joint_strategy="rear",
263
+ )
264
+ if flash_attn.__version__ >= "2.7.0":
265
+ attn2, *_ = _flash_attn_forward(
266
+ q[:, cu_seqlens_q[1] :],
267
+ k[:, cu_seqlens_kv[1] :],
268
+ v[:, cu_seqlens_kv[1] :],
269
+ dropout_p=0.0,
270
+ softmax_scale=q.shape[-1] ** (-0.5),
271
+ causal=False,
272
+ window_size_left=-1,
273
+ window_size_right=-1,
274
+ softcap=0.0,
275
+ alibi_slopes=None,
276
+ return_softmax=False,
277
+ )
278
+ else:
279
+ attn2, *_ = _flash_attn_forward(
280
+ q[:, cu_seqlens_q[1] :],
281
+ k[:, cu_seqlens_kv[1] :],
282
+ v[:, cu_seqlens_kv[1] :],
283
+ dropout_p=0.0,
284
+ softmax_scale=q.shape[-1] ** (-0.5),
285
+ causal=False,
286
+ window_size=(-1, -1),
287
+ softcap=0.0,
288
+ alibi_slopes=None,
289
+ return_softmax=False,
290
+ )
291
+ attn = torch.cat([attn1, attn2], dim=1)
292
+ b, s, a, d = attn.shape
293
+ attn = attn.reshape(b, s, -1)
294
+
295
+ return attn
hunyuan_model/autoencoder_kl_causal_3d.py ADDED
@@ -0,0 +1,609 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # ==============================================================================
15
+ #
16
+ # Modified from diffusers==0.29.2
17
+ #
18
+ # ==============================================================================
19
+ from typing import Dict, Optional, Tuple, Union
20
+ from dataclasses import dataclass
21
+
22
+ import torch
23
+ import torch.nn as nn
24
+
25
+ from diffusers.configuration_utils import ConfigMixin, register_to_config
26
+
27
+ # try:
28
+ # # This diffusers is modified and packed in the mirror.
29
+ # from diffusers.loaders import FromOriginalVAEMixin
30
+ # except ImportError:
31
+ # # Use this to be compatible with the original diffusers.
32
+ # from diffusers.loaders.single_file_model import FromOriginalModelMixin as FromOriginalVAEMixin
33
+ from diffusers.utils.accelerate_utils import apply_forward_hook
34
+ from diffusers.models.attention_processor import (
35
+ ADDED_KV_ATTENTION_PROCESSORS,
36
+ CROSS_ATTENTION_PROCESSORS,
37
+ Attention,
38
+ AttentionProcessor,
39
+ AttnAddedKVProcessor,
40
+ AttnProcessor,
41
+ )
42
+ from diffusers.models.modeling_outputs import AutoencoderKLOutput
43
+ from diffusers.models.modeling_utils import ModelMixin
44
+ from .vae import DecoderCausal3D, BaseOutput, DecoderOutput, DiagonalGaussianDistribution, EncoderCausal3D
45
+
46
+
47
+ @dataclass
48
+ class DecoderOutput2(BaseOutput):
49
+ sample: torch.FloatTensor
50
+ posterior: Optional[DiagonalGaussianDistribution] = None
51
+
52
+
53
+ class AutoencoderKLCausal3D(ModelMixin, ConfigMixin):
54
+ r"""
55
+ A VAE model with KL loss for encoding images/videos into latents and decoding latent representations into images/videos.
56
+
57
+ This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
58
+ for all models (such as downloading or saving).
59
+ """
60
+
61
+ _supports_gradient_checkpointing = True
62
+
63
+ @register_to_config
64
+ def __init__(
65
+ self,
66
+ in_channels: int = 3,
67
+ out_channels: int = 3,
68
+ down_block_types: Tuple[str] = ("DownEncoderBlockCausal3D",),
69
+ up_block_types: Tuple[str] = ("UpDecoderBlockCausal3D",),
70
+ block_out_channels: Tuple[int] = (64,),
71
+ layers_per_block: int = 1,
72
+ act_fn: str = "silu",
73
+ latent_channels: int = 4,
74
+ norm_num_groups: int = 32,
75
+ sample_size: int = 32,
76
+ sample_tsize: int = 64,
77
+ scaling_factor: float = 0.18215,
78
+ force_upcast: float = True,
79
+ spatial_compression_ratio: int = 8,
80
+ time_compression_ratio: int = 4,
81
+ mid_block_add_attention: bool = True,
82
+ ):
83
+ super().__init__()
84
+
85
+ self.time_compression_ratio = time_compression_ratio
86
+
87
+ self.encoder = EncoderCausal3D(
88
+ in_channels=in_channels,
89
+ out_channels=latent_channels,
90
+ down_block_types=down_block_types,
91
+ block_out_channels=block_out_channels,
92
+ layers_per_block=layers_per_block,
93
+ act_fn=act_fn,
94
+ norm_num_groups=norm_num_groups,
95
+ double_z=True,
96
+ time_compression_ratio=time_compression_ratio,
97
+ spatial_compression_ratio=spatial_compression_ratio,
98
+ mid_block_add_attention=mid_block_add_attention,
99
+ )
100
+
101
+ self.decoder = DecoderCausal3D(
102
+ in_channels=latent_channels,
103
+ out_channels=out_channels,
104
+ up_block_types=up_block_types,
105
+ block_out_channels=block_out_channels,
106
+ layers_per_block=layers_per_block,
107
+ norm_num_groups=norm_num_groups,
108
+ act_fn=act_fn,
109
+ time_compression_ratio=time_compression_ratio,
110
+ spatial_compression_ratio=spatial_compression_ratio,
111
+ mid_block_add_attention=mid_block_add_attention,
112
+ )
113
+
114
+ self.quant_conv = nn.Conv3d(2 * latent_channels, 2 * latent_channels, kernel_size=1)
115
+ self.post_quant_conv = nn.Conv3d(latent_channels, latent_channels, kernel_size=1)
116
+
117
+ self.use_slicing = False
118
+ self.use_spatial_tiling = False
119
+ self.use_temporal_tiling = False
120
+
121
+ # only relevant if vae tiling is enabled
122
+ self.tile_sample_min_tsize = sample_tsize
123
+ self.tile_latent_min_tsize = sample_tsize // time_compression_ratio
124
+
125
+ self.tile_sample_min_size = self.config.sample_size
126
+ sample_size = self.config.sample_size[0] if isinstance(self.config.sample_size, (list, tuple)) else self.config.sample_size
127
+ self.tile_latent_min_size = int(sample_size / (2 ** (len(self.config.block_out_channels) - 1)))
128
+ self.tile_overlap_factor = 0.25
129
+
130
+ def _set_gradient_checkpointing(self, module, value=False):
131
+ if isinstance(module, (EncoderCausal3D, DecoderCausal3D)):
132
+ module.gradient_checkpointing = value
133
+
134
+ def enable_temporal_tiling(self, use_tiling: bool = True):
135
+ self.use_temporal_tiling = use_tiling
136
+
137
+ def disable_temporal_tiling(self):
138
+ self.enable_temporal_tiling(False)
139
+
140
+ def enable_spatial_tiling(self, use_tiling: bool = True):
141
+ self.use_spatial_tiling = use_tiling
142
+
143
+ def disable_spatial_tiling(self):
144
+ self.enable_spatial_tiling(False)
145
+
146
+ def enable_tiling(self, use_tiling: bool = True):
147
+ r"""
148
+ Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
149
+ compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
150
+ processing larger videos.
151
+ """
152
+ self.enable_spatial_tiling(use_tiling)
153
+ self.enable_temporal_tiling(use_tiling)
154
+
155
+ def disable_tiling(self):
156
+ r"""
157
+ Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
158
+ decoding in one step.
159
+ """
160
+ self.disable_spatial_tiling()
161
+ self.disable_temporal_tiling()
162
+
163
+ def enable_slicing(self):
164
+ r"""
165
+ Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
166
+ compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
167
+ """
168
+ self.use_slicing = True
169
+
170
+ def disable_slicing(self):
171
+ r"""
172
+ Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
173
+ decoding in one step.
174
+ """
175
+ self.use_slicing = False
176
+
177
+ def set_chunk_size_for_causal_conv_3d(self, chunk_size: int):
178
+ # set chunk_size to CausalConv3d recursively
179
+ def set_chunk_size(module):
180
+ if hasattr(module, "chunk_size"):
181
+ module.chunk_size = chunk_size
182
+
183
+ self.apply(set_chunk_size)
184
+
185
+ @property
186
+ # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
187
+ def attn_processors(self) -> Dict[str, AttentionProcessor]:
188
+ r"""
189
+ Returns:
190
+ `dict` of attention processors: A dictionary containing all attention processors used in the model with
191
+ indexed by its weight name.
192
+ """
193
+ # set recursively
194
+ processors = {}
195
+
196
+ def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
197
+ if hasattr(module, "get_processor"):
198
+ processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
199
+
200
+ for sub_name, child in module.named_children():
201
+ fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
202
+
203
+ return processors
204
+
205
+ for name, module in self.named_children():
206
+ fn_recursive_add_processors(name, module, processors)
207
+
208
+ return processors
209
+
210
+ # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
211
+ def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]], _remove_lora=False):
212
+ r"""
213
+ Sets the attention processor to use to compute attention.
214
+
215
+ Parameters:
216
+ processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
217
+ The instantiated processor class or a dictionary of processor classes that will be set as the processor
218
+ for **all** `Attention` layers.
219
+
220
+ If `processor` is a dict, the key needs to define the path to the corresponding cross attention
221
+ processor. This is strongly recommended when setting trainable attention processors.
222
+
223
+ """
224
+ count = len(self.attn_processors.keys())
225
+
226
+ if isinstance(processor, dict) and len(processor) != count:
227
+ raise ValueError(
228
+ f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
229
+ f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
230
+ )
231
+
232
+ def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
233
+ if hasattr(module, "set_processor"):
234
+ if not isinstance(processor, dict):
235
+ module.set_processor(processor, _remove_lora=_remove_lora)
236
+ else:
237
+ module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora)
238
+
239
+ for sub_name, child in module.named_children():
240
+ fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
241
+
242
+ for name, module in self.named_children():
243
+ fn_recursive_attn_processor(name, module, processor)
244
+
245
+ # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
246
+ def set_default_attn_processor(self):
247
+ """
248
+ Disables custom attention processors and sets the default attention implementation.
249
+ """
250
+ if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
251
+ processor = AttnAddedKVProcessor()
252
+ elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
253
+ processor = AttnProcessor()
254
+ else:
255
+ raise ValueError(
256
+ f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
257
+ )
258
+
259
+ self.set_attn_processor(processor, _remove_lora=True)
260
+
261
+ @apply_forward_hook
262
+ def encode(
263
+ self, x: torch.FloatTensor, return_dict: bool = True
264
+ ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
265
+ """
266
+ Encode a batch of images/videos into latents.
267
+
268
+ Args:
269
+ x (`torch.FloatTensor`): Input batch of images/videos.
270
+ return_dict (`bool`, *optional*, defaults to `True`):
271
+ Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
272
+
273
+ Returns:
274
+ The latent representations of the encoded images/videos. If `return_dict` is True, a
275
+ [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
276
+ """
277
+ assert len(x.shape) == 5, "The input tensor should have 5 dimensions."
278
+
279
+ if self.use_temporal_tiling and x.shape[2] > self.tile_sample_min_tsize:
280
+ return self.temporal_tiled_encode(x, return_dict=return_dict)
281
+
282
+ if self.use_spatial_tiling and (x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size):
283
+ return self.spatial_tiled_encode(x, return_dict=return_dict)
284
+
285
+ if self.use_slicing and x.shape[0] > 1:
286
+ encoded_slices = [self.encoder(x_slice) for x_slice in x.split(1)]
287
+ h = torch.cat(encoded_slices)
288
+ else:
289
+ h = self.encoder(x)
290
+
291
+ moments = self.quant_conv(h)
292
+ posterior = DiagonalGaussianDistribution(moments)
293
+
294
+ if not return_dict:
295
+ return (posterior,)
296
+
297
+ return AutoencoderKLOutput(latent_dist=posterior)
298
+
299
+ def _decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
300
+ assert len(z.shape) == 5, "The input tensor should have 5 dimensions."
301
+
302
+ if self.use_temporal_tiling and z.shape[2] > self.tile_latent_min_tsize:
303
+ return self.temporal_tiled_decode(z, return_dict=return_dict)
304
+
305
+ if self.use_spatial_tiling and (z.shape[-1] > self.tile_latent_min_size or z.shape[-2] > self.tile_latent_min_size):
306
+ return self.spatial_tiled_decode(z, return_dict=return_dict)
307
+
308
+ z = self.post_quant_conv(z)
309
+ dec = self.decoder(z)
310
+
311
+ if not return_dict:
312
+ return (dec,)
313
+
314
+ return DecoderOutput(sample=dec)
315
+
316
+ @apply_forward_hook
317
+ def decode(self, z: torch.FloatTensor, return_dict: bool = True, generator=None) -> Union[DecoderOutput, torch.FloatTensor]:
318
+ """
319
+ Decode a batch of images/videos.
320
+
321
+ Args:
322
+ z (`torch.FloatTensor`): Input batch of latent vectors.
323
+ return_dict (`bool`, *optional*, defaults to `True`):
324
+ Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
325
+
326
+ Returns:
327
+ [`~models.vae.DecoderOutput`] or `tuple`:
328
+ If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
329
+ returned.
330
+
331
+ """
332
+ if self.use_slicing and z.shape[0] > 1:
333
+ decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
334
+ decoded = torch.cat(decoded_slices)
335
+ else:
336
+ decoded = self._decode(z).sample
337
+
338
+ if not return_dict:
339
+ return (decoded,)
340
+
341
+ return DecoderOutput(sample=decoded)
342
+
343
+ def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
344
+ blend_extent = min(a.shape[-2], b.shape[-2], blend_extent)
345
+ for y in range(blend_extent):
346
+ b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * (y / blend_extent)
347
+ return b
348
+
349
+ def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
350
+ blend_extent = min(a.shape[-1], b.shape[-1], blend_extent)
351
+ for x in range(blend_extent):
352
+ b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * (x / blend_extent)
353
+ return b
354
+
355
+ def blend_t(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
356
+ blend_extent = min(a.shape[-3], b.shape[-3], blend_extent)
357
+ for x in range(blend_extent):
358
+ b[:, :, x, :, :] = a[:, :, -blend_extent + x, :, :] * (1 - x / blend_extent) + b[:, :, x, :, :] * (x / blend_extent)
359
+ return b
360
+
361
+ def spatial_tiled_encode(
362
+ self, x: torch.FloatTensor, return_dict: bool = True, return_moments: bool = False
363
+ ) -> AutoencoderKLOutput:
364
+ r"""Encode a batch of images/videos using a tiled encoder.
365
+
366
+ When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
367
+ steps. This is useful to keep memory use constant regardless of image/videos size. The end result of tiled encoding is
368
+ different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
369
+ tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
370
+ output, but they should be much less noticeable.
371
+
372
+ Args:
373
+ x (`torch.FloatTensor`): Input batch of images/videos.
374
+ return_dict (`bool`, *optional*, defaults to `True`):
375
+ Whether or not to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
376
+
377
+ Returns:
378
+ [`~models.autoencoder_kl.AutoencoderKLOutput`] or `tuple`:
379
+ If return_dict is True, a [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain
380
+ `tuple` is returned.
381
+ """
382
+ overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))
383
+ blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)
384
+ row_limit = self.tile_latent_min_size - blend_extent
385
+
386
+ # Split video into tiles and encode them separately.
387
+ rows = []
388
+ for i in range(0, x.shape[-2], overlap_size):
389
+ row = []
390
+ for j in range(0, x.shape[-1], overlap_size):
391
+ tile = x[:, :, :, i : i + self.tile_sample_min_size, j : j + self.tile_sample_min_size]
392
+ tile = self.encoder(tile)
393
+ tile = self.quant_conv(tile)
394
+ row.append(tile)
395
+ rows.append(row)
396
+ result_rows = []
397
+ for i, row in enumerate(rows):
398
+ result_row = []
399
+ for j, tile in enumerate(row):
400
+ # blend the above tile and the left tile
401
+ # to the current tile and add the current tile to the result row
402
+ if i > 0:
403
+ tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
404
+ if j > 0:
405
+ tile = self.blend_h(row[j - 1], tile, blend_extent)
406
+ result_row.append(tile[:, :, :, :row_limit, :row_limit])
407
+ result_rows.append(torch.cat(result_row, dim=-1))
408
+
409
+ moments = torch.cat(result_rows, dim=-2)
410
+ if return_moments:
411
+ return moments
412
+
413
+ posterior = DiagonalGaussianDistribution(moments)
414
+ if not return_dict:
415
+ return (posterior,)
416
+
417
+ return AutoencoderKLOutput(latent_dist=posterior)
418
+
419
+ def spatial_tiled_decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
420
+ r"""
421
+ Decode a batch of images/videos using a tiled decoder.
422
+
423
+ Args:
424
+ z (`torch.FloatTensor`): Input batch of latent vectors.
425
+ return_dict (`bool`, *optional*, defaults to `True`):
426
+ Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
427
+
428
+ Returns:
429
+ [`~models.vae.DecoderOutput`] or `tuple`:
430
+ If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
431
+ returned.
432
+ """
433
+ overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor))
434
+ blend_extent = int(self.tile_sample_min_size * self.tile_overlap_factor)
435
+ row_limit = self.tile_sample_min_size - blend_extent
436
+
437
+ # Split z into overlapping tiles and decode them separately.
438
+ # The tiles have an overlap to avoid seams between tiles.
439
+ rows = []
440
+ for i in range(0, z.shape[-2], overlap_size):
441
+ row = []
442
+ for j in range(0, z.shape[-1], overlap_size):
443
+ tile = z[:, :, :, i : i + self.tile_latent_min_size, j : j + self.tile_latent_min_size]
444
+ tile = self.post_quant_conv(tile)
445
+ decoded = self.decoder(tile)
446
+ row.append(decoded)
447
+ rows.append(row)
448
+ result_rows = []
449
+ for i, row in enumerate(rows):
450
+ result_row = []
451
+ for j, tile in enumerate(row):
452
+ # blend the above tile and the left tile
453
+ # to the current tile and add the current tile to the result row
454
+ if i > 0:
455
+ tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
456
+ if j > 0:
457
+ tile = self.blend_h(row[j - 1], tile, blend_extent)
458
+ result_row.append(tile[:, :, :, :row_limit, :row_limit])
459
+ result_rows.append(torch.cat(result_row, dim=-1))
460
+
461
+ dec = torch.cat(result_rows, dim=-2)
462
+ if not return_dict:
463
+ return (dec,)
464
+
465
+ return DecoderOutput(sample=dec)
466
+
467
+ def temporal_tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True) -> AutoencoderKLOutput:
468
+
469
+ B, C, T, H, W = x.shape
470
+ overlap_size = int(self.tile_sample_min_tsize * (1 - self.tile_overlap_factor))
471
+ blend_extent = int(self.tile_latent_min_tsize * self.tile_overlap_factor)
472
+ t_limit = self.tile_latent_min_tsize - blend_extent
473
+
474
+ # Split the video into tiles and encode them separately.
475
+ row = []
476
+ for i in range(0, T, overlap_size):
477
+ tile = x[:, :, i : i + self.tile_sample_min_tsize + 1, :, :]
478
+ if self.use_spatial_tiling and (
479
+ tile.shape[-1] > self.tile_sample_min_size or tile.shape[-2] > self.tile_sample_min_size
480
+ ):
481
+ tile = self.spatial_tiled_encode(tile, return_moments=True)
482
+ else:
483
+ tile = self.encoder(tile)
484
+ tile = self.quant_conv(tile)
485
+ if i > 0:
486
+ tile = tile[:, :, 1:, :, :]
487
+ row.append(tile)
488
+ result_row = []
489
+ for i, tile in enumerate(row):
490
+ if i > 0:
491
+ tile = self.blend_t(row[i - 1], tile, blend_extent)
492
+ result_row.append(tile[:, :, :t_limit, :, :])
493
+ else:
494
+ result_row.append(tile[:, :, : t_limit + 1, :, :])
495
+
496
+ moments = torch.cat(result_row, dim=2)
497
+ posterior = DiagonalGaussianDistribution(moments)
498
+
499
+ if not return_dict:
500
+ return (posterior,)
501
+
502
+ return AutoencoderKLOutput(latent_dist=posterior)
503
+
504
+ def temporal_tiled_decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
505
+ # Split z into overlapping tiles and decode them separately.
506
+
507
+ B, C, T, H, W = z.shape
508
+ overlap_size = int(self.tile_latent_min_tsize * (1 - self.tile_overlap_factor))
509
+ blend_extent = int(self.tile_sample_min_tsize * self.tile_overlap_factor)
510
+ t_limit = self.tile_sample_min_tsize - blend_extent
511
+
512
+ row = []
513
+ for i in range(0, T, overlap_size):
514
+ tile = z[:, :, i : i + self.tile_latent_min_tsize + 1, :, :]
515
+ if self.use_spatial_tiling and (
516
+ tile.shape[-1] > self.tile_latent_min_size or tile.shape[-2] > self.tile_latent_min_size
517
+ ):
518
+ decoded = self.spatial_tiled_decode(tile, return_dict=True).sample
519
+ else:
520
+ tile = self.post_quant_conv(tile)
521
+ decoded = self.decoder(tile)
522
+ if i > 0:
523
+ decoded = decoded[:, :, 1:, :, :]
524
+ row.append(decoded)
525
+ result_row = []
526
+ for i, tile in enumerate(row):
527
+ if i > 0:
528
+ tile = self.blend_t(row[i - 1], tile, blend_extent)
529
+ result_row.append(tile[:, :, :t_limit, :, :])
530
+ else:
531
+ result_row.append(tile[:, :, : t_limit + 1, :, :])
532
+
533
+ dec = torch.cat(result_row, dim=2)
534
+ if not return_dict:
535
+ return (dec,)
536
+
537
+ return DecoderOutput(sample=dec)
538
+
539
+ def forward(
540
+ self,
541
+ sample: torch.FloatTensor,
542
+ sample_posterior: bool = False,
543
+ return_dict: bool = True,
544
+ return_posterior: bool = False,
545
+ generator: Optional[torch.Generator] = None,
546
+ ) -> Union[DecoderOutput2, torch.FloatTensor]:
547
+ r"""
548
+ Args:
549
+ sample (`torch.FloatTensor`): Input sample.
550
+ sample_posterior (`bool`, *optional*, defaults to `False`):
551
+ Whether to sample from the posterior.
552
+ return_dict (`bool`, *optional*, defaults to `True`):
553
+ Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
554
+ """
555
+ x = sample
556
+ posterior = self.encode(x).latent_dist
557
+ if sample_posterior:
558
+ z = posterior.sample(generator=generator)
559
+ else:
560
+ z = posterior.mode()
561
+ dec = self.decode(z).sample
562
+
563
+ if not return_dict:
564
+ if return_posterior:
565
+ return (dec, posterior)
566
+ else:
567
+ return (dec,)
568
+ if return_posterior:
569
+ return DecoderOutput2(sample=dec, posterior=posterior)
570
+ else:
571
+ return DecoderOutput2(sample=dec)
572
+
573
+ # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
574
+ def fuse_qkv_projections(self):
575
+ """
576
+ Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
577
+ key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
578
+
579
+ <Tip warning={true}>
580
+
581
+ This API is 🧪 experimental.
582
+
583
+ </Tip>
584
+ """
585
+ self.original_attn_processors = None
586
+
587
+ for _, attn_processor in self.attn_processors.items():
588
+ if "Added" in str(attn_processor.__class__.__name__):
589
+ raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
590
+
591
+ self.original_attn_processors = self.attn_processors
592
+
593
+ for module in self.modules():
594
+ if isinstance(module, Attention):
595
+ module.fuse_projections(fuse=True)
596
+
597
+ # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
598
+ def unfuse_qkv_projections(self):
599
+ """Disables the fused QKV projection if enabled.
600
+
601
+ <Tip warning={true}>
602
+
603
+ This API is 🧪 experimental.
604
+
605
+ </Tip>
606
+
607
+ """
608
+ if self.original_attn_processors is not None:
609
+ self.set_attn_processor(self.original_attn_processors)
hunyuan_model/embed_layers.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import collections
2
+ import math
3
+ import torch
4
+ import torch.nn as nn
5
+ from einops import rearrange, repeat
6
+
7
+ from .helpers import to_2tuple
8
+
9
+ class PatchEmbed(nn.Module):
10
+ """2D Image to Patch Embedding
11
+
12
+ Image to Patch Embedding using Conv2d
13
+
14
+ A convolution based approach to patchifying a 2D image w/ embedding projection.
15
+
16
+ Based on the impl in https://github.com/google-research/vision_transformer
17
+
18
+ Hacked together by / Copyright 2020 Ross Wightman
19
+
20
+ Remove the _assert function in forward function to be compatible with multi-resolution images.
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ patch_size=16,
26
+ in_chans=3,
27
+ embed_dim=768,
28
+ norm_layer=None,
29
+ flatten=True,
30
+ bias=True,
31
+ dtype=None,
32
+ device=None,
33
+ ):
34
+ factory_kwargs = {"dtype": dtype, "device": device}
35
+ super().__init__()
36
+ patch_size = to_2tuple(patch_size)
37
+ self.patch_size = patch_size
38
+ self.flatten = flatten
39
+
40
+ self.proj = nn.Conv3d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias, **factory_kwargs)
41
+ nn.init.xavier_uniform_(self.proj.weight.view(self.proj.weight.size(0), -1))
42
+ if bias:
43
+ nn.init.zeros_(self.proj.bias)
44
+
45
+ self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
46
+
47
+ def forward(self, x):
48
+ x = self.proj(x)
49
+ if self.flatten:
50
+ x = x.flatten(2).transpose(1, 2) # BCHW -> BNC
51
+ x = self.norm(x)
52
+ return x
53
+
54
+
55
+ class TextProjection(nn.Module):
56
+ """
57
+ Projects text embeddings. Also handles dropout for classifier-free guidance.
58
+
59
+ Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/nets/PixArt_blocks.py
60
+ """
61
+
62
+ def __init__(self, in_channels, hidden_size, act_layer, dtype=None, device=None):
63
+ factory_kwargs = {"dtype": dtype, "device": device}
64
+ super().__init__()
65
+ self.linear_1 = nn.Linear(in_features=in_channels, out_features=hidden_size, bias=True, **factory_kwargs)
66
+ self.act_1 = act_layer()
67
+ self.linear_2 = nn.Linear(in_features=hidden_size, out_features=hidden_size, bias=True, **factory_kwargs)
68
+
69
+ def forward(self, caption):
70
+ hidden_states = self.linear_1(caption)
71
+ hidden_states = self.act_1(hidden_states)
72
+ hidden_states = self.linear_2(hidden_states)
73
+ return hidden_states
74
+
75
+
76
+ def timestep_embedding(t, dim, max_period=10000):
77
+ """
78
+ Create sinusoidal timestep embeddings.
79
+
80
+ Args:
81
+ t (torch.Tensor): a 1-D Tensor of N indices, one per batch element. These may be fractional.
82
+ dim (int): the dimension of the output.
83
+ max_period (int): controls the minimum frequency of the embeddings.
84
+
85
+ Returns:
86
+ embedding (torch.Tensor): An (N, D) Tensor of positional embeddings.
87
+
88
+ .. ref_link: https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
89
+ """
90
+ half = dim // 2
91
+ freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(device=t.device)
92
+ args = t[:, None].float() * freqs[None]
93
+ embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
94
+ if dim % 2:
95
+ embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
96
+ return embedding
97
+
98
+
99
+ class TimestepEmbedder(nn.Module):
100
+ """
101
+ Embeds scalar timesteps into vector representations.
102
+ """
103
+
104
+ def __init__(
105
+ self,
106
+ hidden_size,
107
+ act_layer,
108
+ frequency_embedding_size=256,
109
+ max_period=10000,
110
+ out_size=None,
111
+ dtype=None,
112
+ device=None,
113
+ ):
114
+ factory_kwargs = {"dtype": dtype, "device": device}
115
+ super().__init__()
116
+ self.frequency_embedding_size = frequency_embedding_size
117
+ self.max_period = max_period
118
+ if out_size is None:
119
+ out_size = hidden_size
120
+
121
+ self.mlp = nn.Sequential(
122
+ nn.Linear(frequency_embedding_size, hidden_size, bias=True, **factory_kwargs),
123
+ act_layer(),
124
+ nn.Linear(hidden_size, out_size, bias=True, **factory_kwargs),
125
+ )
126
+ nn.init.normal_(self.mlp[0].weight, std=0.02)
127
+ nn.init.normal_(self.mlp[2].weight, std=0.02)
128
+
129
+ def forward(self, t):
130
+ t_freq = timestep_embedding(t, self.frequency_embedding_size, self.max_period).type(self.mlp[0].weight.dtype)
131
+ t_emb = self.mlp(t_freq)
132
+ return t_emb
hunyuan_model/fp8_optimization.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #based on ComfyUI's and MinusZoneAI's fp8_linear optimization
2
+ #further borrowed from HunyuanVideoWrapper for Musubi Tuner
3
+ import torch
4
+ import torch.nn as nn
5
+
6
+ def fp8_linear_forward(cls, original_dtype, input):
7
+ weight_dtype = cls.weight.dtype
8
+ if weight_dtype in [torch.float8_e4m3fn, torch.float8_e5m2]:
9
+ if len(input.shape) == 3:
10
+ target_dtype = torch.float8_e5m2 if weight_dtype == torch.float8_e4m3fn else torch.float8_e4m3fn
11
+ inn = input.reshape(-1, input.shape[2]).to(target_dtype)
12
+ w = cls.weight.t()
13
+
14
+ scale = torch.ones((1), device=input.device, dtype=torch.float32)
15
+ bias = cls.bias.to(original_dtype) if cls.bias is not None else None
16
+
17
+ if bias is not None:
18
+ o = torch._scaled_mm(inn, w, out_dtype=original_dtype, bias=bias, scale_a=scale, scale_b=scale)
19
+ else:
20
+ o = torch._scaled_mm(inn, w, out_dtype=original_dtype, scale_a=scale, scale_b=scale)
21
+
22
+ if isinstance(o, tuple):
23
+ o = o[0]
24
+
25
+ return o.reshape((-1, input.shape[1], cls.weight.shape[0]))
26
+ else:
27
+ return cls.original_forward(input.to(original_dtype))
28
+ else:
29
+ return cls.original_forward(input)
30
+
31
+ def convert_fp8_linear(module, original_dtype, params_to_keep={}):
32
+ setattr(module, "fp8_matmul_enabled", True)
33
+
34
+ for name, module in module.named_modules():
35
+ if not any(keyword in name for keyword in params_to_keep):
36
+ if isinstance(module, nn.Linear):
37
+ original_forward = module.forward
38
+ setattr(module, "original_forward", original_forward)
39
+ setattr(module, "forward", lambda input, m=module: fp8_linear_forward(m, original_dtype, input))
hunyuan_model/helpers.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import collections.abc
2
+
3
+ from itertools import repeat
4
+
5
+
6
+ def _ntuple(n):
7
+ def parse(x):
8
+ if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
9
+ x = tuple(x)
10
+ if len(x) == 1:
11
+ x = tuple(repeat(x[0], n))
12
+ return x
13
+ return tuple(repeat(x, n))
14
+ return parse
15
+
16
+
17
+ to_1tuple = _ntuple(1)
18
+ to_2tuple = _ntuple(2)
19
+ to_3tuple = _ntuple(3)
20
+ to_4tuple = _ntuple(4)
21
+
22
+
23
+ def as_tuple(x):
24
+ if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
25
+ return tuple(x)
26
+ if x is None or isinstance(x, (int, float, str)):
27
+ return (x,)
28
+ else:
29
+ raise ValueError(f"Unknown type {type(x)}")
30
+
31
+
32
+ def as_list_of_2tuple(x):
33
+ x = as_tuple(x)
34
+ if len(x) == 1:
35
+ x = (x[0], x[0])
36
+ assert len(x) % 2 == 0, f"Expect even length, got {len(x)}."
37
+ lst = []
38
+ for i in range(0, len(x), 2):
39
+ lst.append((x[i], x[i + 1]))
40
+ return lst
hunyuan_model/mlp_layers.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Modified from timm library:
2
+ # https://github.com/huggingface/pytorch-image-models/blob/648aaa41233ba83eb38faf5ba9d415d574823241/timm/layers/mlp.py#L13
3
+
4
+ from functools import partial
5
+
6
+ import torch
7
+ import torch.nn as nn
8
+
9
+ from .modulate_layers import modulate
10
+ from .helpers import to_2tuple
11
+
12
+
13
+ class MLP(nn.Module):
14
+ """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
15
+
16
+ def __init__(
17
+ self,
18
+ in_channels,
19
+ hidden_channels=None,
20
+ out_features=None,
21
+ act_layer=nn.GELU,
22
+ norm_layer=None,
23
+ bias=True,
24
+ drop=0.0,
25
+ use_conv=False,
26
+ device=None,
27
+ dtype=None,
28
+ ):
29
+ factory_kwargs = {"device": device, "dtype": dtype}
30
+ super().__init__()
31
+ out_features = out_features or in_channels
32
+ hidden_channels = hidden_channels or in_channels
33
+ bias = to_2tuple(bias)
34
+ drop_probs = to_2tuple(drop)
35
+ linear_layer = partial(nn.Conv2d, kernel_size=1) if use_conv else nn.Linear
36
+
37
+ self.fc1 = linear_layer(
38
+ in_channels, hidden_channels, bias=bias[0], **factory_kwargs
39
+ )
40
+ self.act = act_layer()
41
+ self.drop1 = nn.Dropout(drop_probs[0])
42
+ self.norm = (
43
+ norm_layer(hidden_channels, **factory_kwargs)
44
+ if norm_layer is not None
45
+ else nn.Identity()
46
+ )
47
+ self.fc2 = linear_layer(
48
+ hidden_channels, out_features, bias=bias[1], **factory_kwargs
49
+ )
50
+ self.drop2 = nn.Dropout(drop_probs[1])
51
+
52
+ def forward(self, x):
53
+ x = self.fc1(x)
54
+ x = self.act(x)
55
+ x = self.drop1(x)
56
+ x = self.norm(x)
57
+ x = self.fc2(x)
58
+ x = self.drop2(x)
59
+ return x
60
+
61
+
62
+ #
63
+ class MLPEmbedder(nn.Module):
64
+ """copied from https://github.com/black-forest-labs/flux/blob/main/src/flux/modules/layers.py"""
65
+ def __init__(self, in_dim: int, hidden_dim: int, device=None, dtype=None):
66
+ factory_kwargs = {"device": device, "dtype": dtype}
67
+ super().__init__()
68
+ self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True, **factory_kwargs)
69
+ self.silu = nn.SiLU()
70
+ self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True, **factory_kwargs)
71
+
72
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
73
+ return self.out_layer(self.silu(self.in_layer(x)))
74
+
75
+
76
+ class FinalLayer(nn.Module):
77
+ """The final layer of DiT."""
78
+
79
+ def __init__(
80
+ self, hidden_size, patch_size, out_channels, act_layer, device=None, dtype=None
81
+ ):
82
+ factory_kwargs = {"device": device, "dtype": dtype}
83
+ super().__init__()
84
+
85
+ # Just use LayerNorm for the final layer
86
+ self.norm_final = nn.LayerNorm(
87
+ hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs
88
+ )
89
+ if isinstance(patch_size, int):
90
+ self.linear = nn.Linear(
91
+ hidden_size,
92
+ patch_size * patch_size * out_channels,
93
+ bias=True,
94
+ **factory_kwargs
95
+ )
96
+ else:
97
+ self.linear = nn.Linear(
98
+ hidden_size,
99
+ patch_size[0] * patch_size[1] * patch_size[2] * out_channels,
100
+ bias=True,
101
+ )
102
+ nn.init.zeros_(self.linear.weight)
103
+ nn.init.zeros_(self.linear.bias)
104
+
105
+ # Here we don't distinguish between the modulate types. Just use the simple one.
106
+ self.adaLN_modulation = nn.Sequential(
107
+ act_layer(),
108
+ nn.Linear(hidden_size, 2 * hidden_size, bias=True, **factory_kwargs),
109
+ )
110
+ # Zero-initialize the modulation
111
+ nn.init.zeros_(self.adaLN_modulation[1].weight)
112
+ nn.init.zeros_(self.adaLN_modulation[1].bias)
113
+
114
+ def forward(self, x, c):
115
+ shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
116
+ x = modulate(self.norm_final(x), shift=shift, scale=scale)
117
+ x = self.linear(x)
118
+ return x
hunyuan_model/models.py ADDED
@@ -0,0 +1,1044 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Any, List, Tuple, Optional, Union, Dict
3
+ import accelerate
4
+ from einops import rearrange
5
+
6
+ import torch
7
+ import torch.nn as nn
8
+ from torch.utils.checkpoint import checkpoint
9
+
10
+ from .activation_layers import get_activation_layer
11
+ from .norm_layers import get_norm_layer
12
+ from .embed_layers import TimestepEmbedder, PatchEmbed, TextProjection
13
+ from .attention import attention, parallel_attention, get_cu_seqlens
14
+ from .posemb_layers import apply_rotary_emb
15
+ from .mlp_layers import MLP, MLPEmbedder, FinalLayer
16
+ from .modulate_layers import ModulateDiT, modulate, apply_gate
17
+ from .token_refiner import SingleTokenRefiner
18
+ from modules.custom_offloading_utils import ModelOffloader, synchronize_device, clean_memory_on_device
19
+ from hunyuan_model.posemb_layers import get_nd_rotary_pos_embed
20
+
21
+ from utils.safetensors_utils import MemoryEfficientSafeOpen
22
+
23
+
24
+ class MMDoubleStreamBlock(nn.Module):
25
+ """
26
+ A multimodal dit block with seperate modulation for
27
+ text and image/video, see more details (SD3): https://arxiv.org/abs/2403.03206
28
+ (Flux.1): https://github.com/black-forest-labs/flux
29
+ """
30
+
31
+ def __init__(
32
+ self,
33
+ hidden_size: int,
34
+ heads_num: int,
35
+ mlp_width_ratio: float,
36
+ mlp_act_type: str = "gelu_tanh",
37
+ qk_norm: bool = True,
38
+ qk_norm_type: str = "rms",
39
+ qkv_bias: bool = False,
40
+ dtype: Optional[torch.dtype] = None,
41
+ device: Optional[torch.device] = None,
42
+ attn_mode: str = "flash",
43
+ split_attn: bool = False,
44
+ ):
45
+ factory_kwargs = {"device": device, "dtype": dtype}
46
+ super().__init__()
47
+ self.attn_mode = attn_mode
48
+ self.split_attn = split_attn
49
+
50
+ self.deterministic = False
51
+ self.heads_num = heads_num
52
+ head_dim = hidden_size // heads_num
53
+ mlp_hidden_dim = int(hidden_size * mlp_width_ratio)
54
+
55
+ self.img_mod = ModulateDiT(
56
+ hidden_size,
57
+ factor=6,
58
+ act_layer=get_activation_layer("silu"),
59
+ **factory_kwargs,
60
+ )
61
+ self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
62
+
63
+ self.img_attn_qkv = nn.Linear(hidden_size, hidden_size * 3, bias=qkv_bias, **factory_kwargs)
64
+ qk_norm_layer = get_norm_layer(qk_norm_type)
65
+ self.img_attn_q_norm = (
66
+ qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
67
+ )
68
+ self.img_attn_k_norm = (
69
+ qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
70
+ )
71
+ self.img_attn_proj = nn.Linear(hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs)
72
+
73
+ self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
74
+ self.img_mlp = MLP(
75
+ hidden_size,
76
+ mlp_hidden_dim,
77
+ act_layer=get_activation_layer(mlp_act_type),
78
+ bias=True,
79
+ **factory_kwargs,
80
+ )
81
+
82
+ self.txt_mod = ModulateDiT(
83
+ hidden_size,
84
+ factor=6,
85
+ act_layer=get_activation_layer("silu"),
86
+ **factory_kwargs,
87
+ )
88
+ self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
89
+
90
+ self.txt_attn_qkv = nn.Linear(hidden_size, hidden_size * 3, bias=qkv_bias, **factory_kwargs)
91
+ self.txt_attn_q_norm = (
92
+ qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
93
+ )
94
+ self.txt_attn_k_norm = (
95
+ qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
96
+ )
97
+ self.txt_attn_proj = nn.Linear(hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs)
98
+
99
+ self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
100
+ self.txt_mlp = MLP(
101
+ hidden_size,
102
+ mlp_hidden_dim,
103
+ act_layer=get_activation_layer(mlp_act_type),
104
+ bias=True,
105
+ **factory_kwargs,
106
+ )
107
+ self.hybrid_seq_parallel_attn = None
108
+
109
+ self.gradient_checkpointing = False
110
+
111
+ def enable_deterministic(self):
112
+ self.deterministic = True
113
+
114
+ def disable_deterministic(self):
115
+ self.deterministic = False
116
+
117
+ def enable_gradient_checkpointing(self):
118
+ self.gradient_checkpointing = True
119
+
120
+ def disable_gradient_checkpointing(self):
121
+ self.gradient_checkpointing = False
122
+
123
+ def _forward(
124
+ self,
125
+ img: torch.Tensor,
126
+ txt: torch.Tensor,
127
+ vec: torch.Tensor,
128
+ attn_mask: Optional[torch.Tensor] = None,
129
+ total_len: Optional[torch.Tensor] = None,
130
+ cu_seqlens_q: Optional[torch.Tensor] = None,
131
+ cu_seqlens_kv: Optional[torch.Tensor] = None,
132
+ max_seqlen_q: Optional[int] = None,
133
+ max_seqlen_kv: Optional[int] = None,
134
+ freqs_cis: tuple = None,
135
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
136
+ (img_mod1_shift, img_mod1_scale, img_mod1_gate, img_mod2_shift, img_mod2_scale, img_mod2_gate) = self.img_mod(vec).chunk(
137
+ 6, dim=-1
138
+ )
139
+ (txt_mod1_shift, txt_mod1_scale, txt_mod1_gate, txt_mod2_shift, txt_mod2_scale, txt_mod2_gate) = self.txt_mod(vec).chunk(
140
+ 6, dim=-1
141
+ )
142
+
143
+ # Prepare image for attention.
144
+ img_modulated = self.img_norm1(img)
145
+ img_modulated = modulate(img_modulated, shift=img_mod1_shift, scale=img_mod1_scale)
146
+ img_qkv = self.img_attn_qkv(img_modulated)
147
+ img_modulated = None
148
+ img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B L H D", K=3, H=self.heads_num)
149
+ img_qkv = None
150
+ # Apply QK-Norm if needed
151
+ img_q = self.img_attn_q_norm(img_q).to(img_v)
152
+ img_k = self.img_attn_k_norm(img_k).to(img_v)
153
+
154
+ # Apply RoPE if needed.
155
+ if freqs_cis is not None:
156
+ img_q_shape = img_q.shape
157
+ img_k_shape = img_k.shape
158
+ img_q, img_k = apply_rotary_emb(img_q, img_k, freqs_cis, head_first=False)
159
+ assert (
160
+ img_q.shape == img_q_shape and img_k.shape == img_k_shape
161
+ ), f"img_kk: {img_q.shape}, img_q: {img_q_shape}, img_kk: {img_k.shape}, img_k: {img_k_shape}"
162
+ # img_q, img_k = img_qq, img_kk
163
+
164
+ # Prepare txt for attention.
165
+ txt_modulated = self.txt_norm1(txt)
166
+ txt_modulated = modulate(txt_modulated, shift=txt_mod1_shift, scale=txt_mod1_scale)
167
+ txt_qkv = self.txt_attn_qkv(txt_modulated)
168
+ txt_modulated = None
169
+ txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B L H D", K=3, H=self.heads_num)
170
+ txt_qkv = None
171
+ # Apply QK-Norm if needed.
172
+ txt_q = self.txt_attn_q_norm(txt_q).to(txt_v)
173
+ txt_k = self.txt_attn_k_norm(txt_k).to(txt_v)
174
+
175
+ # Run actual attention.
176
+ img_q_len = img_q.shape[1]
177
+ img_kv_len = img_k.shape[1]
178
+ batch_size = img_k.shape[0]
179
+ q = torch.cat((img_q, txt_q), dim=1)
180
+ img_q = txt_q = None
181
+ k = torch.cat((img_k, txt_k), dim=1)
182
+ img_k = txt_k = None
183
+ v = torch.cat((img_v, txt_v), dim=1)
184
+ img_v = txt_v = None
185
+
186
+ assert (
187
+ cu_seqlens_q.shape[0] == 2 * img.shape[0] + 1
188
+ ), f"cu_seqlens_q.shape:{cu_seqlens_q.shape}, img.shape[0]:{img.shape[0]}"
189
+
190
+ # attention computation start
191
+ if not self.hybrid_seq_parallel_attn:
192
+ l = [q, k, v]
193
+ q = k = v = None
194
+ attn = attention(
195
+ l,
196
+ mode=self.attn_mode,
197
+ attn_mask=attn_mask,
198
+ total_len=total_len,
199
+ cu_seqlens_q=cu_seqlens_q,
200
+ cu_seqlens_kv=cu_seqlens_kv,
201
+ max_seqlen_q=max_seqlen_q,
202
+ max_seqlen_kv=max_seqlen_kv,
203
+ batch_size=batch_size,
204
+ )
205
+ else:
206
+ attn = parallel_attention(
207
+ self.hybrid_seq_parallel_attn,
208
+ q,
209
+ k,
210
+ v,
211
+ img_q_len=img_q_len,
212
+ img_kv_len=img_kv_len,
213
+ cu_seqlens_q=cu_seqlens_q,
214
+ cu_seqlens_kv=cu_seqlens_kv,
215
+ )
216
+
217
+ # attention computation end
218
+
219
+ img_attn, txt_attn = attn[:, : img.shape[1]], attn[:, img.shape[1] :]
220
+ attn = None
221
+
222
+ # Calculate the img bloks.
223
+ img = img + apply_gate(self.img_attn_proj(img_attn), gate=img_mod1_gate)
224
+ img_attn = None
225
+ img = img + apply_gate(
226
+ self.img_mlp(modulate(self.img_norm2(img), shift=img_mod2_shift, scale=img_mod2_scale)),
227
+ gate=img_mod2_gate,
228
+ )
229
+
230
+ # Calculate the txt bloks.
231
+ txt = txt + apply_gate(self.txt_attn_proj(txt_attn), gate=txt_mod1_gate)
232
+ txt_attn = None
233
+ txt = txt + apply_gate(
234
+ self.txt_mlp(modulate(self.txt_norm2(txt), shift=txt_mod2_shift, scale=txt_mod2_scale)),
235
+ gate=txt_mod2_gate,
236
+ )
237
+
238
+ return img, txt
239
+
240
+ # def forward(
241
+ # self,
242
+ # img: torch.Tensor,
243
+ # txt: torch.Tensor,
244
+ # vec: torch.Tensor,
245
+ # attn_mask: Optional[torch.Tensor] = None,
246
+ # cu_seqlens_q: Optional[torch.Tensor] = None,
247
+ # cu_seqlens_kv: Optional[torch.Tensor] = None,
248
+ # max_seqlen_q: Optional[int] = None,
249
+ # max_seqlen_kv: Optional[int] = None,
250
+ # freqs_cis: Tuple[torch.Tensor, torch.Tensor] = None,
251
+ # ) -> Tuple[torch.Tensor, torch.Tensor]:
252
+ def forward(self, *args, **kwargs):
253
+ if self.training and self.gradient_checkpointing:
254
+ return checkpoint(self._forward, *args, use_reentrant=False, **kwargs)
255
+ else:
256
+ return self._forward(*args, **kwargs)
257
+
258
+
259
+ class MMSingleStreamBlock(nn.Module):
260
+ """
261
+ A DiT block with parallel linear layers as described in
262
+ https://arxiv.org/abs/2302.05442 and adapted modulation interface.
263
+ Also refer to (SD3): https://arxiv.org/abs/2403.03206
264
+ (Flux.1): https://github.com/black-forest-labs/flux
265
+ """
266
+
267
+ def __init__(
268
+ self,
269
+ hidden_size: int,
270
+ heads_num: int,
271
+ mlp_width_ratio: float = 4.0,
272
+ mlp_act_type: str = "gelu_tanh",
273
+ qk_norm: bool = True,
274
+ qk_norm_type: str = "rms",
275
+ qk_scale: float = None,
276
+ dtype: Optional[torch.dtype] = None,
277
+ device: Optional[torch.device] = None,
278
+ attn_mode: str = "flash",
279
+ split_attn: bool = False,
280
+ ):
281
+ factory_kwargs = {"device": device, "dtype": dtype}
282
+ super().__init__()
283
+ self.attn_mode = attn_mode
284
+ self.split_attn = split_attn
285
+
286
+ self.deterministic = False
287
+ self.hidden_size = hidden_size
288
+ self.heads_num = heads_num
289
+ head_dim = hidden_size // heads_num
290
+ mlp_hidden_dim = int(hidden_size * mlp_width_ratio)
291
+ self.mlp_hidden_dim = mlp_hidden_dim
292
+ self.scale = qk_scale or head_dim**-0.5
293
+
294
+ # qkv and mlp_in
295
+ self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + mlp_hidden_dim, **factory_kwargs)
296
+ # proj and mlp_out
297
+ self.linear2 = nn.Linear(hidden_size + mlp_hidden_dim, hidden_size, **factory_kwargs)
298
+
299
+ qk_norm_layer = get_norm_layer(qk_norm_type)
300
+ self.q_norm = qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
301
+ self.k_norm = qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
302
+
303
+ self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
304
+
305
+ self.mlp_act = get_activation_layer(mlp_act_type)()
306
+ self.modulation = ModulateDiT(hidden_size, factor=3, act_layer=get_activation_layer("silu"), **factory_kwargs)
307
+ self.hybrid_seq_parallel_attn = None
308
+
309
+ self.gradient_checkpointing = False
310
+
311
+ def enable_deterministic(self):
312
+ self.deterministic = True
313
+
314
+ def disable_deterministic(self):
315
+ self.deterministic = False
316
+
317
+ def enable_gradient_checkpointing(self):
318
+ self.gradient_checkpointing = True
319
+
320
+ def disable_gradient_checkpointing(self):
321
+ self.gradient_checkpointing = False
322
+
323
+ def _forward(
324
+ self,
325
+ x: torch.Tensor,
326
+ vec: torch.Tensor,
327
+ txt_len: int,
328
+ attn_mask: Optional[torch.Tensor] = None,
329
+ total_len: Optional[torch.Tensor] = None,
330
+ cu_seqlens_q: Optional[torch.Tensor] = None,
331
+ cu_seqlens_kv: Optional[torch.Tensor] = None,
332
+ max_seqlen_q: Optional[int] = None,
333
+ max_seqlen_kv: Optional[int] = None,
334
+ freqs_cis: Tuple[torch.Tensor, torch.Tensor] = None,
335
+ ) -> torch.Tensor:
336
+ mod_shift, mod_scale, mod_gate = self.modulation(vec).chunk(3, dim=-1)
337
+ x_mod = modulate(self.pre_norm(x), shift=mod_shift, scale=mod_scale)
338
+ qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
339
+ x_mod = None
340
+ # mlp = mlp.to("cpu", non_blocking=True)
341
+ # clean_memory_on_device(x.device)
342
+
343
+ q, k, v = rearrange(qkv, "B L (K H D) -> K B L H D", K=3, H=self.heads_num)
344
+ qkv = None
345
+
346
+ # Apply QK-Norm if needed.
347
+ q = self.q_norm(q).to(v)
348
+ k = self.k_norm(k).to(v)
349
+
350
+ # Apply RoPE if needed.
351
+ if freqs_cis is not None:
352
+ img_q, txt_q = q[:, :-txt_len, :, :], q[:, -txt_len:, :, :]
353
+ img_k, txt_k = k[:, :-txt_len, :, :], k[:, -txt_len:, :, :]
354
+ q = k = None
355
+ img_q_shape = img_q.shape
356
+ img_k_shape = img_k.shape
357
+ img_q, img_k = apply_rotary_emb(img_q, img_k, freqs_cis, head_first=False)
358
+ assert (
359
+ img_q.shape == img_q_shape and img_k_shape == img_k.shape
360
+ ), f"img_kk: {img_q.shape}, img_q: {img_q.shape}, img_kk: {img_k.shape}, img_k: {img_k.shape}"
361
+ # img_q, img_k = img_qq, img_kk
362
+ # del img_qq, img_kk
363
+ q = torch.cat((img_q, txt_q), dim=1)
364
+ k = torch.cat((img_k, txt_k), dim=1)
365
+ del img_q, txt_q, img_k, txt_k
366
+
367
+ # Compute attention.
368
+ assert cu_seqlens_q.shape[0] == 2 * x.shape[0] + 1, f"cu_seqlens_q.shape:{cu_seqlens_q.shape}, x.shape[0]:{x.shape[0]}"
369
+
370
+ # attention computation start
371
+ if not self.hybrid_seq_parallel_attn:
372
+ l = [q, k, v]
373
+ q = k = v = None
374
+ attn = attention(
375
+ l,
376
+ mode=self.attn_mode,
377
+ attn_mask=attn_mask,
378
+ total_len=total_len,
379
+ cu_seqlens_q=cu_seqlens_q,
380
+ cu_seqlens_kv=cu_seqlens_kv,
381
+ max_seqlen_q=max_seqlen_q,
382
+ max_seqlen_kv=max_seqlen_kv,
383
+ batch_size=x.shape[0],
384
+ )
385
+ else:
386
+ attn = parallel_attention(
387
+ self.hybrid_seq_parallel_attn,
388
+ q,
389
+ k,
390
+ v,
391
+ img_q_len=img_q.shape[1],
392
+ img_kv_len=img_k.shape[1],
393
+ cu_seqlens_q=cu_seqlens_q,
394
+ cu_seqlens_kv=cu_seqlens_kv,
395
+ )
396
+ # attention computation end
397
+
398
+ # Compute activation in mlp stream, cat again and run second linear layer.
399
+ # mlp = mlp.to(x.device)
400
+ mlp = self.mlp_act(mlp)
401
+ attn_mlp = torch.cat((attn, mlp), 2)
402
+ attn = None
403
+ mlp = None
404
+ output = self.linear2(attn_mlp)
405
+ attn_mlp = None
406
+ return x + apply_gate(output, gate=mod_gate)
407
+
408
+ # def forward(
409
+ # self,
410
+ # x: torch.Tensor,
411
+ # vec: torch.Tensor,
412
+ # txt_len: int,
413
+ # attn_mask: Optional[torch.Tensor] = None,
414
+ # cu_seqlens_q: Optional[torch.Tensor] = None,
415
+ # cu_seqlens_kv: Optional[torch.Tensor] = None,
416
+ # max_seqlen_q: Optional[int] = None,
417
+ # max_seqlen_kv: Optional[int] = None,
418
+ # freqs_cis: Tuple[torch.Tensor, torch.Tensor] = None,
419
+ # ) -> torch.Tensor:
420
+ def forward(self, *args, **kwargs):
421
+ if self.training and self.gradient_checkpointing:
422
+ return checkpoint(self._forward, *args, use_reentrant=False, **kwargs)
423
+ else:
424
+ return self._forward(*args, **kwargs)
425
+
426
+
427
+ class HYVideoDiffusionTransformer(nn.Module): # ModelMixin, ConfigMixin):
428
+ """
429
+ HunyuanVideo Transformer backbone
430
+
431
+ Inherited from ModelMixin and ConfigMixin for compatibility with diffusers' sampler StableDiffusionPipeline.
432
+
433
+ Reference:
434
+ [1] Flux.1: https://github.com/black-forest-labs/flux
435
+ [2] MMDiT: http://arxiv.org/abs/2403.03206
436
+
437
+ Parameters
438
+ ----------
439
+ args: argparse.Namespace
440
+ The arguments parsed by argparse.
441
+ patch_size: list
442
+ The size of the patch.
443
+ in_channels: int
444
+ The number of input channels.
445
+ out_channels: int
446
+ The number of output channels.
447
+ hidden_size: int
448
+ The hidden size of the transformer backbone.
449
+ heads_num: int
450
+ The number of attention heads.
451
+ mlp_width_ratio: float
452
+ The ratio of the hidden size of the MLP in the transformer block.
453
+ mlp_act_type: str
454
+ The activation function of the MLP in the transformer block.
455
+ depth_double_blocks: int
456
+ The number of transformer blocks in the double blocks.
457
+ depth_single_blocks: int
458
+ The number of transformer blocks in the single blocks.
459
+ rope_dim_list: list
460
+ The dimension of the rotary embedding for t, h, w.
461
+ qkv_bias: bool
462
+ Whether to use bias in the qkv linear layer.
463
+ qk_norm: bool
464
+ Whether to use qk norm.
465
+ qk_norm_type: str
466
+ The type of qk norm.
467
+ guidance_embed: bool
468
+ Whether to use guidance embedding for distillation.
469
+ text_projection: str
470
+ The type of the text projection, default is single_refiner.
471
+ use_attention_mask: bool
472
+ Whether to use attention mask for text encoder.
473
+ dtype: torch.dtype
474
+ The dtype of the model.
475
+ device: torch.device
476
+ The device of the model.
477
+ attn_mode: str
478
+ The mode of the attention, default is flash.
479
+ split_attn: bool
480
+ Whether to use split attention (make attention as batch size 1).
481
+ """
482
+
483
+ # @register_to_config
484
+ def __init__(
485
+ self,
486
+ text_states_dim: int,
487
+ text_states_dim_2: int,
488
+ patch_size: list = [1, 2, 2],
489
+ in_channels: int = 4, # Should be VAE.config.latent_channels.
490
+ out_channels: int = None,
491
+ hidden_size: int = 3072,
492
+ heads_num: int = 24,
493
+ mlp_width_ratio: float = 4.0,
494
+ mlp_act_type: str = "gelu_tanh",
495
+ mm_double_blocks_depth: int = 20,
496
+ mm_single_blocks_depth: int = 40,
497
+ rope_dim_list: List[int] = [16, 56, 56],
498
+ qkv_bias: bool = True,
499
+ qk_norm: bool = True,
500
+ qk_norm_type: str = "rms",
501
+ guidance_embed: bool = False, # For modulation.
502
+ text_projection: str = "single_refiner",
503
+ use_attention_mask: bool = True,
504
+ dtype: Optional[torch.dtype] = None,
505
+ device: Optional[torch.device] = None,
506
+ attn_mode: str = "flash",
507
+ split_attn: bool = False,
508
+ ):
509
+ factory_kwargs = {"device": device, "dtype": dtype}
510
+ super().__init__()
511
+
512
+ self.patch_size = patch_size
513
+ self.in_channels = in_channels
514
+ self.out_channels = in_channels if out_channels is None else out_channels
515
+ self.unpatchify_channels = self.out_channels
516
+ self.guidance_embed = guidance_embed
517
+ self.rope_dim_list = rope_dim_list
518
+
519
+ # Text projection. Default to linear projection.
520
+ # Alternative: TokenRefiner. See more details (LI-DiT): http://arxiv.org/abs/2406.11831
521
+ self.use_attention_mask = use_attention_mask
522
+ self.text_projection = text_projection
523
+
524
+ self.text_states_dim = text_states_dim
525
+ self.text_states_dim_2 = text_states_dim_2
526
+
527
+ if hidden_size % heads_num != 0:
528
+ raise ValueError(f"Hidden size {hidden_size} must be divisible by heads_num {heads_num}")
529
+ pe_dim = hidden_size // heads_num
530
+ if sum(rope_dim_list) != pe_dim:
531
+ raise ValueError(f"Got {rope_dim_list} but expected positional dim {pe_dim}")
532
+ self.hidden_size = hidden_size
533
+ self.heads_num = heads_num
534
+
535
+ self.attn_mode = attn_mode
536
+ self.split_attn = split_attn
537
+ print(f"Using {self.attn_mode} attention mode, split_attn: {self.split_attn}")
538
+
539
+ # image projection
540
+ self.img_in = PatchEmbed(self.patch_size, self.in_channels, self.hidden_size, **factory_kwargs)
541
+
542
+ # text projection
543
+ if self.text_projection == "linear":
544
+ self.txt_in = TextProjection(
545
+ self.text_states_dim,
546
+ self.hidden_size,
547
+ get_activation_layer("silu"),
548
+ **factory_kwargs,
549
+ )
550
+ elif self.text_projection == "single_refiner":
551
+ self.txt_in = SingleTokenRefiner(self.text_states_dim, hidden_size, heads_num, depth=2, **factory_kwargs)
552
+ else:
553
+ raise NotImplementedError(f"Unsupported text_projection: {self.text_projection}")
554
+
555
+ # time modulation
556
+ self.time_in = TimestepEmbedder(self.hidden_size, get_activation_layer("silu"), **factory_kwargs)
557
+
558
+ # text modulation
559
+ self.vector_in = MLPEmbedder(self.text_states_dim_2, self.hidden_size, **factory_kwargs)
560
+
561
+ # guidance modulation
562
+ self.guidance_in = (
563
+ TimestepEmbedder(self.hidden_size, get_activation_layer("silu"), **factory_kwargs) if guidance_embed else None
564
+ )
565
+
566
+ # double blocks
567
+ self.double_blocks = nn.ModuleList(
568
+ [
569
+ MMDoubleStreamBlock(
570
+ self.hidden_size,
571
+ self.heads_num,
572
+ mlp_width_ratio=mlp_width_ratio,
573
+ mlp_act_type=mlp_act_type,
574
+ qk_norm=qk_norm,
575
+ qk_norm_type=qk_norm_type,
576
+ qkv_bias=qkv_bias,
577
+ attn_mode=attn_mode,
578
+ split_attn=split_attn,
579
+ **factory_kwargs,
580
+ )
581
+ for _ in range(mm_double_blocks_depth)
582
+ ]
583
+ )
584
+
585
+ # single blocks
586
+ self.single_blocks = nn.ModuleList(
587
+ [
588
+ MMSingleStreamBlock(
589
+ self.hidden_size,
590
+ self.heads_num,
591
+ mlp_width_ratio=mlp_width_ratio,
592
+ mlp_act_type=mlp_act_type,
593
+ qk_norm=qk_norm,
594
+ qk_norm_type=qk_norm_type,
595
+ attn_mode=attn_mode,
596
+ split_attn=split_attn,
597
+ **factory_kwargs,
598
+ )
599
+ for _ in range(mm_single_blocks_depth)
600
+ ]
601
+ )
602
+
603
+ self.final_layer = FinalLayer(
604
+ self.hidden_size,
605
+ self.patch_size,
606
+ self.out_channels,
607
+ get_activation_layer("silu"),
608
+ **factory_kwargs,
609
+ )
610
+
611
+ self.gradient_checkpointing = False
612
+ self.blocks_to_swap = None
613
+ self.offloader_double = None
614
+ self.offloader_single = None
615
+ self._enable_img_in_txt_in_offloading = False
616
+
617
+ @property
618
+ def device(self):
619
+ return next(self.parameters()).device
620
+
621
+ @property
622
+ def dtype(self):
623
+ return next(self.parameters()).dtype
624
+
625
+ def enable_gradient_checkpointing(self):
626
+ self.gradient_checkpointing = True
627
+
628
+ self.txt_in.enable_gradient_checkpointing()
629
+
630
+ for block in self.double_blocks + self.single_blocks:
631
+ block.enable_gradient_checkpointing()
632
+
633
+ print(f"HYVideoDiffusionTransformer: Gradient checkpointing enabled.")
634
+
635
+ def disable_gradient_checkpointing(self):
636
+ self.gradient_checkpointing = False
637
+
638
+ self.txt_in.disable_gradient_checkpointing()
639
+
640
+ for block in self.double_blocks + self.single_blocks:
641
+ block.disable_gradient_checkpointing()
642
+
643
+ print(f"HYVideoDiffusionTransformer: Gradient checkpointing disabled.")
644
+
645
+ def enable_img_in_txt_in_offloading(self):
646
+ self._enable_img_in_txt_in_offloading = True
647
+
648
+ def enable_block_swap(self, num_blocks: int, device: torch.device, supports_backward: bool):
649
+ self.blocks_to_swap = num_blocks
650
+ self.num_double_blocks = len(self.double_blocks)
651
+ self.num_single_blocks = len(self.single_blocks)
652
+ double_blocks_to_swap = num_blocks // 2
653
+ single_blocks_to_swap = (num_blocks - double_blocks_to_swap) * 2 + 1
654
+
655
+ assert double_blocks_to_swap <= self.num_double_blocks - 1 and single_blocks_to_swap <= self.num_single_blocks - 1, (
656
+ f"Cannot swap more than {self.num_double_blocks - 1} double blocks and {self.num_single_blocks - 1} single blocks. "
657
+ f"Requested {double_blocks_to_swap} double blocks and {single_blocks_to_swap} single blocks."
658
+ )
659
+
660
+ self.offloader_double = ModelOffloader(
661
+ "double", self.double_blocks, self.num_double_blocks, double_blocks_to_swap, supports_backward, device # , debug=True
662
+ )
663
+ self.offloader_single = ModelOffloader(
664
+ "single", self.single_blocks, self.num_single_blocks, single_blocks_to_swap, supports_backward, device # , debug=True
665
+ )
666
+ print(
667
+ f"HYVideoDiffusionTransformer: Block swap enabled. Swapping {num_blocks} blocks, double blocks: {double_blocks_to_swap}, single blocks: {single_blocks_to_swap}."
668
+ )
669
+
670
+ def switch_block_swap_for_inference(self):
671
+ if self.blocks_to_swap:
672
+ self.offloader_double.set_forward_only(True)
673
+ self.offloader_single.set_forward_only(True)
674
+ self.prepare_block_swap_before_forward()
675
+ print(f"HYVideoDiffusionTransformer: Block swap set to forward only.")
676
+
677
+ def switch_block_swap_for_training(self):
678
+ if self.blocks_to_swap:
679
+ self.offloader_double.set_forward_only(False)
680
+ self.offloader_single.set_forward_only(False)
681
+ self.prepare_block_swap_before_forward()
682
+ print(f"HYVideoDiffusionTransformer: Block swap set to forward and backward.")
683
+
684
+ def move_to_device_except_swap_blocks(self, device: torch.device):
685
+ # assume model is on cpu. do not move blocks to device to reduce temporary memory usage
686
+ if self.blocks_to_swap:
687
+ save_double_blocks = self.double_blocks
688
+ save_single_blocks = self.single_blocks
689
+ self.double_blocks = None
690
+ self.single_blocks = None
691
+
692
+ self.to(device)
693
+
694
+ if self.blocks_to_swap:
695
+ self.double_blocks = save_double_blocks
696
+ self.single_blocks = save_single_blocks
697
+
698
+ def prepare_block_swap_before_forward(self):
699
+ if self.blocks_to_swap is None or self.blocks_to_swap == 0:
700
+ return
701
+ self.offloader_double.prepare_block_devices_before_forward(self.double_blocks)
702
+ self.offloader_single.prepare_block_devices_before_forward(self.single_blocks)
703
+
704
+ def enable_deterministic(self):
705
+ for block in self.double_blocks:
706
+ block.enable_deterministic()
707
+ for block in self.single_blocks:
708
+ block.enable_deterministic()
709
+
710
+ def disable_deterministic(self):
711
+ for block in self.double_blocks:
712
+ block.disable_deterministic()
713
+ for block in self.single_blocks:
714
+ block.disable_deterministic()
715
+
716
+ def forward(
717
+ self,
718
+ x: torch.Tensor,
719
+ t: torch.Tensor, # Should be in range(0, 1000).
720
+ text_states: torch.Tensor = None,
721
+ text_mask: torch.Tensor = None, # Now we don't use it.
722
+ text_states_2: Optional[torch.Tensor] = None, # Text embedding for modulation.
723
+ freqs_cos: Optional[torch.Tensor] = None,
724
+ freqs_sin: Optional[torch.Tensor] = None,
725
+ guidance: torch.Tensor = None, # Guidance for modulation, should be cfg_scale x 1000.
726
+ return_dict: bool = True,
727
+ ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
728
+ out = {}
729
+ img = x
730
+ txt = text_states
731
+ _, _, ot, oh, ow = x.shape
732
+ tt, th, tw = (
733
+ ot // self.patch_size[0],
734
+ oh // self.patch_size[1],
735
+ ow // self.patch_size[2],
736
+ )
737
+
738
+ # Prepare modulation vectors.
739
+ vec = self.time_in(t)
740
+
741
+ # text modulation
742
+ vec = vec + self.vector_in(text_states_2)
743
+
744
+ # guidance modulation
745
+ if self.guidance_embed:
746
+ if guidance is None:
747
+ raise ValueError("Didn't get guidance strength for guidance distilled model.")
748
+
749
+ # our timestep_embedding is merged into guidance_in(TimestepEmbedder)
750
+ vec = vec + self.guidance_in(guidance)
751
+
752
+ # Embed image and text.
753
+ if self._enable_img_in_txt_in_offloading:
754
+ self.img_in.to(x.device, non_blocking=True)
755
+ self.txt_in.to(x.device, non_blocking=True)
756
+ synchronize_device(x.device)
757
+
758
+ img = self.img_in(img)
759
+ if self.text_projection == "linear":
760
+ txt = self.txt_in(txt)
761
+ elif self.text_projection == "single_refiner":
762
+ txt = self.txt_in(txt, t, text_mask if self.use_attention_mask else None)
763
+ else:
764
+ raise NotImplementedError(f"Unsupported text_projection: {self.text_projection}")
765
+
766
+ if self._enable_img_in_txt_in_offloading:
767
+ self.img_in.to(torch.device("cpu"), non_blocking=True)
768
+ self.txt_in.to(torch.device("cpu"), non_blocking=True)
769
+ synchronize_device(x.device)
770
+ clean_memory_on_device(x.device)
771
+
772
+ txt_seq_len = txt.shape[1]
773
+ img_seq_len = img.shape[1]
774
+
775
+ # Compute cu_squlens and max_seqlen for flash attention
776
+ cu_seqlens_q = get_cu_seqlens(text_mask, img_seq_len)
777
+ cu_seqlens_kv = cu_seqlens_q
778
+ max_seqlen_q = img_seq_len + txt_seq_len
779
+ max_seqlen_kv = max_seqlen_q
780
+
781
+ attn_mask = total_len = None
782
+ if self.split_attn or self.attn_mode == "torch":
783
+ # calculate text length and total length
784
+ text_len = text_mask.sum(dim=1) # (bs, )
785
+ total_len = img_seq_len + text_len # (bs, )
786
+ if self.attn_mode == "torch" and not self.split_attn:
787
+ # initialize attention mask: bool tensor for sdpa, (b, 1, n, n)
788
+ bs = img.shape[0]
789
+ attn_mask = torch.zeros((bs, 1, max_seqlen_q, max_seqlen_q), dtype=torch.bool, device=text_mask.device)
790
+
791
+ # set attention mask with total_len
792
+ for i in range(bs):
793
+ attn_mask[i, :, : total_len[i], : total_len[i]] = True
794
+ total_len = None # means we don't use split_attn
795
+
796
+ freqs_cis = (freqs_cos, freqs_sin) if freqs_cos is not None else None
797
+ # --------------------- Pass through DiT blocks ------------------------
798
+ for block_idx, block in enumerate(self.double_blocks):
799
+ double_block_args = [
800
+ img,
801
+ txt,
802
+ vec,
803
+ attn_mask,
804
+ total_len,
805
+ cu_seqlens_q,
806
+ cu_seqlens_kv,
807
+ max_seqlen_q,
808
+ max_seqlen_kv,
809
+ freqs_cis,
810
+ ]
811
+
812
+ if self.blocks_to_swap:
813
+ self.offloader_double.wait_for_block(block_idx)
814
+
815
+ img, txt = block(*double_block_args)
816
+
817
+ if self.blocks_to_swap:
818
+ self.offloader_double.submit_move_blocks_forward(self.double_blocks, block_idx)
819
+
820
+ # Merge txt and img to pass through single stream blocks.
821
+ x = torch.cat((img, txt), 1)
822
+ if self.blocks_to_swap:
823
+ # delete img, txt to reduce memory usage
824
+ del img, txt
825
+ clean_memory_on_device(x.device)
826
+
827
+ if len(self.single_blocks) > 0:
828
+ for block_idx, block in enumerate(self.single_blocks):
829
+ single_block_args = [
830
+ x,
831
+ vec,
832
+ txt_seq_len,
833
+ attn_mask,
834
+ total_len,
835
+ cu_seqlens_q,
836
+ cu_seqlens_kv,
837
+ max_seqlen_q,
838
+ max_seqlen_kv,
839
+ freqs_cis,
840
+ ]
841
+ if self.blocks_to_swap:
842
+ self.offloader_single.wait_for_block(block_idx)
843
+
844
+ x = block(*single_block_args)
845
+
846
+ if self.blocks_to_swap:
847
+ self.offloader_single.submit_move_blocks_forward(self.single_blocks, block_idx)
848
+
849
+ img = x[:, :img_seq_len, ...]
850
+ x = None
851
+
852
+ # ---------------------------- Final layer ------------------------------
853
+ img = self.final_layer(img, vec) # (N, T, patch_size ** 2 * out_channels)
854
+
855
+ img = self.unpatchify(img, tt, th, tw)
856
+ if return_dict:
857
+ out["x"] = img
858
+ return out
859
+ return img
860
+
861
+ def unpatchify(self, x, t, h, w):
862
+ """
863
+ x: (N, T, patch_size**2 * C)
864
+ imgs: (N, H, W, C)
865
+ """
866
+ c = self.unpatchify_channels
867
+ pt, ph, pw = self.patch_size
868
+ assert t * h * w == x.shape[1]
869
+
870
+ x = x.reshape(shape=(x.shape[0], t, h, w, c, pt, ph, pw))
871
+ x = torch.einsum("nthwcopq->nctohpwq", x)
872
+ imgs = x.reshape(shape=(x.shape[0], c, t * pt, h * ph, w * pw))
873
+
874
+ return imgs
875
+
876
+ def params_count(self):
877
+ counts = {
878
+ "double": sum(
879
+ [
880
+ sum(p.numel() for p in block.img_attn_qkv.parameters())
881
+ + sum(p.numel() for p in block.img_attn_proj.parameters())
882
+ + sum(p.numel() for p in block.img_mlp.parameters())
883
+ + sum(p.numel() for p in block.txt_attn_qkv.parameters())
884
+ + sum(p.numel() for p in block.txt_attn_proj.parameters())
885
+ + sum(p.numel() for p in block.txt_mlp.parameters())
886
+ for block in self.double_blocks
887
+ ]
888
+ ),
889
+ "single": sum(
890
+ [
891
+ sum(p.numel() for p in block.linear1.parameters()) + sum(p.numel() for p in block.linear2.parameters())
892
+ for block in self.single_blocks
893
+ ]
894
+ ),
895
+ "total": sum(p.numel() for p in self.parameters()),
896
+ }
897
+ counts["attn+mlp"] = counts["double"] + counts["single"]
898
+ return counts
899
+
900
+
901
+ #################################################################################
902
+ # HunyuanVideo Configs #
903
+ #################################################################################
904
+
905
+ HUNYUAN_VIDEO_CONFIG = {
906
+ "HYVideo-T/2": {
907
+ "mm_double_blocks_depth": 20,
908
+ "mm_single_blocks_depth": 40,
909
+ "rope_dim_list": [16, 56, 56],
910
+ "hidden_size": 3072,
911
+ "heads_num": 24,
912
+ "mlp_width_ratio": 4,
913
+ },
914
+ "HYVideo-T/2-cfgdistill": {
915
+ "mm_double_blocks_depth": 20,
916
+ "mm_single_blocks_depth": 40,
917
+ "rope_dim_list": [16, 56, 56],
918
+ "hidden_size": 3072,
919
+ "heads_num": 24,
920
+ "mlp_width_ratio": 4,
921
+ "guidance_embed": True,
922
+ },
923
+ }
924
+
925
+
926
+ def load_dit_model(text_states_dim, text_states_dim_2, in_channels, out_channels, factor_kwargs):
927
+ """load hunyuan video model
928
+
929
+ NOTE: Only support HYVideo-T/2-cfgdistill now.
930
+
931
+ Args:
932
+ text_state_dim (int): text state dimension
933
+ text_state_dim_2 (int): text state dimension 2
934
+ in_channels (int): input channels number
935
+ out_channels (int): output channels number
936
+ factor_kwargs (dict): factor kwargs
937
+
938
+ Returns:
939
+ model (nn.Module): The hunyuan video model
940
+ """
941
+ # if args.model in HUNYUAN_VIDEO_CONFIG.keys():
942
+ model = HYVideoDiffusionTransformer(
943
+ text_states_dim=text_states_dim,
944
+ text_states_dim_2=text_states_dim_2,
945
+ in_channels=in_channels,
946
+ out_channels=out_channels,
947
+ **HUNYUAN_VIDEO_CONFIG["HYVideo-T/2-cfgdistill"],
948
+ **factor_kwargs,
949
+ )
950
+ return model
951
+ # else:
952
+ # raise NotImplementedError()
953
+
954
+
955
+ def load_state_dict(model, model_path):
956
+ state_dict = torch.load(model_path, map_location=lambda storage, loc: storage, weights_only=True)
957
+
958
+ load_key = "module"
959
+ if load_key in state_dict:
960
+ state_dict = state_dict[load_key]
961
+ else:
962
+ raise KeyError(
963
+ f"Missing key: `{load_key}` in the checkpoint: {model_path}. The keys in the checkpoint "
964
+ f"are: {list(state_dict.keys())}."
965
+ )
966
+ model.load_state_dict(state_dict, strict=True, assign=True)
967
+ return model
968
+
969
+
970
+ def load_transformer(dit_path, attn_mode, split_attn, device, dtype, in_channels=16) -> HYVideoDiffusionTransformer:
971
+ # =========================== Build main model ===========================
972
+ factor_kwargs = {"device": device, "dtype": dtype, "attn_mode": attn_mode, "split_attn": split_attn}
973
+ latent_channels = 16
974
+ out_channels = latent_channels
975
+
976
+ with accelerate.init_empty_weights():
977
+ transformer = load_dit_model(
978
+ text_states_dim=4096,
979
+ text_states_dim_2=768,
980
+ in_channels=in_channels,
981
+ out_channels=out_channels,
982
+ factor_kwargs=factor_kwargs,
983
+ )
984
+
985
+ if os.path.splitext(dit_path)[-1] == ".safetensors":
986
+ # loading safetensors: may be already fp8
987
+ with MemoryEfficientSafeOpen(dit_path) as f:
988
+ state_dict = {}
989
+ for k in f.keys():
990
+ tensor = f.get_tensor(k)
991
+ tensor = tensor.to(device=device, dtype=dtype)
992
+ # TODO support comfy model
993
+ # if k.startswith("model.model."):
994
+ # k = convert_comfy_model_key(k)
995
+ state_dict[k] = tensor
996
+ transformer.load_state_dict(state_dict, strict=True, assign=True)
997
+ else:
998
+ transformer = load_state_dict(transformer, dit_path)
999
+
1000
+ return transformer
1001
+
1002
+
1003
+ def get_rotary_pos_embed_by_shape(model, latents_size):
1004
+ target_ndim = 3
1005
+ ndim = 5 - 2
1006
+
1007
+ if isinstance(model.patch_size, int):
1008
+ assert all(s % model.patch_size == 0 for s in latents_size), (
1009
+ f"Latent size(last {ndim} dimensions) should be divisible by patch size({model.patch_size}), "
1010
+ f"but got {latents_size}."
1011
+ )
1012
+ rope_sizes = [s // model.patch_size for s in latents_size]
1013
+ elif isinstance(model.patch_size, list):
1014
+ assert all(s % model.patch_size[idx] == 0 for idx, s in enumerate(latents_size)), (
1015
+ f"Latent size(last {ndim} dimensions) should be divisible by patch size({model.patch_size}), "
1016
+ f"but got {latents_size}."
1017
+ )
1018
+ rope_sizes = [s // model.patch_size[idx] for idx, s in enumerate(latents_size)]
1019
+
1020
+ if len(rope_sizes) != target_ndim:
1021
+ rope_sizes = [1] * (target_ndim - len(rope_sizes)) + rope_sizes # time axis
1022
+ head_dim = model.hidden_size // model.heads_num
1023
+ rope_dim_list = model.rope_dim_list
1024
+ if rope_dim_list is None:
1025
+ rope_dim_list = [head_dim // target_ndim for _ in range(target_ndim)]
1026
+ assert sum(rope_dim_list) == head_dim, "sum(rope_dim_list) should equal to head_dim of attention layer"
1027
+
1028
+ rope_theta = 256
1029
+ freqs_cos, freqs_sin = get_nd_rotary_pos_embed(
1030
+ rope_dim_list, rope_sizes, theta=rope_theta, use_real=True, theta_rescale_factor=1
1031
+ )
1032
+ return freqs_cos, freqs_sin
1033
+
1034
+
1035
+ def get_rotary_pos_embed(vae_name, model, video_length, height, width):
1036
+ # 884
1037
+ if "884" in vae_name:
1038
+ latents_size = [(video_length - 1) // 4 + 1, height // 8, width // 8]
1039
+ elif "888" in vae_name:
1040
+ latents_size = [(video_length - 1) // 8 + 1, height // 8, width // 8]
1041
+ else:
1042
+ latents_size = [video_length, height // 8, width // 8]
1043
+
1044
+ return get_rotary_pos_embed_by_shape(model, latents_size)
hunyuan_model/modulate_layers.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Callable
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+
6
+
7
+ class ModulateDiT(nn.Module):
8
+ """Modulation layer for DiT."""
9
+ def __init__(
10
+ self,
11
+ hidden_size: int,
12
+ factor: int,
13
+ act_layer: Callable,
14
+ dtype=None,
15
+ device=None,
16
+ ):
17
+ factory_kwargs = {"dtype": dtype, "device": device}
18
+ super().__init__()
19
+ self.act = act_layer()
20
+ self.linear = nn.Linear(
21
+ hidden_size, factor * hidden_size, bias=True, **factory_kwargs
22
+ )
23
+ # Zero-initialize the modulation
24
+ nn.init.zeros_(self.linear.weight)
25
+ nn.init.zeros_(self.linear.bias)
26
+
27
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
28
+ return self.linear(self.act(x))
29
+
30
+
31
+ def modulate(x, shift=None, scale=None):
32
+ """modulate by shift and scale
33
+
34
+ Args:
35
+ x (torch.Tensor): input tensor.
36
+ shift (torch.Tensor, optional): shift tensor. Defaults to None.
37
+ scale (torch.Tensor, optional): scale tensor. Defaults to None.
38
+
39
+ Returns:
40
+ torch.Tensor: the output tensor after modulate.
41
+ """
42
+ if scale is None and shift is None:
43
+ return x
44
+ elif shift is None:
45
+ return x * (1 + scale.unsqueeze(1))
46
+ elif scale is None:
47
+ return x + shift.unsqueeze(1)
48
+ else:
49
+ return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
50
+
51
+
52
+ def apply_gate(x, gate=None, tanh=False):
53
+ """AI is creating summary for apply_gate
54
+
55
+ Args:
56
+ x (torch.Tensor): input tensor.
57
+ gate (torch.Tensor, optional): gate tensor. Defaults to None.
58
+ tanh (bool, optional): whether to use tanh function. Defaults to False.
59
+
60
+ Returns:
61
+ torch.Tensor: the output tensor after apply gate.
62
+ """
63
+ if gate is None:
64
+ return x
65
+ if tanh:
66
+ return x * gate.unsqueeze(1).tanh()
67
+ else:
68
+ return x * gate.unsqueeze(1)
69
+
70
+
71
+ def ckpt_wrapper(module):
72
+ def ckpt_forward(*inputs):
73
+ outputs = module(*inputs)
74
+ return outputs
75
+
76
+ return ckpt_forward
hunyuan_model/norm_layers.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+
5
+ class RMSNorm(nn.Module):
6
+ def __init__(
7
+ self,
8
+ dim: int,
9
+ elementwise_affine=True,
10
+ eps: float = 1e-6,
11
+ device=None,
12
+ dtype=None,
13
+ ):
14
+ """
15
+ Initialize the RMSNorm normalization layer.
16
+
17
+ Args:
18
+ dim (int): The dimension of the input tensor.
19
+ eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
20
+
21
+ Attributes:
22
+ eps (float): A small value added to the denominator for numerical stability.
23
+ weight (nn.Parameter): Learnable scaling parameter.
24
+
25
+ """
26
+ factory_kwargs = {"device": device, "dtype": dtype}
27
+ super().__init__()
28
+ self.eps = eps
29
+ if elementwise_affine:
30
+ self.weight = nn.Parameter(torch.ones(dim, **factory_kwargs))
31
+
32
+ def _norm(self, x):
33
+ """
34
+ Apply the RMSNorm normalization to the input tensor.
35
+
36
+ Args:
37
+ x (torch.Tensor): The input tensor.
38
+
39
+ Returns:
40
+ torch.Tensor: The normalized tensor.
41
+
42
+ """
43
+ return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
44
+
45
+ def forward(self, x):
46
+ """
47
+ Forward pass through the RMSNorm layer.
48
+
49
+ Args:
50
+ x (torch.Tensor): The input tensor.
51
+
52
+ Returns:
53
+ torch.Tensor: The output tensor after applying RMSNorm.
54
+
55
+ """
56
+ output = self._norm(x.float()).type_as(x)
57
+ if hasattr(self, "weight"):
58
+ # output = output * self.weight
59
+ # support fp8
60
+ output = output * self.weight.to(output.dtype)
61
+ return output
62
+
63
+
64
+ def get_norm_layer(norm_layer):
65
+ """
66
+ Get the normalization layer.
67
+
68
+ Args:
69
+ norm_layer (str): The type of normalization layer.
70
+
71
+ Returns:
72
+ norm_layer (nn.Module): The normalization layer.
73
+ """
74
+ if norm_layer == "layer":
75
+ return nn.LayerNorm
76
+ elif norm_layer == "rms":
77
+ return RMSNorm
78
+ else:
79
+ raise NotImplementedError(f"Norm layer {norm_layer} is not implemented")
hunyuan_model/pipeline_hunyuan_video.py ADDED
@@ -0,0 +1,1100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # ==============================================================================
15
+ #
16
+ # Modified from diffusers==0.29.2
17
+ #
18
+ # ==============================================================================
19
+ import inspect
20
+ from typing import Any, Callable, Dict, List, Optional, Union, Tuple
21
+ import torch
22
+ import torch.distributed as dist
23
+ import numpy as np
24
+ from dataclasses import dataclass
25
+ from packaging import version
26
+
27
+ from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
28
+ from diffusers.configuration_utils import FrozenDict
29
+ from diffusers.image_processor import VaeImageProcessor
30
+ from diffusers.loaders import LoraLoaderMixin, TextualInversionLoaderMixin
31
+ from diffusers.models import AutoencoderKL
32
+ from diffusers.models.lora import adjust_lora_scale_text_encoder
33
+ from diffusers.schedulers import KarrasDiffusionSchedulers
34
+ from diffusers.utils import (
35
+ USE_PEFT_BACKEND,
36
+ deprecate,
37
+ logging,
38
+ replace_example_docstring,
39
+ scale_lora_layers,
40
+ unscale_lora_layers,
41
+ )
42
+ from diffusers.utils.torch_utils import randn_tensor
43
+ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
44
+ from diffusers.utils import BaseOutput
45
+
46
+ from ...constants import PRECISION_TO_TYPE
47
+ from ...vae.autoencoder_kl_causal_3d import AutoencoderKLCausal3D
48
+ from ...text_encoder import TextEncoder
49
+ from ...modules import HYVideoDiffusionTransformer
50
+
51
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
52
+
53
+ EXAMPLE_DOC_STRING = """"""
54
+
55
+
56
+ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
57
+ """
58
+ Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
59
+ Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
60
+ """
61
+ std_text = noise_pred_text.std(
62
+ dim=list(range(1, noise_pred_text.ndim)), keepdim=True
63
+ )
64
+ std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
65
+ # rescale the results from guidance (fixes overexposure)
66
+ noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
67
+ # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
68
+ noise_cfg = (
69
+ guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
70
+ )
71
+ return noise_cfg
72
+
73
+
74
+ def retrieve_timesteps(
75
+ scheduler,
76
+ num_inference_steps: Optional[int] = None,
77
+ device: Optional[Union[str, torch.device]] = None,
78
+ timesteps: Optional[List[int]] = None,
79
+ sigmas: Optional[List[float]] = None,
80
+ **kwargs,
81
+ ):
82
+ """
83
+ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
84
+ custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
85
+
86
+ Args:
87
+ scheduler (`SchedulerMixin`):
88
+ The scheduler to get timesteps from.
89
+ num_inference_steps (`int`):
90
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
91
+ must be `None`.
92
+ device (`str` or `torch.device`, *optional*):
93
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
94
+ timesteps (`List[int]`, *optional*):
95
+ Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
96
+ `num_inference_steps` and `sigmas` must be `None`.
97
+ sigmas (`List[float]`, *optional*):
98
+ Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
99
+ `num_inference_steps` and `timesteps` must be `None`.
100
+
101
+ Returns:
102
+ `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
103
+ second element is the number of inference steps.
104
+ """
105
+ if timesteps is not None and sigmas is not None:
106
+ raise ValueError(
107
+ "Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values"
108
+ )
109
+ if timesteps is not None:
110
+ accepts_timesteps = "timesteps" in set(
111
+ inspect.signature(scheduler.set_timesteps).parameters.keys()
112
+ )
113
+ if not accepts_timesteps:
114
+ raise ValueError(
115
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
116
+ f" timestep schedules. Please check whether you are using the correct scheduler."
117
+ )
118
+ scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
119
+ timesteps = scheduler.timesteps
120
+ num_inference_steps = len(timesteps)
121
+ elif sigmas is not None:
122
+ accept_sigmas = "sigmas" in set(
123
+ inspect.signature(scheduler.set_timesteps).parameters.keys()
124
+ )
125
+ if not accept_sigmas:
126
+ raise ValueError(
127
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
128
+ f" sigmas schedules. Please check whether you are using the correct scheduler."
129
+ )
130
+ scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
131
+ timesteps = scheduler.timesteps
132
+ num_inference_steps = len(timesteps)
133
+ else:
134
+ scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
135
+ timesteps = scheduler.timesteps
136
+ return timesteps, num_inference_steps
137
+
138
+
139
+ @dataclass
140
+ class HunyuanVideoPipelineOutput(BaseOutput):
141
+ videos: Union[torch.Tensor, np.ndarray]
142
+
143
+
144
+ class HunyuanVideoPipeline(DiffusionPipeline):
145
+ r"""
146
+ Pipeline for text-to-video generation using HunyuanVideo.
147
+
148
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
149
+ implemented for all pipelines (downloading, saving, running on a particular device, etc.).
150
+
151
+ Args:
152
+ vae ([`AutoencoderKL`]):
153
+ Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
154
+ text_encoder ([`TextEncoder`]):
155
+ Frozen text-encoder.
156
+ text_encoder_2 ([`TextEncoder`]):
157
+ Frozen text-encoder_2.
158
+ transformer ([`HYVideoDiffusionTransformer`]):
159
+ A `HYVideoDiffusionTransformer` to denoise the encoded video latents.
160
+ scheduler ([`SchedulerMixin`]):
161
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents.
162
+ """
163
+
164
+ model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
165
+ _optional_components = ["text_encoder_2"]
166
+ _exclude_from_cpu_offload = ["transformer"]
167
+ _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
168
+
169
+ def __init__(
170
+ self,
171
+ vae: AutoencoderKL,
172
+ text_encoder: TextEncoder,
173
+ transformer: HYVideoDiffusionTransformer,
174
+ scheduler: KarrasDiffusionSchedulers,
175
+ text_encoder_2: Optional[TextEncoder] = None,
176
+ progress_bar_config: Dict[str, Any] = None,
177
+ args=None,
178
+ ):
179
+ super().__init__()
180
+
181
+ # ==========================================================================================
182
+ if progress_bar_config is None:
183
+ progress_bar_config = {}
184
+ if not hasattr(self, "_progress_bar_config"):
185
+ self._progress_bar_config = {}
186
+ self._progress_bar_config.update(progress_bar_config)
187
+
188
+ self.args = args
189
+ # ==========================================================================================
190
+
191
+ if (
192
+ hasattr(scheduler.config, "steps_offset")
193
+ and scheduler.config.steps_offset != 1
194
+ ):
195
+ deprecation_message = (
196
+ f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
197
+ f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
198
+ "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
199
+ " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
200
+ " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
201
+ " file"
202
+ )
203
+ deprecate(
204
+ "steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False
205
+ )
206
+ new_config = dict(scheduler.config)
207
+ new_config["steps_offset"] = 1
208
+ scheduler._internal_dict = FrozenDict(new_config)
209
+
210
+ if (
211
+ hasattr(scheduler.config, "clip_sample")
212
+ and scheduler.config.clip_sample is True
213
+ ):
214
+ deprecation_message = (
215
+ f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
216
+ " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
217
+ " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
218
+ " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
219
+ " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
220
+ )
221
+ deprecate(
222
+ "clip_sample not set", "1.0.0", deprecation_message, standard_warn=False
223
+ )
224
+ new_config = dict(scheduler.config)
225
+ new_config["clip_sample"] = False
226
+ scheduler._internal_dict = FrozenDict(new_config)
227
+
228
+ self.register_modules(
229
+ vae=vae,
230
+ text_encoder=text_encoder,
231
+ transformer=transformer,
232
+ scheduler=scheduler,
233
+ text_encoder_2=text_encoder_2,
234
+ )
235
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
236
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
237
+
238
+ def encode_prompt(
239
+ self,
240
+ prompt,
241
+ device,
242
+ num_videos_per_prompt,
243
+ do_classifier_free_guidance,
244
+ negative_prompt=None,
245
+ prompt_embeds: Optional[torch.Tensor] = None,
246
+ attention_mask: Optional[torch.Tensor] = None,
247
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
248
+ negative_attention_mask: Optional[torch.Tensor] = None,
249
+ lora_scale: Optional[float] = None,
250
+ clip_skip: Optional[int] = None,
251
+ text_encoder: Optional[TextEncoder] = None,
252
+ data_type: Optional[str] = "image",
253
+ ):
254
+ r"""
255
+ Encodes the prompt into text encoder hidden states.
256
+
257
+ Args:
258
+ prompt (`str` or `List[str]`, *optional*):
259
+ prompt to be encoded
260
+ device: (`torch.device`):
261
+ torch device
262
+ num_videos_per_prompt (`int`):
263
+ number of videos that should be generated per prompt
264
+ do_classifier_free_guidance (`bool`):
265
+ whether to use classifier free guidance or not
266
+ negative_prompt (`str` or `List[str]`, *optional*):
267
+ The prompt or prompts not to guide the video generation. If not defined, one has to pass
268
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
269
+ less than `1`).
270
+ prompt_embeds (`torch.Tensor`, *optional*):
271
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
272
+ provided, text embeddings will be generated from `prompt` input argument.
273
+ attention_mask (`torch.Tensor`, *optional*):
274
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
275
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
276
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
277
+ argument.
278
+ negative_attention_mask (`torch.Tensor`, *optional*):
279
+ lora_scale (`float`, *optional*):
280
+ A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
281
+ clip_skip (`int`, *optional*):
282
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
283
+ the output of the pre-final layer will be used for computing the prompt embeddings.
284
+ text_encoder (TextEncoder, *optional*):
285
+ data_type (`str`, *optional*):
286
+ """
287
+ if text_encoder is None:
288
+ text_encoder = self.text_encoder
289
+
290
+ # set lora scale so that monkey patched LoRA
291
+ # function of text encoder can correctly access it
292
+ if lora_scale is not None and isinstance(self, LoraLoaderMixin):
293
+ self._lora_scale = lora_scale
294
+
295
+ # dynamically adjust the LoRA scale
296
+ if not USE_PEFT_BACKEND:
297
+ adjust_lora_scale_text_encoder(text_encoder.model, lora_scale)
298
+ else:
299
+ scale_lora_layers(text_encoder.model, lora_scale)
300
+
301
+ if prompt is not None and isinstance(prompt, str):
302
+ batch_size = 1
303
+ elif prompt is not None and isinstance(prompt, list):
304
+ batch_size = len(prompt)
305
+ else:
306
+ batch_size = prompt_embeds.shape[0]
307
+
308
+ if prompt_embeds is None:
309
+ # textual inversion: process multi-vector tokens if necessary
310
+ if isinstance(self, TextualInversionLoaderMixin):
311
+ prompt = self.maybe_convert_prompt(prompt, text_encoder.tokenizer)
312
+
313
+ text_inputs = text_encoder.text2tokens(prompt, data_type=data_type)
314
+
315
+ if clip_skip is None:
316
+ prompt_outputs = text_encoder.encode(
317
+ text_inputs, data_type=data_type, device=device
318
+ )
319
+ prompt_embeds = prompt_outputs.hidden_state
320
+ else:
321
+ prompt_outputs = text_encoder.encode(
322
+ text_inputs,
323
+ output_hidden_states=True,
324
+ data_type=data_type,
325
+ device=device,
326
+ )
327
+ # Access the `hidden_states` first, that contains a tuple of
328
+ # all the hidden states from the encoder layers. Then index into
329
+ # the tuple to access the hidden states from the desired layer.
330
+ prompt_embeds = prompt_outputs.hidden_states_list[-(clip_skip + 1)]
331
+ # We also need to apply the final LayerNorm here to not mess with the
332
+ # representations. The `last_hidden_states` that we typically use for
333
+ # obtaining the final prompt representations passes through the LayerNorm
334
+ # layer.
335
+ prompt_embeds = text_encoder.model.text_model.final_layer_norm(
336
+ prompt_embeds
337
+ )
338
+
339
+ attention_mask = prompt_outputs.attention_mask
340
+ if attention_mask is not None:
341
+ attention_mask = attention_mask.to(device)
342
+ bs_embed, seq_len = attention_mask.shape
343
+ attention_mask = attention_mask.repeat(1, num_videos_per_prompt)
344
+ attention_mask = attention_mask.view(
345
+ bs_embed * num_videos_per_prompt, seq_len
346
+ )
347
+
348
+ if text_encoder is not None:
349
+ prompt_embeds_dtype = text_encoder.dtype
350
+ elif self.transformer is not None:
351
+ prompt_embeds_dtype = self.transformer.dtype
352
+ else:
353
+ prompt_embeds_dtype = prompt_embeds.dtype
354
+
355
+ prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
356
+
357
+ if prompt_embeds.ndim == 2:
358
+ bs_embed, _ = prompt_embeds.shape
359
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
360
+ prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt)
361
+ prompt_embeds = prompt_embeds.view(bs_embed * num_videos_per_prompt, -1)
362
+ else:
363
+ bs_embed, seq_len, _ = prompt_embeds.shape
364
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
365
+ prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
366
+ prompt_embeds = prompt_embeds.view(
367
+ bs_embed * num_videos_per_prompt, seq_len, -1
368
+ )
369
+
370
+ # get unconditional embeddings for classifier free guidance
371
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
372
+ uncond_tokens: List[str]
373
+ if negative_prompt is None:
374
+ uncond_tokens = [""] * batch_size
375
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
376
+ raise TypeError(
377
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
378
+ f" {type(prompt)}."
379
+ )
380
+ elif isinstance(negative_prompt, str):
381
+ uncond_tokens = [negative_prompt]
382
+ elif batch_size != len(negative_prompt):
383
+ raise ValueError(
384
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
385
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
386
+ " the batch size of `prompt`."
387
+ )
388
+ else:
389
+ uncond_tokens = negative_prompt
390
+
391
+ # textual inversion: process multi-vector tokens if necessary
392
+ if isinstance(self, TextualInversionLoaderMixin):
393
+ uncond_tokens = self.maybe_convert_prompt(
394
+ uncond_tokens, text_encoder.tokenizer
395
+ )
396
+
397
+ # max_length = prompt_embeds.shape[1]
398
+ uncond_input = text_encoder.text2tokens(uncond_tokens, data_type=data_type)
399
+
400
+ negative_prompt_outputs = text_encoder.encode(
401
+ uncond_input, data_type=data_type, device=device
402
+ )
403
+ negative_prompt_embeds = negative_prompt_outputs.hidden_state
404
+
405
+ negative_attention_mask = negative_prompt_outputs.attention_mask
406
+ if negative_attention_mask is not None:
407
+ negative_attention_mask = negative_attention_mask.to(device)
408
+ _, seq_len = negative_attention_mask.shape
409
+ negative_attention_mask = negative_attention_mask.repeat(
410
+ 1, num_videos_per_prompt
411
+ )
412
+ negative_attention_mask = negative_attention_mask.view(
413
+ batch_size * num_videos_per_prompt, seq_len
414
+ )
415
+
416
+ if do_classifier_free_guidance:
417
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
418
+ seq_len = negative_prompt_embeds.shape[1]
419
+
420
+ negative_prompt_embeds = negative_prompt_embeds.to(
421
+ dtype=prompt_embeds_dtype, device=device
422
+ )
423
+
424
+ if negative_prompt_embeds.ndim == 2:
425
+ negative_prompt_embeds = negative_prompt_embeds.repeat(
426
+ 1, num_videos_per_prompt
427
+ )
428
+ negative_prompt_embeds = negative_prompt_embeds.view(
429
+ batch_size * num_videos_per_prompt, -1
430
+ )
431
+ else:
432
+ negative_prompt_embeds = negative_prompt_embeds.repeat(
433
+ 1, num_videos_per_prompt, 1
434
+ )
435
+ negative_prompt_embeds = negative_prompt_embeds.view(
436
+ batch_size * num_videos_per_prompt, seq_len, -1
437
+ )
438
+
439
+ if text_encoder is not None:
440
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
441
+ # Retrieve the original scale by scaling back the LoRA layers
442
+ unscale_lora_layers(text_encoder.model, lora_scale)
443
+
444
+ return (
445
+ prompt_embeds,
446
+ negative_prompt_embeds,
447
+ attention_mask,
448
+ negative_attention_mask,
449
+ )
450
+
451
+ def decode_latents(self, latents, enable_tiling=True):
452
+ deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
453
+ deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
454
+
455
+ latents = 1 / self.vae.config.scaling_factor * latents
456
+ if enable_tiling:
457
+ self.vae.enable_tiling()
458
+ image = self.vae.decode(latents, return_dict=False)[0]
459
+ else:
460
+ image = self.vae.decode(latents, return_dict=False)[0]
461
+ image = (image / 2 + 0.5).clamp(0, 1)
462
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
463
+ if image.ndim == 4:
464
+ image = image.cpu().permute(0, 2, 3, 1).float()
465
+ else:
466
+ image = image.cpu().float()
467
+ return image
468
+
469
+ def prepare_extra_func_kwargs(self, func, kwargs):
470
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
471
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
472
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
473
+ # and should be between [0, 1]
474
+ extra_step_kwargs = {}
475
+
476
+ for k, v in kwargs.items():
477
+ accepts = k in set(inspect.signature(func).parameters.keys())
478
+ if accepts:
479
+ extra_step_kwargs[k] = v
480
+ return extra_step_kwargs
481
+
482
+ def check_inputs(
483
+ self,
484
+ prompt,
485
+ height,
486
+ width,
487
+ video_length,
488
+ callback_steps,
489
+ negative_prompt=None,
490
+ prompt_embeds=None,
491
+ negative_prompt_embeds=None,
492
+ callback_on_step_end_tensor_inputs=None,
493
+ vae_ver="88-4c-sd",
494
+ ):
495
+ if height % 8 != 0 or width % 8 != 0:
496
+ raise ValueError(
497
+ f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
498
+ )
499
+
500
+ if video_length is not None:
501
+ if "884" in vae_ver:
502
+ if video_length != 1 and (video_length - 1) % 4 != 0:
503
+ raise ValueError(
504
+ f"`video_length` has to be 1 or a multiple of 4 but is {video_length}."
505
+ )
506
+ elif "888" in vae_ver:
507
+ if video_length != 1 and (video_length - 1) % 8 != 0:
508
+ raise ValueError(
509
+ f"`video_length` has to be 1 or a multiple of 8 but is {video_length}."
510
+ )
511
+
512
+ if callback_steps is not None and (
513
+ not isinstance(callback_steps, int) or callback_steps <= 0
514
+ ):
515
+ raise ValueError(
516
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
517
+ f" {type(callback_steps)}."
518
+ )
519
+ if callback_on_step_end_tensor_inputs is not None and not all(
520
+ k in self._callback_tensor_inputs
521
+ for k in callback_on_step_end_tensor_inputs
522
+ ):
523
+ raise ValueError(
524
+ f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
525
+ )
526
+
527
+ if prompt is not None and prompt_embeds is not None:
528
+ raise ValueError(
529
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
530
+ " only forward one of the two."
531
+ )
532
+ elif prompt is None and prompt_embeds is None:
533
+ raise ValueError(
534
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
535
+ )
536
+ elif prompt is not None and (
537
+ not isinstance(prompt, str) and not isinstance(prompt, list)
538
+ ):
539
+ raise ValueError(
540
+ f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
541
+ )
542
+
543
+ if negative_prompt is not None and negative_prompt_embeds is not None:
544
+ raise ValueError(
545
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
546
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
547
+ )
548
+
549
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
550
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
551
+ raise ValueError(
552
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
553
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
554
+ f" {negative_prompt_embeds.shape}."
555
+ )
556
+
557
+
558
+ def prepare_latents(
559
+ self,
560
+ batch_size,
561
+ num_channels_latents,
562
+ height,
563
+ width,
564
+ video_length,
565
+ dtype,
566
+ device,
567
+ generator,
568
+ latents=None,
569
+ ):
570
+ shape = (
571
+ batch_size,
572
+ num_channels_latents,
573
+ video_length,
574
+ int(height) // self.vae_scale_factor,
575
+ int(width) // self.vae_scale_factor,
576
+ )
577
+ if isinstance(generator, list) and len(generator) != batch_size:
578
+ raise ValueError(
579
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
580
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
581
+ )
582
+
583
+ if latents is None:
584
+ latents = randn_tensor(
585
+ shape, generator=generator, device=device, dtype=dtype
586
+ )
587
+ else:
588
+ latents = latents.to(device)
589
+
590
+ # Check existence to make it compatible with FlowMatchEulerDiscreteScheduler
591
+ if hasattr(self.scheduler, "init_noise_sigma"):
592
+ # scale the initial noise by the standard deviation required by the scheduler
593
+ latents = latents * self.scheduler.init_noise_sigma
594
+ return latents
595
+
596
+ # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
597
+ def get_guidance_scale_embedding(
598
+ self,
599
+ w: torch.Tensor,
600
+ embedding_dim: int = 512,
601
+ dtype: torch.dtype = torch.float32,
602
+ ) -> torch.Tensor:
603
+ """
604
+ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
605
+
606
+ Args:
607
+ w (`torch.Tensor`):
608
+ Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
609
+ embedding_dim (`int`, *optional*, defaults to 512):
610
+ Dimension of the embeddings to generate.
611
+ dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
612
+ Data type of the generated embeddings.
613
+
614
+ Returns:
615
+ `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
616
+ """
617
+ assert len(w.shape) == 1
618
+ w = w * 1000.0
619
+
620
+ half_dim = embedding_dim // 2
621
+ emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
622
+ emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
623
+ emb = w.to(dtype)[:, None] * emb[None, :]
624
+ emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
625
+ if embedding_dim % 2 == 1: # zero pad
626
+ emb = torch.nn.functional.pad(emb, (0, 1))
627
+ assert emb.shape == (w.shape[0], embedding_dim)
628
+ return emb
629
+
630
+ @property
631
+ def guidance_scale(self):
632
+ return self._guidance_scale
633
+
634
+ @property
635
+ def guidance_rescale(self):
636
+ return self._guidance_rescale
637
+
638
+ @property
639
+ def clip_skip(self):
640
+ return self._clip_skip
641
+
642
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
643
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
644
+ # corresponds to doing no classifier free guidance.
645
+ @property
646
+ def do_classifier_free_guidance(self):
647
+ # return self._guidance_scale > 1 and self.transformer.config.time_cond_proj_dim is None
648
+ return self._guidance_scale > 1
649
+
650
+ @property
651
+ def cross_attention_kwargs(self):
652
+ return self._cross_attention_kwargs
653
+
654
+ @property
655
+ def num_timesteps(self):
656
+ return self._num_timesteps
657
+
658
+ @property
659
+ def interrupt(self):
660
+ return self._interrupt
661
+
662
+ @torch.no_grad()
663
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
664
+ def __call__(
665
+ self,
666
+ prompt: Union[str, List[str]],
667
+ height: int,
668
+ width: int,
669
+ video_length: int,
670
+ data_type: str = "video",
671
+ num_inference_steps: int = 50,
672
+ timesteps: List[int] = None,
673
+ sigmas: List[float] = None,
674
+ guidance_scale: float = 7.5,
675
+ negative_prompt: Optional[Union[str, List[str]]] = None,
676
+ num_videos_per_prompt: Optional[int] = 1,
677
+ eta: float = 0.0,
678
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
679
+ latents: Optional[torch.Tensor] = None,
680
+ prompt_embeds: Optional[torch.Tensor] = None,
681
+ attention_mask: Optional[torch.Tensor] = None,
682
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
683
+ negative_attention_mask: Optional[torch.Tensor] = None,
684
+ output_type: Optional[str] = "pil",
685
+ return_dict: bool = True,
686
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
687
+ guidance_rescale: float = 0.0,
688
+ clip_skip: Optional[int] = None,
689
+ callback_on_step_end: Optional[
690
+ Union[
691
+ Callable[[int, int, Dict], None],
692
+ PipelineCallback,
693
+ MultiPipelineCallbacks,
694
+ ]
695
+ ] = None,
696
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
697
+ freqs_cis: Tuple[torch.Tensor, torch.Tensor] = None,
698
+ vae_ver: str = "88-4c-sd",
699
+ enable_tiling: bool = False,
700
+ n_tokens: Optional[int] = None,
701
+ embedded_guidance_scale: Optional[float] = None,
702
+ **kwargs,
703
+ ):
704
+ r"""
705
+ The call function to the pipeline for generation.
706
+
707
+ Args:
708
+ prompt (`str` or `List[str]`):
709
+ The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
710
+ height (`int`):
711
+ The height in pixels of the generated image.
712
+ width (`int`):
713
+ The width in pixels of the generated image.
714
+ video_length (`int`):
715
+ The number of frames in the generated video.
716
+ num_inference_steps (`int`, *optional*, defaults to 50):
717
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
718
+ expense of slower inference.
719
+ timesteps (`List[int]`, *optional*):
720
+ Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
721
+ in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
722
+ passed will be used. Must be in descending order.
723
+ sigmas (`List[float]`, *optional*):
724
+ Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
725
+ their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
726
+ will be used.
727
+ guidance_scale (`float`, *optional*, defaults to 7.5):
728
+ A higher guidance scale value encourages the model to generate images closely linked to the text
729
+ `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
730
+ negative_prompt (`str` or `List[str]`, *optional*):
731
+ The prompt or prompts to guide what to not include in image generation. If not defined, you need to
732
+ pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
733
+ num_videos_per_prompt (`int`, *optional*, defaults to 1):
734
+ The number of images to generate per prompt.
735
+ eta (`float`, *optional*, defaults to 0.0):
736
+ Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
737
+ to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
738
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
739
+ A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
740
+ generation deterministic.
741
+ latents (`torch.Tensor`, *optional*):
742
+ Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
743
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
744
+ tensor is generated by sampling using the supplied random `generator`.
745
+ prompt_embeds (`torch.Tensor`, *optional*):
746
+ Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
747
+ provided, text embeddings are generated from the `prompt` input argument.
748
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
749
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
750
+ not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
751
+
752
+ output_type (`str`, *optional*, defaults to `"pil"`):
753
+ The output format of the generated image. Choose between `PIL.Image` or `np.array`.
754
+ return_dict (`bool`, *optional*, defaults to `True`):
755
+ Whether or not to return a [`HunyuanVideoPipelineOutput`] instead of a
756
+ plain tuple.
757
+ cross_attention_kwargs (`dict`, *optional*):
758
+ A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
759
+ [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
760
+ guidance_rescale (`float`, *optional*, defaults to 0.0):
761
+ Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
762
+ Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
763
+ using zero terminal SNR.
764
+ clip_skip (`int`, *optional*):
765
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
766
+ the output of the pre-final layer will be used for computing the prompt embeddings.
767
+ callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
768
+ A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
769
+ each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
770
+ DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
771
+ list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
772
+ callback_on_step_end_tensor_inputs (`List`, *optional*):
773
+ The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
774
+ will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
775
+ `._callback_tensor_inputs` attribute of your pipeline class.
776
+
777
+ Examples:
778
+
779
+ Returns:
780
+ [`~HunyuanVideoPipelineOutput`] or `tuple`:
781
+ If `return_dict` is `True`, [`HunyuanVideoPipelineOutput`] is returned,
782
+ otherwise a `tuple` is returned where the first element is a list with the generated images and the
783
+ second element is a list of `bool`s indicating whether the corresponding generated image contains
784
+ "not-safe-for-work" (nsfw) content.
785
+ """
786
+ callback = kwargs.pop("callback", None)
787
+ callback_steps = kwargs.pop("callback_steps", None)
788
+
789
+ if callback is not None:
790
+ deprecate(
791
+ "callback",
792
+ "1.0.0",
793
+ "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
794
+ )
795
+ if callback_steps is not None:
796
+ deprecate(
797
+ "callback_steps",
798
+ "1.0.0",
799
+ "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
800
+ )
801
+
802
+ if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
803
+ callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
804
+
805
+ # 0. Default height and width to unet
806
+ # height = height or self.transformer.config.sample_size * self.vae_scale_factor
807
+ # width = width or self.transformer.config.sample_size * self.vae_scale_factor
808
+ # to deal with lora scaling and other possible forward hooks
809
+
810
+ # 1. Check inputs. Raise error if not correct
811
+ self.check_inputs(
812
+ prompt,
813
+ height,
814
+ width,
815
+ video_length,
816
+ callback_steps,
817
+ negative_prompt,
818
+ prompt_embeds,
819
+ negative_prompt_embeds,
820
+ callback_on_step_end_tensor_inputs,
821
+ vae_ver=vae_ver,
822
+ )
823
+
824
+ self._guidance_scale = guidance_scale
825
+ self._guidance_rescale = guidance_rescale
826
+ self._clip_skip = clip_skip
827
+ self._cross_attention_kwargs = cross_attention_kwargs
828
+ self._interrupt = False
829
+
830
+ # 2. Define call parameters
831
+ if prompt is not None and isinstance(prompt, str):
832
+ batch_size = 1
833
+ elif prompt is not None and isinstance(prompt, list):
834
+ batch_size = len(prompt)
835
+ else:
836
+ batch_size = prompt_embeds.shape[0]
837
+
838
+ device = torch.device(f"cuda:{dist.get_rank()}") if dist.is_initialized() else self._execution_device
839
+
840
+ # 3. Encode input prompt
841
+ lora_scale = (
842
+ self.cross_attention_kwargs.get("scale", None)
843
+ if self.cross_attention_kwargs is not None
844
+ else None
845
+ )
846
+
847
+ (
848
+ prompt_embeds,
849
+ negative_prompt_embeds,
850
+ prompt_mask,
851
+ negative_prompt_mask,
852
+ ) = self.encode_prompt(
853
+ prompt,
854
+ device,
855
+ num_videos_per_prompt,
856
+ self.do_classifier_free_guidance,
857
+ negative_prompt,
858
+ prompt_embeds=prompt_embeds,
859
+ attention_mask=attention_mask,
860
+ negative_prompt_embeds=negative_prompt_embeds,
861
+ negative_attention_mask=negative_attention_mask,
862
+ lora_scale=lora_scale,
863
+ clip_skip=self.clip_skip,
864
+ data_type=data_type,
865
+ )
866
+ if self.text_encoder_2 is not None:
867
+ (
868
+ prompt_embeds_2,
869
+ negative_prompt_embeds_2,
870
+ prompt_mask_2,
871
+ negative_prompt_mask_2,
872
+ ) = self.encode_prompt(
873
+ prompt,
874
+ device,
875
+ num_videos_per_prompt,
876
+ self.do_classifier_free_guidance,
877
+ negative_prompt,
878
+ prompt_embeds=None,
879
+ attention_mask=None,
880
+ negative_prompt_embeds=None,
881
+ negative_attention_mask=None,
882
+ lora_scale=lora_scale,
883
+ clip_skip=self.clip_skip,
884
+ text_encoder=self.text_encoder_2,
885
+ data_type=data_type,
886
+ )
887
+ else:
888
+ prompt_embeds_2 = None
889
+ negative_prompt_embeds_2 = None
890
+ prompt_mask_2 = None
891
+ negative_prompt_mask_2 = None
892
+
893
+ # For classifier free guidance, we need to do two forward passes.
894
+ # Here we concatenate the unconditional and text embeddings into a single batch
895
+ # to avoid doing two forward passes
896
+ if self.do_classifier_free_guidance:
897
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
898
+ if prompt_mask is not None:
899
+ prompt_mask = torch.cat([negative_prompt_mask, prompt_mask])
900
+ if prompt_embeds_2 is not None:
901
+ prompt_embeds_2 = torch.cat([negative_prompt_embeds_2, prompt_embeds_2])
902
+ if prompt_mask_2 is not None:
903
+ prompt_mask_2 = torch.cat([negative_prompt_mask_2, prompt_mask_2])
904
+
905
+
906
+ # 4. Prepare timesteps
907
+ extra_set_timesteps_kwargs = self.prepare_extra_func_kwargs(
908
+ self.scheduler.set_timesteps, {"n_tokens": n_tokens}
909
+ )
910
+ timesteps, num_inference_steps = retrieve_timesteps(
911
+ self.scheduler,
912
+ num_inference_steps,
913
+ device,
914
+ timesteps,
915
+ sigmas,
916
+ **extra_set_timesteps_kwargs,
917
+ )
918
+
919
+ if "884" in vae_ver:
920
+ video_length = (video_length - 1) // 4 + 1
921
+ elif "888" in vae_ver:
922
+ video_length = (video_length - 1) // 8 + 1
923
+ else:
924
+ video_length = video_length
925
+
926
+ # 5. Prepare latent variables
927
+ num_channels_latents = self.transformer.config.in_channels
928
+ latents = self.prepare_latents(
929
+ batch_size * num_videos_per_prompt,
930
+ num_channels_latents,
931
+ height,
932
+ width,
933
+ video_length,
934
+ prompt_embeds.dtype,
935
+ device,
936
+ generator,
937
+ latents,
938
+ )
939
+
940
+ # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
941
+ extra_step_kwargs = self.prepare_extra_func_kwargs(
942
+ self.scheduler.step,
943
+ {"generator": generator, "eta": eta},
944
+ )
945
+
946
+ target_dtype = PRECISION_TO_TYPE[self.args.precision]
947
+ autocast_enabled = (
948
+ target_dtype != torch.float32
949
+ ) and not self.args.disable_autocast
950
+ vae_dtype = PRECISION_TO_TYPE[self.args.vae_precision]
951
+ vae_autocast_enabled = (
952
+ vae_dtype != torch.float32
953
+ ) and not self.args.disable_autocast
954
+
955
+ # 7. Denoising loop
956
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
957
+ self._num_timesteps = len(timesteps)
958
+
959
+ # if is_progress_bar:
960
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
961
+ for i, t in enumerate(timesteps):
962
+ if self.interrupt:
963
+ continue
964
+
965
+ # expand the latents if we are doing classifier free guidance
966
+ latent_model_input = (
967
+ torch.cat([latents] * 2)
968
+ if self.do_classifier_free_guidance
969
+ else latents
970
+ )
971
+ latent_model_input = self.scheduler.scale_model_input(
972
+ latent_model_input, t
973
+ )
974
+
975
+ t_expand = t.repeat(latent_model_input.shape[0])
976
+ guidance_expand = (
977
+ torch.tensor(
978
+ [embedded_guidance_scale] * latent_model_input.shape[0],
979
+ dtype=torch.float32,
980
+ device=device,
981
+ ).to(target_dtype)
982
+ * 1000.0
983
+ if embedded_guidance_scale is not None
984
+ else None
985
+ )
986
+
987
+ # predict the noise residual
988
+ with torch.autocast(
989
+ device_type="cuda", dtype=target_dtype, enabled=autocast_enabled
990
+ ):
991
+ noise_pred = self.transformer( # For an input image (129, 192, 336) (1, 256, 256)
992
+ latent_model_input, # [2, 16, 33, 24, 42]
993
+ t_expand, # [2]
994
+ text_states=prompt_embeds, # [2, 256, 4096]
995
+ text_mask=prompt_mask, # [2, 256]
996
+ text_states_2=prompt_embeds_2, # [2, 768]
997
+ freqs_cos=freqs_cis[0], # [seqlen, head_dim]
998
+ freqs_sin=freqs_cis[1], # [seqlen, head_dim]
999
+ guidance=guidance_expand,
1000
+ return_dict=True,
1001
+ )[
1002
+ "x"
1003
+ ]
1004
+
1005
+ # perform guidance
1006
+ if self.do_classifier_free_guidance:
1007
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
1008
+ noise_pred = noise_pred_uncond + self.guidance_scale * (
1009
+ noise_pred_text - noise_pred_uncond
1010
+ )
1011
+
1012
+ if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
1013
+ # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
1014
+ noise_pred = rescale_noise_cfg(
1015
+ noise_pred,
1016
+ noise_pred_text,
1017
+ guidance_rescale=self.guidance_rescale,
1018
+ )
1019
+
1020
+ # compute the previous noisy sample x_t -> x_t-1
1021
+ latents = self.scheduler.step(
1022
+ noise_pred, t, latents, **extra_step_kwargs, return_dict=False
1023
+ )[0]
1024
+
1025
+ if callback_on_step_end is not None:
1026
+ callback_kwargs = {}
1027
+ for k in callback_on_step_end_tensor_inputs:
1028
+ callback_kwargs[k] = locals()[k]
1029
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
1030
+
1031
+ latents = callback_outputs.pop("latents", latents)
1032
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
1033
+ negative_prompt_embeds = callback_outputs.pop(
1034
+ "negative_prompt_embeds", negative_prompt_embeds
1035
+ )
1036
+
1037
+ # call the callback, if provided
1038
+ if i == len(timesteps) - 1 or (
1039
+ (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
1040
+ ):
1041
+ if progress_bar is not None:
1042
+ progress_bar.update()
1043
+ if callback is not None and i % callback_steps == 0:
1044
+ step_idx = i // getattr(self.scheduler, "order", 1)
1045
+ callback(step_idx, t, latents)
1046
+
1047
+ if not output_type == "latent":
1048
+ expand_temporal_dim = False
1049
+ if len(latents.shape) == 4:
1050
+ if isinstance(self.vae, AutoencoderKLCausal3D):
1051
+ latents = latents.unsqueeze(2)
1052
+ expand_temporal_dim = True
1053
+ elif len(latents.shape) == 5:
1054
+ pass
1055
+ else:
1056
+ raise ValueError(
1057
+ f"Only support latents with shape (b, c, h, w) or (b, c, f, h, w), but got {latents.shape}."
1058
+ )
1059
+
1060
+ if (
1061
+ hasattr(self.vae.config, "shift_factor")
1062
+ and self.vae.config.shift_factor
1063
+ ):
1064
+ latents = (
1065
+ latents / self.vae.config.scaling_factor
1066
+ + self.vae.config.shift_factor
1067
+ )
1068
+ else:
1069
+ latents = latents / self.vae.config.scaling_factor
1070
+
1071
+ with torch.autocast(
1072
+ device_type="cuda", dtype=vae_dtype, enabled=vae_autocast_enabled
1073
+ ):
1074
+ if enable_tiling:
1075
+ self.vae.enable_tiling()
1076
+ image = self.vae.decode(
1077
+ latents, return_dict=False, generator=generator
1078
+ )[0]
1079
+ else:
1080
+ image = self.vae.decode(
1081
+ latents, return_dict=False, generator=generator
1082
+ )[0]
1083
+
1084
+ if expand_temporal_dim or image.shape[2] == 1:
1085
+ image = image.squeeze(2)
1086
+
1087
+ else:
1088
+ image = latents
1089
+
1090
+ image = (image / 2 + 0.5).clamp(0, 1)
1091
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
1092
+ image = image.cpu().float()
1093
+
1094
+ # Offload all models
1095
+ self.maybe_free_model_hooks()
1096
+
1097
+ if not return_dict:
1098
+ return image
1099
+
1100
+ return HunyuanVideoPipelineOutput(videos=image)
hunyuan_model/posemb_layers.py ADDED
@@ -0,0 +1,310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from typing import Union, Tuple, List
3
+
4
+
5
+ def _to_tuple(x, dim=2):
6
+ if isinstance(x, int):
7
+ return (x,) * dim
8
+ elif len(x) == dim:
9
+ return x
10
+ else:
11
+ raise ValueError(f"Expected length {dim} or int, but got {x}")
12
+
13
+
14
+ def get_meshgrid_nd(start, *args, dim=2):
15
+ """
16
+ Get n-D meshgrid with start, stop and num.
17
+
18
+ Args:
19
+ start (int or tuple): If len(args) == 0, start is num; If len(args) == 1, start is start, args[0] is stop,
20
+ step is 1; If len(args) == 2, start is start, args[0] is stop, args[1] is num. For n-dim, start/stop/num
21
+ should be int or n-tuple. If n-tuple is provided, the meshgrid will be stacked following the dim order in
22
+ n-tuples.
23
+ *args: See above.
24
+ dim (int): Dimension of the meshgrid. Defaults to 2.
25
+
26
+ Returns:
27
+ grid (np.ndarray): [dim, ...]
28
+ """
29
+ if len(args) == 0:
30
+ # start is grid_size
31
+ num = _to_tuple(start, dim=dim)
32
+ start = (0,) * dim
33
+ stop = num
34
+ elif len(args) == 1:
35
+ # start is start, args[0] is stop, step is 1
36
+ start = _to_tuple(start, dim=dim)
37
+ stop = _to_tuple(args[0], dim=dim)
38
+ num = [stop[i] - start[i] for i in range(dim)]
39
+ elif len(args) == 2:
40
+ # start is start, args[0] is stop, args[1] is num
41
+ start = _to_tuple(start, dim=dim) # Left-Top eg: 12,0
42
+ stop = _to_tuple(args[0], dim=dim) # Right-Bottom eg: 20,32
43
+ num = _to_tuple(args[1], dim=dim) # Target Size eg: 32,124
44
+ else:
45
+ raise ValueError(f"len(args) should be 0, 1 or 2, but got {len(args)}")
46
+
47
+ # PyTorch implement of np.linspace(start[i], stop[i], num[i], endpoint=False)
48
+ axis_grid = []
49
+ for i in range(dim):
50
+ a, b, n = start[i], stop[i], num[i]
51
+ g = torch.linspace(a, b, n + 1, dtype=torch.float32)[:n]
52
+ axis_grid.append(g)
53
+ grid = torch.meshgrid(*axis_grid, indexing="ij") # dim x [W, H, D]
54
+ grid = torch.stack(grid, dim=0) # [dim, W, H, D]
55
+
56
+ return grid
57
+
58
+
59
+ #################################################################################
60
+ # Rotary Positional Embedding Functions #
61
+ #################################################################################
62
+ # https://github.com/meta-llama/llama/blob/be327c427cc5e89cc1d3ab3d3fec4484df771245/llama/model.py#L80
63
+
64
+
65
+ def reshape_for_broadcast(
66
+ freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
67
+ x: torch.Tensor,
68
+ head_first=False,
69
+ ):
70
+ """
71
+ Reshape frequency tensor for broadcasting it with another tensor.
72
+
73
+ This function reshapes the frequency tensor to have the same shape as the target tensor 'x'
74
+ for the purpose of broadcasting the frequency tensor during element-wise operations.
75
+
76
+ Notes:
77
+ When using FlashMHAModified, head_first should be False.
78
+ When using Attention, head_first should be True.
79
+
80
+ Args:
81
+ freqs_cis (Union[torch.Tensor, Tuple[torch.Tensor]]): Frequency tensor to be reshaped.
82
+ x (torch.Tensor): Target tensor for broadcasting compatibility.
83
+ head_first (bool): head dimension first (except batch dim) or not.
84
+
85
+ Returns:
86
+ torch.Tensor: Reshaped frequency tensor.
87
+
88
+ Raises:
89
+ AssertionError: If the frequency tensor doesn't match the expected shape.
90
+ AssertionError: If the target tensor 'x' doesn't have the expected number of dimensions.
91
+ """
92
+ ndim = x.ndim
93
+ assert 0 <= 1 < ndim
94
+
95
+ if isinstance(freqs_cis, tuple):
96
+ # freqs_cis: (cos, sin) in real space
97
+ if head_first:
98
+ assert freqs_cis[0].shape == (
99
+ x.shape[-2],
100
+ x.shape[-1],
101
+ ), f"freqs_cis shape {freqs_cis[0].shape} does not match x shape {x.shape}"
102
+ shape = [
103
+ d if i == ndim - 2 or i == ndim - 1 else 1
104
+ for i, d in enumerate(x.shape)
105
+ ]
106
+ else:
107
+ assert freqs_cis[0].shape == (
108
+ x.shape[1],
109
+ x.shape[-1],
110
+ ), f"freqs_cis shape {freqs_cis[0].shape} does not match x shape {x.shape}"
111
+ shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
112
+ return freqs_cis[0].view(*shape), freqs_cis[1].view(*shape)
113
+ else:
114
+ # freqs_cis: values in complex space
115
+ if head_first:
116
+ assert freqs_cis.shape == (
117
+ x.shape[-2],
118
+ x.shape[-1],
119
+ ), f"freqs_cis shape {freqs_cis.shape} does not match x shape {x.shape}"
120
+ shape = [
121
+ d if i == ndim - 2 or i == ndim - 1 else 1
122
+ for i, d in enumerate(x.shape)
123
+ ]
124
+ else:
125
+ assert freqs_cis.shape == (
126
+ x.shape[1],
127
+ x.shape[-1],
128
+ ), f"freqs_cis shape {freqs_cis.shape} does not match x shape {x.shape}"
129
+ shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
130
+ return freqs_cis.view(*shape)
131
+
132
+
133
+ def rotate_half(x):
134
+ x_real, x_imag = (
135
+ x.float().reshape(*x.shape[:-1], -1, 2).unbind(-1)
136
+ ) # [B, S, H, D//2]
137
+ return torch.stack([-x_imag, x_real], dim=-1).flatten(3)
138
+
139
+
140
+ def apply_rotary_emb(
141
+ xq: torch.Tensor,
142
+ xk: torch.Tensor,
143
+ freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
144
+ head_first: bool = False,
145
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
146
+ """
147
+ Apply rotary embeddings to input tensors using the given frequency tensor.
148
+
149
+ This function applies rotary embeddings to the given query 'xq' and key 'xk' tensors using the provided
150
+ frequency tensor 'freqs_cis'. The input tensors are reshaped as complex numbers, and the frequency tensor
151
+ is reshaped for broadcasting compatibility. The resulting tensors contain rotary embeddings and are
152
+ returned as real tensors.
153
+
154
+ Args:
155
+ xq (torch.Tensor): Query tensor to apply rotary embeddings. [B, S, H, D]
156
+ xk (torch.Tensor): Key tensor to apply rotary embeddings. [B, S, H, D]
157
+ freqs_cis (torch.Tensor or tuple): Precomputed frequency tensor for complex exponential.
158
+ head_first (bool): head dimension first (except batch dim) or not.
159
+
160
+ Returns:
161
+ Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
162
+
163
+ """
164
+ xk_out = None
165
+ if isinstance(freqs_cis, tuple):
166
+ cos, sin = reshape_for_broadcast(freqs_cis, xq, head_first) # [S, D]
167
+ cos, sin = cos.to(xq.device), sin.to(xq.device)
168
+ # real * cos - imag * sin
169
+ # imag * cos + real * sin
170
+ xq_out = (xq.float() * cos + rotate_half(xq.float()) * sin).type_as(xq)
171
+ xk_out = (xk.float() * cos + rotate_half(xk.float()) * sin).type_as(xk)
172
+ else:
173
+ # view_as_complex will pack [..., D/2, 2](real) to [..., D/2](complex)
174
+ xq_ = torch.view_as_complex(
175
+ xq.float().reshape(*xq.shape[:-1], -1, 2)
176
+ ) # [B, S, H, D//2]
177
+ freqs_cis = reshape_for_broadcast(freqs_cis, xq_, head_first).to(
178
+ xq.device
179
+ ) # [S, D//2] --> [1, S, 1, D//2]
180
+ # (real, imag) * (cos, sin) = (real * cos - imag * sin, imag * cos + real * sin)
181
+ # view_as_real will expand [..., D/2](complex) to [..., D/2, 2](real)
182
+ xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3).type_as(xq)
183
+ xk_ = torch.view_as_complex(
184
+ xk.float().reshape(*xk.shape[:-1], -1, 2)
185
+ ) # [B, S, H, D//2]
186
+ xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3).type_as(xk)
187
+
188
+ return xq_out, xk_out
189
+
190
+
191
+ def get_nd_rotary_pos_embed(
192
+ rope_dim_list,
193
+ start,
194
+ *args,
195
+ theta=10000.0,
196
+ use_real=False,
197
+ theta_rescale_factor: Union[float, List[float]] = 1.0,
198
+ interpolation_factor: Union[float, List[float]] = 1.0,
199
+ ):
200
+ """
201
+ This is a n-d version of precompute_freqs_cis, which is a RoPE for tokens with n-d structure.
202
+
203
+ Args:
204
+ rope_dim_list (list of int): Dimension of each rope. len(rope_dim_list) should equal to n.
205
+ sum(rope_dim_list) should equal to head_dim of attention layer.
206
+ start (int | tuple of int | list of int): If len(args) == 0, start is num; If len(args) == 1, start is start,
207
+ args[0] is stop, step is 1; If len(args) == 2, start is start, args[0] is stop, args[1] is num.
208
+ *args: See above.
209
+ theta (float): Scaling factor for frequency computation. Defaults to 10000.0.
210
+ use_real (bool): If True, return real part and imaginary part separately. Otherwise, return complex numbers.
211
+ Some libraries such as TensorRT does not support complex64 data type. So it is useful to provide a real
212
+ part and an imaginary part separately.
213
+ theta_rescale_factor (float): Rescale factor for theta. Defaults to 1.0.
214
+
215
+ Returns:
216
+ pos_embed (torch.Tensor): [HW, D/2]
217
+ """
218
+
219
+ grid = get_meshgrid_nd(
220
+ start, *args, dim=len(rope_dim_list)
221
+ ) # [3, W, H, D] / [2, W, H]
222
+
223
+ if isinstance(theta_rescale_factor, int) or isinstance(theta_rescale_factor, float):
224
+ theta_rescale_factor = [theta_rescale_factor] * len(rope_dim_list)
225
+ elif isinstance(theta_rescale_factor, list) and len(theta_rescale_factor) == 1:
226
+ theta_rescale_factor = [theta_rescale_factor[0]] * len(rope_dim_list)
227
+ assert len(theta_rescale_factor) == len(
228
+ rope_dim_list
229
+ ), "len(theta_rescale_factor) should equal to len(rope_dim_list)"
230
+
231
+ if isinstance(interpolation_factor, int) or isinstance(interpolation_factor, float):
232
+ interpolation_factor = [interpolation_factor] * len(rope_dim_list)
233
+ elif isinstance(interpolation_factor, list) and len(interpolation_factor) == 1:
234
+ interpolation_factor = [interpolation_factor[0]] * len(rope_dim_list)
235
+ assert len(interpolation_factor) == len(
236
+ rope_dim_list
237
+ ), "len(interpolation_factor) should equal to len(rope_dim_list)"
238
+
239
+ # use 1/ndim of dimensions to encode grid_axis
240
+ embs = []
241
+ for i in range(len(rope_dim_list)):
242
+ emb = get_1d_rotary_pos_embed(
243
+ rope_dim_list[i],
244
+ grid[i].reshape(-1),
245
+ theta,
246
+ use_real=use_real,
247
+ theta_rescale_factor=theta_rescale_factor[i],
248
+ interpolation_factor=interpolation_factor[i],
249
+ ) # 2 x [WHD, rope_dim_list[i]]
250
+ embs.append(emb)
251
+
252
+ if use_real:
253
+ cos = torch.cat([emb[0] for emb in embs], dim=1) # (WHD, D/2)
254
+ sin = torch.cat([emb[1] for emb in embs], dim=1) # (WHD, D/2)
255
+ return cos, sin
256
+ else:
257
+ emb = torch.cat(embs, dim=1) # (WHD, D/2)
258
+ return emb
259
+
260
+
261
+ def get_1d_rotary_pos_embed(
262
+ dim: int,
263
+ pos: Union[torch.FloatTensor, int],
264
+ theta: float = 10000.0,
265
+ use_real: bool = False,
266
+ theta_rescale_factor: float = 1.0,
267
+ interpolation_factor: float = 1.0,
268
+ ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
269
+ """
270
+ Precompute the frequency tensor for complex exponential (cis) with given dimensions.
271
+ (Note: `cis` means `cos + i * sin`, where i is the imaginary unit.)
272
+
273
+ This function calculates a frequency tensor with complex exponential using the given dimension 'dim'
274
+ and the end index 'end'. The 'theta' parameter scales the frequencies.
275
+ The returned tensor contains complex values in complex64 data type.
276
+
277
+ Args:
278
+ dim (int): Dimension of the frequency tensor.
279
+ pos (int or torch.FloatTensor): Position indices for the frequency tensor. [S] or scalar
280
+ theta (float, optional): Scaling factor for frequency computation. Defaults to 10000.0.
281
+ use_real (bool, optional): If True, return real part and imaginary part separately.
282
+ Otherwise, return complex numbers.
283
+ theta_rescale_factor (float, optional): Rescale factor for theta. Defaults to 1.0.
284
+
285
+ Returns:
286
+ freqs_cis: Precomputed frequency tensor with complex exponential. [S, D/2]
287
+ freqs_cos, freqs_sin: Precomputed frequency tensor with real and imaginary parts separately. [S, D]
288
+ """
289
+ if isinstance(pos, int):
290
+ pos = torch.arange(pos).float()
291
+
292
+ # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
293
+ # has some connection to NTK literature
294
+ if theta_rescale_factor != 1.0:
295
+ theta *= theta_rescale_factor ** (dim / (dim - 2))
296
+
297
+ freqs = 1.0 / (
298
+ theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)
299
+ ) # [D/2]
300
+ # assert interpolation_factor == 1.0, f"interpolation_factor: {interpolation_factor}"
301
+ freqs = torch.outer(pos * interpolation_factor, freqs) # [S, D/2]
302
+ if use_real:
303
+ freqs_cos = freqs.cos().repeat_interleave(2, dim=1) # [S, D]
304
+ freqs_sin = freqs.sin().repeat_interleave(2, dim=1) # [S, D]
305
+ return freqs_cos, freqs_sin
306
+ else:
307
+ freqs_cis = torch.polar(
308
+ torch.ones_like(freqs), freqs
309
+ ) # complex64 # [S, D/2]
310
+ return freqs_cis
hunyuan_model/text_encoder.py ADDED
@@ -0,0 +1,710 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ import json
3
+ import os
4
+ from typing import Optional, Tuple, Union
5
+ from copy import deepcopy
6
+
7
+ import torch
8
+ import torch.nn as nn
9
+ from transformers import (
10
+ CLIPTextModel,
11
+ CLIPTokenizer,
12
+ AutoTokenizer,
13
+ AutoModel,
14
+ CLIPConfig,
15
+ LlamaForCausalLM,
16
+ LlamaConfig,
17
+ )
18
+ from transformers.utils import ModelOutput
19
+ from transformers.models.llama import LlamaModel
20
+ from safetensors.torch import load_file
21
+ from accelerate import init_empty_weights
22
+
23
+ import logging
24
+
25
+ logger = logging.getLogger(__name__)
26
+ logging.basicConfig(level=logging.INFO)
27
+
28
+
29
+ CLIP_L_HUGGINGFACE_MODEL_ID = "openai/clip-vit-large-patch14"
30
+ LLAVA_HUGGINGFACE_MODEL_ID = "xtuner/llava-llama-3-8b-v1_1-transformers"
31
+
32
+ CLIP_CONFIG = {
33
+ "_name_or_path": "clip-vit-large-patch14/",
34
+ "architectures": ["CLIPModel"],
35
+ "initializer_factor": 1.0,
36
+ "logit_scale_init_value": 2.6592,
37
+ "model_type": "clip",
38
+ "projection_dim": 768,
39
+ # "text_config": {
40
+ "_name_or_path": "",
41
+ "add_cross_attention": False,
42
+ "architectures": None,
43
+ "attention_dropout": 0.0,
44
+ "bad_words_ids": None,
45
+ "bos_token_id": 0,
46
+ "chunk_size_feed_forward": 0,
47
+ "cross_attention_hidden_size": None,
48
+ "decoder_start_token_id": None,
49
+ "diversity_penalty": 0.0,
50
+ "do_sample": False,
51
+ "dropout": 0.0,
52
+ "early_stopping": False,
53
+ "encoder_no_repeat_ngram_size": 0,
54
+ "eos_token_id": 2,
55
+ "finetuning_task": None,
56
+ "forced_bos_token_id": None,
57
+ "forced_eos_token_id": None,
58
+ "hidden_act": "quick_gelu",
59
+ "hidden_size": 768,
60
+ "id2label": {"0": "LABEL_0", "1": "LABEL_1"},
61
+ "initializer_factor": 1.0,
62
+ "initializer_range": 0.02,
63
+ "intermediate_size": 3072,
64
+ "is_decoder": False,
65
+ "is_encoder_decoder": False,
66
+ "label2id": {"LABEL_0": 0, "LABEL_1": 1},
67
+ "layer_norm_eps": 1e-05,
68
+ "length_penalty": 1.0,
69
+ "max_length": 20,
70
+ "max_position_embeddings": 77,
71
+ "min_length": 0,
72
+ "model_type": "clip_text_model",
73
+ "no_repeat_ngram_size": 0,
74
+ "num_attention_heads": 12,
75
+ "num_beam_groups": 1,
76
+ "num_beams": 1,
77
+ "num_hidden_layers": 12,
78
+ "num_return_sequences": 1,
79
+ "output_attentions": False,
80
+ "output_hidden_states": False,
81
+ "output_scores": False,
82
+ "pad_token_id": 1,
83
+ "prefix": None,
84
+ "problem_type": None,
85
+ "projection_dim": 768,
86
+ "pruned_heads": {},
87
+ "remove_invalid_values": False,
88
+ "repetition_penalty": 1.0,
89
+ "return_dict": True,
90
+ "return_dict_in_generate": False,
91
+ "sep_token_id": None,
92
+ "task_specific_params": None,
93
+ "temperature": 1.0,
94
+ "tie_encoder_decoder": False,
95
+ "tie_word_embeddings": True,
96
+ "tokenizer_class": None,
97
+ "top_k": 50,
98
+ "top_p": 1.0,
99
+ "torch_dtype": None,
100
+ "torchscript": False,
101
+ "transformers_version": "4.16.0.dev0",
102
+ "use_bfloat16": False,
103
+ "vocab_size": 49408,
104
+ # },
105
+ # "text_config_dict": {
106
+ "hidden_size": 768,
107
+ "intermediate_size": 3072,
108
+ "num_attention_heads": 12,
109
+ "num_hidden_layers": 12,
110
+ "projection_dim": 768,
111
+ # },
112
+ # "torch_dtype": "float32",
113
+ # "transformers_version": null
114
+ }
115
+
116
+ LLAMA_CONFIG = {
117
+ "architectures": ["LlamaForCausalLM"],
118
+ "attention_bias": False,
119
+ "attention_dropout": 0.0,
120
+ "bos_token_id": 128000,
121
+ "eos_token_id": 128001,
122
+ "head_dim": 128,
123
+ "hidden_act": "silu",
124
+ "hidden_size": 4096,
125
+ "initializer_range": 0.02,
126
+ "intermediate_size": 14336,
127
+ "max_position_embeddings": 8192,
128
+ "mlp_bias": False,
129
+ "model_type": "llama",
130
+ "num_attention_heads": 32,
131
+ "num_hidden_layers": 32,
132
+ "num_key_value_heads": 8,
133
+ "pretraining_tp": 1,
134
+ "rms_norm_eps": 1e-05,
135
+ "rope_scaling": None,
136
+ "rope_theta": 500000.0,
137
+ "tie_word_embeddings": False,
138
+ "torch_dtype": "float16",
139
+ "transformers_version": "4.46.3",
140
+ "use_cache": True,
141
+ "vocab_size": 128320,
142
+ }
143
+
144
+ # When using decoder-only models, we must provide a prompt template to instruct the text encoder
145
+ # on how to generate the text.
146
+ # --------------------------------------------------------------------
147
+ PROMPT_TEMPLATE_ENCODE = (
148
+ "<|start_header_id|>system<|end_header_id|>\n\nDescribe the image by detailing the color, shape, size, texture, "
149
+ "quantity, text, spatial relationships of the objects and background:<|eot_id|>"
150
+ "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
151
+ )
152
+ PROMPT_TEMPLATE_ENCODE_VIDEO = (
153
+ "<|start_header_id|>system<|end_header_id|>\n\nDescribe the video by detailing the following aspects: "
154
+ "1. The main content and theme of the video."
155
+ "2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects."
156
+ "3. Actions, events, behaviors temporal relationships, physical movement changes of the objects."
157
+ "4. background environment, light, style and atmosphere."
158
+ "5. camera angles, movements, and transitions used in the video:<|eot_id|>"
159
+ "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
160
+ )
161
+
162
+ NEGATIVE_PROMPT = "Aerial view, aerial view, overexposed, low quality, deformation, a poor composition, bad hands, bad teeth, bad eyes, bad limbs, distortion"
163
+
164
+ PROMPT_TEMPLATE = {
165
+ "dit-llm-encode": {
166
+ "template": PROMPT_TEMPLATE_ENCODE,
167
+ "crop_start": 36,
168
+ },
169
+ "dit-llm-encode-video": {
170
+ "template": PROMPT_TEMPLATE_ENCODE_VIDEO,
171
+ "crop_start": 95,
172
+ },
173
+ }
174
+
175
+
176
+ def use_default(value, default):
177
+ return value if value is not None else default
178
+
179
+
180
+ def load_clip_l(text_encoder_path: str, dtype: Optional[Union[str, torch.dtype]] = None):
181
+ if os.path.isdir(text_encoder_path):
182
+ # load from directory, configs are in the directory
183
+ text_encoder = CLIPTextModel.from_pretrained(text_encoder_path, torch_dtype=dtype)
184
+ else:
185
+ # load from file, we create the model with the appropriate config
186
+ config = CLIPConfig(**CLIP_CONFIG)
187
+ with init_empty_weights():
188
+ text_encoder = CLIPTextModel._from_config(config, torch_dtype=dtype)
189
+
190
+ state_dict = load_file(text_encoder_path)
191
+
192
+ text_encoder.load_state_dict(state_dict, strict=True, assign=True)
193
+ # if dtype is not None:
194
+ # text_encoder.to(dtype=dtype)
195
+
196
+ return text_encoder
197
+
198
+
199
+ def load_clip_l_tokenizer(tokenizer_path: str):
200
+ if os.path.isdir(tokenizer_path):
201
+ tokenizer = CLIPTokenizer.from_pretrained(tokenizer_path, max_length=77)
202
+ else:
203
+ # load from Hugging Face
204
+ logger.info(f"Loading tokenizer from Hugging Face: {CLIP_L_HUGGINGFACE_MODEL_ID}")
205
+ tokenizer = CLIPTokenizer.from_pretrained(CLIP_L_HUGGINGFACE_MODEL_ID, max_length=77)
206
+
207
+ return tokenizer
208
+
209
+
210
+ def load_llm(text_encoder_path: str, dtype: Optional[Union[str, torch.dtype]] = None):
211
+ if os.path.isdir(text_encoder_path):
212
+ # load from directory, configs are in the directory
213
+ text_encoder = AutoModel.from_pretrained(text_encoder_path, low_cpu_mem_usage=True, torch_dtype=dtype)
214
+ else:
215
+ # load from file, we create the model with the appropriate config
216
+ config = LlamaConfig(**LLAMA_CONFIG)
217
+ with init_empty_weights():
218
+ text_encoder = LlamaForCausalLM._from_config(config, torch_dtype=dtype)
219
+
220
+ state_dict = load_file(text_encoder_path)
221
+
222
+ # support weights from ComfyUI
223
+ if "tokenizer" in state_dict:
224
+ state_dict.pop("tokenizer")
225
+
226
+ text_encoder.load_state_dict(state_dict, strict=True, assign=True)
227
+
228
+ return text_encoder
229
+
230
+
231
+ def load_llm_tokenizer(tokenizer_path: str, padding_side="right"):
232
+ if os.path.isdir(tokenizer_path):
233
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
234
+ else:
235
+ # load from Hugging Face
236
+ logger.info(f"Loading tokenizer from Hugging Face: {LLAVA_HUGGINGFACE_MODEL_ID}")
237
+ tokenizer = AutoTokenizer.from_pretrained(LLAVA_HUGGINGFACE_MODEL_ID, padding_side=padding_side)
238
+
239
+ return tokenizer
240
+
241
+
242
+ def load_text_encoder(
243
+ text_encoder_type: str,
244
+ text_encoder_path: str,
245
+ text_encoder_dtype: Optional[Union[str, torch.dtype]] = None,
246
+ ):
247
+ logger.info(f"Loading text encoder model ({text_encoder_type}) from: {text_encoder_path}")
248
+
249
+ # reduce peak memory usage by specifying the dtype of the model
250
+ dtype = text_encoder_dtype
251
+ if text_encoder_type == "clipL":
252
+ text_encoder = load_clip_l(text_encoder_path, dtype=dtype)
253
+ text_encoder.final_layer_norm = text_encoder.text_model.final_layer_norm
254
+ elif text_encoder_type == "llm":
255
+ text_encoder = load_llm(text_encoder_path, dtype=dtype)
256
+ if hasattr(text_encoder, "norm"):
257
+ text_encoder.final_layer_norm = text_encoder.norm # by from_pretrained
258
+ else:
259
+ text_encoder.final_layer_norm = text_encoder.model.norm # by _from_config
260
+ else:
261
+ raise ValueError(f"Unsupported text encoder type: {text_encoder_type}")
262
+ # from_pretrained will ensure that the model is in eval mode.
263
+
264
+ if dtype is not None:
265
+ text_encoder = text_encoder.to(dtype=dtype)
266
+
267
+ text_encoder.requires_grad_(False)
268
+
269
+ logger.info(f"Text encoder to dtype: {text_encoder.dtype}")
270
+ return text_encoder, text_encoder_path
271
+
272
+
273
+ def load_tokenizer(tokenizer_type, tokenizer_path=None, padding_side="right"):
274
+ logger.info(f"Loading tokenizer ({tokenizer_type}) from: {tokenizer_path}")
275
+
276
+ if tokenizer_type == "clipL":
277
+ tokenizer = load_clip_l_tokenizer(tokenizer_path)
278
+ elif tokenizer_type == "llm":
279
+ tokenizer = load_llm_tokenizer(tokenizer_path, padding_side=padding_side)
280
+ else:
281
+ raise ValueError(f"Unsupported tokenizer type: {tokenizer_type}")
282
+
283
+ return tokenizer, tokenizer_path
284
+
285
+
286
+ @dataclass
287
+ class TextEncoderModelOutput(ModelOutput):
288
+ """
289
+ Base class for model's outputs that also contains a pooling of the last hidden states.
290
+
291
+ Args:
292
+ hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
293
+ Sequence of hidden-states at the output of the last layer of the model.
294
+ attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
295
+ Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
296
+ hidden_states_list (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed):
297
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
298
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
299
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
300
+ text_outputs (`list`, *optional*, returned when `return_texts=True` is passed):
301
+ List of decoded texts.
302
+ """
303
+
304
+ hidden_state: torch.FloatTensor = None
305
+ attention_mask: Optional[torch.LongTensor] = None
306
+ hidden_states_list: Optional[Tuple[torch.FloatTensor, ...]] = None
307
+ text_outputs: Optional[list] = None
308
+
309
+
310
+ class TextEncoder(nn.Module):
311
+ def __init__(
312
+ self,
313
+ text_encoder_type: str,
314
+ max_length: int,
315
+ text_encoder_dtype: Optional[Union[str, torch.dtype]] = None,
316
+ text_encoder_path: Optional[str] = None,
317
+ tokenizer_type: Optional[str] = None,
318
+ tokenizer_path: Optional[str] = None,
319
+ output_key: Optional[str] = None,
320
+ use_attention_mask: bool = True,
321
+ input_max_length: Optional[int] = None,
322
+ prompt_template: Optional[dict] = None,
323
+ prompt_template_video: Optional[dict] = None,
324
+ hidden_state_skip_layer: Optional[int] = None,
325
+ apply_final_norm: bool = False,
326
+ reproduce: bool = False,
327
+ ):
328
+ super().__init__()
329
+ self.text_encoder_type = text_encoder_type
330
+ self.max_length = max_length
331
+ # self.precision = text_encoder_precision
332
+ self.model_path = text_encoder_path
333
+ self.tokenizer_type = tokenizer_type if tokenizer_type is not None else text_encoder_type
334
+ self.tokenizer_path = tokenizer_path if tokenizer_path is not None else text_encoder_path
335
+ self.use_attention_mask = use_attention_mask
336
+ if prompt_template_video is not None:
337
+ assert use_attention_mask is True, "Attention mask is True required when training videos."
338
+ self.input_max_length = input_max_length if input_max_length is not None else max_length
339
+ self.prompt_template = prompt_template
340
+ self.prompt_template_video = prompt_template_video
341
+ self.hidden_state_skip_layer = hidden_state_skip_layer
342
+ self.apply_final_norm = apply_final_norm
343
+ self.reproduce = reproduce
344
+
345
+ self.use_template = self.prompt_template is not None
346
+ if self.use_template:
347
+ assert (
348
+ isinstance(self.prompt_template, dict) and "template" in self.prompt_template
349
+ ), f"`prompt_template` must be a dictionary with a key 'template', got {self.prompt_template}"
350
+ assert "{}" in str(self.prompt_template["template"]), (
351
+ "`prompt_template['template']` must contain a placeholder `{}` for the input text, "
352
+ f"got {self.prompt_template['template']}"
353
+ )
354
+
355
+ self.use_video_template = self.prompt_template_video is not None
356
+ if self.use_video_template:
357
+ if self.prompt_template_video is not None:
358
+ assert (
359
+ isinstance(self.prompt_template_video, dict) and "template" in self.prompt_template_video
360
+ ), f"`prompt_template_video` must be a dictionary with a key 'template', got {self.prompt_template_video}"
361
+ assert "{}" in str(self.prompt_template_video["template"]), (
362
+ "`prompt_template_video['template']` must contain a placeholder `{}` for the input text, "
363
+ f"got {self.prompt_template_video['template']}"
364
+ )
365
+
366
+ if "t5" in text_encoder_type:
367
+ self.output_key = output_key or "last_hidden_state"
368
+ elif "clip" in text_encoder_type:
369
+ self.output_key = output_key or "pooler_output"
370
+ elif "llm" in text_encoder_type or "glm" in text_encoder_type:
371
+ self.output_key = output_key or "last_hidden_state"
372
+ else:
373
+ raise ValueError(f"Unsupported text encoder type: {text_encoder_type}")
374
+
375
+ self.model, self.model_path = load_text_encoder(
376
+ text_encoder_type=self.text_encoder_type, text_encoder_path=self.model_path, text_encoder_dtype=text_encoder_dtype
377
+ )
378
+ self.dtype = self.model.dtype
379
+
380
+ self.tokenizer, self.tokenizer_path = load_tokenizer(
381
+ tokenizer_type=self.tokenizer_type, tokenizer_path=self.tokenizer_path, padding_side="right"
382
+ )
383
+
384
+ def __repr__(self):
385
+ return f"{self.text_encoder_type} ({self.precision} - {self.model_path})"
386
+
387
+ @property
388
+ def device(self):
389
+ return self.model.device
390
+
391
+ @staticmethod
392
+ def apply_text_to_template(text, template, prevent_empty_text=True):
393
+ """
394
+ Apply text to template.
395
+
396
+ Args:
397
+ text (str): Input text.
398
+ template (str or list): Template string or list of chat conversation.
399
+ prevent_empty_text (bool): If Ture, we will prevent the user text from being empty
400
+ by adding a space. Defaults to True.
401
+ """
402
+ if isinstance(template, str):
403
+ # Will send string to tokenizer. Used for llm
404
+ return template.format(text)
405
+ else:
406
+ raise TypeError(f"Unsupported template type: {type(template)}")
407
+
408
+ def text2tokens(self, text, data_type="image"):
409
+ """
410
+ Tokenize the input text.
411
+
412
+ Args:
413
+ text (str or list): Input text.
414
+ """
415
+ tokenize_input_type = "str"
416
+ if self.use_template:
417
+ if data_type == "image":
418
+ prompt_template = self.prompt_template["template"]
419
+ elif data_type == "video":
420
+ prompt_template = self.prompt_template_video["template"]
421
+ else:
422
+ raise ValueError(f"Unsupported data type: {data_type}")
423
+ if isinstance(text, (list, tuple)):
424
+ text = [self.apply_text_to_template(one_text, prompt_template) for one_text in text]
425
+ if isinstance(text[0], list):
426
+ tokenize_input_type = "list"
427
+ elif isinstance(text, str):
428
+ text = self.apply_text_to_template(text, prompt_template)
429
+ if isinstance(text, list):
430
+ tokenize_input_type = "list"
431
+ else:
432
+ raise TypeError(f"Unsupported text type: {type(text)}")
433
+
434
+ kwargs = dict(
435
+ truncation=True,
436
+ max_length=self.max_length,
437
+ padding="max_length",
438
+ return_tensors="pt",
439
+ )
440
+ if tokenize_input_type == "str":
441
+ return self.tokenizer(
442
+ text,
443
+ return_length=False,
444
+ return_overflowing_tokens=False,
445
+ return_attention_mask=True,
446
+ **kwargs,
447
+ )
448
+ elif tokenize_input_type == "list":
449
+ return self.tokenizer.apply_chat_template(
450
+ text,
451
+ add_generation_prompt=True,
452
+ tokenize=True,
453
+ return_dict=True,
454
+ **kwargs,
455
+ )
456
+ else:
457
+ raise ValueError(f"Unsupported tokenize_input_type: {tokenize_input_type}")
458
+
459
+ def encode(
460
+ self,
461
+ batch_encoding,
462
+ use_attention_mask=None,
463
+ output_hidden_states=False,
464
+ do_sample=None,
465
+ hidden_state_skip_layer=None,
466
+ return_texts=False,
467
+ data_type="image",
468
+ device=None,
469
+ ):
470
+ """
471
+ Args:
472
+ batch_encoding (dict): Batch encoding from tokenizer.
473
+ use_attention_mask (bool): Whether to use attention mask. If None, use self.use_attention_mask.
474
+ Defaults to None.
475
+ output_hidden_states (bool): Whether to output hidden states. If False, return the value of
476
+ self.output_key. If True, return the entire output. If set self.hidden_state_skip_layer,
477
+ output_hidden_states will be set True. Defaults to False.
478
+ do_sample (bool): Whether to sample from the model. Used for Decoder-Only LLMs. Defaults to None.
479
+ When self.produce is False, do_sample is set to True by default.
480
+ hidden_state_skip_layer (int): Number of hidden states to hidden_state_skip_layer. 0 means the last layer.
481
+ If None, self.output_key will be used. Defaults to None.
482
+ return_texts (bool): Whether to return the decoded texts. Defaults to False.
483
+ """
484
+ device = self.model.device if device is None else device
485
+ use_attention_mask = use_default(use_attention_mask, self.use_attention_mask)
486
+ hidden_state_skip_layer = use_default(hidden_state_skip_layer, self.hidden_state_skip_layer)
487
+ do_sample = use_default(do_sample, not self.reproduce)
488
+ attention_mask = batch_encoding["attention_mask"].to(device) if use_attention_mask else None
489
+ outputs = self.model(
490
+ input_ids=batch_encoding["input_ids"].to(device),
491
+ attention_mask=attention_mask,
492
+ output_hidden_states=output_hidden_states or hidden_state_skip_layer is not None,
493
+ )
494
+ if hidden_state_skip_layer is not None:
495
+ last_hidden_state = outputs.hidden_states[-(hidden_state_skip_layer + 1)]
496
+ # Real last hidden state already has layer norm applied. So here we only apply it
497
+ # for intermediate layers.
498
+ if hidden_state_skip_layer > 0 and self.apply_final_norm:
499
+ last_hidden_state = self.model.final_layer_norm(last_hidden_state)
500
+ else:
501
+ last_hidden_state = outputs[self.output_key]
502
+
503
+ # Remove hidden states of instruction tokens, only keep prompt tokens.
504
+ if self.use_template:
505
+ if data_type == "image":
506
+ crop_start = self.prompt_template.get("crop_start", -1)
507
+ elif data_type == "video":
508
+ crop_start = self.prompt_template_video.get("crop_start", -1)
509
+ else:
510
+ raise ValueError(f"Unsupported data type: {data_type}")
511
+ if crop_start > 0:
512
+ last_hidden_state = last_hidden_state[:, crop_start:]
513
+ attention_mask = attention_mask[:, crop_start:] if use_attention_mask else None
514
+
515
+ if output_hidden_states:
516
+ return TextEncoderModelOutput(last_hidden_state, attention_mask, outputs.hidden_states)
517
+ return TextEncoderModelOutput(last_hidden_state, attention_mask)
518
+
519
+ def forward(
520
+ self,
521
+ text,
522
+ use_attention_mask=None,
523
+ output_hidden_states=False,
524
+ do_sample=False,
525
+ hidden_state_skip_layer=None,
526
+ return_texts=False,
527
+ ):
528
+ batch_encoding = self.text2tokens(text)
529
+ return self.encode(
530
+ batch_encoding,
531
+ use_attention_mask=use_attention_mask,
532
+ output_hidden_states=output_hidden_states,
533
+ do_sample=do_sample,
534
+ hidden_state_skip_layer=hidden_state_skip_layer,
535
+ return_texts=return_texts,
536
+ )
537
+
538
+
539
+ # region HunyanVideo architecture
540
+
541
+
542
+ def load_text_encoder_1(
543
+ text_encoder_dir: str, device: torch.device, fp8_llm: bool, dtype: Optional[Union[str, torch.dtype]] = None
544
+ ) -> TextEncoder:
545
+ text_encoder_dtype = dtype or torch.float16
546
+ text_encoder_type = "llm"
547
+ text_len = 256
548
+ hidden_state_skip_layer = 2
549
+ apply_final_norm = False
550
+ reproduce = False
551
+
552
+ prompt_template = "dit-llm-encode"
553
+ prompt_template = PROMPT_TEMPLATE[prompt_template]
554
+ prompt_template_video = "dit-llm-encode-video"
555
+ prompt_template_video = PROMPT_TEMPLATE[prompt_template_video]
556
+
557
+ crop_start = prompt_template_video["crop_start"] # .get("crop_start", 0)
558
+ max_length = text_len + crop_start
559
+
560
+ text_encoder_1 = TextEncoder(
561
+ text_encoder_type=text_encoder_type,
562
+ max_length=max_length,
563
+ text_encoder_dtype=text_encoder_dtype,
564
+ text_encoder_path=text_encoder_dir,
565
+ tokenizer_type=text_encoder_type,
566
+ prompt_template=prompt_template,
567
+ prompt_template_video=prompt_template_video,
568
+ hidden_state_skip_layer=hidden_state_skip_layer,
569
+ apply_final_norm=apply_final_norm,
570
+ reproduce=reproduce,
571
+ )
572
+ text_encoder_1.eval()
573
+
574
+ if fp8_llm:
575
+ org_dtype = text_encoder_1.dtype
576
+ logger.info(f"Moving and casting text encoder to {device} and torch.float8_e4m3fn")
577
+ text_encoder_1.to(device=device, dtype=torch.float8_e4m3fn)
578
+
579
+ # prepare LLM for fp8
580
+ def prepare_fp8(llama_model: LlamaModel, target_dtype):
581
+ def forward_hook(module):
582
+ def forward(hidden_states):
583
+ input_dtype = hidden_states.dtype
584
+ hidden_states = hidden_states.to(torch.float32)
585
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
586
+ hidden_states = hidden_states * torch.rsqrt(variance + module.variance_epsilon)
587
+ return module.weight.to(input_dtype) * hidden_states.to(input_dtype)
588
+
589
+ return forward
590
+
591
+ for module in llama_model.modules():
592
+ if module.__class__.__name__ in ["Embedding"]:
593
+ # print("set", module.__class__.__name__, "to", target_dtype)
594
+ module.to(target_dtype)
595
+ if module.__class__.__name__ in ["LlamaRMSNorm"]:
596
+ # print("set", module.__class__.__name__, "hooks")
597
+ module.forward = forward_hook(module)
598
+
599
+ prepare_fp8(text_encoder_1.model, org_dtype)
600
+ else:
601
+ text_encoder_1.to(device=device)
602
+
603
+ return text_encoder_1
604
+
605
+
606
+ def load_text_encoder_2(
607
+ text_encoder_dir: str, device: torch.device, dtype: Optional[Union[str, torch.dtype]] = None
608
+ ) -> TextEncoder:
609
+ text_encoder_dtype = dtype or torch.float16
610
+ reproduce = False
611
+
612
+ text_encoder_2_type = "clipL"
613
+ text_len_2 = 77
614
+
615
+ text_encoder_2 = TextEncoder(
616
+ text_encoder_type=text_encoder_2_type,
617
+ max_length=text_len_2,
618
+ text_encoder_dtype=text_encoder_dtype,
619
+ text_encoder_path=text_encoder_dir,
620
+ tokenizer_type=text_encoder_2_type,
621
+ reproduce=reproduce,
622
+ )
623
+ text_encoder_2.eval()
624
+
625
+ text_encoder_2.to(device=device)
626
+
627
+ return text_encoder_2
628
+
629
+
630
+ # endregion
631
+
632
+
633
+ if __name__ == "__main__":
634
+ import argparse
635
+ from utils.model_utils import str_to_dtype
636
+
637
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
638
+
639
+ parser = argparse.ArgumentParser()
640
+ parser.add_argument("type", type=str, help="Text Encoder type")
641
+ parser.add_argument("path1", type=str, help="Text Encoder directory or file 1")
642
+ parser.add_argument("path2", type=str, help="Text Encoder directory or file 2")
643
+ parser.add_argument("--dtype", type=str, default=None, help="Data type for Text Encoder")
644
+ args = parser.parse_args()
645
+
646
+ dtype = str_to_dtype(args.dtype) if args.dtype is not None else torch.float16
647
+
648
+ """
649
+ if args.type == "clipL":
650
+ text_encoder_1st = load_clip_l(args.path1, dtype=dtype)
651
+ tokenizer_1st = load_clip_l_tokenizer(args.path1)
652
+ text_encoder_2nd = load_clip_l(args.path2, dtype=dtype)
653
+ tokenizer_2nd = load_clip_l_tokenizer(args.path2)
654
+ elif args.type == "llm":
655
+ text_encoder_1st = load_llm(args.path1, dtype=dtype)
656
+ tokenizer_1st = load_llm_tokenizer(args.path1)
657
+ text_encoder_2nd = load_llm(args.path2, dtype=dtype)
658
+ tokenizer_2nd = load_llm_tokenizer(args.path2)
659
+
660
+ print(f"1st Text Encoder dtype: {text_encoder_1st.dtype}")
661
+ print(f"2nd Text Encoder dtype: {text_encoder_2nd.dtype}")
662
+
663
+ text_encoder_1st.to(device=device)
664
+ text_encoder_2nd.to(device=device)
665
+
666
+ test_text = "A cat sitting on a table"
667
+ token_ids_1st = tokenizer_1st(test_text, return_tensors="pt")["input_ids"]
668
+ token_ids_2nd = tokenizer_2nd(test_text, return_tensors="pt")["input_ids"]
669
+ assert torch.allclose(token_ids_1st, token_ids_2nd)
670
+ print(f"Token IDs are the same: {token_ids_1st}")
671
+
672
+ with torch.no_grad():
673
+ text_encoder_1st_output = text_encoder_1st(token_ids_1st.to(device), output_hidden_states=True)
674
+ text_encoder_2nd_output = text_encoder_2nd(token_ids_2nd.to(device), output_hidden_states=True)
675
+ print(f"1st Text Encoder output keys: {text_encoder_1st_output.keys()}")
676
+ print(f"2nd Text Encoder output keys: {text_encoder_2nd_output.keys()}")
677
+ for key in text_encoder_1st_output:
678
+ print(f"Checking output: {key}")
679
+ assert key in text_encoder_2nd_output, f"Key {key} not in 2nd Text Encoder output"
680
+ assert torch.allclose(text_encoder_1st_output[key], text_encoder_2nd_output[key])
681
+ print(f"Outputs are the same: {key}")
682
+ print("All outputs are the same.")
683
+ """
684
+
685
+ if args.type == "clipL":
686
+ text_encoder_1st = load_text_encoder_2(args.path1, device, dtype)
687
+ text_encoder_2nd = load_text_encoder_2(args.path2, device, dtype)
688
+ elif args.type == "llm":
689
+ text_encoder_1st = load_text_encoder_1(args.path1, device, False, dtype)
690
+ text_encoder_2nd = load_text_encoder_1(args.path2, device, False, dtype)
691
+ print(f"1st Text Encoder dtype: {text_encoder_1st.dtype}")
692
+ print(f"2nd Text Encoder dtype: {text_encoder_2nd.dtype}")
693
+
694
+ prompt = "A cat sitting on a table"
695
+ data_type = "video" # video only, image is not supported
696
+ text_inputs_1st = text_encoder_1st.text2tokens(prompt, data_type=data_type)
697
+ text_inputs_2nd = text_encoder_2nd.text2tokens(prompt, data_type=data_type)
698
+ print(text_inputs_1st)
699
+ assert torch.allclose(text_inputs_1st["input_ids"], text_inputs_2nd["input_ids"])
700
+
701
+ with torch.no_grad():
702
+ prompt_outputs_1st = text_encoder_1st.encode(text_inputs_1st, data_type=data_type)
703
+ prompt_outputs_2nd = text_encoder_2nd.encode(text_inputs_1st, data_type=data_type)
704
+
705
+ # prompt_outputs.hidden_state, prompt_outputs.attention_mask
706
+ assert torch.allclose(prompt_outputs_1st.hidden_state, prompt_outputs_2nd.hidden_state)
707
+ print("Hidden states are the same.")
708
+ assert torch.allclose(prompt_outputs_1st.attention_mask, prompt_outputs_2nd.attention_mask)
709
+ print("Attention masks are the same.")
710
+ print("All outputs are the same.")
hunyuan_model/token_refiner.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ from einops import rearrange
4
+ import torch
5
+ import torch.nn as nn
6
+ from torch.utils.checkpoint import checkpoint
7
+
8
+ from .activation_layers import get_activation_layer
9
+ from .attention import attention
10
+ from .norm_layers import get_norm_layer
11
+ from .embed_layers import TimestepEmbedder, TextProjection
12
+ from .mlp_layers import MLP
13
+ from .modulate_layers import modulate, apply_gate
14
+
15
+
16
+ class IndividualTokenRefinerBlock(nn.Module):
17
+ def __init__(
18
+ self,
19
+ hidden_size,
20
+ heads_num,
21
+ mlp_width_ratio: str = 4.0,
22
+ mlp_drop_rate: float = 0.0,
23
+ act_type: str = "silu",
24
+ qk_norm: bool = False,
25
+ qk_norm_type: str = "layer",
26
+ qkv_bias: bool = True,
27
+ dtype: Optional[torch.dtype] = None,
28
+ device: Optional[torch.device] = None,
29
+ ):
30
+ factory_kwargs = {"device": device, "dtype": dtype}
31
+ super().__init__()
32
+ self.heads_num = heads_num
33
+ head_dim = hidden_size // heads_num
34
+ mlp_hidden_dim = int(hidden_size * mlp_width_ratio)
35
+
36
+ self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=True, eps=1e-6, **factory_kwargs)
37
+ self.self_attn_qkv = nn.Linear(hidden_size, hidden_size * 3, bias=qkv_bias, **factory_kwargs)
38
+ qk_norm_layer = get_norm_layer(qk_norm_type)
39
+ self.self_attn_q_norm = (
40
+ qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
41
+ )
42
+ self.self_attn_k_norm = (
43
+ qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
44
+ )
45
+ self.self_attn_proj = nn.Linear(hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs)
46
+
47
+ self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=True, eps=1e-6, **factory_kwargs)
48
+ act_layer = get_activation_layer(act_type)
49
+ self.mlp = MLP(
50
+ in_channels=hidden_size,
51
+ hidden_channels=mlp_hidden_dim,
52
+ act_layer=act_layer,
53
+ drop=mlp_drop_rate,
54
+ **factory_kwargs,
55
+ )
56
+
57
+ self.adaLN_modulation = nn.Sequential(
58
+ act_layer(),
59
+ nn.Linear(hidden_size, 2 * hidden_size, bias=True, **factory_kwargs),
60
+ )
61
+ # Zero-initialize the modulation
62
+ nn.init.zeros_(self.adaLN_modulation[1].weight)
63
+ nn.init.zeros_(self.adaLN_modulation[1].bias)
64
+
65
+ self.gradient_checkpointing = False
66
+
67
+ def enable_gradient_checkpointing(self):
68
+ self.gradient_checkpointing = True
69
+
70
+ def disable_gradient_checkpointing(self):
71
+ self.gradient_checkpointing = False
72
+
73
+ def _forward(
74
+ self,
75
+ x: torch.Tensor,
76
+ c: torch.Tensor, # timestep_aware_representations + context_aware_representations
77
+ attn_mask: torch.Tensor = None,
78
+ ):
79
+ gate_msa, gate_mlp = self.adaLN_modulation(c).chunk(2, dim=1)
80
+
81
+ norm_x = self.norm1(x)
82
+ qkv = self.self_attn_qkv(norm_x)
83
+ q, k, v = rearrange(qkv, "B L (K H D) -> K B L H D", K=3, H=self.heads_num)
84
+ # Apply QK-Norm if needed
85
+ q = self.self_attn_q_norm(q).to(v)
86
+ k = self.self_attn_k_norm(k).to(v)
87
+
88
+ # Self-Attention
89
+ attn = attention(q, k, v, mode="torch", attn_mask=attn_mask)
90
+
91
+ x = x + apply_gate(self.self_attn_proj(attn), gate_msa)
92
+
93
+ # FFN Layer
94
+ x = x + apply_gate(self.mlp(self.norm2(x)), gate_mlp)
95
+
96
+ return x
97
+
98
+ def forward(self, *args, **kwargs):
99
+ if self.training and self.gradient_checkpointing:
100
+ return checkpoint(self._forward, *args, use_reentrant=False, **kwargs)
101
+ else:
102
+ return self._forward(*args, **kwargs)
103
+
104
+
105
+ class IndividualTokenRefiner(nn.Module):
106
+ def __init__(
107
+ self,
108
+ hidden_size,
109
+ heads_num,
110
+ depth,
111
+ mlp_width_ratio: float = 4.0,
112
+ mlp_drop_rate: float = 0.0,
113
+ act_type: str = "silu",
114
+ qk_norm: bool = False,
115
+ qk_norm_type: str = "layer",
116
+ qkv_bias: bool = True,
117
+ dtype: Optional[torch.dtype] = None,
118
+ device: Optional[torch.device] = None,
119
+ ):
120
+ factory_kwargs = {"device": device, "dtype": dtype}
121
+ super().__init__()
122
+ self.blocks = nn.ModuleList(
123
+ [
124
+ IndividualTokenRefinerBlock(
125
+ hidden_size=hidden_size,
126
+ heads_num=heads_num,
127
+ mlp_width_ratio=mlp_width_ratio,
128
+ mlp_drop_rate=mlp_drop_rate,
129
+ act_type=act_type,
130
+ qk_norm=qk_norm,
131
+ qk_norm_type=qk_norm_type,
132
+ qkv_bias=qkv_bias,
133
+ **factory_kwargs,
134
+ )
135
+ for _ in range(depth)
136
+ ]
137
+ )
138
+
139
+ def enable_gradient_checkpointing(self):
140
+ for block in self.blocks:
141
+ block.enable_gradient_checkpointing()
142
+
143
+ def disable_gradient_checkpointing(self):
144
+ for block in self.blocks:
145
+ block.disable_gradient_checkpointing()
146
+
147
+ def forward(
148
+ self,
149
+ x: torch.Tensor,
150
+ c: torch.LongTensor,
151
+ mask: Optional[torch.Tensor] = None,
152
+ ):
153
+ self_attn_mask = None
154
+ if mask is not None:
155
+ batch_size = mask.shape[0]
156
+ seq_len = mask.shape[1]
157
+ mask = mask.to(x.device)
158
+ # batch_size x 1 x seq_len x seq_len
159
+ self_attn_mask_1 = mask.view(batch_size, 1, 1, seq_len).repeat(1, 1, seq_len, 1)
160
+ # batch_size x 1 x seq_len x seq_len
161
+ self_attn_mask_2 = self_attn_mask_1.transpose(2, 3)
162
+ # batch_size x 1 x seq_len x seq_len, 1 for broadcasting of heads_num
163
+ self_attn_mask = (self_attn_mask_1 & self_attn_mask_2).bool()
164
+ # avoids self-attention weight being NaN for padding tokens
165
+ self_attn_mask[:, :, :, 0] = True
166
+
167
+ for block in self.blocks:
168
+ x = block(x, c, self_attn_mask)
169
+ return x
170
+
171
+
172
+ class SingleTokenRefiner(nn.Module):
173
+ """
174
+ A single token refiner block for llm text embedding refine.
175
+ """
176
+
177
+ def __init__(
178
+ self,
179
+ in_channels,
180
+ hidden_size,
181
+ heads_num,
182
+ depth,
183
+ mlp_width_ratio: float = 4.0,
184
+ mlp_drop_rate: float = 0.0,
185
+ act_type: str = "silu",
186
+ qk_norm: bool = False,
187
+ qk_norm_type: str = "layer",
188
+ qkv_bias: bool = True,
189
+ attn_mode: str = "torch",
190
+ dtype: Optional[torch.dtype] = None,
191
+ device: Optional[torch.device] = None,
192
+ ):
193
+ factory_kwargs = {"device": device, "dtype": dtype}
194
+ super().__init__()
195
+ self.attn_mode = attn_mode
196
+ assert self.attn_mode == "torch", "Only support 'torch' mode for token refiner."
197
+
198
+ self.input_embedder = nn.Linear(in_channels, hidden_size, bias=True, **factory_kwargs)
199
+
200
+ act_layer = get_activation_layer(act_type)
201
+ # Build timestep embedding layer
202
+ self.t_embedder = TimestepEmbedder(hidden_size, act_layer, **factory_kwargs)
203
+ # Build context embedding layer
204
+ self.c_embedder = TextProjection(in_channels, hidden_size, act_layer, **factory_kwargs)
205
+
206
+ self.individual_token_refiner = IndividualTokenRefiner(
207
+ hidden_size=hidden_size,
208
+ heads_num=heads_num,
209
+ depth=depth,
210
+ mlp_width_ratio=mlp_width_ratio,
211
+ mlp_drop_rate=mlp_drop_rate,
212
+ act_type=act_type,
213
+ qk_norm=qk_norm,
214
+ qk_norm_type=qk_norm_type,
215
+ qkv_bias=qkv_bias,
216
+ **factory_kwargs,
217
+ )
218
+
219
+ def enable_gradient_checkpointing(self):
220
+ self.individual_token_refiner.enable_gradient_checkpointing()
221
+
222
+ def disable_gradient_checkpointing(self):
223
+ self.individual_token_refiner.disable_gradient_checkpointing()
224
+
225
+ def forward(
226
+ self,
227
+ x: torch.Tensor,
228
+ t: torch.LongTensor,
229
+ mask: Optional[torch.LongTensor] = None,
230
+ ):
231
+ timestep_aware_representations = self.t_embedder(t)
232
+
233
+ if mask is None:
234
+ context_aware_representations = x.mean(dim=1)
235
+ else:
236
+ mask_float = mask.float().unsqueeze(-1) # [b, s1, 1]
237
+ context_aware_representations = (x * mask_float).sum(dim=1) / mask_float.sum(dim=1)
238
+ context_aware_representations = self.c_embedder(context_aware_representations)
239
+ c = timestep_aware_representations + context_aware_representations
240
+
241
+ x = self.input_embedder(x)
242
+
243
+ x = self.individual_token_refiner(x, c, mask)
244
+
245
+ return x