svjack commited on 5 days ago

Commit

ef46f0f

verified ·

1 Parent(s): c31c4a4

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +8 -0
.ipynb_checkpoints/README-checkpoint.md +203 -0
.python-version +1 -0
README.md +203 -0
cache_latents.py +339 -0
cache_text_encoder_outputs.py +214 -0
convert_lora.py +137 -0
dataset/__init__.py +0 -0
dataset/config_utils.py +381 -0
dataset/dataset_config.md +461 -0
dataset/image_video_dataset.py +1726 -0
docs/advanced_config.md +316 -0
docs/framepack.md +331 -0
docs/sampling_during_training.md +116 -0
docs/wan.md +531 -0
fpack_cache_latents.py +381 -0
fpack_cache_text_encoder_outputs.py +110 -0
fpack_generate_video.py +1149 -0
fpack_train_network.py +410 -0
frame_pack/__init__.py +0 -0
frame_pack/bucket_tools.py +30 -0
frame_pack/clip_vision.py +14 -0
frame_pack/framepack_utils.py +273 -0
frame_pack/hunyuan.py +116 -0
frame_pack/hunyuan_video_packed.py +2015 -0
frame_pack/k_diffusion_hunyuan.py +128 -0
frame_pack/uni_pc_fm.py +142 -0
frame_pack/utils.py +617 -0
frame_pack/wrapper.py +51 -0
framepack_yichen_output/framepack-yichen-lora-000001.safetensors +3 -0
framepack_yichen_output/framepack-yichen-lora-000002.safetensors +3 -0
framepack_yichen_output/framepack-yichen-lora-000003.safetensors +3 -0
framepack_yichen_output/framepack-yichen-lora-000004.safetensors +3 -0
framepack_yichen_output/framepack-yichen-lora-000005.safetensors +3 -0
framepack_yichen_output/framepack-yichen-lora-000006.safetensors +3 -0
hunyuan_model/__init__.py +0 -0
hunyuan_model/activation_layers.py +23 -0
hunyuan_model/attention.py +295 -0
hunyuan_model/autoencoder_kl_causal_3d.py +609 -0
hunyuan_model/embed_layers.py +132 -0
hunyuan_model/fp8_optimization.py +39 -0
hunyuan_model/helpers.py +40 -0
hunyuan_model/mlp_layers.py +118 -0
hunyuan_model/models.py +1044 -0
hunyuan_model/modulate_layers.py +76 -0
hunyuan_model/norm_layers.py +79 -0
hunyuan_model/pipeline_hunyuan_video.py +1100 -0
hunyuan_model/posemb_layers.py +310 -0
hunyuan_model/text_encoder.py +710 -0
hunyuan_model/token_refiner.py +245 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+__pycache__/
+.venv
+venv/
+logs/
+uv.lock
+main.exp
+main.lib
+main.obj

.ipynb_checkpoints/README-checkpoint.md ADDED Viewed

	@@ -0,0 +1,203 @@

+# FramePack Dancing Image-to-Video Generation
+This repository contains the necessary steps and scripts to generate videos using the Dancing image-to-video model. The model leverages LoRA (Low-Rank Adaptation) weights and pre-trained components to create high-quality anime-style videos based on textual prompts.
+## Prerequisites
+Before proceeding, ensure that you have the following installed on your system:
+• **Ubuntu** (or a compatible Linux distribution)
+• **Python 3.x**
+• **pip** (Python package manager)
+• **Git**
+• **Git LFS** (Git Large File Storage)
+• **FFmpeg**
+## Installation
+1. **Update and Install Dependencies**
+   ```bash
+   sudo apt-get update && sudo apt-get install cbm git-lfs ffmpeg
+   ```
+2. **Clone the Repository**
+   ```bash
+   git clone https://huggingface.co/svjack/YiChen_FramePack_lora_early
+   cd YiChen_FramePack_lora_early
+   ```
+3. **Install Python Dependencies**
+   ```bash
+   pip install torch torchvision
+   pip install -r requirements.txt
+   pip install ascii-magic matplotlib tensorboard huggingface_hub datasets
+   pip install moviepy==1.0.3
+   pip install sageattention==1.0.6
+   ```
+4. **Download Model Weights**
+   ```bash
+    git clone https://huggingface.co/lllyasviel/FramePackI2V_HY
+    git clone https://huggingface.co/hunyuanvideo-community/HunyuanVideo
+    git clone https://huggingface.co/Comfy-Org/HunyuanVideo_repackaged
+    git clone https://huggingface.co/Comfy-Org/sigclip_vision_384
+   ```
+## Usage
+To generate a video, use the `fpack_generate_video.py` script with the appropriate parameters. Below are examples of how to generate videos using the Dancing model.
+### 1. Furina
+- Source Image
+```bash
+python fpack_generate_video.py \
+    --dit FramePackI2V_HY/diffusion_pytorch_model-00001-of-00003.safetensors \
+    --vae HunyuanVideo/vae/diffusion_pytorch_model.safetensors \
+    --text_encoder1 HunyuanVideo_repackaged/split_files/text_encoders/llava_llama3_fp16.safetensors \
+    --text_encoder2 HunyuanVideo_repackaged/split_files/text_encoders/clip_l.safetensors \
+    --image_encoder sigclip_vision_384/sigclip_vision_patch14_384.safetensors \
+    --image_path fln.png \
+    --prompt "In the style of Yi Chen Dancing White Background , The character's movements shift dynamically throughout the video, transitioning from poised stillness to lively dance steps. Her expressions evolve seamlessly—starting with focused determination, then flashing surprise as she executes a quick spin, before breaking into a joyful smile mid-leap. Her hands flow through choreographed positions, sometimes extending gracefully like unfolding wings, other times clapping rhythmically against her wrists. During a dramatic hip sway, her fingers fan open near her cheek, then sweep downward as her whole body dips into a playful crouch, the sequins on her costume catching the light with every motion." \
+    --video_size 960 544 --video_seconds 3 --fps 30 --infer_steps 25 \
+    --attn_mode sdpa --fp8_scaled \
+    --vae_chunk_size 32 --vae_spatial_tile_sample_min_size 128 \
+    --save_path save --output_type both \
+    --seed 1234 --lora_multiplier 1.0 --lora_weight framepack_yichen_output/framepack-yichen-lora-000006.safetensors
+```
+- Without Lora
+- With Lora
+### 2. Roper
+- Source Image
+```bash
+python fpack_generate_video.py \
+    --dit FramePackI2V_HY/diffusion_pytorch_model-00001-of-00003.safetensors \
+    --vae HunyuanVideo/vae/diffusion_pytorch_model.safetensors \
+    --text_encoder1 HunyuanVideo_repackaged/split_files/text_encoders/llava_llama3_fp16.safetensors \
+    --text_encoder2 HunyuanVideo_repackaged/split_files/text_encoders/clip_l.safetensors \
+    --image_encoder sigclip_vision_384/sigclip_vision_patch14_384.safetensors \
+    --image_path shengjiang.png \
+    --prompt "In the style of Yi Chen Dancing White Background , The character's movements shift dynamically throughout the video, transitioning from poised stillness to lively dance steps. Her expressions evolve seamlessly—starting with focused determination, then flashing surprise as she executes a quick spin, before breaking into a joyful smile mid-leap. Her hands flow through choreographed positions, sometimes extending gracefully like unfolding wings, other times clapping rhythmically against her wrists. During a dramatic hip sway, her fingers fan open near her cheek, then sweep downward as her whole body dips into a playful crouch, the sequins on her costume catching the light with every motion." \
+    --video_size 960 544 --video_seconds 3 --fps 30 --infer_steps 25 \
+    --attn_mode sdpa --fp8_scaled \
+    --vae_chunk_size 32 --vae_spatial_tile_sample_min_size 128 \
+    --save_path save --output_type both \
+    --seed 1234 --lora_multiplier 1.0 --lora_weight framepack_yichen_output/framepack-yichen-lora-000006.safetensors
+```
+- With Lora
+### 3. Varesa
+- Source Image
+```bash
+python fpack_generate_video.py \
+    --dit FramePackI2V_HY/diffusion_pytorch_model-00001-of-00003.safetensors \
+    --vae HunyuanVideo/vae/diffusion_pytorch_model.safetensors \
+    --text_encoder1 HunyuanVideo_repackaged/split_files/text_encoders/llava_llama3_fp16.safetensors \
+    --text_encoder2 HunyuanVideo_repackaged/split_files/text_encoders/clip_l.safetensors \
+    --image_encoder sigclip_vision_384/sigclip_vision_patch14_384.safetensors \
+    --image_path waliesha.jpg \
+    --prompt "In the style of Yi Chen Dancing White Background , The dancer’s energy pulses in waves—one moment a statue, poised and precise, the next a whirl of motion as her feet flicker across the floor. Her face tells its own story: brows knit in concentration, then eyes widening mid-turn as if startled by her own speed, before dissolving into laughter as she springs upward, weightless. Her arms carve the air—now arcing like ribbons unfurling, now snapping sharp as a whip’s crack, palms meeting wrists in staccato beats. A roll of her hips sends her fingers fluttering near her temple, then cascading down as she folds into a teasing dip, the beads on her dress scattering light like sparks." \
+    --video_size 960 544 --video_seconds 3 --fps 30 --infer_steps 25 \
+    --attn_mode sdpa --fp8_scaled \
+    --vae_chunk_size 32 --vae_spatial_tile_sample_min_size 128 \
+    --save_path save --output_type both \
+    --seed 1234 --lora_multiplier 1.0 --lora_weight framepack_yichen_output/framepack-yichen-lora-000006.safetensors
+```
+- With Lora
+### 4. Scaramouche
+- Source Image
+```bash
+python fpack_generate_video.py \
+    --dit FramePackI2V_HY/diffusion_pytorch_model-00001-of-00003.safetensors \
+    --vae HunyuanVideo/vae/diffusion_pytorch_model.safetensors \
+    --text_encoder1 HunyuanVideo_repackaged/split_files/text_encoders/llava_llama3_fp16.safetensors \
+    --text_encoder2 HunyuanVideo_repackaged/split_files/text_encoders/clip_l.safetensors \
+    --image_encoder sigclip_vision_384/sigclip_vision_patch14_384.safetensors \
+    --image_path shanbing.jpg \
+    --prompt "In the style of Yi Chen Dancing White Background , The dancer’s energy pulses in waves—one moment a statue, poised and precise, the next a whirl of motion as her feet flicker across the floor. Her face tells its own story: brows knit in concentration, then eyes widening mid-turn as if startled by her own speed, before dissolving into laughter as she springs upward, weightless. Her arms carve the air—now arcing like ribbons unfurling, now snapping sharp as a whip’s crack, palms meeting wrists in staccato beats. A roll of her hips sends her fingers fluttering near her temple, then cascading down as she folds into a teasing dip, the beads on her dress scattering light like sparks." \
+    --video_size 960 544 --video_seconds 3 --fps 30 --infer_steps 25 \
+    --attn_mode sdpa --fp8_scaled \
+    --vae_chunk_size 32 --vae_spatial_tile_sample_min_size 128 \
+    --save_path save --output_type both \
+    --seed 1234 --lora_multiplier 1.0 --lora_weight framepack_yichen_output/framepack-yichen-lora-000006.safetensors
+```
+- With Lora
+## Parameters
+* `--fp8`: Enable FP8 precision (optional).
+* `--task`: Specify the task (e.g., `t2v-1.3B`).
+* `--video_size`: Set the resolution of the generated video (e.g., `1024 1024`).
+* `--video_length`: Define the length of the video in frames.
+* `--infer_steps`: Number of inference steps.
+* `--save_path`: Directory to save the generated video.
+* `--output_type`: Output type (e.g., `both` for video and frames).
+* `--dit`: Path to the diffusion model weights.
+* `--vae`: Path to the VAE model weights.
+* `--t5`: Path to the T5 model weights.
+* `--attn_mode`: Attention mode (e.g., `torch`).
+* `--lora_weight`: Path to the LoRA weights.
+* `--lora_multiplier`: Multiplier for LoRA weights.
+* `--prompt`: Textual prompt for video generation.
+## Output
+The generated video and frames will be saved in the specified `save_path` directory.
+## Troubleshooting
+• Ensure all dependencies are correctly installed.
+• Verify that the model weights are downloaded and placed in the correct locations.
+• Check for any missing Python packages and install them using `pip`.
+## License
+This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
+## Acknowledgments
+• **Hugging Face** for hosting the model weights.
+• **Wan-AI** for providing the pre-trained models.
+• **DeepBeepMeep** for contributing to the model weights.
+## Contact
+For any questions or issues, please open an issue on the repository or contact the maintainer.
+---

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.10

README.md ADDED Viewed

	@@ -0,0 +1,203 @@

+# FramePack Dancing Image-to-Video Generation
+This repository contains the necessary steps and scripts to generate videos using the Dancing image-to-video model. The model leverages LoRA (Low-Rank Adaptation) weights and pre-trained components to create high-quality anime-style videos based on textual prompts.
+## Prerequisites
+Before proceeding, ensure that you have the following installed on your system:
+• **Ubuntu** (or a compatible Linux distribution)
+• **Python 3.x**
+• **pip** (Python package manager)
+• **Git**
+• **Git LFS** (Git Large File Storage)
+• **FFmpeg**
+## Installation
+1. **Update and Install Dependencies**
+   ```bash
+   sudo apt-get update && sudo apt-get install cbm git-lfs ffmpeg
+   ```
+2. **Clone the Repository**
+   ```bash
+   git clone https://huggingface.co/svjack/YiChen_FramePack_lora_early
+   cd YiChen_FramePack_lora_early
+   ```
+3. **Install Python Dependencies**
+   ```bash
+   pip install torch torchvision
+   pip install -r requirements.txt
+   pip install ascii-magic matplotlib tensorboard huggingface_hub datasets
+   pip install moviepy==1.0.3
+   pip install sageattention==1.0.6
+   ```
+4. **Download Model Weights**
+   ```bash
+    git clone https://huggingface.co/lllyasviel/FramePackI2V_HY
+    git clone https://huggingface.co/hunyuanvideo-community/HunyuanVideo
+    git clone https://huggingface.co/Comfy-Org/HunyuanVideo_repackaged
+    git clone https://huggingface.co/Comfy-Org/sigclip_vision_384
+   ```
+## Usage
+To generate a video, use the `fpack_generate_video.py` script with the appropriate parameters. Below are examples of how to generate videos using the Dancing model.
+### 1. Furina
+- Source Image
+```bash
+python fpack_generate_video.py \
+    --dit FramePackI2V_HY/diffusion_pytorch_model-00001-of-00003.safetensors \
+    --vae HunyuanVideo/vae/diffusion_pytorch_model.safetensors \
+    --text_encoder1 HunyuanVideo_repackaged/split_files/text_encoders/llava_llama3_fp16.safetensors \
+    --text_encoder2 HunyuanVideo_repackaged/split_files/text_encoders/clip_l.safetensors \
+    --image_encoder sigclip_vision_384/sigclip_vision_patch14_384.safetensors \
+    --image_path fln.png \
+    --prompt "In the style of Yi Chen Dancing White Background , The character's movements shift dynamically throughout the video, transitioning from poised stillness to lively dance steps. Her expressions evolve seamlessly—starting with focused determination, then flashing surprise as she executes a quick spin, before breaking into a joyful smile mid-leap. Her hands flow through choreographed positions, sometimes extending gracefully like unfolding wings, other times clapping rhythmically against her wrists. During a dramatic hip sway, her fingers fan open near her cheek, then sweep downward as her whole body dips into a playful crouch, the sequins on her costume catching the light with every motion." \
+    --video_size 960 544 --video_seconds 3 --fps 30 --infer_steps 25 \
+    --attn_mode sdpa --fp8_scaled \
+    --vae_chunk_size 32 --vae_spatial_tile_sample_min_size 128 \
+    --save_path save --output_type both \
+    --seed 1234 --lora_multiplier 1.0 --lora_weight framepack_yichen_output/framepack-yichen-lora-000006.safetensors
+```
+- Without Lora
+- With Lora
+### 2. Roper
+- Source Image
+```bash
+python fpack_generate_video.py \
+    --dit FramePackI2V_HY/diffusion_pytorch_model-00001-of-00003.safetensors \
+    --vae HunyuanVideo/vae/diffusion_pytorch_model.safetensors \
+    --text_encoder1 HunyuanVideo_repackaged/split_files/text_encoders/llava_llama3_fp16.safetensors \
+    --text_encoder2 HunyuanVideo_repackaged/split_files/text_encoders/clip_l.safetensors \
+    --image_encoder sigclip_vision_384/sigclip_vision_patch14_384.safetensors \
+    --image_path shengjiang.png \
+    --prompt "In the style of Yi Chen Dancing White Background , The character's movements shift dynamically throughout the video, transitioning from poised stillness to lively dance steps. Her expressions evolve seamlessly—starting with focused determination, then flashing surprise as she executes a quick spin, before breaking into a joyful smile mid-leap. Her hands flow through choreographed positions, sometimes extending gracefully like unfolding wings, other times clapping rhythmically against her wrists. During a dramatic hip sway, her fingers fan open near her cheek, then sweep downward as her whole body dips into a playful crouch, the sequins on her costume catching the light with every motion." \
+    --video_size 960 544 --video_seconds 3 --fps 30 --infer_steps 25 \
+    --attn_mode sdpa --fp8_scaled \
+    --vae_chunk_size 32 --vae_spatial_tile_sample_min_size 128 \
+    --save_path save --output_type both \
+    --seed 1234 --lora_multiplier 1.0 --lora_weight framepack_yichen_output/framepack-yichen-lora-000006.safetensors
+```
+- With Lora
+### 3. Varesa
+- Source Image
+```bash
+python fpack_generate_video.py \
+    --dit FramePackI2V_HY/diffusion_pytorch_model-00001-of-00003.safetensors \
+    --vae HunyuanVideo/vae/diffusion_pytorch_model.safetensors \
+    --text_encoder1 HunyuanVideo_repackaged/split_files/text_encoders/llava_llama3_fp16.safetensors \
+    --text_encoder2 HunyuanVideo_repackaged/split_files/text_encoders/clip_l.safetensors \
+    --image_encoder sigclip_vision_384/sigclip_vision_patch14_384.safetensors \
+    --image_path waliesha.jpg \
+    --prompt "In the style of Yi Chen Dancing White Background , The dancer’s energy pulses in waves—one moment a statue, poised and precise, the next a whirl of motion as her feet flicker across the floor. Her face tells its own story: brows knit in concentration, then eyes widening mid-turn as if startled by her own speed, before dissolving into laughter as she springs upward, weightless. Her arms carve the air—now arcing like ribbons unfurling, now snapping sharp as a whip’s crack, palms meeting wrists in staccato beats. A roll of her hips sends her fingers fluttering near her temple, then cascading down as she folds into a teasing dip, the beads on her dress scattering light like sparks." \
+    --video_size 960 544 --video_seconds 3 --fps 30 --infer_steps 25 \
+    --attn_mode sdpa --fp8_scaled \
+    --vae_chunk_size 32 --vae_spatial_tile_sample_min_size 128 \
+    --save_path save --output_type both \
+    --seed 1234 --lora_multiplier 1.0 --lora_weight framepack_yichen_output/framepack-yichen-lora-000006.safetensors
+```
+- With Lora
+### 4. Scaramouche
+- Source Image
+```bash
+python fpack_generate_video.py \
+    --dit FramePackI2V_HY/diffusion_pytorch_model-00001-of-00003.safetensors \
+    --vae HunyuanVideo/vae/diffusion_pytorch_model.safetensors \
+    --text_encoder1 HunyuanVideo_repackaged/split_files/text_encoders/llava_llama3_fp16.safetensors \
+    --text_encoder2 HunyuanVideo_repackaged/split_files/text_encoders/clip_l.safetensors \
+    --image_encoder sigclip_vision_384/sigclip_vision_patch14_384.safetensors \
+    --image_path shanbing.jpg \
+    --prompt "In the style of Yi Chen Dancing White Background , The dancer’s energy pulses in waves—one moment a statue, poised and precise, the next a whirl of motion as her feet flicker across the floor. Her face tells its own story: brows knit in concentration, then eyes widening mid-turn as if startled by her own speed, before dissolving into laughter as she springs upward, weightless. Her arms carve the air—now arcing like ribbons unfurling, now snapping sharp as a whip’s crack, palms meeting wrists in staccato beats. A roll of her hips sends her fingers fluttering near her temple, then cascading down as she folds into a teasing dip, the beads on her dress scattering light like sparks." \
+    --video_size 960 544 --video_seconds 3 --fps 30 --infer_steps 25 \
+    --attn_mode sdpa --fp8_scaled \
+    --vae_chunk_size 32 --vae_spatial_tile_sample_min_size 128 \
+    --save_path save --output_type both \
+    --seed 1234 --lora_multiplier 1.0 --lora_weight framepack_yichen_output/framepack-yichen-lora-000006.safetensors
+```
+- With Lora
+## Parameters
+* `--fp8`: Enable FP8 precision (optional).
+* `--task`: Specify the task (e.g., `t2v-1.3B`).
+* `--video_size`: Set the resolution of the generated video (e.g., `1024 1024`).
+* `--video_length`: Define the length of the video in frames.
+* `--infer_steps`: Number of inference steps.
+* `--save_path`: Directory to save the generated video.
+* `--output_type`: Output type (e.g., `both` for video and frames).
+* `--dit`: Path to the diffusion model weights.
+* `--vae`: Path to the VAE model weights.
+* `--t5`: Path to the T5 model weights.
+* `--attn_mode`: Attention mode (e.g., `torch`).
+* `--lora_weight`: Path to the LoRA weights.
+* `--lora_multiplier`: Multiplier for LoRA weights.
+* `--prompt`: Textual prompt for video generation.
+## Output
+The generated video and frames will be saved in the specified `save_path` directory.
+## Troubleshooting
+• Ensure all dependencies are correctly installed.
+• Verify that the model weights are downloaded and placed in the correct locations.
+• Check for any missing Python packages and install them using `pip`.
+## License
+This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
+## Acknowledgments
+• **Hugging Face** for hosting the model weights.
+• **Wan-AI** for providing the pre-trained models.
+• **DeepBeepMeep** for contributing to the model weights.
+## Contact
+For any questions or issues, please open an issue on the repository or contact the maintainer.
+---

cache_latents.py ADDED Viewed

	@@ -0,0 +1,339 @@

+import argparse
+import os
+import glob
+from typing import Optional, Union
+import numpy as np
+import torch
+from tqdm import tqdm
+from dataset import config_utils
+from dataset.config_utils import BlueprintGenerator, ConfigSanitizer
+from PIL import Image
+import logging
+from dataset.image_video_dataset import BaseDataset, ItemInfo, save_latent_cache, ARCHITECTURE_HUNYUAN_VIDEO
+from hunyuan_model.vae import load_vae
+from hunyuan_model.autoencoder_kl_causal_3d import AutoencoderKLCausal3D
+from utils.model_utils import str_to_dtype
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+def show_image(image: Union[list[Union[Image.Image, np.ndarray], Union[Image.Image, np.ndarray]]]) -> int:
+    import cv2
+    imgs = (
+        [image]
+        if (isinstance(image, np.ndarray) and len(image.shape) == 3) or isinstance(image, Image.Image)
+        else [image[0], image[-1]]
+    )
+    if len(imgs) > 1:
+        print(f"Number of images: {len(image)}")
+    for i, img in enumerate(imgs):
+        if len(imgs) > 1:
+            print(f"{'First' if i == 0 else 'Last'} image: {img.shape}")
+        else:
+            print(f"Image: {img.shape}")
+        cv2_img = np.array(img) if isinstance(img, Image.Image) else img
+        cv2_img = cv2.cvtColor(cv2_img, cv2.COLOR_RGB2BGR)
+        cv2.imshow("image", cv2_img)
+        k = cv2.waitKey(0)
+        cv2.destroyAllWindows()
+        if k == ord("q") or k == ord("d"):
+            return k
+    return k
+def show_console(
+    image: Union[list[Union[Image.Image, np.ndarray], Union[Image.Image, np.ndarray]]],
+    width: int,
+    back: str,
+    interactive: bool = False,
+) -> int:
+    from ascii_magic import from_pillow_image, Back
+    back = None
+    if back is not None:
+        back = getattr(Back, back.upper())
+    k = None
+    imgs = (
+        [image]
+        if (isinstance(image, np.ndarray) and len(image.shape) == 3) or isinstance(image, Image.Image)
+        else [image[0], image[-1]]
+    )
+    if len(imgs) > 1:
+        print(f"Number of images: {len(image)}")
+    for i, img in enumerate(imgs):
+        if len(imgs) > 1:
+            print(f"{'First' if i == 0 else 'Last'} image: {img.shape}")
+        else:
+            print(f"Image: {img.shape}")
+        pil_img = img if isinstance(img, Image.Image) else Image.fromarray(img)
+        ascii_img = from_pillow_image(pil_img)
+        ascii_img.to_terminal(columns=width, back=back)
+        if interactive:
+            k = input("Press q to quit, d to next dataset, other key to next: ")
+            if k == "q" or k == "d":
+                return ord(k)
+    if not interactive:
+        return ord(" ")
+    return ord(k) if k else ord(" ")
+def save_video(image: Union[list[Union[Image.Image, np.ndarray], Union[Image.Image, np.ndarray]]], cache_path: str, fps: int = 24):
+    import av
+    directory = os.path.dirname(cache_path)
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+    if (isinstance(image, np.ndarray) and len(image.shape) == 3) or isinstance(image, Image.Image):
+        # save image
+        image_path = cache_path.replace(".safetensors", ".jpg")
+        img = image if isinstance(image, Image.Image) else Image.fromarray(image)
+        img.save(image_path)
+        print(f"Saved image: {image_path}")
+    else:
+        imgs = image
+        print(f"Number of images: {len(imgs)}")
+        # save video
+        video_path = cache_path.replace(".safetensors", ".mp4")
+        height, width = imgs[0].shape[0:2]
+        # create output container
+        container = av.open(video_path, mode="w")
+        # create video stream
+        codec = "libx264"
+        pixel_format = "yuv420p"
+        stream = container.add_stream(codec, rate=fps)
+        stream.width = width
+        stream.height = height
+        stream.pix_fmt = pixel_format
+        stream.bit_rate = 1000000  # 1Mbit/s for preview quality
+        for frame_img in imgs:
+            if isinstance(frame_img, Image.Image):
+                frame = av.VideoFrame.from_image(frame_img)
+            else:
+                frame = av.VideoFrame.from_ndarray(frame_img, format="rgb24")
+            packets = stream.encode(frame)
+            for packet in packets:
+                container.mux(packet)
+        for packet in stream.encode():
+            container.mux(packet)
+        container.close()
+        print(f"Saved video: {video_path}")
+def show_datasets(
+    datasets: list[BaseDataset],
+    debug_mode: str,
+    console_width: int,
+    console_back: str,
+    console_num_images: Optional[int],
+    fps: int = 24,
+):
+    if debug_mode != "video":
+        print(f"d: next dataset, q: quit")
+    num_workers = max(1, os.cpu_count() - 1)
+    for i, dataset in enumerate(datasets):
+        print(f"Dataset [{i}]")
+        batch_index = 0
+        num_images_to_show = console_num_images
+        k = None
+        for key, batch in dataset.retrieve_latent_cache_batches(num_workers):
+            print(f"bucket resolution: {key}, count: {len(batch)}")
+            for j, item_info in enumerate(batch):
+                item_info: ItemInfo
+                print(f"{batch_index}-{j}: {item_info}")
+                if debug_mode == "image":
+                    k = show_image(item_info.content)
+                elif debug_mode == "console":
+                    k = show_console(item_info.content, console_width, console_back, console_num_images is None)
+                    if num_images_to_show is not None:
+                        num_images_to_show -= 1
+                        if num_images_to_show == 0:
+                            k = ord("d")  # next dataset
+                elif debug_mode == "video":
+                    save_video(item_info.content, item_info.latent_cache_path, fps)
+                    k = None  # save next video
+                if k == ord("q"):
+                    return
+                elif k == ord("d"):
+                    break
+            if k == ord("d"):
+                break
+            batch_index += 1
+def encode_and_save_batch(vae: AutoencoderKLCausal3D, batch: list[ItemInfo]):
+    contents = torch.stack([torch.from_numpy(item.content) for item in batch])
+    if len(contents.shape) == 4:
+        contents = contents.unsqueeze(1)  # B, H, W, C -> B, F, H, W, C
+    contents = contents.permute(0, 4, 1, 2, 3).contiguous()  # B, C, F, H, W
+    contents = contents.to(vae.device, dtype=vae.dtype)
+    contents = contents / 127.5 - 1.0  # normalize to [-1, 1]
+    h, w = contents.shape[3], contents.shape[4]
+    if h < 8 or w < 8:
+        item = batch[0]  # other items should have the same size
+        raise ValueError(f"Image or video size too small: {item.item_key} and {len(batch) - 1} more, size: {item.original_size}")
+    # print(f"encode batch: {contents.shape}")
+    with torch.no_grad():
+        latent = vae.encode(contents).latent_dist.sample()
+        # latent = latent * vae.config.scaling_factor
+    # # debug: decode and save
+    # with torch.no_grad():
+    #     latent_to_decode = latent / vae.config.scaling_factor
+    #     images = vae.decode(latent_to_decode, return_dict=False)[0]
+    #     images = (images / 2 + 0.5).clamp(0, 1)
+    #     images = images.cpu().float().numpy()
+    #     images = (images * 255).astype(np.uint8)
+    #     images = images.transpose(0, 2, 3, 4, 1)  # B, C, F, H, W -> B, F, H, W, C
+    #     for b in range(images.shape[0]):
+    #         for f in range(images.shape[1]):
+    #             fln = os.path.splitext(os.path.basename(batch[b].item_key))[0]
+    #             img = Image.fromarray(images[b, f])
+    #             img.save(f"./logs/decode_{fln}_{b}_{f:03d}.jpg")
+    for item, l in zip(batch, latent):
+        # print(f"save latent cache: {item.latent_cache_path}, latent shape: {l.shape}")
+        save_latent_cache(item, l)
+def encode_datasets(datasets: list[BaseDataset], encode: callable, args: argparse.Namespace):
+    num_workers = args.num_workers if args.num_workers is not None else max(1, os.cpu_count() - 1)
+    for i, dataset in enumerate(datasets):
+        logger.info(f"Encoding dataset [{i}]")
+        all_latent_cache_paths = []
+        for _, batch in tqdm(dataset.retrieve_latent_cache_batches(num_workers)):
+            all_latent_cache_paths.extend([item.latent_cache_path for item in batch])
+            if args.skip_existing:
+                filtered_batch = [item for item in batch if not os.path.exists(item.latent_cache_path)]
+                if len(filtered_batch) == 0:
+                    continue
+                batch = filtered_batch
+            bs = args.batch_size if args.batch_size is not None else len(batch)
+            for i in range(0, len(batch), bs):
+                encode(batch[i : i + bs])
+        # normalize paths
+        all_latent_cache_paths = [os.path.normpath(p) for p in all_latent_cache_paths]
+        all_latent_cache_paths = set(all_latent_cache_paths)
+        # remove old cache files not in the dataset
+        all_cache_files = dataset.get_all_latent_cache_files()
+        for cache_file in all_cache_files:
+            if os.path.normpath(cache_file) not in all_latent_cache_paths:
+                if args.keep_cache:
+                    logger.info(f"Keep cache file not in the dataset: {cache_file}")
+                else:
+                    os.remove(cache_file)
+                    logger.info(f"Removed old cache file: {cache_file}")
+def main(args):
+    device = args.device if args.device is not None else "cuda" if torch.cuda.is_available() else "cpu"
+    device = torch.device(device)
+    # Load dataset config
+    blueprint_generator = BlueprintGenerator(ConfigSanitizer())
+    logger.info(f"Load dataset config from {args.dataset_config}")
+    user_config = config_utils.load_user_config(args.dataset_config)
+    blueprint = blueprint_generator.generate(user_config, args, architecture=ARCHITECTURE_HUNYUAN_VIDEO)
+    train_dataset_group = config_utils.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+    datasets = train_dataset_group.datasets
+    if args.debug_mode is not None:
+        show_datasets(datasets, args.debug_mode, args.console_width, args.console_back, args.console_num_images)
+        return
+    assert args.vae is not None, "vae checkpoint is required"
+    # Load VAE model: HunyuanVideo VAE model is float16
+    vae_dtype = torch.float16 if args.vae_dtype is None else str_to_dtype(args.vae_dtype)
+    vae, _, s_ratio, t_ratio = load_vae(vae_dtype=vae_dtype, device=device, vae_path=args.vae)
+    vae.eval()
+    logger.info(f"Loaded VAE: {vae.config}, dtype: {vae.dtype}")
+    if args.vae_chunk_size is not None:
+        vae.set_chunk_size_for_causal_conv_3d(args.vae_chunk_size)
+        logger.info(f"Set chunk_size to {args.vae_chunk_size} for CausalConv3d in VAE")
+    if args.vae_spatial_tile_sample_min_size is not None:
+        vae.enable_spatial_tiling(True)
+        vae.tile_sample_min_size = args.vae_spatial_tile_sample_min_size
+        vae.tile_latent_min_size = args.vae_spatial_tile_sample_min_size // 8
+    elif args.vae_tiling:
+        vae.enable_spatial_tiling(True)
+    # Encode images
+    def encode(one_batch: list[ItemInfo]):
+        encode_and_save_batch(vae, one_batch)
+    encode_datasets(datasets, encode, args)
+def setup_parser_common() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dataset_config", type=str, required=True, help="path to dataset config .toml file")
+    parser.add_argument("--vae", type=str, required=False, default=None, help="path to vae checkpoint")
+    parser.add_argument("--vae_dtype", type=str, default=None, help="data type for VAE, default is float16")
+    parser.add_argument("--device", type=str, default=None, help="device to use, default is cuda if available")
+    parser.add_argument(
+        "--batch_size", type=int, default=None, help="batch size, override dataset config if dataset batch size > this"
+    )
+    parser.add_argument("--num_workers", type=int, default=None, help="number of workers for dataset. default is cpu count-1")
+    parser.add_argument("--skip_existing", action="store_true", help="skip existing cache files")
+    parser.add_argument("--keep_cache", action="store_true", help="keep cache files not in dataset")
+    parser.add_argument("--debug_mode", type=str, default=None, choices=["image", "console", "video"], help="debug mode")
+    parser.add_argument("--console_width", type=int, default=80, help="debug mode: console width")
+    parser.add_argument(
+        "--console_back", type=str, default=None, help="debug mode: console background color, one of ascii_magic.Back"
+    )
+    parser.add_argument(
+        "--console_num_images",
+        type=int,
+        default=None,
+        help="debug mode: not interactive, number of images to show for each dataset",
+    )
+    return parser
+def hv_setup_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    parser.add_argument(
+        "--vae_tiling",
+        action="store_true",
+        help="enable spatial tiling for VAE, default is False. If vae_spatial_tile_sample_min_size is set, this is automatically enabled",
+    )
+    parser.add_argument("--vae_chunk_size", type=int, default=None, help="chunk size for CausalConv3d in VAE")
+    parser.add_argument(
+        "--vae_spatial_tile_sample_min_size", type=int, default=None, help="spatial tile sample min size for VAE, default 256"
+    )
+    return parser
+if __name__ == "__main__":
+    parser = setup_parser_common()
+    parser = hv_setup_parser(parser)
+    args = parser.parse_args()
+    main(args)

cache_text_encoder_outputs.py ADDED Viewed

	@@ -0,0 +1,214 @@

+import argparse
+import os
+from typing import Optional, Union
+import numpy as np
+import torch
+from tqdm import tqdm
+from dataset import config_utils
+from dataset.config_utils import BlueprintGenerator, ConfigSanitizer
+import accelerate
+from dataset.image_video_dataset import ARCHITECTURE_HUNYUAN_VIDEO, BaseDataset, ItemInfo, save_text_encoder_output_cache
+from hunyuan_model import text_encoder as text_encoder_module
+from hunyuan_model.text_encoder import TextEncoder
+import logging
+from utils.model_utils import str_to_dtype
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+def encode_prompt(text_encoder: TextEncoder, prompt: Union[str, list[str]]):
+    data_type = "video"  # video only, image is not supported
+    text_inputs = text_encoder.text2tokens(prompt, data_type=data_type)
+    with torch.no_grad():
+        prompt_outputs = text_encoder.encode(text_inputs, data_type=data_type)
+    return prompt_outputs.hidden_state, prompt_outputs.attention_mask
+def encode_and_save_batch(
+    text_encoder: TextEncoder, batch: list[ItemInfo], is_llm: bool, accelerator: Optional[accelerate.Accelerator]
+):
+    prompts = [item.caption for item in batch]
+    # print(prompts)
+    # encode prompt
+    if accelerator is not None:
+        with accelerator.autocast():
+            prompt_embeds, prompt_mask = encode_prompt(text_encoder, prompts)
+    else:
+        prompt_embeds, prompt_mask = encode_prompt(text_encoder, prompts)
+    # # convert to fp16 if needed
+    # if prompt_embeds.dtype == torch.float32 and text_encoder.dtype != torch.float32:
+    #     prompt_embeds = prompt_embeds.to(text_encoder.dtype)
+    # save prompt cache
+    for item, embed, mask in zip(batch, prompt_embeds, prompt_mask):
+        save_text_encoder_output_cache(item, embed, mask, is_llm)
+def prepare_cache_files_and_paths(datasets: list[BaseDataset]):
+    all_cache_files_for_dataset = []  # exisiting cache files
+    all_cache_paths_for_dataset = []  # all cache paths in the dataset
+    for dataset in datasets:
+        all_cache_files = [os.path.normpath(file) for file in dataset.get_all_text_encoder_output_cache_files()]
+        all_cache_files = set(all_cache_files)
+        all_cache_files_for_dataset.append(all_cache_files)
+        all_cache_paths_for_dataset.append(set())
+    return all_cache_files_for_dataset, all_cache_paths_for_dataset
+def process_text_encoder_batches(
+    num_workers: Optional[int],
+    skip_existing: bool,
+    batch_size: int,
+    datasets: list[BaseDataset],
+    all_cache_files_for_dataset: list[set],
+    all_cache_paths_for_dataset: list[set],
+    encode: callable,
+):
+    num_workers = num_workers if num_workers is not None else max(1, os.cpu_count() - 1)
+    for i, dataset in enumerate(datasets):
+        logger.info(f"Encoding dataset [{i}]")
+        all_cache_files = all_cache_files_for_dataset[i]
+        all_cache_paths = all_cache_paths_for_dataset[i]
+        for batch in tqdm(dataset.retrieve_text_encoder_output_cache_batches(num_workers)):
+            # update cache files (it's ok if we update it multiple times)
+            all_cache_paths.update([os.path.normpath(item.text_encoder_output_cache_path) for item in batch])
+            # skip existing cache files
+            if skip_existing:
+                filtered_batch = [
+                    item for item in batch if not os.path.normpath(item.text_encoder_output_cache_path) in all_cache_files
+                ]
+                # print(f"Filtered {len(batch) - len(filtered_batch)} existing cache files")
+                if len(filtered_batch) == 0:
+                    continue
+                batch = filtered_batch
+            bs = batch_size if batch_size is not None else len(batch)
+            for i in range(0, len(batch), bs):
+                encode(batch[i : i + bs])
+def post_process_cache_files(
+    datasets: list[BaseDataset], all_cache_files_for_dataset: list[set], all_cache_paths_for_dataset: list[set]
+):
+    for i, dataset in enumerate(datasets):
+        all_cache_files = all_cache_files_for_dataset[i]
+        all_cache_paths = all_cache_paths_for_dataset[i]
+        for cache_file in all_cache_files:
+            if cache_file not in all_cache_paths:
+                if args.keep_cache:
+                    logger.info(f"Keep cache file not in the dataset: {cache_file}")
+                else:
+                    os.remove(cache_file)
+                    logger.info(f"Removed old cache file: {cache_file}")
+def main(args):
+    device = args.device if args.device is not None else "cuda" if torch.cuda.is_available() else "cpu"
+    device = torch.device(device)
+    # Load dataset config
+    blueprint_generator = BlueprintGenerator(ConfigSanitizer())
+    logger.info(f"Load dataset config from {args.dataset_config}")
+    user_config = config_utils.load_user_config(args.dataset_config)
+    blueprint = blueprint_generator.generate(user_config, args, architecture=ARCHITECTURE_HUNYUAN_VIDEO)
+    train_dataset_group = config_utils.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+    datasets = train_dataset_group.datasets
+    # define accelerator for fp8 inference
+    accelerator = None
+    if args.fp8_llm:
+        accelerator = accelerate.Accelerator(mixed_precision="fp16")
+    # prepare cache files and paths: all_cache_files_for_dataset = exisiting cache files, all_cache_paths_for_dataset = all cache paths in the dataset
+    all_cache_files_for_dataset, all_cache_paths_for_dataset = prepare_cache_files_and_paths(datasets)
+    # Load Text Encoder 1
+    text_encoder_dtype = torch.float16 if args.text_encoder_dtype is None else str_to_dtype(args.text_encoder_dtype)
+    logger.info(f"loading text encoder 1: {args.text_encoder1}")
+    text_encoder_1 = text_encoder_module.load_text_encoder_1(args.text_encoder1, device, args.fp8_llm, text_encoder_dtype)
+    text_encoder_1.to(device=device)
+    # Encode with Text Encoder 1 (LLM)
+    logger.info("Encoding with Text Encoder 1")
+    def encode_for_text_encoder_1(batch: list[ItemInfo]):
+        encode_and_save_batch(text_encoder_1, batch, is_llm=True, accelerator=accelerator)
+    process_text_encoder_batches(
+        args.num_workers,
+        args.skip_existing,
+        args.batch_size,
+        datasets,
+        all_cache_files_for_dataset,
+        all_cache_paths_for_dataset,
+        encode_for_text_encoder_1,
+    )
+    del text_encoder_1
+    # Load Text Encoder 2
+    logger.info(f"loading text encoder 2: {args.text_encoder2}")
+    text_encoder_2 = text_encoder_module.load_text_encoder_2(args.text_encoder2, device, text_encoder_dtype)
+    text_encoder_2.to(device=device)
+    # Encode with Text Encoder 2
+    logger.info("Encoding with Text Encoder 2")
+    def encode_for_text_encoder_2(batch: list[ItemInfo]):
+        encode_and_save_batch(text_encoder_2, batch, is_llm=False, accelerator=None)
+    process_text_encoder_batches(
+        args.num_workers,
+        args.skip_existing,
+        args.batch_size,
+        datasets,
+        all_cache_files_for_dataset,
+        all_cache_paths_for_dataset,
+        encode_for_text_encoder_2,
+    )
+    del text_encoder_2
+    # remove cache files not in dataset
+    post_process_cache_files(datasets, all_cache_files_for_dataset, all_cache_paths_for_dataset)
+def setup_parser_common():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dataset_config", type=str, required=True, help="path to dataset config .toml file")
+    parser.add_argument("--device", type=str, default=None, help="device to use, default is cuda if available")
+    parser.add_argument(
+        "--batch_size", type=int, default=None, help="batch size, override dataset config if dataset batch size > this"
+    )
+    parser.add_argument("--num_workers", type=int, default=None, help="number of workers for dataset. default is cpu count-1")
+    parser.add_argument("--skip_existing", action="store_true", help="skip existing cache files")
+    parser.add_argument("--keep_cache", action="store_true", help="keep cache files not in dataset")
+    return parser
+def hv_setup_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    parser.add_argument("--text_encoder1", type=str, required=True, help="Text Encoder 1 directory")
+    parser.add_argument("--text_encoder2", type=str, required=True, help="Text Encoder 2 directory")
+    parser.add_argument("--text_encoder_dtype", type=str, default=None, help="data type for Text Encoder, default is float16")
+    parser.add_argument("--fp8_llm", action="store_true", help="use fp8 for Text Encoder 1 (LLM)")
+    return parser
+if __name__ == "__main__":
+    parser = setup_parser_common()
+    parser = hv_setup_parser(parser)
+    args = parser.parse_args()
+    main(args)

convert_lora.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import argparse
+import torch
+from safetensors.torch import load_file, save_file
+from safetensors import safe_open
+from utils import model_utils
+import logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+def convert_from_diffusers(prefix, weights_sd):
+    # convert from diffusers(?) to default LoRA
+    # Diffusers format: {"diffusion_model.module.name.lora_A.weight": weight, "diffusion_model.module.name.lora_B.weight": weight, ...}
+    # default LoRA format: {"prefix_module_name.lora_down.weight": weight, "prefix_module_name.lora_up.weight": weight, ...}
+    # note: Diffusers has no alpha, so alpha is set to rank
+    new_weights_sd = {}
+    lora_dims = {}
+    for key, weight in weights_sd.items():
+        diffusers_prefix, key_body = key.split(".", 1)
+        if diffusers_prefix != "diffusion_model" and diffusers_prefix != "transformer":
+            logger.warning(f"unexpected key: {key} in diffusers format")
+            continue
+        new_key = f"{prefix}{key_body}".replace(".", "_").replace("_lora_A_", ".lora_down.").replace("_lora_B_", ".lora_up.")
+        new_weights_sd[new_key] = weight
+        lora_name = new_key.split(".")[0]  # before first dot
+        if lora_name not in lora_dims and "lora_down" in new_key:
+            lora_dims[lora_name] = weight.shape[0]
+    # add alpha with rank
+    for lora_name, dim in lora_dims.items():
+        new_weights_sd[f"{lora_name}.alpha"] = torch.tensor(dim)
+    return new_weights_sd
+def convert_to_diffusers(prefix, weights_sd):
+    # convert from default LoRA to diffusers
+    # get alphas
+    lora_alphas = {}
+    for key, weight in weights_sd.items():
+        if key.startswith(prefix):
+            lora_name = key.split(".", 1)[0]  # before first dot
+            if lora_name not in lora_alphas and "alpha" in key:
+                lora_alphas[lora_name] = weight
+    new_weights_sd = {}
+    for key, weight in weights_sd.items():
+        if key.startswith(prefix):
+            if "alpha" in key:
+                continue
+            lora_name = key.split(".", 1)[0]  # before first dot
+            module_name = lora_name[len(prefix) :]  # remove "lora_unet_"
+            module_name = module_name.replace("_", ".")  # replace "_" with "."
+            if ".cross.attn." in module_name or ".self.attn." in module_name:
+                # Wan2.1 lora name to module name: ugly but works
+                module_name = module_name.replace("cross.attn", "cross_attn")  # fix cross attn
+                module_name = module_name.replace("self.attn", "self_attn")  # fix self attn
+                module_name = module_name.replace("k.img", "k_img")  # fix k img
+                module_name = module_name.replace("v.img", "v_img")  # fix v img
+            else:
+                # HunyuanVideo lora name to module name: ugly but works
+                module_name = module_name.replace("double.blocks.", "double_blocks.")  # fix double blocks
+                module_name = module_name.replace("single.blocks.", "single_blocks.")  # fix single blocks
+                module_name = module_name.replace("img.", "img_")  # fix img
+                module_name = module_name.replace("txt.", "txt_")  # fix txt
+                module_name = module_name.replace("attn.", "attn_")  # fix attn
+            diffusers_prefix = "diffusion_model"
+            if "lora_down" in key:
+                new_key = f"{diffusers_prefix}.{module_name}.lora_A.weight"
+                dim = weight.shape[0]
+            elif "lora_up" in key:
+                new_key = f"{diffusers_prefix}.{module_name}.lora_B.weight"
+                dim = weight.shape[1]
+            else:
+                logger.warning(f"unexpected key: {key} in default LoRA format")
+                continue
+            # scale weight by alpha
+            if lora_name in lora_alphas:
+                # we scale both down and up, so scale is sqrt
+                scale = lora_alphas[lora_name] / dim
+                scale = scale.sqrt()
+                weight = weight * scale
+            else:
+                logger.warning(f"missing alpha for {lora_name}")
+            new_weights_sd[new_key] = weight
+    return new_weights_sd
+def convert(input_file, output_file, target_format):
+    logger.info(f"loading {input_file}")
+    weights_sd = load_file(input_file)
+    with safe_open(input_file, framework="pt") as f:
+        metadata = f.metadata()
+    logger.info(f"converting to {target_format}")
+    prefix = "lora_unet_"
+    if target_format == "default":
+        new_weights_sd = convert_from_diffusers(prefix, weights_sd)
+        metadata = metadata or {}
+        model_utils.precalculate_safetensors_hashes(new_weights_sd, metadata)
+    elif target_format == "other":
+        new_weights_sd = convert_to_diffusers(prefix, weights_sd)
+    else:
+        raise ValueError(f"unknown target format: {target_format}")
+    logger.info(f"saving to {output_file}")
+    save_file(new_weights_sd, output_file, metadata=metadata)
+    logger.info("done")
+def parse_args():
+    parser = argparse.ArgumentParser(description="Convert LoRA weights between default and other formats")
+    parser.add_argument("--input", type=str, required=True, help="input model file")
+    parser.add_argument("--output", type=str, required=True, help="output model file")
+    parser.add_argument("--target", type=str, required=True, choices=["other", "default"], help="target format")
+    args = parser.parse_args()
+    return args
+if __name__ == "__main__":
+    args = parse_args()
+    convert(args.input, args.output, args.target)

dataset/__init__.py ADDED Viewed

File without changes

dataset/config_utils.py ADDED Viewed

	@@ -0,0 +1,381 @@

+import argparse
+from dataclasses import (
+    asdict,
+    dataclass,
+)
+import functools
+import random
+from textwrap import dedent, indent
+import json
+from pathlib import Path
+# from toolz import curry
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+import toml
+import voluptuous
+from voluptuous import Any, ExactSequence, MultipleInvalid, Object, Schema
+from .image_video_dataset import DatasetGroup, ImageDataset, VideoDataset
+import logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+@dataclass
+class BaseDatasetParams:
+    resolution: Tuple[int, int] = (960, 544)
+    enable_bucket: bool = False
+    bucket_no_upscale: bool = False
+    caption_extension: Optional[str] = None
+    batch_size: int = 1
+    num_repeats: int = 1
+    cache_directory: Optional[str] = None
+    debug_dataset: bool = False
+    architecture: str = "no_default"  # short style like "hv" or "wan"
+@dataclass
+class ImageDatasetParams(BaseDatasetParams):
+    image_directory: Optional[str] = None
+    image_jsonl_file: Optional[str] = None
+@dataclass
+class VideoDatasetParams(BaseDatasetParams):
+    video_directory: Optional[str] = None
+    video_jsonl_file: Optional[str] = None
+    control_directory: Optional[str] = None
+    target_frames: Sequence[int] = (1,)
+    frame_extraction: Optional[str] = "head"
+    frame_stride: Optional[int] = 1
+    frame_sample: Optional[int] = 1
+    max_frames: Optional[int] = 129
+    source_fps: Optional[float] = None
+@dataclass
+class DatasetBlueprint:
+    is_image_dataset: bool
+    params: Union[ImageDatasetParams, VideoDatasetParams]
+@dataclass
+class DatasetGroupBlueprint:
+    datasets: Sequence[DatasetBlueprint]
+@dataclass
+class Blueprint:
+    dataset_group: DatasetGroupBlueprint
+class ConfigSanitizer:
+    # @curry
+    @staticmethod
+    def __validate_and_convert_twodim(klass, value: Sequence) -> Tuple:
+        Schema(ExactSequence([klass, klass]))(value)
+        return tuple(value)
+    # @curry
+    @staticmethod
+    def __validate_and_convert_scalar_or_twodim(klass, value: Union[float, Sequence]) -> Tuple:
+        Schema(Any(klass, ExactSequence([klass, klass])))(value)
+        try:
+            Schema(klass)(value)
+            return (value, value)
+        except:
+            return ConfigSanitizer.__validate_and_convert_twodim(klass, value)
+    # datasets schema
+    DATASET_ASCENDABLE_SCHEMA = {
+        "caption_extension": str,
+        "batch_size": int,
+        "num_repeats": int,
+        "resolution": functools.partial(__validate_and_convert_scalar_or_twodim.__func__, int),
+        "enable_bucket": bool,
+        "bucket_no_upscale": bool,
+    }
+    IMAGE_DATASET_DISTINCT_SCHEMA = {
+        "image_directory": str,
+        "image_jsonl_file": str,
+        "cache_directory": str,
+    }
+    VIDEO_DATASET_DISTINCT_SCHEMA = {
+        "video_directory": str,
+        "video_jsonl_file": str,
+        "control_directory": str,
+        "target_frames": [int],
+        "frame_extraction": str,
+        "frame_stride": int,
+        "frame_sample": int,
+        "max_frames": int,
+        "cache_directory": str,
+        "source_fps": float,
+    }
+    # options handled by argparse but not handled by user config
+    ARGPARSE_SPECIFIC_SCHEMA = {
+        "debug_dataset": bool,
+    }
+    def __init__(self) -> None:
+        self.image_dataset_schema = self.__merge_dict(
+            self.DATASET_ASCENDABLE_SCHEMA,
+            self.IMAGE_DATASET_DISTINCT_SCHEMA,
+        )
+        self.video_dataset_schema = self.__merge_dict(
+            self.DATASET_ASCENDABLE_SCHEMA,
+            self.VIDEO_DATASET_DISTINCT_SCHEMA,
+        )
+        def validate_flex_dataset(dataset_config: dict):
+            if "video_directory" in dataset_config or "video_jsonl_file" in dataset_config:
+                return Schema(self.video_dataset_schema)(dataset_config)
+            else:
+                return Schema(self.image_dataset_schema)(dataset_config)
+        self.dataset_schema = validate_flex_dataset
+        self.general_schema = self.__merge_dict(
+            self.DATASET_ASCENDABLE_SCHEMA,
+        )
+        self.user_config_validator = Schema(
+            {
+                "general": self.general_schema,
+                "datasets": [self.dataset_schema],
+            }
+        )
+        self.argparse_schema = self.__merge_dict(
+            self.ARGPARSE_SPECIFIC_SCHEMA,
+        )
+        self.argparse_config_validator = Schema(Object(self.argparse_schema), extra=voluptuous.ALLOW_EXTRA)
+    def sanitize_user_config(self, user_config: dict) -> dict:
+        try:
+            return self.user_config_validator(user_config)
+        except MultipleInvalid:
+            # TODO: clarify the error message
+            logger.error("Invalid user config / ユーザ設定の形式が正しくないようです")
+            raise
+    # NOTE: In nature, argument parser result is not needed to be sanitize
+    #   However this will help us to detect program bug
+    def sanitize_argparse_namespace(self, argparse_namespace: argparse.Namespace) -> argparse.Namespace:
+        try:
+            return self.argparse_config_validator(argparse_namespace)
+        except MultipleInvalid:
+            # XXX: this should be a bug
+            logger.error(
+                "Invalid cmdline parsed arguments. This should be a bug. / コマンドラインのパース結果が正しくないようです。プログラムのバグの可能性が高いです。"
+            )
+            raise
+    # NOTE: value would be overwritten by latter dict if there is already the same key
+    @staticmethod
+    def __merge_dict(*dict_list: dict) -> dict:
+        merged = {}
+        for schema in dict_list:
+            # merged |= schema
+            for k, v in schema.items():
+                merged[k] = v
+        return merged
+class BlueprintGenerator:
+    BLUEPRINT_PARAM_NAME_TO_CONFIG_OPTNAME = {}
+    def __init__(self, sanitizer: ConfigSanitizer):
+        self.sanitizer = sanitizer
+    # runtime_params is for parameters which is only configurable on runtime, such as tokenizer
+    def generate(self, user_config: dict, argparse_namespace: argparse.Namespace, **runtime_params) -> Blueprint:
+        sanitized_user_config = self.sanitizer.sanitize_user_config(user_config)
+        sanitized_argparse_namespace = self.sanitizer.sanitize_argparse_namespace(argparse_namespace)
+        argparse_config = {k: v for k, v in vars(sanitized_argparse_namespace).items() if v is not None}
+        general_config = sanitized_user_config.get("general", {})
+        dataset_blueprints = []
+        for dataset_config in sanitized_user_config.get("datasets", []):
+            is_image_dataset = "image_directory" in dataset_config or "image_jsonl_file" in dataset_config
+            if is_image_dataset:
+                dataset_params_klass = ImageDatasetParams
+            else:
+                dataset_params_klass = VideoDatasetParams
+            params = self.generate_params_by_fallbacks(
+                dataset_params_klass, [dataset_config, general_config, argparse_config, runtime_params]
+            )
+            dataset_blueprints.append(DatasetBlueprint(is_image_dataset, params))
+        dataset_group_blueprint = DatasetGroupBlueprint(dataset_blueprints)
+        return Blueprint(dataset_group_blueprint)
+    @staticmethod
+    def generate_params_by_fallbacks(param_klass, fallbacks: Sequence[dict]):
+        name_map = BlueprintGenerator.BLUEPRINT_PARAM_NAME_TO_CONFIG_OPTNAME
+        search_value = BlueprintGenerator.search_value
+        default_params = asdict(param_klass())
+        param_names = default_params.keys()
+        params = {name: search_value(name_map.get(name, name), fallbacks, default_params.get(name)) for name in param_names}
+        return param_klass(**params)
+    @staticmethod
+    def search_value(key: str, fallbacks: Sequence[dict], default_value=None):
+        for cand in fallbacks:
+            value = cand.get(key)
+            if value is not None:
+                return value
+        return default_value
+# if training is True, it will return a dataset group for training, otherwise for caching
+def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlueprint, training: bool = False) -> DatasetGroup:
+    datasets: List[Union[ImageDataset, VideoDataset]] = []
+    for dataset_blueprint in dataset_group_blueprint.datasets:
+        if dataset_blueprint.is_image_dataset:
+            dataset_klass = ImageDataset
+        else:
+            dataset_klass = VideoDataset
+        dataset = dataset_klass(**asdict(dataset_blueprint.params))
+        datasets.append(dataset)
+    # assertion
+    cache_directories = [dataset.cache_directory for dataset in datasets]
+    num_of_unique_cache_directories = len(set(cache_directories))
+    if num_of_unique_cache_directories != len(cache_directories):
+        raise ValueError(
+            "cache directory should be unique for each dataset (note that cache directory is image/video directory if not specified)"
+            + " / cache directory は各データセットごとに異なる必要があります（指定されていない場合はimage/video directoryが使われるので注意）"
+        )
+    # print info
+    info = ""
+    for i, dataset in enumerate(datasets):
+        is_image_dataset = isinstance(dataset, ImageDataset)
+        info += dedent(
+            f"""\
+      [Dataset {i}]
+        is_image_dataset: {is_image_dataset}
+        resolution: {dataset.resolution}
+        batch_size: {dataset.batch_size}
+        num_repeats: {dataset.num_repeats}
+        caption_extension: "{dataset.caption_extension}"
+        enable_bucket: {dataset.enable_bucket}
+        bucket_no_upscale: {dataset.bucket_no_upscale}
+        cache_directory: "{dataset.cache_directory}"
+        debug_dataset: {dataset.debug_dataset}
+    """
+        )
+        if is_image_dataset:
+            info += indent(
+                dedent(
+                    f"""\
+        image_directory: "{dataset.image_directory}"
+        image_jsonl_file: "{dataset.image_jsonl_file}"
+    \n"""
+                ),
+                "    ",
+            )
+        else:
+            info += indent(
+                dedent(
+                    f"""\
+        video_directory: "{dataset.video_directory}"
+        video_jsonl_file: "{dataset.video_jsonl_file}"
+        control_directory: "{dataset.control_directory}"
+        target_frames: {dataset.target_frames}
+        frame_extraction: {dataset.frame_extraction}
+        frame_stride: {dataset.frame_stride}
+        frame_sample: {dataset.frame_sample}
+        max_frames: {dataset.max_frames}
+        source_fps: {dataset.source_fps}
+    \n"""
+                ),
+                "    ",
+            )
+    logger.info(f"{info}")
+    # make buckets first because it determines the length of dataset
+    # and set the same seed for all datasets
+    seed = random.randint(0, 2**31)  # actual seed is seed + epoch_no
+    for i, dataset in enumerate(datasets):
+        # logger.info(f"[Dataset {i}]")
+        dataset.set_seed(seed)
+        if training:
+            dataset.prepare_for_training()
+    return DatasetGroup(datasets)
+def load_user_config(file: str) -> dict:
+    file: Path = Path(file)
+    if not file.is_file():
+        raise ValueError(f"file not found / ファイルが見つかりません: {file}")
+    if file.name.lower().endswith(".json"):
+        try:
+            with open(file, "r", encoding="utf-8") as f:
+                config = json.load(f)
+        except Exception:
+            logger.error(
+                f"Error on parsing JSON config file. Please check the format. / JSON 形式の設定ファイルの読み込みに失敗しました。文法が正しいか確認してください。: {file}"
+            )
+            raise
+    elif file.name.lower().endswith(".toml"):
+        try:
+            config = toml.load(file)
+        except Exception:
+            logger.error(
+                f"Error on parsing TOML config file. Please check the format. / TOML 形式の設定ファイルの読み込みに失敗しました。文法が正しいか確認してください。: {file}"
+            )
+            raise
+    else:
+        raise ValueError(f"not supported config file format / 対応していない設定ファイルの形式です: {file}")
+    return config
+# for config test
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("dataset_config")
+    config_args, remain = parser.parse_known_args()
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--debug_dataset", action="store_true")
+    argparse_namespace = parser.parse_args(remain)
+    logger.info("[argparse_namespace]")
+    logger.info(f"{vars(argparse_namespace)}")
+    user_config = load_user_config(config_args.dataset_config)
+    logger.info("")
+    logger.info("[user_config]")
+    logger.info(f"{user_config}")
+    sanitizer = ConfigSanitizer()
+    sanitized_user_config = sanitizer.sanitize_user_config(user_config)
+    logger.info("")
+    logger.info("[sanitized_user_config]")
+    logger.info(f"{sanitized_user_config}")
+    blueprint = BlueprintGenerator(sanitizer).generate(user_config, argparse_namespace)
+    logger.info("")
+    logger.info("[blueprint]")
+    logger.info(f"{blueprint}")
+    dataset_group = generate_dataset_group_by_blueprint(blueprint.dataset_group)

dataset/dataset_config.md ADDED Viewed

	@@ -0,0 +1,461 @@

+> 📝 Click on the language section to expand / 言語をクリックして展開
+## Dataset Configuration
+Please create a TOML file for dataset configuration.
+Image and video datasets are supported. The configuration file can include multiple datasets, either image or video datasets, with caption text files or metadata JSONL files.
+The cache directory must be different for each dataset.
+Each video is extracted frame by frame without additional processing and used for training. It is recommended to use videos with a frame rate of 24fps for HunyuanVideo, 16fps for Wan2.1 and 30fps for FramePack. You can check the videos that will be trained using `--debug_mode video` when caching latent (see [here](/README.md#latent-caching)).
+<details>
+<summary>日本語</summary>
+データセットの設定を行うためのTOMLファイルを作成してください。
+画像データセットと動画データセットがサポートされています。設定ファイルには、画像または動画データセットを複数含めることができます。キャプションテキストファイルまたはメタデータJSONLファイルを使用できます。
+キャッシュディレクトリは、各データセットごとに異なるディレクトリである必要があります。
+動画は追加のプロセスなしでフレームごとに抽出され、学習に用いられます。そのため、HunyuanVideoは24fps、Wan2.1は16fps、FramePackは30fpsのフレームレートの動画を使用することをお勧めします。latentキャッシュ時の`--debug_mode video`を使用すると、学習される動画を確認できます（[こちら](/README.ja.md#latentの事前キャッシュ)を参照）。
+</details>
+### Sample for Image Dataset with Caption Text Files
+```toml
+# resolution, caption_extension, batch_size, num_repeats, enable_bucket, bucket_no_upscale should be set in either general or datasets
+# otherwise, the default values will be used for each item
+# general configurations
+[general]
+resolution = [960, 544]
+caption_extension = ".txt"
+batch_size = 1
+enable_bucket = true
+bucket_no_upscale = false
+[[datasets]]
+image_directory = "/path/to/image_dir"
+cache_directory = "/path/to/cache_directory"
+num_repeats = 1 # optional, default is 1. Number of times to repeat the dataset. Useful to balance the multiple datasets with different sizes.
+# other datasets can be added here. each dataset can have different configurations
+```
+`cache_directory` is optional, default is None to use the same directory as the image directory. However, we recommend to set the cache directory to avoid accidental sharing of the cache files between different datasets.
+`num_repeats` is also available. It is optional, default is 1 (no repeat). It repeats the images (or videos) that many times to expand the dataset. For example, if `num_repeats = 2` and there are 20 images in the dataset, each image will be duplicated twice (with the same caption) to have a total of 40 images. It is useful to balance the multiple datasets with different sizes.
+<details>
+<summary>日本語</summary>
+`cache_directory` はオプションです。デフォルトは画像ディレクトリと同じディレクトリに設定されます。ただし、異なるデータセット間でキャッシュファイルが共有されるのを防ぐために、明示的に別のキャッシュディレクトリを設定することをお勧めします。
+`num_repeats` はオプションで、デフォルトは 1 です（繰り返しなし）。画像（や動画）を、その回数だけ単純に繰り返してデータセットを拡張します。たとえば`num_repeats = 2`としたとき、画像20枚のデータセットなら、各画像が2枚ずつ（同一のキャプションで）計40枚存在した場合と同じになります。異なるデータ数のデータセット間でバランスを取るために使用可能です。
+resolution, caption_extension, batch_size, num_repeats, enable_bucket, bucket_no_upscale は general または datasets のどちらかに設定してください。省略時は各項目のデフォルト値が使用されます。
+`[[datasets]]`以下を追加することで、他のデータセットを追加できます。各データセットには異なる設定を持てます。
+</details>
+### Sample for Image Dataset with Metadata JSONL File
+```toml
+# resolution, batch_size, num_repeats, enable_bucket, bucket_no_upscale should be set in either general or datasets
+# caption_extension is not required for metadata jsonl file
+# cache_directory is required for each dataset with metadata jsonl file
+# general configurations
+[general]
+resolution = [960, 544]
+batch_size = 1
+enable_bucket = true
+bucket_no_upscale = false
+[[datasets]]
+image_jsonl_file = "/path/to/metadata.jsonl"
+cache_directory = "/path/to/cache_directory" # required for metadata jsonl file
+num_repeats = 1 # optional, default is 1. Same as above.
+# other datasets can be added here. each dataset can have different configurations
+```
+JSONL file format for metadata:
+```json
+{"image_path": "/path/to/image1.jpg", "caption": "A caption for image1"}
+{"image_path": "/path/to/image2.jpg", "caption": "A caption for image2"}
+```
+<details>
+<summary>日本語</summary>
+resolution, batch_size, num_repeats, enable_bucket, bucket_no_upscale は general または datasets のどちらかに設定してください。省略時は各項目のデフォルト値が使用されます。
+metadata jsonl ファイルを使用する場合、caption_extension は必要ありません。また、cache_directory は必須です。
+キャプションによるデータセットと同様に、複数のデータセットを追加できます。各データセットには異なる設定を持てます。
+</details>
+### Sample for Video Dataset with Caption Text Files
+```toml
+# Common parameters (resolution, caption_extension, batch_size, num_repeats, enable_bucket, bucket_no_upscale)
+# can be set in either general or datasets sections
+# Video-specific parameters (target_frames, frame_extraction, frame_stride, frame_sample, max_frames, source_fps)
+# must be set in each datasets section
+# general configurations
+[general]
+resolution = [960, 544]
+caption_extension = ".txt"
+batch_size = 1
+enable_bucket = true
+bucket_no_upscale = false
+[[datasets]]
+video_directory = "/path/to/video_dir"
+cache_directory = "/path/to/cache_directory" # recommended to set cache directory
+target_frames = [1, 25, 45]
+frame_extraction = "head"
+source_fps = 30.0 # optional, source fps for videos in the directory, decimal number
+[[datasets]]
+video_directory = "/path/to/video_dir2"
+cache_directory = "/path/to/cache_directory2" # recommended to set cache directory
+frame_extraction = "full"
+max_frames = 45
+# other datasets can be added here. each dataset can have different configurations
+```
+__In HunyuanVideo and Wan2.1, the number of `target_frames` must be "N\*4+1" (N=0,1,2,...).__ Otherwise, it will be truncated to the nearest "N*4+1".
+In FramePack, it is recommended to set `frame_extraction` to `full` and `max_frames` to a sufficiently large value, as it can handle longer videos. However, if the video is too long, an Out of Memory error may occur during VAE encoding. The videos in FramePack are trimmed to "N * latent_window_size * 4 + 1" frames (for example, 37, 73, 109... if `latent_window_size` is 9).
+If the `source_fps` is specified, the videos in the directory are considered to be at this frame rate, and some frames will be skipped to match the model's frame rate (24 for HunyuanVideo and 16 for Wan2.1). __The value must be a decimal number, for example, `30.0` instead of `30`.__ The skipping is done automatically and does not consider the content of the images. Please check if the converted data is correct using `--debug_mode video`.
+If `source_fps` is not specified (default), all frames of the video will be used regardless of the video's frame rate.
+<details>
+<summary>日本語</summary>
+共通パラメータ（resolution, caption_extension, batch_size, num_repeats, enable_bucket, bucket_no_upscale）は、generalまたはdatasetsのいずれかに設定できます。
+動画固有のパラメータ（target_frames, frame_extraction, frame_stride, frame_sample, max_frames, source_fps）は、各datasetsセクションに設定する必要があります。
+__HunyuanVideoおよびWan2.1では、target_framesの数値は「N\*4+1」である必要があります。__ これ以外の値の場合は、最も近いN\*4+1の値に切り捨てられます。
+FramePackでも同様ですが、FramePackでは動画が長くても学習可能なため、 `frame_extraction`に`full` を指定し、`max_frames`を十分に大きな値に設定することをお勧めします。ただし、あまりにも長すぎるとVAEのencodeでOut of Memoryエラーが発生する可能性があります。FramePackの動画は、「N * latent_window_size * 4 + 1」フレームにトリミングされます（latent_window_sizeが9の場合、37、73、109……）。
+`source_fps`を指定した場合、ディレクトリ内の動画をこのフレームレートとみなして、モデルのフレームレートにあうようにいくつかのフレームをスキップします（HunyuanVideoは24、Wan2.1は16）。__小数点を含む数値で指定してください。__ 例：`30`ではなく`30.0`。スキップは機械的に行われ、画像の内容は考慮しません。変換後のデータが正しいか、`--debug_mode video`で確認してください。
+`source_fps`を指定しない場合、動画のフレームは（動画自体のフレームレートに関係なく）すべて使用されます。
+他の注意事項は画像データセットと同様です。
+</details>
+### Sample for Video Dataset with Metadata JSONL File
+```toml
+# Common parameters (resolution, caption_extension, batch_size, num_repeats, enable_bucket, bucket_no_upscale)
+# can be set in either general or datasets sections
+# Video-specific parameters (target_frames, frame_extraction, frame_stride, frame_sample, max_frames, source_fps)
+# must be set in each datasets section
+# caption_extension is not required for metadata jsonl file
+# cache_directory is required for each dataset with metadata jsonl file
+# general configurations
+[general]
+resolution = [960, 544]
+batch_size = 1
+enable_bucket = true
+bucket_no_upscale = false
+[[datasets]]
+video_jsonl_file = "/path/to/metadata.jsonl"
+target_frames = [1, 25, 45]
+frame_extraction = "head"
+cache_directory = "/path/to/cache_directory_head"
+source_fps = 30.0 # optional, source fps for videos in the jsonl file
+# same metadata jsonl file can be used for multiple datasets
+[[datasets]]
+video_jsonl_file = "/path/to/metadata.jsonl"
+target_frames = [1]
+frame_stride = 10
+cache_directory = "/path/to/cache_directory_stride"
+# other datasets can be added here. each dataset can have different configurations
+```
+JSONL file format for metadata:
+```json
+{"video_path": "/path/to/video1.mp4", "caption": "A caption for video1"}
+{"video_path": "/path/to/video2.mp4", "caption": "A caption for video2"}
+```
+`video_path` can be a directory containing multiple images.
+<details>
+<summary>日本語</summary>
+metadata jsonl ファイルを使用する場合、caption_extension は必要ありません。また、cache_directory は必須です。
+`video_path`は、複数の画像を含むディレクトリのパスでも構いません。
+他の注意事項は今までのデータセットと同様です。
+</details>
+### frame_extraction Options
+- `head`: Extract the first N frames from the video.
+- `chunk`: Extract frames by splitting the video into chunks of N frames.
+- `slide`: Extract frames from the video with a stride of `frame_stride`.
+- `uniform`: Extract `frame_sample` samples uniformly from the video.
+- `full`: Extract all frames from the video.
+In the case of `full`, the entire video is used, but it is trimmed to "N*4+1" frames. It is also trimmed to the `max_frames` if it exceeds that value. To avoid Out of Memory errors, please set `max_frames`.
+The frame extraction methods other than `full` are recommended when the video contains repeated actions. `full` is recommended when each video represents a single complete motion.
+For example, consider a video with 40 frames. The following diagrams illustrate each extraction:
+<details>
+<summary>日本語</summary>
+- `head`: 動画から最初のNフレームを抽出します。
+- `chunk`: 動画をNフレームずつに分割してフレームを抽出します。
+- `slide`: `frame_stride`に指定したフレームごとに動画からNフレームを抽出します。
+- `uniform`: 動画から一定間隔で、`frame_sample`個のNフレームを抽出します。
+- `full`: 動画から全てのフレームを抽出します。
+`full`の場合、各動画の全体を用いますが、「N*4+1」のフレーム数にトリミングされます。また`max_frames`を超える場合もその値にトリミングされます。Out of Memoryエラーを避けるために、`max_frames`を設定してください。
+`full`以外の抽出方法は、動画が特定の動作を繰り返している場合にお勧めします。`full`はそれぞれの動画がひとつの完結したモーションの場合にお勧めします。
+例えば、40フレームの動画を例とした抽出について、以下の図で説明します。
+</details>
+```
+Original Video, 40 frames: x = frame, o = no frame
+oooooooooooooooooooooooooooooooooooooooo
+head, target_frames = [1, 13, 25] -> extract head frames:
+xooooooooooooooooooooooooooooooooooooooo
+xxxxxxxxxxxxxooooooooooooooooooooooooooo
+xxxxxxxxxxxxxxxxxxxxxxxxxooooooooooooooo
+chunk, target_frames = [13, 25] -> extract frames by splitting into chunks, into 13 and 25 frames:
+xxxxxxxxxxxxxooooooooooooooooooooooooooo
+oooooooooooooxxxxxxxxxxxxxoooooooooooooo
+ooooooooooooooooooooooooooxxxxxxxxxxxxxo
+xxxxxxxxxxxxxxxxxxxxxxxxxooooooooooooooo
+NOTE: Please do not include 1 in target_frames if you are using the frame_extraction "chunk". It will make the all frames to be extracted.
+注: frame_extraction "chunk" を使用する場合、target_frames に 1 を含めないでください。全てのフレームが抽出されてしまいます。
+slide, target_frames = [1, 13, 25], frame_stride = 10 -> extract N frames with a stride of 10:
+xooooooooooooooooooooooooooooooooooooooo
+ooooooooooxooooooooooooooooooooooooooooo
+ooooooooooooooooooooxooooooooooooooooooo
+ooooooooooooooooooooooooooooooxooooooooo
+xxxxxxxxxxxxxooooooooooooooooooooooooooo
+ooooooooooxxxxxxxxxxxxxooooooooooooooooo
+ooooooooooooooooooooxxxxxxxxxxxxxooooooo
+xxxxxxxxxxxxxxxxxxxxxxxxxooooooooooooooo
+ooooooooooxxxxxxxxxxxxxxxxxxxxxxxxxooooo
+uniform, target_frames =[1, 13, 25], frame_sample = 4 -> extract `frame_sample` samples uniformly, N frames each:
+xooooooooooooooooooooooooooooooooooooooo
+oooooooooooooxoooooooooooooooooooooooooo
+oooooooooooooooooooooooooxoooooooooooooo
+ooooooooooooooooooooooooooooooooooooooox
+xxxxxxxxxxxxxooooooooooooooooooooooooooo
+oooooooooxxxxxxxxxxxxxoooooooooooooooooo
+ooooooooooooooooooxxxxxxxxxxxxxooooooooo
+oooooooooooooooooooooooooooxxxxxxxxxxxxx
+xxxxxxxxxxxxxxxxxxxxxxxxxooooooooooooooo
+oooooxxxxxxxxxxxxxxxxxxxxxxxxxoooooooooo
+ooooooooooxxxxxxxxxxxxxxxxxxxxxxxxxooooo
+oooooooooooooooxxxxxxxxxxxxxxxxxxxxxxxxx
+Three Original Videos, 20, 25, 35 frames: x = frame, o = no frame
+full, max_frames = 31 -> extract all frames (trimmed to the maximum length):
+video1: xxxxxxxxxxxxxxxxx (trimmed to 17 frames)
+video2: xxxxxxxxxxxxxxxxxxxxxxxxx (25 frames)
+video3: xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx (trimmed to 31 frames)
+```
+### Sample for Video Dataset with Control Images
+The dataset with control videos is used for training ControlNet models.
+The dataset configuration with caption text files is similar to the video dataset, but with an additional `control_directory` parameter.
+The control video for a video is used from the `control_directory` with the same filename (or different extension) as the video, for example, `video_dir/video1.mp4` and `control_dir/video1.mp4` or `control_dir/video1.mov`. The control video can also be a directory without an extension, for example, `video_dir/video1.mp4` and `control_dir/video1`.
+```toml
+[[datasets]]
+video_directory = "/path/to/video_dir"
+control_directory = "/path/to/control_dir" # required for dataset with control videos
+cache_directory = "/path/to/cache_directory" # recommended to set cache directory
+target_frames = [1, 25, 45]
+frame_extraction = "head"
+```
+The dataset configuration with metadata JSONL file is  same as the video dataset, but metadata JSONL file must include the control video paths. The control video path can be a directory containing multiple images.
+```json
+{"video_path": "/path/to/video1.mp4", "control_path": "/path/to/control1.mp4", "caption": "A caption for video1"}
+{"video_path": "/path/to/video2.mp4", "control_path": "/path/to/control2.mp4", "caption": "A caption for video2"}
+```
+<details>
+<summary>日本語</summary>
+制御動画を持つデータセットです。ControlNetモデルの学習に使用します。
+キャプションを用いる場合のデータセット設定は動画データセットと似ていますが、`control_directory`パラメータが追加されています。上にある例を参照してください。ある動画に対する制御用動画として、動画と同じファイル名（または拡張子のみが異なるファイル名）の、`control_directory`にある動画が使用されます（例：`video_dir/video1.mp4`と`control_dir/video1.mp4`または`control_dir/video1.mov`）。また、拡張子なしのディレクトリ内の、複数枚の画像を制御用動画として使用することもできます（例：`video_dir/video1.mp4`と`control_dir/video1`）。
+データセット設定でメタデータJSONLファイルを使用する場合は、動画と制御用動画のパスを含める必要があります。制御用動画のパスは、複数枚の画像を含むディレクトリのパスでも構いません。
+</details>
+## Specifications
+```toml
+# general configurations
+[general]
+resolution = [960, 544] # optional, [W, H], default is [960, 544]. This is the default resolution for all datasets
+caption_extension = ".txt" # optional, default is None. This is the default caption extension for all datasets
+batch_size = 1 # optional, default is 1. This is the default batch size for all datasets
+num_repeats = 1 # optional, default is 1. Number of times to repeat the dataset. Useful to balance the multiple datasets with different sizes.
+enable_bucket = true # optional, default is false. Enable bucketing for datasets
+bucket_no_upscale = false # optional, default is false. Disable upscaling for bucketing. Ignored if enable_bucket is false
+### Image Dataset
+# sample image dataset with caption text files
+[[datasets]]
+image_directory = "/path/to/image_dir"
+caption_extension = ".txt" # required for caption text files, if general caption extension is not set
+resolution = [960, 544] # required if general resolution is not set
+batch_size = 4 # optional, overwrite the default batch size
+num_repeats = 1 # optional, overwrite the default num_repeats
+enable_bucket = false # optional, overwrite the default bucketing setting
+bucket_no_upscale = true # optional, overwrite the default bucketing setting
+cache_directory = "/path/to/cache_directory" # optional, default is None to use the same directory as the image directory. NOTE: caching is always enabled
+# sample image dataset with metadata **jsonl** file
+[[datasets]]
+image_jsonl_file = "/path/to/metadata.jsonl" # includes pairs of image files and captions
+resolution = [960, 544] # required if general resolution is not set
+cache_directory = "/path/to/cache_directory" # required for metadata jsonl file
+# caption_extension is not required for metadata jsonl file
+# batch_size, num_repeats, enable_bucket, bucket_no_upscale are also available for metadata jsonl file
+### Video Dataset
+# sample video dataset with caption text files
+[[datasets]]
+video_directory = "/path/to/video_dir"
+caption_extension = ".txt" # required for caption text files, if general caption extension is not set
+resolution = [960, 544] # required if general resolution is not set
+control_directory = "/path/to/control_dir" # optional, required for dataset with control images
+# following configurations must be set in each [[datasets]] section for video datasets
+target_frames = [1, 25, 79] # required for video dataset. list of video lengths to extract frames. each element must be N*4+1 (N=0,1,2,...)
+# NOTE: Please do not include 1 in target_frames if you are using the frame_extraction "chunk". It will make the all frames to be extracted.
+frame_extraction = "head" # optional, "head" or "chunk", "slide", "uniform". Default is "head"
+frame_stride = 1 # optional, default is 1, available for "slide" frame extraction
+frame_sample = 4 # optional, default is 1 (same as "head"), available for "uniform" frame extraction
+max_frames = 129 # optional, default is 129. Maximum number of frames to extract, available for "full" frame extraction
+# batch_size, num_repeats, enable_bucket, bucket_no_upscale, cache_directory are also available for video dataset
+# sample video dataset with metadata jsonl file
+[[datasets]]
+video_jsonl_file = "/path/to/metadata.jsonl" # includes pairs of video files and captions
+target_frames = [1, 79]
+cache_directory = "/path/to/cache_directory" # required for metadata jsonl file
+# frame_extraction, frame_stride, frame_sample, max_frames are also available for metadata jsonl file
+```
+<!--
+# sample image dataset with lance
+[[datasets]]
+image_lance_dataset = "/path/to/lance_dataset"
+resolution = [960, 544] # required if general resolution is not set
+# batch_size, enable_bucket, bucket_no_upscale, cache_directory are also available for lance dataset
+-->
+The metadata with .json file will be supported in the near future.
+<!--
+```toml
+# general configurations
+[general]
+resolution = [960, 544] # optional, [W, H], default is None. This is the default resolution for all datasets
+caption_extension = ".txt" # optional, default is None. This is the default caption extension for all datasets
+batch_size = 1 # optional, default is 1. This is the default batch size for all datasets
+enable_bucket = true # optional, default is false. Enable bucketing for datasets
+bucket_no_upscale = false # optional, default is false. Disable upscaling for bucketing. Ignored if enable_bucket is false
+# sample image dataset with caption text files
+[[datasets]]
+image_directory = "/path/to/image_dir"
+caption_extension = ".txt" # required for caption text files, if general caption extension is not set
+resolution = [960, 544] # required if general resolution is not set
+batch_size = 4 # optional, overwrite the default batch size
+enable_bucket = false # optional, overwrite the default bucketing setting
+bucket_no_upscale = true # optional, overwrite the default bucketing setting
+cache_directory = "/path/to/cache_directory" # optional, default is None to use the same directory as the image directory. NOTE: caching is always enabled
+# sample image dataset with metadata **jsonl** file
+[[datasets]]
+image_jsonl_file = "/path/to/metadata.jsonl" # includes pairs of image files and captions
+resolution = [960, 544] # required if general resolution is not set
+cache_directory = "/path/to/cache_directory" # required for metadata jsonl file
+# caption_extension is not required for metadata jsonl file
+# batch_size, enable_bucket, bucket_no_upscale are also available for metadata jsonl file
+# sample video dataset with caption text files
+[[datasets]]
+video_directory = "/path/to/video_dir"
+caption_extension = ".txt" # required for caption text files, if general caption extension is not set
+resolution = [960, 544] # required if general resolution is not set
+target_frames = [1, 25, 79] # required for video dataset. list of video lengths to extract frames. each element must be N*4+1 (N=0,1,2,...)
+frame_extraction = "head" # optional, "head" or "chunk", "slide", "uniform". Default is "head"
+frame_stride = 1 # optional, default is 1, available for "slide" frame extraction
+frame_sample = 4 # optional, default is 1 (same as "head"), available for "uniform" frame extraction
+# batch_size, enable_bucket, bucket_no_upscale, cache_directory are also available for video dataset
+# sample video dataset with metadata jsonl file
+[[datasets]]
+video_jsonl_file = "/path/to/metadata.jsonl" # includes pairs of video files and captions
+target_frames = [1, 79]
+cache_directory = "/path/to/cache_directory" # required for metadata jsonl file
+# frame_extraction, frame_stride, frame_sample are also available for metadata jsonl file
+```
+# sample image dataset with lance
+[[datasets]]
+image_lance_dataset = "/path/to/lance_dataset"
+resolution = [960, 544] # required if general resolution is not set
+# batch_size, enable_bucket, bucket_no_upscale, cache_directory are also available for lance dataset
+The metadata with .json file will be supported in the near future.
+-->

dataset/image_video_dataset.py ADDED Viewed

	@@ -0,0 +1,1726 @@

+from concurrent.futures import ThreadPoolExecutor
+import glob
+import json
+import math
+import os
+import random
+import time
+from typing import Optional, Sequence, Tuple, Union
+import numpy as np
+import torch
+from safetensors.torch import save_file, load_file
+from safetensors import safe_open
+from PIL import Image
+import cv2
+import av
+from utils import safetensors_utils
+from utils.model_utils import dtype_to_str
+import logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+IMAGE_EXTENSIONS = [".png", ".jpg", ".jpeg", ".webp", ".bmp", ".PNG", ".JPG", ".JPEG", ".WEBP", ".BMP"]
+try:
+    import pillow_avif
+    IMAGE_EXTENSIONS.extend([".avif", ".AVIF"])
+except:
+    pass
+# JPEG-XL on Linux
+try:
+    from jxlpy import JXLImagePlugin
+    IMAGE_EXTENSIONS.extend([".jxl", ".JXL"])
+except:
+    pass
+# JPEG-XL on Windows
+try:
+    import pillow_jxl
+    IMAGE_EXTENSIONS.extend([".jxl", ".JXL"])
+except:
+    pass
+VIDEO_EXTENSIONS = [
+    ".mp4",
+    ".webm",
+    ".avi",
+    ".mkv",
+    ".mov",
+    ".flv",
+    ".wmv",
+    ".m4v",
+    ".mpg",
+    ".mpeg",
+    ".MP4",
+    ".WEBM",
+    ".AVI",
+    ".MKV",
+    ".MOV",
+    ".FLV",
+    ".WMV",
+    ".M4V",
+    ".MPG",
+    ".MPEG",
+]  # some of them are not tested
+ARCHITECTURE_HUNYUAN_VIDEO = "hv"
+ARCHITECTURE_HUNYUAN_VIDEO_FULL = "hunyuan_video"
+ARCHITECTURE_WAN = "wan"
+ARCHITECTURE_WAN_FULL = "wan"
+ARCHITECTURE_FRAMEPACK = "fp"
+ARCHITECTURE_FRAMEPACK_FULL = "framepack"
+def glob_images(directory, base="*"):
+    img_paths = []
+    for ext in IMAGE_EXTENSIONS:
+        if base == "*":
+            img_paths.extend(glob.glob(os.path.join(glob.escape(directory), base + ext)))
+        else:
+            img_paths.extend(glob.glob(glob.escape(os.path.join(directory, base + ext))))
+    img_paths = list(set(img_paths))  # remove duplicates
+    img_paths.sort()
+    return img_paths
+def glob_videos(directory, base="*"):
+    video_paths = []
+    for ext in VIDEO_EXTENSIONS:
+        if base == "*":
+            video_paths.extend(glob.glob(os.path.join(glob.escape(directory), base + ext)))
+        else:
+            video_paths.extend(glob.glob(glob.escape(os.path.join(directory, base + ext))))
+    video_paths = list(set(video_paths))  # remove duplicates
+    video_paths.sort()
+    return video_paths
+def divisible_by(num: int, divisor: int) -> int:
+    return num - num % divisor
+def resize_image_to_bucket(image: Union[Image.Image, np.ndarray], bucket_reso: tuple[int, int]) -> np.ndarray:
+    """
+    Resize the image to the bucket resolution.
+    bucket_reso: **(width, height)**
+    """
+    is_pil_image = isinstance(image, Image.Image)
+    if is_pil_image:
+        image_width, image_height = image.size
+    else:
+        image_height, image_width = image.shape[:2]
+    if bucket_reso == (image_width, image_height):
+        return np.array(image) if is_pil_image else image
+    bucket_width, bucket_height = bucket_reso
+    if bucket_width == image_width or bucket_height == image_height:
+        image = np.array(image) if is_pil_image else image
+    else:
+        # resize the image to the bucket resolution to match the short side
+        scale_width = bucket_width / image_width
+        scale_height = bucket_height / image_height
+        scale = max(scale_width, scale_height)
+        image_width = int(image_width * scale + 0.5)
+        image_height = int(image_height * scale + 0.5)
+        if scale > 1:
+            image = Image.fromarray(image) if not is_pil_image else image
+            image = image.resize((image_width, image_height), Image.LANCZOS)
+            image = np.array(image)
+        else:
+            image = np.array(image) if is_pil_image else image
+            image = cv2.resize(image, (image_width, image_height), interpolation=cv2.INTER_AREA)
+    # crop the image to the bucket resolution
+    crop_left = (image_width - bucket_width) // 2
+    crop_top = (image_height - bucket_height) // 2
+    image = image[crop_top : crop_top + bucket_height, crop_left : crop_left + bucket_width]
+    return image
+class ItemInfo:
+    def __init__(
+        self,
+        item_key: str,
+        caption: str,
+        original_size: tuple[int, int],
+        bucket_size: Optional[Union[tuple[int, int], tuple[int, int, int]]] = None,
+        frame_count: Optional[int] = None,
+        content: Optional[np.ndarray] = None,
+        latent_cache_path: Optional[str] = None,
+    ) -> None:
+        self.item_key = item_key
+        self.caption = caption
+        self.original_size = original_size
+        self.bucket_size = bucket_size
+        self.frame_count = frame_count
+        self.content = content
+        self.latent_cache_path = latent_cache_path
+        self.text_encoder_output_cache_path: Optional[str] = None
+        self.control_content: Optional[np.ndarray] = None
+    def __str__(self) -> str:
+        return (
+            f"ItemInfo(item_key={self.item_key}, caption={self.caption}, "
+            + f"original_size={self.original_size}, bucket_size={self.bucket_size}, "
+            + f"frame_count={self.frame_count}, latent_cache_path={self.latent_cache_path}, content={self.content.shape if self.content is not None else None})"
+        )
+# We use simple if-else approach to support multiple architectures.
+# Maybe we can use a plugin system in the future.
+# the keys of the dict are `<content_type>_FxHxW_<dtype>` for latents
+# and `<content_type>_<dtype|mask>` for other tensors
+def save_latent_cache(item_info: ItemInfo, latent: torch.Tensor):
+    """HunyuanVideo architecture only. HunyuanVideo doesn't support I2V and control latents"""
+    assert latent.dim() == 4, "latent should be 4D tensor (frame, channel, height, width)"
+    _, F, H, W = latent.shape
+    dtype_str = dtype_to_str(latent.dtype)
+    sd = {f"latents_{F}x{H}x{W}_{dtype_str}": latent.detach().cpu()}
+    save_latent_cache_common(item_info, sd, ARCHITECTURE_HUNYUAN_VIDEO_FULL)
+def save_latent_cache_wan(
+    item_info: ItemInfo,
+    latent: torch.Tensor,
+    clip_embed: Optional[torch.Tensor],
+    image_latent: Optional[torch.Tensor],
+    control_latent: Optional[torch.Tensor],
+):
+    """Wan architecture only"""
+    assert latent.dim() == 4, "latent should be 4D tensor (frame, channel, height, width)"
+    _, F, H, W = latent.shape
+    dtype_str = dtype_to_str(latent.dtype)
+    sd = {f"latents_{F}x{H}x{W}_{dtype_str}": latent.detach().cpu()}
+    if clip_embed is not None:
+        sd[f"clip_{dtype_str}"] = clip_embed.detach().cpu()
+    if image_latent is not None:
+        sd[f"latents_image_{F}x{H}x{W}_{dtype_str}"] = image_latent.detach().cpu()
+    if control_latent is not None:
+        sd[f"latents_control_{F}x{H}x{W}_{dtype_str}"] = control_latent.detach().cpu()
+    save_latent_cache_common(item_info, sd, ARCHITECTURE_WAN_FULL)
+def save_latent_cache_framepack(
+    item_info: ItemInfo,
+    latent: torch.Tensor,
+    latent_indices: torch.Tensor,
+    clean_latents: torch.Tensor,
+    clean_latent_indices: torch.Tensor,
+    clean_latents_2x: torch.Tensor,
+    clean_latent_2x_indices: torch.Tensor,
+    clean_latents_4x: torch.Tensor,
+    clean_latent_4x_indices: torch.Tensor,
+    image_embeddings: torch.Tensor,
+):
+    """FramePack architecture only"""
+    assert latent.dim() == 4, "latent should be 4D tensor (frame, channel, height, width)"
+    _, F, H, W = latent.shape
+    dtype_str = dtype_to_str(latent.dtype)
+    sd = {f"latents_{F}x{H}x{W}_{dtype_str}": latent.detach().cpu().contiguous()}
+    # `latents_xxx` must have {F, H, W} suffix
+    indices_dtype_str = dtype_to_str(latent_indices.dtype)
+    sd[f"image_embeddings_{dtype_str}"] = image_embeddings.detach().cpu()  # image embeddings dtype is same as latents dtype
+    sd[f"latent_indices_{indices_dtype_str}"] = latent_indices.detach().cpu()
+    sd[f"clean_latent_indices_{indices_dtype_str}"] = clean_latent_indices.detach().cpu()
+    sd[f"clean_latent_2x_indices_{indices_dtype_str}"] = clean_latent_2x_indices.detach().cpu()
+    sd[f"clean_latent_4x_indices_{indices_dtype_str}"] = clean_latent_4x_indices.detach().cpu()
+    sd[f"latents_clean_{F}x{H}x{W}_{dtype_str}"] = clean_latents.detach().cpu().contiguous()
+    sd[f"latents_clean_2x_{F}x{H}x{W}_{dtype_str}"] = clean_latents_2x.detach().cpu().contiguous()
+    sd[f"latents_clean_4x_{F}x{H}x{W}_{dtype_str}"] = clean_latents_4x.detach().cpu().contiguous()
+    # for key, value in sd.items():
+    #     print(f"{key}: {value.shape}")
+    save_latent_cache_common(item_info, sd, ARCHITECTURE_FRAMEPACK_FULL)
+def save_latent_cache_common(item_info: ItemInfo, sd: dict[str, torch.Tensor], arch_fullname: str):
+    metadata = {
+        "architecture": arch_fullname,
+        "width": f"{item_info.original_size[0]}",
+        "height": f"{item_info.original_size[1]}",
+        "format_version": "1.0.1",
+    }
+    if item_info.frame_count is not None:
+        metadata["frame_count"] = f"{item_info.frame_count}"
+    for key, value in sd.items():
+        # NaN check and show warning, replace NaN with 0
+        if torch.isnan(value).any():
+            logger.warning(f"{key} tensor has NaN: {item_info.item_key}, replace NaN with 0")
+            value[torch.isnan(value)] = 0
+    latent_dir = os.path.dirname(item_info.latent_cache_path)
+    os.makedirs(latent_dir, exist_ok=True)
+    save_file(sd, item_info.latent_cache_path, metadata=metadata)
+def save_text_encoder_output_cache(item_info: ItemInfo, embed: torch.Tensor, mask: Optional[torch.Tensor], is_llm: bool):
+    """HunyuanVideo architecture only"""
+    assert (
+        embed.dim() == 1 or embed.dim() == 2
+    ), f"embed should be 2D tensor (feature, hidden_size) or (hidden_size,), got {embed.shape}"
+    assert mask is None or mask.dim() == 1, f"mask should be 1D tensor (feature), got {mask.shape}"
+    sd = {}
+    dtype_str = dtype_to_str(embed.dtype)
+    text_encoder_type = "llm" if is_llm else "clipL"
+    sd[f"{text_encoder_type}_{dtype_str}"] = embed.detach().cpu()
+    if mask is not None:
+        sd[f"{text_encoder_type}_mask"] = mask.detach().cpu()
+    save_text_encoder_output_cache_common(item_info, sd, ARCHITECTURE_HUNYUAN_VIDEO_FULL)
+def save_text_encoder_output_cache_wan(item_info: ItemInfo, embed: torch.Tensor):
+    """Wan architecture only. Wan2.1 only has a single text encoder"""
+    sd = {}
+    dtype_str = dtype_to_str(embed.dtype)
+    text_encoder_type = "t5"
+    sd[f"varlen_{text_encoder_type}_{dtype_str}"] = embed.detach().cpu()
+    save_text_encoder_output_cache_common(item_info, sd, ARCHITECTURE_WAN_FULL)
+def save_text_encoder_output_cache_framepack(
+    item_info: ItemInfo, llama_vec: torch.Tensor, llama_attention_mask: torch.Tensor, clip_l_pooler: torch.Tensor
+):
+    """FramePack architecture only."""
+    sd = {}
+    dtype_str = dtype_to_str(llama_vec.dtype)
+    sd[f"llama_vec_{dtype_str}"] = llama_vec.detach().cpu()
+    sd[f"llama_attention_mask"] = llama_attention_mask.detach().cpu()
+    dtype_str = dtype_to_str(clip_l_pooler.dtype)
+    sd[f"clip_l_pooler_{dtype_str}"] = clip_l_pooler.detach().cpu()
+    save_text_encoder_output_cache_common(item_info, sd, ARCHITECTURE_FRAMEPACK_FULL)
+def save_text_encoder_output_cache_common(item_info: ItemInfo, sd: dict[str, torch.Tensor], arch_fullname: str):
+    for key, value in sd.items():
+        # NaN check and show warning, replace NaN with 0
+        if torch.isnan(value).any():
+            logger.warning(f"{key} tensor has NaN: {item_info.item_key}, replace NaN with 0")
+            value[torch.isnan(value)] = 0
+    metadata = {
+        "architecture": arch_fullname,
+        "caption1": item_info.caption,
+        "format_version": "1.0.1",
+    }
+    if os.path.exists(item_info.text_encoder_output_cache_path):
+        # load existing cache and update metadata
+        with safetensors_utils.MemoryEfficientSafeOpen(item_info.text_encoder_output_cache_path) as f:
+            existing_metadata = f.metadata()
+            for key in f.keys():
+                if key not in sd:  # avoid overwriting by existing cache, we keep the new one
+                    sd[key] = f.get_tensor(key)
+        assert existing_metadata["architecture"] == metadata["architecture"], "architecture mismatch"
+        if existing_metadata["caption1"] != metadata["caption1"]:
+            logger.warning(f"caption mismatch: existing={existing_metadata['caption1']}, new={metadata['caption1']}, overwrite")
+        # TODO verify format_version
+        existing_metadata.pop("caption1", None)
+        existing_metadata.pop("format_version", None)
+        metadata.update(existing_metadata)  # copy existing metadata except caption and format_version
+    else:
+        text_encoder_output_dir = os.path.dirname(item_info.text_encoder_output_cache_path)
+        os.makedirs(text_encoder_output_dir, exist_ok=True)
+    safetensors_utils.mem_eff_save_file(sd, item_info.text_encoder_output_cache_path, metadata=metadata)
+class BucketSelector:
+    RESOLUTION_STEPS_HUNYUAN = 16
+    RESOLUTION_STEPS_WAN = 16
+    RESOLUTION_STEPS_FRAMEPACK = 16
+    def __init__(
+        self, resolution: Tuple[int, int], enable_bucket: bool = True, no_upscale: bool = False, architecture: str = "no_default"
+    ):
+        self.resolution = resolution
+        self.bucket_area = resolution[0] * resolution[1]
+        self.architecture = architecture
+        if self.architecture == ARCHITECTURE_HUNYUAN_VIDEO:
+            self.reso_steps = BucketSelector.RESOLUTION_STEPS_HUNYUAN
+        elif self.architecture == ARCHITECTURE_WAN:
+            self.reso_steps = BucketSelector.RESOLUTION_STEPS_WAN
+        elif self.architecture == ARCHITECTURE_FRAMEPACK:
+            self.reso_steps = BucketSelector.RESOLUTION_STEPS_FRAMEPACK
+        else:
+            raise ValueError(f"Invalid architecture: {self.architecture}")
+        if not enable_bucket:
+            # only define one bucket
+            self.bucket_resolutions = [resolution]
+            self.no_upscale = False
+        else:
+            # prepare bucket resolution
+            self.no_upscale = no_upscale
+            sqrt_size = int(math.sqrt(self.bucket_area))
+            min_size = divisible_by(sqrt_size // 2, self.reso_steps)
+            self.bucket_resolutions = []
+            for w in range(min_size, sqrt_size + self.reso_steps, self.reso_steps):
+                h = divisible_by(self.bucket_area // w, self.reso_steps)
+                self.bucket_resolutions.append((w, h))
+                self.bucket_resolutions.append((h, w))
+            self.bucket_resolutions = list(set(self.bucket_resolutions))
+            self.bucket_resolutions.sort()
+        # calculate aspect ratio to find the nearest resolution
+        self.aspect_ratios = np.array([w / h for w, h in self.bucket_resolutions])
+    def get_bucket_resolution(self, image_size: tuple[int, int]) -> tuple[int, int]:
+        """
+        return the bucket resolution for the given image size, (width, height)
+        """
+        area = image_size[0] * image_size[1]
+        if self.no_upscale and area <= self.bucket_area:
+            w, h = image_size
+            w = divisible_by(w, self.reso_steps)
+            h = divisible_by(h, self.reso_steps)
+            return w, h
+        aspect_ratio = image_size[0] / image_size[1]
+        ar_errors = self.aspect_ratios - aspect_ratio
+        bucket_id = np.abs(ar_errors).argmin()
+        return self.bucket_resolutions[bucket_id]
+def load_video(
+    video_path: str,
+    start_frame: Optional[int] = None,
+    end_frame: Optional[int] = None,
+    bucket_selector: Optional[BucketSelector] = None,
+    bucket_reso: Optional[tuple[int, int]] = None,
+    source_fps: Optional[float] = None,
+    target_fps: Optional[float] = None,
+) -> list[np.ndarray]:
+    """
+    bucket_reso: if given, resize the video to the bucket resolution, (width, height)
+    """
+    if source_fps is None or target_fps is None:
+        if os.path.isfile(video_path):
+            container = av.open(video_path)
+            video = []
+            for i, frame in enumerate(container.decode(video=0)):
+                if start_frame is not None and i < start_frame:
+                    continue
+                if end_frame is not None and i >= end_frame:
+                    break
+                frame = frame.to_image()
+                if bucket_selector is not None and bucket_reso is None:
+                    bucket_reso = bucket_selector.get_bucket_resolution(frame.size)  # calc resolution from first frame
+                if bucket_reso is not None:
+                    frame = resize_image_to_bucket(frame, bucket_reso)
+                else:
+                    frame = np.array(frame)
+                video.append(frame)
+            container.close()
+        else:
+            # load images in the directory
+            image_files = glob_images(video_path)
+            image_files.sort()
+            video = []
+            for i in range(len(image_files)):
+                if start_frame is not None and i < start_frame:
+                    continue
+                if end_frame is not None and i >= end_frame:
+                    break
+                image_file = image_files[i]
+                image = Image.open(image_file).convert("RGB")
+                if bucket_selector is not None and bucket_reso is None:
+                    bucket_reso = bucket_selector.get_bucket_resolution(image.size)  # calc resolution from first frame
+                image = np.array(image)
+                if bucket_reso is not None:
+                    image = resize_image_to_bucket(image, bucket_reso)
+                video.append(image)
+    else:
+        # drop frames to match the target fps TODO commonize this code with the above if this works
+        frame_index_delta = target_fps / source_fps  # example: 16 / 30 = 0.5333
+        if os.path.isfile(video_path):
+            container = av.open(video_path)
+            video = []
+            frame_index_with_fraction = 0.0
+            previous_frame_index = -1
+            for i, frame in enumerate(container.decode(video=0)):
+                target_frame_index = int(frame_index_with_fraction)
+                frame_index_with_fraction += frame_index_delta
+                if target_frame_index == previous_frame_index:  # drop this frame
+                    continue
+                # accept this frame
+                previous_frame_index = target_frame_index
+                if start_frame is not None and target_frame_index < start_frame:
+                    continue
+                if end_frame is not None and target_frame_index >= end_frame:
+                    break
+                frame = frame.to_image()
+                if bucket_selector is not None and bucket_reso is None:
+                    bucket_reso = bucket_selector.get_bucket_resolution(frame.size)  # calc resolution from first frame
+                if bucket_reso is not None:
+                    frame = resize_image_to_bucket(frame, bucket_reso)
+                else:
+                    frame = np.array(frame)
+                video.append(frame)
+            container.close()
+        else:
+            # load images in the directory
+            image_files = glob_images(video_path)
+            image_files.sort()
+            video = []
+            frame_index_with_fraction = 0.0
+            previous_frame_index = -1
+            for i in range(len(image_files)):
+                target_frame_index = int(frame_index_with_fraction)
+                frame_index_with_fraction += frame_index_delta
+                if target_frame_index == previous_frame_index:  # drop this frame
+                    continue
+                # accept this frame
+                previous_frame_index = target_frame_index
+                if start_frame is not None and target_frame_index < start_frame:
+                    continue
+                if end_frame is not None and target_frame_index >= end_frame:
+                    break
+                image_file = image_files[i]
+                image = Image.open(image_file).convert("RGB")
+                if bucket_selector is not None and bucket_reso is None:
+                    bucket_reso = bucket_selector.get_bucket_resolution(image.size)  # calc resolution from first frame
+                image = np.array(image)
+                if bucket_reso is not None:
+                    image = resize_image_to_bucket(image, bucket_reso)
+                video.append(image)
+    return video
+class BucketBatchManager:
+    def __init__(self, bucketed_item_info: dict[Union[tuple[int, int], tuple[int, int, int]], list[ItemInfo]], batch_size: int):
+        self.batch_size = batch_size
+        self.buckets = bucketed_item_info
+        self.bucket_resos = list(self.buckets.keys())
+        self.bucket_resos.sort()
+        # indices for enumerating batches. each batch is reso + batch_idx. reso is (width, height) or (width, height, frames)
+        self.bucket_batch_indices: list[tuple[Union[tuple[int, int], tuple[int, int, int], int]]] = []
+        for bucket_reso in self.bucket_resos:
+            bucket = self.buckets[bucket_reso]
+            num_batches = math.ceil(len(bucket) / self.batch_size)
+            for i in range(num_batches):
+                self.bucket_batch_indices.append((bucket_reso, i))
+        self.shuffle()
+    def show_bucket_info(self):
+        for bucket_reso in self.bucket_resos:
+            bucket = self.buckets[bucket_reso]
+            logger.info(f"bucket: {bucket_reso}, count: {len(bucket)}")
+        logger.info(f"total batches: {len(self)}")
+    def shuffle(self):
+        # shuffle each bucket
+        for bucket in self.buckets.values():
+            random.shuffle(bucket)
+        # shuffle the order of batches
+        random.shuffle(self.bucket_batch_indices)
+    def __len__(self):
+        return len(self.bucket_batch_indices)
+    def __getitem__(self, idx):
+        bucket_reso, batch_idx = self.bucket_batch_indices[idx]
+        bucket = self.buckets[bucket_reso]
+        start = batch_idx * self.batch_size
+        end = min(start + self.batch_size, len(bucket))
+        batch_tensor_data = {}
+        varlen_keys = set()
+        for item_info in bucket[start:end]:
+            sd_latent = load_file(item_info.latent_cache_path)
+            sd_te = load_file(item_info.text_encoder_output_cache_path)
+            sd = {**sd_latent, **sd_te}
+            # TODO refactor this
+            for key in sd.keys():
+                is_varlen_key = key.startswith("varlen_")  # varlen keys are not stacked
+                content_key = key
+                if is_varlen_key:
+                    content_key = content_key.replace("varlen_", "")
+                if content_key.endswith("_mask"):
+                    pass
+                else:
+                    content_key = content_key.rsplit("_", 1)[0]  # remove dtype
+                    if content_key.startswith("latents_"):
+                        content_key = content_key.rsplit("_", 1)[0]  # remove FxHxW
+                if content_key not in batch_tensor_data:
+                    batch_tensor_data[content_key] = []
+                batch_tensor_data[content_key].append(sd[key])
+                if is_varlen_key:
+                    varlen_keys.add(content_key)
+        for key in batch_tensor_data.keys():
+            if key not in varlen_keys:
+                batch_tensor_data[key] = torch.stack(batch_tensor_data[key])
+        return batch_tensor_data
+class ContentDatasource:
+    def __init__(self):
+        self.caption_only = False  # set to True to only fetch caption for Text Encoder caching
+        self.has_control = False
+    def set_caption_only(self, caption_only: bool):
+        self.caption_only = caption_only
+    def is_indexable(self):
+        return False
+    def get_caption(self, idx: int) -> tuple[str, str]:
+        """
+        Returns caption. May not be called if is_indexable() returns False.
+        """
+        raise NotImplementedError
+    def __len__(self):
+        raise NotImplementedError
+    def __iter__(self):
+        raise NotImplementedError
+    def __next__(self):
+        raise NotImplementedError
+class ImageDatasource(ContentDatasource):
+    def __init__(self):
+        super().__init__()
+    def get_image_data(self, idx: int) -> tuple[str, Image.Image, str]:
+        """
+        Returns image data as a tuple of image path, image, and caption for the given index.
+        Key must be unique and valid as a file name.
+        May not be called if is_indexable() returns False.
+        """
+        raise NotImplementedError
+class ImageDirectoryDatasource(ImageDatasource):
+    def __init__(self, image_directory: str, caption_extension: Optional[str] = None):
+        super().__init__()
+        self.image_directory = image_directory
+        self.caption_extension = caption_extension
+        self.current_idx = 0
+        # glob images
+        logger.info(f"glob images in {self.image_directory}")
+        self.image_paths = glob_images(self.image_directory)
+        logger.info(f"found {len(self.image_paths)} images")
+    def is_indexable(self):
+        return True
+    def __len__(self):
+        return len(self.image_paths)
+    def get_image_data(self, idx: int) -> tuple[str, Image.Image, str]:
+        image_path = self.image_paths[idx]
+        image = Image.open(image_path).convert("RGB")
+        _, caption = self.get_caption(idx)
+        return image_path, image, caption
+    def get_caption(self, idx: int) -> tuple[str, str]:
+        image_path = self.image_paths[idx]
+        caption_path = os.path.splitext(image_path)[0] + self.caption_extension if self.caption_extension else ""
+        with open(caption_path, "r", encoding="utf-8") as f:
+            caption = f.read().strip()
+        return image_path, caption
+    def __iter__(self):
+        self.current_idx = 0
+        return self
+    def __next__(self) -> callable:
+        """
+        Returns a fetcher function that returns image data.
+        """
+        if self.current_idx >= len(self.image_paths):
+            raise StopIteration
+        if self.caption_only:
+            def create_caption_fetcher(index):
+                return lambda: self.get_caption(index)
+            fetcher = create_caption_fetcher(self.current_idx)
+        else:
+            def create_image_fetcher(index):
+                return lambda: self.get_image_data(index)
+            fetcher = create_image_fetcher(self.current_idx)
+        self.current_idx += 1
+        return fetcher
+class ImageJsonlDatasource(ImageDatasource):
+    def __init__(self, image_jsonl_file: str):
+        super().__init__()
+        self.image_jsonl_file = image_jsonl_file
+        self.current_idx = 0
+        # load jsonl
+        logger.info(f"load image jsonl from {self.image_jsonl_file}")
+        self.data = []
+        with open(self.image_jsonl_file, "r", encoding="utf-8") as f:
+            for line in f:
+                try:
+                    data = json.loads(line)
+                except json.JSONDecodeError:
+                    logger.error(f"failed to load json: {line} @ {self.image_jsonl_file}")
+                    raise
+                self.data.append(data)
+        logger.info(f"loaded {len(self.data)} images")
+    def is_indexable(self):
+        return True
+    def __len__(self):
+        return len(self.data)
+    def get_image_data(self, idx: int) -> tuple[str, Image.Image, str]:
+        data = self.data[idx]
+        image_path = data["image_path"]
+        image = Image.open(image_path).convert("RGB")
+        caption = data["caption"]
+        return image_path, image, caption
+    def get_caption(self, idx: int) -> tuple[str, str]:
+        data = self.data[idx]
+        image_path = data["image_path"]
+        caption = data["caption"]
+        return image_path, caption
+    def __iter__(self):
+        self.current_idx = 0
+        return self
+    def __next__(self) -> callable:
+        if self.current_idx >= len(self.data):
+            raise StopIteration
+        if self.caption_only:
+            def create_caption_fetcher(index):
+                return lambda: self.get_caption(index)
+            fetcher = create_caption_fetcher(self.current_idx)
+        else:
+            def create_fetcher(index):
+                return lambda: self.get_image_data(index)
+            fetcher = create_fetcher(self.current_idx)
+        self.current_idx += 1
+        return fetcher
+class VideoDatasource(ContentDatasource):
+    def __init__(self):
+        super().__init__()
+        # None means all frames
+        self.start_frame = None
+        self.end_frame = None
+        self.bucket_selector = None
+        self.source_fps = None
+        self.target_fps = None
+    def __len__(self):
+        raise NotImplementedError
+    def get_video_data_from_path(
+        self,
+        video_path: str,
+        start_frame: Optional[int] = None,
+        end_frame: Optional[int] = None,
+        bucket_selector: Optional[BucketSelector] = None,
+    ) -> tuple[str, list[Image.Image], str]:
+        # this method can resize the video if bucket_selector is given to reduce the memory usage
+        start_frame = start_frame if start_frame is not None else self.start_frame
+        end_frame = end_frame if end_frame is not None else self.end_frame
+        bucket_selector = bucket_selector if bucket_selector is not None else self.bucket_selector
+        video = load_video(
+            video_path, start_frame, end_frame, bucket_selector, source_fps=self.source_fps, target_fps=self.target_fps
+        )
+        return video
+    def get_control_data_from_path(
+        self,
+        control_path: str,
+        start_frame: Optional[int] = None,
+        end_frame: Optional[int] = None,
+        bucket_selector: Optional[BucketSelector] = None,
+    ) -> list[Image.Image]:
+        start_frame = start_frame if start_frame is not None else self.start_frame
+        end_frame = end_frame if end_frame is not None else self.end_frame
+        bucket_selector = bucket_selector if bucket_selector is not None else self.bucket_selector
+        control = load_video(
+            control_path, start_frame, end_frame, bucket_selector, source_fps=self.source_fps, target_fps=self.target_fps
+        )
+        return control
+    def set_start_and_end_frame(self, start_frame: Optional[int], end_frame: Optional[int]):
+        self.start_frame = start_frame
+        self.end_frame = end_frame
+    def set_bucket_selector(self, bucket_selector: BucketSelector):
+        self.bucket_selector = bucket_selector
+    def set_source_and_target_fps(self, source_fps: Optional[float], target_fps: Optional[float]):
+        self.source_fps = source_fps
+        self.target_fps = target_fps
+    def __iter__(self):
+        raise NotImplementedError
+    def __next__(self):
+        raise NotImplementedError
+class VideoDirectoryDatasource(VideoDatasource):
+    def __init__(self, video_directory: str, caption_extension: Optional[str] = None, control_directory: Optional[str] = None):
+        super().__init__()
+        self.video_directory = video_directory
+        self.caption_extension = caption_extension
+        self.control_directory = control_directory  # 新しく追加: コントロール画像ディレクトリ
+        self.current_idx = 0
+        # glob videos
+        logger.info(f"glob videos in {self.video_directory}")
+        self.video_paths = glob_videos(self.video_directory)
+        logger.info(f"found {len(self.video_paths)} videos")
+        # glob control images if specified
+        if self.control_directory is not None:
+            logger.info(f"glob control videos in {self.control_directory}")
+            self.has_control = True
+            self.control_paths = {}
+            for video_path in self.video_paths:
+                video_basename = os.path.basename(video_path)
+                # construct control path from video path
+                # for example: video_path = "vid/video.mp4" -> control_path = "control/video.mp4"
+                control_path = os.path.join(self.control_directory, video_basename)
+                if os.path.exists(control_path):
+                    self.control_paths[video_path] = control_path
+                else:
+                    # use the same base name for control path
+                    base_name = os.path.splitext(video_basename)[0]
+                    # directory with images. for example: video_path = "vid/video.mp4" -> control_path = "control/video"
+                    potential_path = os.path.join(self.control_directory, base_name)  # no extension
+                    if os.path.isdir(potential_path):
+                        self.control_paths[video_path] = potential_path
+                    else:
+                        # another extension for control path
+                        # for example: video_path = "vid/video.mp4" -> control_path = "control/video.mov"
+                        for ext in VIDEO_EXTENSIONS:
+                            potential_path = os.path.join(self.control_directory, base_name + ext)
+                            if os.path.exists(potential_path):
+                                self.control_paths[video_path] = potential_path
+                                break
+            logger.info(f"found {len(self.control_paths)} matching control videos/images")
+            # check if all videos have matching control paths, if not, raise an error
+            missing_controls = len(self.video_paths) - len(self.control_paths)
+            if missing_controls > 0:
+                # logger.warning(f"Could not find matching control videos/images for {missing_controls} videos")
+                missing_controls_videos = [video_path for video_path in self.video_paths if video_path not in self.control_paths]
+                logger.error(
+                    f"Could not find matching control videos/images for {missing_controls} videos: {missing_controls_videos}"
+                )
+                raise ValueError(f"Could not find matching control videos/images for {missing_controls} videos")
+    def is_indexable(self):
+        return True
+    def __len__(self):
+        return len(self.video_paths)
+    def get_video_data(
+        self,
+        idx: int,
+        start_frame: Optional[int] = None,
+        end_frame: Optional[int] = None,
+        bucket_selector: Optional[BucketSelector] = None,
+    ) -> tuple[str, list[Image.Image], str, Optional[list[Image.Image]]]:
+        video_path = self.video_paths[idx]
+        video = self.get_video_data_from_path(video_path, start_frame, end_frame, bucket_selector)
+        _, caption = self.get_caption(idx)
+        control = None
+        if self.control_directory is not None and video_path in self.control_paths:
+            control_path = self.control_paths[video_path]
+            control = self.get_control_data_from_path(control_path, start_frame, end_frame, bucket_selector)
+        return video_path, video, caption, control
+    def get_caption(self, idx: int) -> tuple[str, str]:
+        video_path = self.video_paths[idx]
+        caption_path = os.path.splitext(video_path)[0] + self.caption_extension if self.caption_extension else ""
+        with open(caption_path, "r", encoding="utf-8") as f:
+            caption = f.read().strip()
+        return video_path, caption
+    def __iter__(self):
+        self.current_idx = 0
+        return self
+    def __next__(self):
+        if self.current_idx >= len(self.video_paths):
+            raise StopIteration
+        if self.caption_only:
+            def create_caption_fetcher(index):
+                return lambda: self.get_caption(index)
+            fetcher = create_caption_fetcher(self.current_idx)
+        else:
+            def create_fetcher(index):
+                return lambda: self.get_video_data(index)
+            fetcher = create_fetcher(self.current_idx)
+        self.current_idx += 1
+        return fetcher
+class VideoJsonlDatasource(VideoDatasource):
+    def __init__(self, video_jsonl_file: str):
+        super().__init__()
+        self.video_jsonl_file = video_jsonl_file
+        self.current_idx = 0
+        # load jsonl
+        logger.info(f"load video jsonl from {self.video_jsonl_file}")
+        self.data = []
+        with open(self.video_jsonl_file, "r", encoding="utf-8") as f:
+            for line in f:
+                data = json.loads(line)
+                self.data.append(data)
+        logger.info(f"loaded {len(self.data)} videos")
+        # Check if there are control paths in the JSONL
+        self.has_control = any("control_path" in item for item in self.data)
+        if self.has_control:
+            control_count = sum(1 for item in self.data if "control_path" in item)
+            if control_count < len(self.data):
+                missing_control_videos = [item["video_path"] for item in self.data if "control_path" not in item]
+                logger.error(f"Some videos do not have control paths in JSONL data: {missing_control_videos}")
+                raise ValueError(f"Some videos do not have control paths in JSONL data: {missing_control_videos}")
+            logger.info(f"found {control_count} control videos/images in JSONL data")
+    def is_indexable(self):
+        return True
+    def __len__(self):
+        return len(self.data)
+    def get_video_data(
+        self,
+        idx: int,
+        start_frame: Optional[int] = None,
+        end_frame: Optional[int] = None,
+        bucket_selector: Optional[BucketSelector] = None,
+    ) -> tuple[str, list[Image.Image], str, Optional[list[Image.Image]]]:
+        data = self.data[idx]
+        video_path = data["video_path"]
+        video = self.get_video_data_from_path(video_path, start_frame, end_frame, bucket_selector)
+        caption = data["caption"]
+        control = None
+        if "control_path" in data and data["control_path"]:
+            control_path = data["control_path"]
+            control = self.get_control_data_from_path(control_path, start_frame, end_frame, bucket_selector)
+        return video_path, video, caption, control
+    def get_caption(self, idx: int) -> tuple[str, str]:
+        data = self.data[idx]
+        video_path = data["video_path"]
+        caption = data["caption"]
+        return video_path, caption
+    def __iter__(self):
+        self.current_idx = 0
+        return self
+    def __next__(self):
+        if self.current_idx >= len(self.data):
+            raise StopIteration
+        if self.caption_only:
+            def create_caption_fetcher(index):
+                return lambda: self.get_caption(index)
+            fetcher = create_caption_fetcher(self.current_idx)
+        else:
+            def create_fetcher(index):
+                return lambda: self.get_video_data(index)
+            fetcher = create_fetcher(self.current_idx)
+        self.current_idx += 1
+        return fetcher
+class BaseDataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        resolution: Tuple[int, int] = (960, 544),
+        caption_extension: Optional[str] = None,
+        batch_size: int = 1,
+        num_repeats: int = 1,
+        enable_bucket: bool = False,
+        bucket_no_upscale: bool = False,
+        cache_directory: Optional[str] = None,
+        debug_dataset: bool = False,
+        architecture: str = "no_default",
+    ):
+        self.resolution = resolution
+        self.caption_extension = caption_extension
+        self.batch_size = batch_size
+        self.num_repeats = num_repeats
+        self.enable_bucket = enable_bucket
+        self.bucket_no_upscale = bucket_no_upscale
+        self.cache_directory = cache_directory
+        self.debug_dataset = debug_dataset
+        self.architecture = architecture
+        self.seed = None
+        self.current_epoch = 0
+        if not self.enable_bucket:
+            self.bucket_no_upscale = False
+    def get_metadata(self) -> dict:
+        metadata = {
+            "resolution": self.resolution,
+            "caption_extension": self.caption_extension,
+            "batch_size_per_device": self.batch_size,
+            "num_repeats": self.num_repeats,
+            "enable_bucket": bool(self.enable_bucket),
+            "bucket_no_upscale": bool(self.bucket_no_upscale),
+        }
+        return metadata
+    def get_all_latent_cache_files(self):
+        return glob.glob(os.path.join(self.cache_directory, f"*_{self.architecture}.safetensors"))
+    def get_all_text_encoder_output_cache_files(self):
+        return glob.glob(os.path.join(self.cache_directory, f"*_{self.architecture}_te.safetensors"))
+    def get_latent_cache_path(self, item_info: ItemInfo) -> str:
+        """
+        Returns the cache path for the latent tensor.
+        item_info: ItemInfo object
+        Returns:
+            str: cache path
+        cache_path is based on the item_key and the resolution.
+        """
+        w, h = item_info.original_size
+        basename = os.path.splitext(os.path.basename(item_info.item_key))[0]
+        assert self.cache_directory is not None, "cache_directory is required / cache_directoryは必須です"
+        return os.path.join(self.cache_directory, f"{basename}_{w:04d}x{h:04d}_{self.architecture}.safetensors")
+    def get_text_encoder_output_cache_path(self, item_info: ItemInfo) -> str:
+        basename = os.path.splitext(os.path.basename(item_info.item_key))[0]
+        assert self.cache_directory is not None, "cache_directory is required / cache_directoryは必須です"
+        return os.path.join(self.cache_directory, f"{basename}_{self.architecture}_te.safetensors")
+    def retrieve_latent_cache_batches(self, num_workers: int):
+        raise NotImplementedError
+    def retrieve_text_encoder_output_cache_batches(self, num_workers: int):
+        raise NotImplementedError
+    def prepare_for_training(self):
+        pass
+    def set_seed(self, seed: int):
+        self.seed = seed
+    def set_current_epoch(self, epoch):
+        if not self.current_epoch == epoch:  # shuffle buckets when epoch is incremented
+            if epoch > self.current_epoch:
+                logger.info("epoch is incremented. current_epoch: {}, epoch: {}".format(self.current_epoch, epoch))
+                num_epochs = epoch - self.current_epoch
+                for _ in range(num_epochs):
+                    self.current_epoch += 1
+                    self.shuffle_buckets()
+                # self.current_epoch seem to be set to 0 again in the next epoch. it may be caused by skipped_dataloader?
+            else:
+                logger.warning("epoch is not incremented. current_epoch: {}, epoch: {}".format(self.current_epoch, epoch))
+                self.current_epoch = epoch
+    def set_current_step(self, step):
+        self.current_step = step
+    def set_max_train_steps(self, max_train_steps):
+        self.max_train_steps = max_train_steps
+    def shuffle_buckets(self):
+        raise NotImplementedError
+    def __len__(self):
+        return NotImplementedError
+    def __getitem__(self, idx):
+        raise NotImplementedError
+    def _default_retrieve_text_encoder_output_cache_batches(self, datasource: ContentDatasource, batch_size: int, num_workers: int):
+        datasource.set_caption_only(True)
+        executor = ThreadPoolExecutor(max_workers=num_workers)
+        data: list[ItemInfo] = []
+        futures = []
+        def aggregate_future(consume_all: bool = False):
+            while len(futures) >= num_workers or (consume_all and len(futures) > 0):
+                completed_futures = [future for future in futures if future.done()]
+                if len(completed_futures) == 0:
+                    if len(futures) >= num_workers or consume_all:  # to avoid adding too many futures
+                        time.sleep(0.1)
+                        continue
+                    else:
+                        break  # submit batch if possible
+                for future in completed_futures:
+                    item_key, caption = future.result()
+                    item_info = ItemInfo(item_key, caption, (0, 0), (0, 0))
+                    item_info.text_encoder_output_cache_path = self.get_text_encoder_output_cache_path(item_info)
+                    data.append(item_info)
+                    futures.remove(future)
+        def submit_batch(flush: bool = False):
+            nonlocal data
+            if len(data) >= batch_size or (len(data) > 0 and flush):
+                batch = data[0:batch_size]
+                if len(data) > batch_size:
+                    data = data[batch_size:]
+                else:
+                    data = []
+                return batch
+            return None
+        for fetch_op in datasource:
+            future = executor.submit(fetch_op)
+            futures.append(future)
+            aggregate_future()
+            while True:
+                batch = submit_batch()
+                if batch is None:
+                    break
+                yield batch
+        aggregate_future(consume_all=True)
+        while True:
+            batch = submit_batch(flush=True)
+            if batch is None:
+                break
+            yield batch
+        executor.shutdown()
+class ImageDataset(BaseDataset):
+    def __init__(
+        self,
+        resolution: Tuple[int, int],
+        caption_extension: Optional[str],
+        batch_size: int,
+        num_repeats: int,
+        enable_bucket: bool,
+        bucket_no_upscale: bool,
+        image_directory: Optional[str] = None,
+        image_jsonl_file: Optional[str] = None,
+        cache_directory: Optional[str] = None,
+        debug_dataset: bool = False,
+        architecture: str = "no_default",
+    ):
+        super(ImageDataset, self).__init__(
+            resolution,
+            caption_extension,
+            batch_size,
+            num_repeats,
+            enable_bucket,
+            bucket_no_upscale,
+            cache_directory,
+            debug_dataset,
+            architecture,
+        )
+        self.image_directory = image_directory
+        self.image_jsonl_file = image_jsonl_file
+        if image_directory is not None:
+            self.datasource = ImageDirectoryDatasource(image_directory, caption_extension)
+        elif image_jsonl_file is not None:
+            self.datasource = ImageJsonlDatasource(image_jsonl_file)
+        else:
+            raise ValueError("image_directory or image_jsonl_file must be specified")
+        if self.cache_directory is None:
+            self.cache_directory = self.image_directory
+        self.batch_manager = None
+        self.num_train_items = 0
+    def get_metadata(self):
+        metadata = super().get_metadata()
+        if self.image_directory is not None:
+            metadata["image_directory"] = os.path.basename(self.image_directory)
+        if self.image_jsonl_file is not None:
+            metadata["image_jsonl_file"] = os.path.basename(self.image_jsonl_file)
+        return metadata
+    def get_total_image_count(self):
+        return len(self.datasource) if self.datasource.is_indexable() else None
+    def retrieve_latent_cache_batches(self, num_workers: int):
+        buckset_selector = BucketSelector(self.resolution, self.enable_bucket, self.bucket_no_upscale, self.architecture)
+        executor = ThreadPoolExecutor(max_workers=num_workers)
+        batches: dict[tuple[int, int], list[ItemInfo]] = {}  # (width, height) -> [ItemInfo]
+        futures = []
+        # aggregate futures and sort by bucket resolution
+        def aggregate_future(consume_all: bool = False):
+            while len(futures) >= num_workers or (consume_all and len(futures) > 0):
+                completed_futures = [future for future in futures if future.done()]
+                if len(completed_futures) == 0:
+                    if len(futures) >= num_workers or consume_all:  # to avoid adding too many futures
+                        time.sleep(0.1)
+                        continue
+                    else:
+                        break  # submit batch if possible
+                for future in completed_futures:
+                    original_size, item_key, image, caption = future.result()
+                    bucket_height, bucket_width = image.shape[:2]
+                    bucket_reso = (bucket_width, bucket_height)
+                    item_info = ItemInfo(item_key, caption, original_size, bucket_reso, content=image)
+                    item_info.latent_cache_path = self.get_latent_cache_path(item_info)
+                    if bucket_reso not in batches:
+                        batches[bucket_reso] = []
+                    batches[bucket_reso].append(item_info)
+                    futures.remove(future)
+        # submit batch if some bucket has enough items
+        def submit_batch(flush: bool = False):
+            for key in batches:
+                if len(batches[key]) >= self.batch_size or flush:
+                    batch = batches[key][0 : self.batch_size]
+                    if len(batches[key]) > self.batch_size:
+                        batches[key] = batches[key][self.batch_size :]
+                    else:
+                        del batches[key]
+                    return key, batch
+            return None, None
+        for fetch_op in self.datasource:
+            # fetch and resize image in a separate thread
+            def fetch_and_resize(op: callable) -> tuple[tuple[int, int], str, Image.Image, str]:
+                image_key, image, caption = op()
+                image: Image.Image
+                image_size = image.size
+                bucket_reso = buckset_selector.get_bucket_resolution(image_size)
+                image = resize_image_to_bucket(image, bucket_reso)
+                return image_size, image_key, image, caption
+            future = executor.submit(fetch_and_resize, fetch_op)
+            futures.append(future)
+            aggregate_future()
+            while True:
+                key, batch = submit_batch()
+                if key is None:
+                    break
+                yield key, batch
+        aggregate_future(consume_all=True)
+        while True:
+            key, batch = submit_batch(flush=True)
+            if key is None:
+                break
+            yield key, batch
+        executor.shutdown()
+    def retrieve_text_encoder_output_cache_batches(self, num_workers: int):
+        return self._default_retrieve_text_encoder_output_cache_batches(self.datasource, self.batch_size, num_workers)
+    def prepare_for_training(self):
+        bucket_selector = BucketSelector(self.resolution, self.enable_bucket, self.bucket_no_upscale, self.architecture)
+        # glob cache files
+        latent_cache_files = glob.glob(os.path.join(self.cache_directory, f"*_{self.architecture}.safetensors"))
+        # assign cache files to item info
+        bucketed_item_info: dict[tuple[int, int], list[ItemInfo]] = {}  # (width, height) -> [ItemInfo]
+        for cache_file in latent_cache_files:
+            tokens = os.path.basename(cache_file).split("_")
+            image_size = tokens[-2]  # 0000x0000
+            image_width, image_height = map(int, image_size.split("x"))
+            image_size = (image_width, image_height)
+            item_key = "_".join(tokens[:-2])
+            text_encoder_output_cache_file = os.path.join(self.cache_directory, f"{item_key}_{self.architecture}_te.safetensors")
+            if not os.path.exists(text_encoder_output_cache_file):
+                logger.warning(f"Text encoder output cache file not found: {text_encoder_output_cache_file}")
+                continue
+            bucket_reso = bucket_selector.get_bucket_resolution(image_size)
+            item_info = ItemInfo(item_key, "", image_size, bucket_reso, latent_cache_path=cache_file)
+            item_info.text_encoder_output_cache_path = text_encoder_output_cache_file
+            bucket = bucketed_item_info.get(bucket_reso, [])
+            for _ in range(self.num_repeats):
+                bucket.append(item_info)
+            bucketed_item_info[bucket_reso] = bucket
+        # prepare batch manager
+        self.batch_manager = BucketBatchManager(bucketed_item_info, self.batch_size)
+        self.batch_manager.show_bucket_info()
+        self.num_train_items = sum([len(bucket) for bucket in bucketed_item_info.values()])
+    def shuffle_buckets(self):
+        # set random seed for this epoch
+        random.seed(self.seed + self.current_epoch)
+        self.batch_manager.shuffle()
+    def __len__(self):
+        if self.batch_manager is None:
+            return 100  # dummy value
+        return len(self.batch_manager)
+    def __getitem__(self, idx):
+        return self.batch_manager[idx]
+class VideoDataset(BaseDataset):
+    TARGET_FPS_HUNYUAN = 24.0
+    TARGET_FPS_WAN = 16.0
+    TARGET_FPS_FRAMEPACK = 30.0
+    def __init__(
+        self,
+        resolution: Tuple[int, int],
+        caption_extension: Optional[str],
+        batch_size: int,
+        num_repeats: int,
+        enable_bucket: bool,
+        bucket_no_upscale: bool,
+        frame_extraction: Optional[str] = "head",
+        frame_stride: Optional[int] = 1,
+        frame_sample: Optional[int] = 1,
+        target_frames: Optional[list[int]] = None,
+        max_frames: Optional[int] = None,
+        source_fps: Optional[float] = None,
+        video_directory: Optional[str] = None,
+        video_jsonl_file: Optional[str] = None,
+        control_directory: Optional[str] = None,
+        cache_directory: Optional[str] = None,
+        debug_dataset: bool = False,
+        architecture: str = "no_default",
+    ):
+        super(VideoDataset, self).__init__(
+            resolution,
+            caption_extension,
+            batch_size,
+            num_repeats,
+            enable_bucket,
+            bucket_no_upscale,
+            cache_directory,
+            debug_dataset,
+            architecture,
+        )
+        self.video_directory = video_directory
+        self.video_jsonl_file = video_jsonl_file
+        self.control_directory = control_directory
+        self.frame_extraction = frame_extraction
+        self.frame_stride = frame_stride
+        self.frame_sample = frame_sample
+        self.max_frames = max_frames
+        self.source_fps = source_fps
+        if self.architecture == ARCHITECTURE_HUNYUAN_VIDEO:
+            self.target_fps = VideoDataset.TARGET_FPS_HUNYUAN
+        elif self.architecture == ARCHITECTURE_WAN:
+            self.target_fps = VideoDataset.TARGET_FPS_WAN
+        elif self.architecture == ARCHITECTURE_FRAMEPACK:
+            self.target_fps = VideoDataset.TARGET_FPS_FRAMEPACK
+        else:
+            raise ValueError(f"Unsupported architecture: {self.architecture}")
+        if target_frames is not None:
+            target_frames = list(set(target_frames))
+            target_frames.sort()
+            # round each value to N*4+1
+            rounded_target_frames = [(f - 1) // 4 * 4 + 1 for f in target_frames]
+            rouneded_target_frames = list(set(rounded_target_frames))
+            rouneded_target_frames.sort()
+            # if value is changed, warn
+            if target_frames != rounded_target_frames:
+                logger.warning(f"target_frames are rounded to {rounded_target_frames}")
+            target_frames = tuple(rounded_target_frames)
+        self.target_frames = target_frames
+        if video_directory is not None:
+            self.datasource = VideoDirectoryDatasource(video_directory, caption_extension, control_directory)
+        elif video_jsonl_file is not None:
+            self.datasource = VideoJsonlDatasource(video_jsonl_file)
+        if self.frame_extraction == "uniform" and self.frame_sample == 1:
+            self.frame_extraction = "head"
+            logger.warning("frame_sample is set to 1 for frame_extraction=uniform. frame_extraction is changed to head.")
+        if self.frame_extraction == "head":
+            # head extraction. we can limit the number of frames to be extracted
+            self.datasource.set_start_and_end_frame(0, max(self.target_frames))
+        if self.cache_directory is None:
+            self.cache_directory = self.video_directory
+        self.batch_manager = None
+        self.num_train_items = 0
+        self.has_control = self.datasource.has_control
+    def get_metadata(self):
+        metadata = super().get_metadata()
+        if self.video_directory is not None:
+            metadata["video_directory"] = os.path.basename(self.video_directory)
+        if self.video_jsonl_file is not None:
+            metadata["video_jsonl_file"] = os.path.basename(self.video_jsonl_file)
+        if self.control_directory is not None:
+            metadata["control_directory"] = os.path.basename(self.control_directory)
+        metadata["frame_extraction"] = self.frame_extraction
+        metadata["frame_stride"] = self.frame_stride
+        metadata["frame_sample"] = self.frame_sample
+        metadata["target_frames"] = self.target_frames
+        metadata["max_frames"] = self.max_frames
+        metadata["source_fps"] = self.source_fps
+        metadata["has_control"] = self.has_control
+        return metadata
+    def retrieve_latent_cache_batches(self, num_workers: int):
+        buckset_selector = BucketSelector(self.resolution, architecture=self.architecture)
+        self.datasource.set_bucket_selector(buckset_selector)
+        if self.source_fps is not None:
+            self.datasource.set_source_and_target_fps(self.source_fps, self.target_fps)
+        else:
+            self.datasource.set_source_and_target_fps(None, None)  # no conversion
+        executor = ThreadPoolExecutor(max_workers=num_workers)
+        # key: (width, height, frame_count), value: [ItemInfo]
+        batches: dict[tuple[int, int, int], list[ItemInfo]] = {}
+        futures = []
+        def aggregate_future(consume_all: bool = False):
+            while len(futures) >= num_workers or (consume_all and len(futures) > 0):
+                completed_futures = [future for future in futures if future.done()]
+                if len(completed_futures) == 0:
+                    if len(futures) >= num_workers or consume_all:  # to avoid adding too many futures
+                        time.sleep(0.1)
+                        continue
+                    else:
+                        break  # submit batch if possible
+                for future in completed_futures:
+                    original_frame_size, video_key, video, caption, control = future.result()
+                    frame_count = len(video)
+                    video = np.stack(video, axis=0)
+                    height, width = video.shape[1:3]
+                    bucket_reso = (width, height)  # already resized
+                    # process control images if available
+                    control_video = None
+                    if control is not None:
+                        # set frame count to the same as video
+                        if len(control) > frame_count:
+                            control = control[:frame_count]
+                        elif len(control) < frame_count:
+                            # if control is shorter than video, repeat the last frame
+                            last_frame = control[-1]
+                            control.extend([last_frame] * (frame_count - len(control)))
+                        control_video = np.stack(control, axis=0)
+                    crop_pos_and_frames = []
+                    if self.frame_extraction == "head":
+                        for target_frame in self.target_frames:
+                            if frame_count >= target_frame:
+                                crop_pos_and_frames.append((0, target_frame))
+                    elif self.frame_extraction == "chunk":
+                        # split by target_frames
+                        for target_frame in self.target_frames:
+                            for i in range(0, frame_count, target_frame):
+                                if i + target_frame <= frame_count:
+                                    crop_pos_and_frames.append((i, target_frame))
+                    elif self.frame_extraction == "slide":
+                        # slide window
+                        for target_frame in self.target_frames:
+                            if frame_count >= target_frame:
+                                for i in range(0, frame_count - target_frame + 1, self.frame_stride):
+                                    crop_pos_and_frames.append((i, target_frame))
+                    elif self.frame_extraction == "uniform":
+                        # select N frames uniformly
+                        for target_frame in self.target_frames:
+                            if frame_count >= target_frame:
+                                frame_indices = np.linspace(0, frame_count - target_frame, self.frame_sample, dtype=int)
+                                for i in frame_indices:
+                                    crop_pos_and_frames.append((i, target_frame))
+                    elif self.frame_extraction == "full":
+                        # select all frames
+                        target_frame = min(frame_count, self.max_frames)
+                        target_frame = (target_frame - 1) // 4 * 4 + 1  # round to N*4+1
+                        crop_pos_and_frames.append((0, target_frame))
+                    else:
+                        raise ValueError(f"frame_extraction {self.frame_extraction} is not supported")
+                    for crop_pos, target_frame in crop_pos_and_frames:
+                        cropped_video = video[crop_pos : crop_pos + target_frame]
+                        body, ext = os.path.splitext(video_key)
+                        item_key = f"{body}_{crop_pos:05d}-{target_frame:03d}{ext}"
+                        batch_key = (*bucket_reso, target_frame)  # bucket_reso with frame_count
+                        # crop control video if available
+                        cropped_control = None
+                        if control_video is not None:
+                            cropped_control = control_video[crop_pos : crop_pos + target_frame]
+                        item_info = ItemInfo(
+                            item_key, caption, original_frame_size, batch_key, frame_count=target_frame, content=cropped_video
+                        )
+                        item_info.latent_cache_path = self.get_latent_cache_path(item_info)
+                        item_info.control_content = cropped_control  # None is allowed
+                        batch = batches.get(batch_key, [])
+                        batch.append(item_info)
+                        batches[batch_key] = batch
+                    futures.remove(future)
+        def submit_batch(flush: bool = False):
+            for key in batches:
+                if len(batches[key]) >= self.batch_size or flush:
+                    batch = batches[key][0 : self.batch_size]
+                    if len(batches[key]) > self.batch_size:
+                        batches[key] = batches[key][self.batch_size :]
+                    else:
+                        del batches[key]
+                    return key, batch
+            return None, None
+        for operator in self.datasource:
+            def fetch_and_resize(op: callable) -> tuple[tuple[int, int], str, list[np.ndarray], str, Optional[list[np.ndarray]]]:
+                result = op()
+                if len(result) == 3:  # for backward compatibility TODO remove this in the future
+                    video_key, video, caption = result
+                    control = None
+                else:
+                    video_key, video, caption, control = result
+                video: list[np.ndarray]
+                frame_size = (video[0].shape[1], video[0].shape[0])
+                # resize if necessary
+                bucket_reso = buckset_selector.get_bucket_resolution(frame_size)
+                video = [resize_image_to_bucket(frame, bucket_reso) for frame in video]
+                # resize control if necessary
+                if control is not None:
+                    control = [resize_image_to_bucket(frame, bucket_reso) for frame in control]
+                return frame_size, video_key, video, caption, control
+            future = executor.submit(fetch_and_resize, operator)
+            futures.append(future)
+            aggregate_future()
+            while True:
+                key, batch = submit_batch()
+                if key is None:
+                    break
+                yield key, batch
+        aggregate_future(consume_all=True)
+        while True:
+            key, batch = submit_batch(flush=True)
+            if key is None:
+                break
+            yield key, batch
+        executor.shutdown()
+    def retrieve_text_encoder_output_cache_batches(self, num_workers: int):
+        return self._default_retrieve_text_encoder_output_cache_batches(self.datasource, self.batch_size, num_workers)
+    def prepare_for_training(self):
+        bucket_selector = BucketSelector(self.resolution, self.enable_bucket, self.bucket_no_upscale, self.architecture)
+        # glob cache files
+        latent_cache_files = glob.glob(os.path.join(self.cache_directory, f"*_{self.architecture}.safetensors"))
+        # assign cache files to item info
+        bucketed_item_info: dict[tuple[int, int, int], list[ItemInfo]] = {}  # (width, height, frame_count) -> [ItemInfo]
+        for cache_file in latent_cache_files:
+            tokens = os.path.basename(cache_file).split("_")
+            image_size = tokens[-2]  # 0000x0000
+            image_width, image_height = map(int, image_size.split("x"))
+            image_size = (image_width, image_height)
+            frame_pos, frame_count = tokens[-3].split("-")[:2]  # "00000-000", or optional section index "00000-000-00"
+            frame_pos, frame_count = int(frame_pos), int(frame_count)
+            item_key = "_".join(tokens[:-3])
+            text_encoder_output_cache_file = os.path.join(self.cache_directory, f"{item_key}_{self.architecture}_te.safetensors")
+            if not os.path.exists(text_encoder_output_cache_file):
+                logger.warning(f"Text encoder output cache file not found: {text_encoder_output_cache_file}")
+                continue
+            bucket_reso = bucket_selector.get_bucket_resolution(image_size)
+            bucket_reso = (*bucket_reso, frame_count)
+            item_info = ItemInfo(item_key, "", image_size, bucket_reso, frame_count=frame_count, latent_cache_path=cache_file)
+            item_info.text_encoder_output_cache_path = text_encoder_output_cache_file
+            bucket = bucketed_item_info.get(bucket_reso, [])
+            for _ in range(self.num_repeats):
+                bucket.append(item_info)
+            bucketed_item_info[bucket_reso] = bucket
+        # prepare batch manager
+        self.batch_manager = BucketBatchManager(bucketed_item_info, self.batch_size)
+        self.batch_manager.show_bucket_info()
+        self.num_train_items = sum([len(bucket) for bucket in bucketed_item_info.values()])
+    def shuffle_buckets(self):
+        # set random seed for this epoch
+        random.seed(self.seed + self.current_epoch)
+        self.batch_manager.shuffle()
+    def __len__(self):
+        if self.batch_manager is None:
+            return 100  # dummy value
+        return len(self.batch_manager)
+    def __getitem__(self, idx):
+        return self.batch_manager[idx]
+class DatasetGroup(torch.utils.data.ConcatDataset):
+    def __init__(self, datasets: Sequence[Union[ImageDataset, VideoDataset]]):
+        super().__init__(datasets)
+        self.datasets: list[Union[ImageDataset, VideoDataset]] = datasets
+        self.num_train_items = 0
+        for dataset in self.datasets:
+            self.num_train_items += dataset.num_train_items
+    def set_current_epoch(self, epoch):
+        for dataset in self.datasets:
+            dataset.set_current_epoch(epoch)
+    def set_current_step(self, step):
+        for dataset in self.datasets:
+            dataset.set_current_step(step)
+    def set_max_train_steps(self, max_train_steps):
+        for dataset in self.datasets:
+            dataset.set_max_train_steps(max_train_steps)

docs/advanced_config.md ADDED Viewed

	@@ -0,0 +1,316 @@

+> 📝 Click on the language section to expand / 言語をクリックして展開
+# Advanced configuration / 高度な設定
+## Table of contents / 目次
+- [How to specify `network_args`](#how-to-specify-network_args--network_argsの指定方法)
+- [LoRA+](#lora)
+- [Select the target modules of LoRA](#select-the-target-modules-of-lora--loraの対象モジュールを選択する)
+- [Save and view logs in TensorBoard format](#save-and-view-logs-in-tensorboard-format--tensorboard形式のログの保存と参照)
+- [Save and view logs in wandb](#save-and-view-logs-in-wandb--wandbでログの保存と参照)
+- [FP8 weight optimization for models](#fp8-weight-optimization-for-models--モデルの重みのfp8への最適化)
+- [PyTorch Dynamo optimization for model training](#pytorch-dynamo-optimization-for-model-training--モデルの学習におけるpytorch-dynamoの最適化)
+## How to specify `network_args` / `network_args`の指定方法
+The `--network_args` option is an option for specifying detailed arguments to LoRA. Specify the arguments in the form of `key=value` in `--network_args`.
+<details>
+<summary>日本語</summary>
+`--network_args`オプションは、LoRAへの詳細な引数を指定するためのオプションです。`--network_args`には、`key=value`の形式で引数を指定します。
+</details>
+### Example / 記述例
+If you specify it on the command line, write as follows. / コマンドラインで指定する場合は以下のように記述します。
+```bash
+accelerate launch --num_cpu_threads_per_process 1 --mixed_precision bf16 hv_train_network.py --dit ...
+    --network_module networks.lora --network_dim 32
+    --network_args "key1=value1" "key2=value2" ...
+```
+If you specify it in the configuration file, write as follows. / 設定ファイルで指定する場合は以下のように記述します。
+```toml
+network_args = ["key1=value1", "key2=value2", ...]
+```
+If you specify `"verbose=True"`, detailed information of LoRA will be displayed. / `"verbose=True"`を指定するとLoRAの詳細な情報が表示されます。
+```bash
+--network_args "verbose=True" "key1=value1" "key2=value2" ...
+```
+## LoRA+
+LoRA+ is a method to improve the training speed by increasing the learning rate of the UP side (LoRA-B) of LoRA. Specify the multiplier for the learning rate. The original paper recommends 16, but adjust as needed. It seems to be good to start from around 4. For details, please refer to the [related PR of sd-scripts](https://github.com/kohya-ss/sd-scripts/pull/1233).
+Specify `loraplus_lr_ratio` with `--network_args`.
+<details>
+<summary>日本語</summary>
+LoRA+は、LoRAのUP側（LoRA-B）の学習率を上げることで学習速度を向上させる手法です。学習率に対する倍率を指定します。元論文では16を推奨していますが、必要に応じて調整してください。4程度から始めるとよいようです。詳細は[sd-scriptsの関連PR]https://github.com/kohya-ss/sd-scripts/pull/1233)を参照してください。
+`--network_args`で`loraplus_lr_ratio`を指定します。
+</details>
+### Example / 記述例
+```bash
+accelerate launch --num_cpu_threads_per_process 1 --mixed_precision bf16 hv_train_network.py --dit ...
+    --network_module networks.lora --network_dim 32 --network_args "loraplus_lr_ratio=4" ...
+```
+## Select the target modules of LoRA / LoRAの対象モジュールを選択する
+*This feature is highly experimental and the specification may change. / この機能は特に実験的なもので、仕様は変更される可能性があります。*
+By specifying `exclude_patterns` and `include_patterns` with `--network_args`, you can select the target modules of LoRA.
+`exclude_patterns` excludes modules that match the specified pattern. `include_patterns` targets only modules that match the specified pattern.
+Specify the values as a list. For example, `"exclude_patterns=[r'.*single_blocks.*', r'.*double_blocks\.[0-9]\..*']"`.
+The pattern is a regular expression for the module name. The module name is in the form of `double_blocks.0.img_mod.linear` or `single_blocks.39.modulation.linear`. The regular expression is not a partial match but a complete match.
+The patterns are applied in the order of `exclude_patterns`→`include_patterns`. By default, the Linear layers of `img_mod`, `txt_mod`, and `modulation` of double blocks and single blocks are excluded.
+(`.*(img_mod|txt_mod|modulation).*` is specified.)
+<details>
+<summary>日本語</summary>
+`--network_args`で`exclude_patterns`と`include_patterns`を指定することで、LoRAの対象モジュールを選択することができます。
+`exclude_patterns`は、指定したパターンに一致するモジュールを除外します。`include_patterns`は、指定したパターンに一致するモジュールのみを対象とします。
+値は、リストで指定します。`"exclude_patterns=[r'.*single_blocks.*', r'.*double_blocks\.[0-9]\..*']"`のようになります。
+パターンは、モジュール名に対する正規表現です。モジュール名は、たとえば`double_blocks.0.img_mod.linear`や`single_blocks.39.modulation.linear`のような形式です。正規表現は部分一致ではなく完全一致です。
+パターンは、`exclude_patterns`→`include_patterns`の順で適用されます。デフォルトは、double blocksとsingle blocksのLinear層のうち、`img_mod`、`txt_mod`、`modulation`が除外されています。
+（`.*(img_mod|txt_mod|modulation).*`が指定されています。）
+</details>
+### Example / 記述例
+Only the modules of double blocks / double blocksのモジュールのみを対象とする場合:
+```bash
+--network_args "exclude_patterns=[r'.*single_blocks.*']"
+```
+Only the modules of single blocks from the 10th / single blocksの10番目以降のLinearモジュールのみを対象とする場合:
+```bash
+--network_args "exclude_patterns=[r'.*']" "include_patterns=[r'.*single_blocks\.\d{2}\.linear.*']"
+```
+## Save and view logs in TensorBoard format / TensorBoard形式のログの保存と参照
+Specify the folder to save the logs with the `--logging_dir` option. Logs in TensorBoard format will be saved.
+For example, if you specify `--logging_dir=logs`, a `logs` folder will be created in the working folder, and logs will be saved in the date folder inside it.
+Also, if you specify the `--log_prefix` option, the specified string will be added before the date. For example, use `--logging_dir=logs --log_prefix=lora_setting1_` for identification.
+To view logs in TensorBoard, open another command prompt and activate the virtual environment. Then enter the following in the working folder.
+```powershell
+tensorboard --logdir=logs
+```
+(tensorboard installation is required.)
+Then open a browser and access http://localhost:6006/ to display it.
+<details>
+<summary>日本語</summary>
+`--logging_dir`オプションにログ保存先フォルダを指定してください。TensorBoard形式のログが保存されます。
+たとえば`--logging_dir=logs`と指定すると、作業フォルダにlogsフォルダが作成され、その中の日時フォルダにログが保存されます。
+また`--log_prefix`オプションを指定すると、日時の前に指定した文字列が追加されます。`--logging_dir=logs --log_prefix=lora_setting1_`などとして識別用にお使いください。
+TensorBoardでログを確認するには、別のコマンドプロンプトを開き、仮想環境を有効にしてから、作業フォルダで以下のように入力します。
+```powershell
+tensorboard --logdir=logs
+```
+（tensorboardのインストールが必要です。）
+その後ブラウザを開き、http://localhost:6006/ へアクセスすると表示されます。
+</details>
+## Save and view logs in wandb / wandbでログの保存と参照
+`--log_with wandb` option is available to save logs in wandb format. `tensorboard` or `all` is also available. The default is `tensorboard`.
+Specify the project name with `--log_tracker_name` when using wandb.
+<details>
+<summary>日本語</summary>
+`--log_with wandb`オプションを指定するとwandb形式でログを保存することができます。`tensorboard`や`all`も指定可能です。デフォルトは`tensorboard`です。
+wandbを使用する場合は、`--log_tracker_name`でプロジェクト名を指定してください。
+</details>
+## FP8 weight optimization for models / モデルの重みのFP8への最適化
+The `--fp8_scaled` option is available to quantize the weights of the model to FP8 (E4M3) format with appropriate scaling. This reduces the VRAM usage while maintaining precision. Important weights are kept in FP16/BF16/FP32 format.
+The model weights must be in fp16 or bf16. Weights that have been pre-converted to float8_e4m3 cannot be used.
+Wan2.1 inference and training are supported.
+Specify the `--fp8_scaled` option in addition to the `--fp8` option during inference.
+Specify the `--fp8_scaled` option in addition to the `--fp8_base` option during training.
+Acknowledgments: This feature is based on the [implementation](https://github.com/Tencent/HunyuanVideo/blob/7df4a45c7e424a3f6cd7d653a7ff1f60cddc1eb1/hyvideo/modules/fp8_optimization.py) of [HunyuanVideo](https://github.com/Tencent/HunyuanVideo). The selection of high-precision modules is based on the [implementation](https://github.com/tdrussell/diffusion-pipe/blob/407c04fdae1c9ab5e67b54d33bef62c3e0a8dbc7/models/wan.py) of [diffusion-pipe](https://github.com/tdrussell/diffusion-pipe). I would like to thank these repositories.
+<details>
+<summary>日本語</summary>
+重みを単純にFP8へcastするのではなく、適切なスケーリングでFP8形式に量子化することで、精度を維持しつつVRAM使用量を削減します。また、重要な重みはFP16/BF16/FP32形式で保持します。
+モデルの重みは、fp16���たはbf16が必要です。あらかじめfloat8_e4m3に変換された重みは使用できません。
+Wan2.1の推論、学習のみ対応しています。
+推論時は`--fp8`オプションに加えて `--fp8_scaled`オプションを指定してください。
+学習時は`--fp8_base`オプションに加えて `--fp8_scaled`オプションを指定してください。
+謝辞：この機能は、[HunyuanVideo](https://github.com/Tencent/HunyuanVideo)の[実装](https://github.com/Tencent/HunyuanVideo/blob/7df4a45c7e424a3f6cd7d653a7ff1f60cddc1eb1/hyvideo/modules/fp8_optimization.py)を参考にしました。また、高精度モジュールの選択においては[diffusion-pipe](https://github.com/tdrussell/diffusion-pipe)の[実装](https://github.com/tdrussell/diffusion-pipe/blob/407c04fdae1c9ab5e67b54d33bef62c3e0a8dbc7/models/wan.py)を参考にしました。これらのリポジトリに感謝します。
+</details>
+### Key features and implementation details / 主な特徴と実装の詳細
+- Implements FP8 (E4M3) weight quantization for Linear layers
+- Reduces VRAM requirements by using 8-bit weights for storage (slightly increased compared to existing `--fp8` `--fp8_base` options)
+- Quantizes weights to FP8 format with appropriate scaling instead of simple cast to FP8
+- Maintains computational precision by dequantizing to original precision (FP16/BF16/FP32) during forward pass
+- Preserves important weights in FP16/BF16/FP32 format
+The implementation:
+1. Quantizes weights to FP8 format with appropriate scaling
+2. Replaces weights by FP8 quantized weights and stores scale factors in model state dict
+3. Applies monkey patching to Linear layers for transparent dequantization during computation
+<details>
+<summary>日本語</summary>
+- Linear層のFP8（E4M3）重み量子化を実装
+- 8ビットの重みを使用することでVRAM使用量を削減（既存の`--fp8` `--fp8_base` オプションに比べて微増）
+- 単純なFP8へのcastではなく、適切な値でスケールして重みをFP8形式に量子化
+- forward時に元の精度（FP16/BF16/FP32）に逆量子化して計算精度を維持
+- 精度が重要な重みはFP16/BF16/FP32のまま保持
+実装:
+1. 精度を維持できる適切な倍率で重みをFP8形式に量子化
+2. 重みをFP8量子化重みに置き換え、倍率をモデルのstate dictに保存
+3. Linear層にmonkey patchingすることでモデルを変更せずに逆量子化
+ </details>
+ ## PyTorch Dynamo optimization for model training / モデルの学習におけるPyTorch Dynamoの最適化
+The PyTorch Dynamo options are now available to optimize the training process. PyTorch Dynamo is a Python-level JIT compiler designed to make unmodified PyTorch programs faster by using TorchInductor, a deep learning compiler. This integration allows for potential speedups in training while maintaining model accuracy.
+[PR #215](https://github.com/kohya-ss/musubi-tuner/pull/215) added this feature.
+Specify the `--dynamo_backend` option to enable Dynamo optimization with one of the available backends from the `DynamoBackend` enum.
+Additional options allow for fine-tuning the Dynamo behavior:
+- `--dynamo_mode`: Controls the optimization strategy
+- `--dynamo_fullgraph`: Enables fullgraph mode for potentially better optimization
+- `--dynamo_dynamic`: Enables dynamic shape handling
+The `--dynamo_dynamic` option has been reported to have many problems based on the validation in PR #215.
+### Available options:
+```
+--dynamo_backend {NO, INDUCTOR, NVFUSER, CUDAGRAPHS, CUDAGRAPHS_FALLBACK, etc.}
+    Specifies the Dynamo backend to use (default is NO, which disables Dynamo)
+--dynamo_mode {default, reduce-overhead, max-autotune}
+    Specifies the optimization mode (default is 'default')
+    - 'default': Standard optimization
+    - 'reduce-overhead': Focuses on reducing compilation overhead
+    - 'max-autotune': Performs extensive autotuning for potentially better performance
+--dynamo_fullgraph
+    Flag to enable fullgraph mode, which attempts to capture and optimize the entire model graph
+--dynamo_dynamic
+    Flag to enable dynamic shape handling for models with variable input shapes
+```
+### Usage example:
+```bash
+python train_video_model.py --dynamo_backend INDUCTOR --dynamo_mode default
+```
+For more aggressive optimization:
+```bash
+python train_video_model.py --dynamo_backend INDUCTOR --dynamo_mode max-autotune --dynamo_fullgraph
+```
+Note: The best combination of options may depend on your specific model and hardware. Experimentation may be necessary to find the optimal configuration.
+<details>
+<summary>日本語</summary>
+PyTorch Dynamoオプションが学習プロセスを最適化するために追加されました。PyTorch Dynamoは、TorchInductor（ディープラーニングコンパイラ）を使用して、変更を加えることなくPyTorchプログラムを高速化するためのPythonレベルのJITコンパイラです。この統合により、モデルの精度を維持しながら学習の高速化が期待できます。
+[PR #215](https://github.com/kohya-ss/musubi-tuner/pull/215) で追加されました。
+`--dynamo_backend`オプションを指定して、`DynamoBackend`列挙型から利用可能なバックエンドの一つを選択することで、Dynamo最適化を有効にします。
+追加のオプションにより、Dynamoの動作を微調整できます：
+- `--dynamo_mode`：最適化戦略を制御します
+- `--dynamo_fullgraph`：より良い最適化の可能性のためにフルグラフモードを有効にします
+- `--dynamo_dynamic`：動的形状処理を有効にします
+PR #215での検証によると、`--dynamo_dynamic`には問題が多いことが報告されています。
+__利用可能なオプション：__
+```
+--dynamo_backend {NO, INDUCTOR, NVFUSER, CUDAGRAPHS, CUDAGRAPHS_FALLBACK, など}
+    使用するDynamoバックエンドを指定します（デフォルトはNOで、Dynamoを無効にします）
+--dynamo_mode {default, reduce-overhead, max-autotune}
+    最適化モードを指定します（デフォルトは 'default'）
+    - 'default'：標準的な最適化
+    - 'reduce-overhead'：コンパイルのオーバーヘッド削減に焦点を当てる
+    - 'max-autotune'：より良いパフォーマンスのために広範な自動調整を実行
+--dynamo_fullgraph
+    フルグラフモードを有効にするフラグ。モデルグラフ全体をキャプチャして最適化しようとします
+--dynamo_dynamic
+    可変入力形状を持つモデルのための動的形状処理を有効にするフラグ
+```
+__使用例：__
+```bash
+python train_video_model.py --dynamo_backend INDUCTOR --dynamo_mode default
+```
+より積極的な最適化の場合：
+```bash
+python train_video_model.py --dynamo_backend INDUCTOR --dynamo_mode max-autotune --dynamo_fullgraph
+```
+注意：最適なオプションの組み合わせは、特定のモデルとハードウェアに依存する場合があります。最適な構成を見つけるために実験が必要かもしれません。
+</details>

docs/framepack.md ADDED Viewed

	@@ -0,0 +1,331 @@

+# FramePack
+## Overview / 概要
+This document describes the usage of the [FramePack](https://github.com/lllyasviel/FramePack) architecture within the Musubi Tuner framework. FramePack is a novel video generation architecture developed by lllyasviel.
+Key differences from HunyuanVideo:
+- FramePack only supports Image-to-Video (I2V) generation. Text-to-Video (T2V) is not supported.
+- It utilizes a different DiT model architecture and requires an additional Image Encoder. VAE is same as HunyuanVideo. Text Encoders seem to be the same as HunyuanVideo but we employ the original FramePack method to utilize them.
+- Caching and training scripts are specific to FramePack (`fpack_*.py`).
+- Due to its progressive generation nature, VRAM usage can be significantly lower, especially for longer videos, compared to other architectures.
+This feature is experimental.
+<details>
+<summary>日本語</summary>
+このドキュメントは、Musubi Tunerフレームワーク内での[FramePack](https://github.com/lllyasviel/FramePack) アーキテクチャの使用法について説明しています。FramePackは、lllyasviel氏にによって開発された新しいビデオ生成アーキテクチャです。
+HunyuanVideoとの主な違いは次のとおりです。
+- FramePackは、画像からビデオ（I2V）生成のみをサポートしています。テキストからビデオ（T2V）はサポートされていません。
+- 異なるDiTモデルアーキテクチャを使用し、追加の画像エンコーダーが必要です。VAEはHunyuanVideoと同じです。テキストエンコーダーはHunyuanVideoと同じと思われますが、FramePack公式と同じ方法で推論を行っています。
+- キャッシングと学習スクリプトはFramePack専用（`fpack_*.py`）です。
+- セクションずつ生成するため、他のアーキテクチャと比較して、特に長いビデオの場合、VRAM使用量が大幅に少なくなる可能性があります。
+この機能は実験的なものですです。
+</details>
+## Download the model / モデルのダウンロード
+You need to download the DiT, VAE, Text Encoder 1 (LLaMA), Text Encoder 2 (CLIP), and Image Encoder (SigLIP) models specifically for FramePack. Several download options are available for each component.
+***Note:** The weights are publicly available on the following page: [maybleMyers/framepack_h1111](https://huggingface.co/maybleMyers/framepack_h1111). Thank you maybleMyers!
+### DiT Model
+Choose one of the following methods:
+1.  **From lllyasviel's Hugging Face repo:** Download the three `.safetensors` files (starting with `diffusion_pytorch_model-00001-of-00003.safetensors`) from [lllyasviel/FramePackI2V_HY](https://huggingface.co/lllyasviel/FramePackI2V_HY). Specify the path to the first file (`...-00001-of-00003.safetensors`) as the `--dit` argument.
+2.  **From local FramePack installation:** If you have cloned and run the official FramePack repository, the model might be downloaded locally. Specify the path to the snapshot directory, e.g., `path/to/FramePack/hf_download/hub/models--lllyasviel--FramePackI2V_HY/snapshots/<hex-uuid-folder>`.
+3.  **From Kijai's Hugging Face repo:** Download the single file `FramePackI2V_HY_bf16.safetensors` from [Kijai/HunyuanVideo_comfy](https://huggingface.co/Kijai/HunyuanVideo_comfy/blob/main/FramePackI2V_HY_bf16.safetensors). Specify the path to this file as the `--dit` argument.
+### VAE Model
+Choose one of the following methods:
+1.  **Use official HunyuanVideo VAE:** Follow the instructions in the main [README.md](../README.md#model-download).
+2.  **From hunyuanvideo-community Hugging Face repo:** Download `vae/diffusion_pytorch_model.safetensors` from [hunyuanvideo-community/HunyuanVideo](https://huggingface.co/hunyuanvideo-community/HunyuanVideo).
+3.  **From local FramePack installation:** If you have cloned and run the official FramePack repository, the VAE might be downloaded locally within the HunyuanVideo community model snapshot. Specify the path to the snapshot directory, e.g., `path/to/FramePack/hf_download/hub/models--hunyuanvideo-community--HunyuanVideo/snapshots/<hex-uuid-folder>`.
+### Text Encoder 1 (LLaMA) Model
+Choose one of the following methods:
+1.  **From Comfy-Org Hugging Face repo:** Download `split_files/text_encoders/llava_llama3_fp16.safetensors` from [Comfy-Org/HunyuanVideo_repackaged](https://huggingface.co/Comfy-Org/HunyuanVideo_repackaged).
+2.  **From hunyuanvideo-community Hugging Face repo:** Download the four `.safetensors` files (starting with `text_encoder/model-00001-of-00004.safetensors`) from [hunyuanvideo-community/HunyuanVideo](https://huggingface.co/hunyuanvideo-community/HunyuanVideo). Specify the path to the first file (`...-00001-of-00004.safetensors`) as the `--text_encoder1` argument.
+3.  **From local FramePack installation:** (Same as VAE) Specify the path to the HunyuanVideo community model snapshot directory, e.g., `path/to/FramePack/hf_download/hub/models--hunyuanvideo-community--HunyuanVideo/snapshots/<hex-uuid-folder>`.
+### Text Encoder 2 (CLIP) Model
+Choose one of the following methods:
+1.  **From Comfy-Org Hugging Face repo:** Download `split_files/text_encoders/clip_l.safetensors` from [Comfy-Org/HunyuanVideo_repackaged](https://huggingface.co/Comfy-Org/HunyuanVideo_repackaged).
+2.  **From hunyuanvideo-community Hugging Face repo:** Download `text_encoder_2/model.safetensors` from [hunyuanvideo-community/HunyuanVideo](https://huggingface.co/hunyuanvideo-community/HunyuanVideo).
+3.  **From local FramePack installation:** (Same as VAE) Specify the path to the HunyuanVideo community model snapshot directory, e.g., `path/to/FramePack/hf_download/hub/models--hunyuanvideo-community--HunyuanVideo/snapshots/<hex-uuid-folder>`.
+### Image Encoder (SigLIP) Model
+Choose one of the following methods:
+1.  **From Comfy-Org Hugging Face repo:** Download `sigclip_vision_patch14_384.safetensors` from [Comfy-Org/sigclip_vision_384](https://huggingface.co/Comfy-Org/sigclip_vision_384).
+2.  **From lllyasviel's Hugging Face repo:** Download `image_encoder/model.safetensors` from [lllyasviel/flux_redux_bfl](https://huggingface.co/lllyasviel/flux_redux_bfl).
+3.  **From local FramePack installation:** If you have cloned and run the official FramePack repository, the model might be downloaded locally. Specify the path to the snapshot directory, e.g., `path/to/FramePack/hf_download/hub/models--lllyasviel--flux_redux_bfl/snapshots/<hex-uuid-folder>`.
+<details>
+<summary>日本語</summary>
+※以下のページに重みが一括で公開されています。maybleMyers 氏に感謝いたします。: https://huggingface.co/maybleMyers/framepack_h1111
+DiT、VAE、テキストエンコーダー1（LLaMA）、テキストエンコーダー2（CLIP）、および画像エンコーダー（SigLIP）モデルは複数の方法でダウンロードできます。英語の説明を参考にして、ダウンロードしてください。
+FramePack公式のリポジトリをクローンして実行した場合、モデルはローカルにダウンロードされている可能性があります。スナップショットディレクトリへのパスを指定してください。例：`path/to/FramePack/hf_download/hub/models--lllyasviel--flux_redux_bfl/snapshots/<hex-uuid-folder>`
+HunyuanVideoの推論をComfyUIですでに行っている場合、いくつかのモデルはすでにダウンロードされている可能性があります。
+</details>
+## Pre-caching / 事前キャッシング
+The default resolution for FramePack is 640x640. See [the source code](../frame_pack/bucket_tools.py) for the default resolution of each bucket.
+The dataset for training must be a video dataset. Image datasets are not supported. You can train on videos of any length. Specify `frame_extraction` as `full` and set `max_frames` to a sufficiently large value. However, if the video is too long, you may run out of VRAM during VAE encoding.
+### Latent Pre-caching / latentの事前キャッシング
+Latent pre-caching uses a dedicated script for FramePack. You **must** provide the Image Encoder model.
+```bash
+python fpack_cache_latents.py \
+    --dataset_config path/to/toml --vanilla_sampling \
+    --vae path/to/vae_model.safetensors \
+    --image_encoder path/to/image_encoder_model.safetensors \
+    --vae_chunk_size 32 --vae_spatial_tile_sample_min_size 128
+```
+Key differences from HunyuanVideo caching:
+-   Uses `fpack_cache_latents.py`.
+-   Requires the `--image_encoder` argument pointing to the downloaded SigLIP model.
+-   You can use the `--latent_window_size` argument (default 9) which defines the size of the latent sections FramePack processes (omitted in the example). This value should typically not be changed unless you understand the implications.
+-   The script generates multiple cache files per video, each corresponding to a different section, with the section index appended to the filename (e.g., `..._frame_pos-0000-count_...` becomes `..._frame_pos-0000-0000-count_...`, `..._frame_pos-0000-0001-count_...`, etc.).
+-   Image embeddings are calculated using the Image Encoder and stored in the cache files alongside the latents.
+By default, the sampling method used is Inverted anti-drifting (the same as during inference, using the latent and index in reverse order), described in the paper. You can switch to Vanilla sampling in the paper (using the temporally ordered latent and index) by specifying `--vanilla_sampling`. Preliminary tests suggest that Vanilla sampling may yield better quality. If you change this option, please overwrite the existing cache without specifying `--skip_existing`.
+For VRAM savings during VAE decoding, consider using `--vae_chunk_size` and `--vae_spatial_tile_sample_min_size`. If VRAM is overflowing and using shared memory, it is recommended to set `--vae_chunk_size` to 16 or 8, and `--vae_spatial_tile_sample_min_size` to 64 or 32.
+<details>
+<summary>日本語</summary>
+FramePackのデフォルト解像度は640x640です。��バケットのデフォルト解像度については、[ソースコード](../frame_pack/bucket_tools.py)を参照してください。
+画像データセットでの学習は行えません。また動画の長さによらず学習可能です。 `frame_extraction` に `full` を指定して、`max_frames` に十分に大きな値を指定してください。ただし、あまりにも長いとVAEのencodeでVRAMが不足する可能性があります。
+latentの事前キャッシングはFramePack専用のスクリプトを使用します。画像エンコーダーモデルを指定する必要があります。
+HunyuanVideoのキャッシングとの主な違いは次のとおりです。
+-  `fpack_cache_latents.py`を使用します。
+-  ダウンロードしたSigLIPモデルを指す`--image_encoder`引数が必要です。
+-  `--latent_window_size`引数（デフォルト9）を指定できます（例では省略）。これは、FramePackが処理するlatentセクションのサイズを定義します。この値は、影響を理解していない限り、通常変更しないでください。
+-  スクリプトは、各ビデオに対して複数のキャッシュファイルを生成します。各ファイルは異なるセクションに対応し、セクションインデックスがファイル名に追加されます（例：`..._frame_pos-0000-count_...`は`..._frame_pos-0000-0000-count_...`、`..._frame_pos-0000-0001-count_...`などになります）。
+-  画像埋め込みは画像エンコーダーを使用して計算され、latentとともにキャッシュファイルに保存されます。
+デフォルトでは、論文のサンプリング方法 Inverted anti-drifting （推論時と同じ、逆順の latent と index を使用）を使用します。`--vanilla_sampling`を指定すると Vanilla sampling （時間順の latent と index を使用）に変更できます。簡単なテストの結果では、Vanilla sampling の方が品質が良いようです。このオプションの有無を変更する場合には `--skip_existing` を指定せずに既存のキャッシュを上書きしてください。
+VAEのdecode時のVRAM節約のために、`--vae_chunk_size`と`--vae_spatial_tile_sample_min_size`を使用することを検討してください。VRAMがあふれて共有メモリを使用している場合には、`--vae_chunk_size`を16、8などに、`--vae_spatial_tile_sample_min_size`を64、32などに変更することをお勧めします。
+</details>
+### Text Encoder Output Pre-caching / テキストエンコーダー出力の事前キャッシング
+Text encoder output pre-caching also uses a dedicated script.
+```bash
+python fpack_cache_text_encoder_outputs.py \
+    --dataset_config path/to/toml \
+    --text_encoder1 path/to/text_encoder1 \
+    --text_encoder2 path/to/text_encoder2 \
+    --batch_size 16
+```
+Key differences from HunyuanVideo caching:
+-   Uses `fpack_cache_text_encoder_outputs.py`.
+-   Requires both `--text_encoder1` (LLaMA) and `--text_encoder2` (CLIP) arguments.
+-   Uses `--fp8_llm` option to run the LLaMA Text Encoder 1 in fp8 mode for VRAM savings (similar to `--fp8_t5` in Wan2.1).
+-   Saves LLaMA embeddings, attention mask, and CLIP pooler output to the cache file.
+<details>
+<summary>日本語</summary>
+テキストエンコーダー出力の事前キャッシングも専用のスクリプトを使用します。
+HunyuanVideoのキャッシングとの主な違いは次のとおりです。
+-  `fpack_cache_text_encoder_outputs.py`を使用します。
+- LLaMAとCLIPの両方の引数が必要です。
+-  LLaMAテキストエンコーダー1をfp8モードで実行するための`--fp8_llm`オプションを使用します（Wan2.1の`--fp8_t5`に似ています）。
+-  LLaMAの埋め込み、アテンションマスク、CLIPのプーラー出力をキャッシュファイルに保存します。
+</details>
+## Training / 学習
+### Training
+Training uses a dedicated script `fpack_train_network.py`. Remember FramePack only supports I2V training.
+```bash
+accelerate launch --num_cpu_threads_per_process 1 --mixed_precision bf16 fpack_train_network.py \
+    --dit path/to/dit_model \
+    --vae path/to/vae_model.safetensors \
+    --text_encoder1 path/to/text_encoder1 \
+    --text_encoder2 path/to/text_encoder2 \
+    --image_encoder path/to/image_encoder_model.safetensors \
+    --dataset_config path/to/toml \
+    --sdpa --mixed_precision bf16 \
+    --optimizer_type adamw8bit --learning_rate 2e-4 --gradient_checkpointing \
+    --timestep_sampling shift --weighting_scheme none --discrete_flow_shift 3.0 \
+    --max_data_loader_n_workers 2 --persistent_data_loader_workers \
+    --network_module networks.lora_framepack --network_dim 32 \
+    --max_train_epochs 16 --save_every_n_epochs 1 --seed 42 \
+    --output_dir path/to/output_dir --output_name name-of-lora
+```
+If you use the command prompt (Windows, not PowerShell), you may need to write them in a single line, or use `^` at the end of each line to continue the command.
+The maximum value for `--blocks_to_swap` is 36. The default resolution for FramePack is 640x640, which requires around 17GB of VRAM. If you run out of VRAM, consider lowering the dataset resolution.
+Key differences from HunyuanVideo training:
+-   Uses `fpack_train_network.py`.
+-   **Requires** specifying `--vae`, `--text_encoder1`, `--text_encoder2`, and `--image_encoder`.
+-   **Requires** specifying `--network_module networks.lora_framepack`.
+-  Optional `--latent_window_size` argument (default 9, should match caching).
+-   Memory saving options like `--fp8_base` (for DiT) and `--fp8_llm` (for Text Encoder 1) are available. `--fp8_scaled` is recommended when using `--fp8_base` for DiT.
+-   `--vae_chunk_size` and `--vae_spatial_tile_sample_min_size` options are available for the VAE to prevent out-of-memory during sampling (similar to caching).
+-  `--gradient_checkpointing` is available for memory savings.
+<!-- -   Use `convert_lora.py` for converting the LoRA weights after training, similar to HunyuanVideo. -->
+Training settings (learning rate, optimizers, etc.) are experimental. Feedback is welcome.
+<details>
+<summary>日本語</summary>
+FramePackの学習は専用のスクリプト`fpack_train_network.py`を使用します。FramePackはI2V学習のみをサポートしています。
+コマンド記述例は英語版を参考にしてください。WindowsでPowerShellではなくコマンドプロンプトを使用している場合、コマンドを1行で記述するか、各行の末尾に`^`を付けてコマンドを続ける必要があります。
+`--blocks_to_swap`の最大値は36です。FramePackのデフォルト解像度（640x640）では、17GB程度のVRAMが必要です。VRAM容量が不足する場合は、データセットの解像度を下げてください。
+HunyuanVideoの学習との主な違いは次のとおりです。
+-  `fpack_train_network.py`を使用します。
+-  `--vae`、`--text_encoder1`、`--text_encoder2`、`--image_encoder`を指定する必要があります。
+-  `--network_module networks.lora_framepack`を指定する必要があります。
+-  必要に応じて`--latent_window_size`引数（デフォルト9）を指定できます（キャッシング時と一致させる必要があります）。
+-  `--fp8_base`（DiT用）や`--fp8_llm`（テキストエンコーダー1用）などのメモリ節約オプションが利用可能です。`--fp8_base`指定時は、`--fp8_scaled`を使用することをお勧めします。
+-  サンプル生成時にメモリ不足を防ぐため、VAE用の`--vae_chunk_size`、`--vae_spatial_tile_sample_min_size`オプションが利用可能です（キャッシング時と同様）。
+-  メモリ節約のために`--gradient_checkpointing`が利用可能です。
+</details>
+## Inference
+Inference uses a dedicated script `fpack_generate_video.py`.
+```bash
+python fpack_generate_video.py \
+    --dit path/to/dit_model \
+    --vae path/to/vae_model.safetensors \
+    --text_encoder1 path/to/text_encoder1 \
+    --text_encoder2 path/to/text_encoder2 \
+    --image_encoder path/to/image_encoder_model.safetensors \
+    --image_path path/to/start_image.jpg \
+    --prompt "A cat walks on the grass, realistic style." \
+    --video_size 512 768 --video_seconds 5 --fps 30 --infer_steps 25 \
+    --attn_mode sdpa --fp8_scaled \
+    --vae_chunk_size 32 --vae_spatial_tile_sample_min_size 128 \
+    --save_path path/to/save/dir --output_type both \
+    --seed 1234 --lora_multiplier 1.0 --lora_weight path/to/lora.safetensors
+```
+<!-- --embedded_cfg_scale 10.0 --guidance_scale 1.0 \ -->
+Key differences from HunyuanVideo inference:
+-   Uses `fpack_generate_video.py`.
+-   **Requires** specifying `--vae`, `--text_encoder1`, `--text_encoder2`, and `--image_encoder`.
+-   **Requires** specifying `--image_path` for the starting frame.
+-   **Requires** specifying `--video_seconds` (length of the video in seconds).
+- `--video_size` is the size of the generated video, height and width are specified in that order.
+-   `--prompt`: Prompt for generation.
+-  Optional `--latent_window_size` argument (default 9, should match caching and training).
+-  `--fp8_scaled` option is available for DiT to reduce memory usage. Quality may be slightly lower. `--fp8_llm` option is available to reduce memory usage of Text Encoder 1. `--fp8` alone is also an option for DiT but `--fp8_scaled` potentially offers better quality.
+-   LoRA loading options (`--lora_weight`, `--lora_multiplier`, `--include_patterns`, `--exclude_patterns`) are available. `--lycoris` is also supported.
+-   `--embedded_cfg_scale` (default 10.0) controls the distilled guidance scale.
+-   `--guidance_scale` (default 1.0) controls the standard classifier-free guidance scale. **Changing this from 1.0 is generally not recommended for the base FramePack model.**
+-   `--guidance_rescale` (default 0.0) is available but typically not needed.
+-   `--bulk_decode` option can decode all frames at once, potentially faster but uses more VRAM during decoding. `--vae_chunk_size` and `--vae_spatial_tile_sample_min_size` options are recommended to prevent out-of-memory errors.
+-   `--sample_solver` (default `unipc`) is available but only `unipc` is implemented.
+-   `--save_merged_model` option is available to save the DiT model after merging LoRA weights. Inference is skipped if this is specified.
+-   Batch and interactive modes (`--from_file`, `--interactive`) are **not yet implemented** for FramePack generation.
+**Section-specific Prompts**
+You can now provide different prompts for different sections of the video using the `--prompt` argument. Use `;;;` to separate sections and specify the starting section index followed by a colon (e.g., `0:prompt A;;;3:prompt B`). Each definition should be in the format `INDEX:PROMPT_TEXT`.
+*   `INDEX` can be:
+    *   A non-negative integer (e.g., `0`, `3`): The prompt applies to this section index.
+    *   A negative integer (e.g., `-1`, `-2`): The prompt applies to the k-th section from the end (e.g., `-1` for the last section, `-2` for the second to last).
+    *   A range (e.g., `0-2`, `3-5`): The prompt applies to all sections within this inclusive range.
+* If some parts are not specified with an index, the prompt associated with index `0` will be used (e.g., `0:prompt A;;;-1:prompt B` means the last section is prompt B, and all others are prompt A).
+    * This can be used with the end image guidance feature to specify a different prompt for the last section.
+*   If no index is specified for a part (e.g., `prompt A;;;3:prompt B`), it defaults to index `0`.
+*   Example 1: `"0:A cat walks;;;3:The cat sits down;;;-1:The cat sleeps"`
+*   Example 2: `"0:A cat turns around;;;-1:A cat walks towards the camera"`
+**End Image Guidance**
+Specify an `--end_image_path` to guide the generation towards a specific final frame. This is highly experimental.
+*  `--end_image_path` : Path to an image to be used as a target for the final frame. The generation process for the last section will be conditioned on this image's VAE latent and image encoder embedding. This may affect the naturalness of the transition into the final frames.
+Other options like `--video_size`, `--fps`, `--infer_steps`, `--save_path`, `--output_type`, `--seed`, `--attn_mode`, `--blocks_to_swap`, `--vae_chunk_size`, `--vae_spatial_tile_sample_min_size` function similarly to HunyuanVideo/Wan2.1 where applicable.
+The maximum value for `--blocks_to_swap` is 38.
+<details>
+<summary>日本語</summary>
+FramePackの推論は専用のスクリプト`fpack_generate_video.py`を使用します。コマンド記述例は英語版を参考にしてください。
+HunyuanVideoの推論との主な違いは次のとおりです。
+-  `fpack_generate_video.py`を使用します。
+-  `--vae`、`--text_encoder1`、`--text_encoder2`、`--image_encoder`を指定する必要があります。
+-  `--image_path`を指定する必要があります（開始フレーム）。
+-  `--video_seconds`を指定する必要があります（秒単位でのビデオの長さを指定）。
+-  `--video_size`は生成するビデオのサイズで、高さと幅をその順番で指定します。
+-   `--prompt`: 生成用のプロンプトです。
+-  必要に応じて`--latent_window_size`引数（デフォルト9）を指定できます（キャッシング時、学習時と一致させる必要があります）。
+- DiTのメモリ使用量を削減するために、`--fp8_scaled`オプションを指定可能です。品質はやや低下する可能性があります。またText Encoder 1のメモリ使用量を削減するために、`--fp8_llm`オプションを指定可能です。DiT用に`--fp8`単独のオプションも用意されていますが、`--fp8_scaled`の方が品質が良い可能性があります。
+-  LoRAの読み込みオプション（`--lora_weight`、`--lora_multiplier`、`--include_patterns`、`--exclude_patterns`）が利用可能です。LyCORISもサポートされています。
+-  `--embedded_cfg_scale`（デフォルト10.0）は、蒸留されたガイダンススケールを制御します。通常は変更しないでください。
+-  `--guidance_scale`（デフォルト1.0）は、標準の分類器フリーガイダンススケールを制御します。**FramePackモデルのベースモデルでは、通常1.0から変更しないことをお勧めします。**
+-  `--guidance_rescale`（デフォルト0.0）も利用可能ですが、通常は必要ありません。
+-  `--bulk_decode`オプションは、すべてのフレームを一度にデコードできるオプションです。高速ですが、デコード中にVRAMを多く使用します。VRAM不足エラーを防ぐために、`--vae_chunk_size`と`--vae_spatial_tile_sample_min_size`オプションを指定することをお勧めします。
+-  `--sample_solver`（デフォルト`unipc`）は利用可能ですが、`unipc`のみが実装されています。
+-  `--save_merged_model`オプションは、LoRAの重みをマージした後にDiTモデルを保存するためのオプションです。これを指定すると推論はスキップされます。
+-  バッチモードとインタラクティブモード（`--from_file`、`--interactive`）はFramePack生成には**まだ実装されていません**。
+**セクション別プロンプト:**
+`--prompt`引数を使用して、ビデオの異なるセクションに異なるプロンプトを指定できるようになりました。セクションを区切るには`;;;`を使用し、開始セクションインデックスの後にコロンを付けて指定します（例：`0:プロンプトA;;;3:プロンプトB`）。各定義は`インデックス:プロンプトテキスト`の形式である必要があります。
+*   `インデックス`には以下を指定できます：
+    *   非負の整数（例：`0`, `3`）：このセクションインデックスに対してプロンプトが適用されます。
+    *   負の整数（例：`-1`, `-2`）：最後からk番目のセクションにプロンプトが適用されます（例：`-1`は最後のセクション、`-2`は最後から2番目のセクション）。
+    *   範囲（例：`0-2`, `3-5`）：この範囲（両端を含む）内のすべてのセクションにプロンプトが適用されます。
+* インデックスが指定されていない部分は、インデックス`0`のプロンプトが適用されます。（例：`0:プロンプトA;;;-1:プロンプトB`なら、一番最後がプロンプトB、それ以外はプロンプトAになります。）
+    * 終端画像ガイダンスを使用する場合、この形式をお勧めします。
+*   ある部分にインデックスが指定されていない場合（例：`プロンプトA;;;3:プロンプトB`）、インデックス`0`として扱われます。
+ **終端画像ガイダンス**
+ `--end_image_path`を指定して、生成を特定の最終フレームに誘導します。これは非常に実験的な機能です。
+-   `--end_image_path` :  最終フレームのターゲットとして使用する画像へのパス。最後のセクションの生成プロセスは、この画像を初期画像として生成されます。これは最終フレームへの遷移の自然さに影響を与える可能性があります。
+`--video_size`、`--fps`、`--infer_steps`、`--save_path`、`--output_type`、`--seed`、`--attn_mode`、`--blocks_to_swap`、`--vae_chunk_size`、`--vae_spatial_tile_sample_min_size`などの他のオプションは、HunyuanVideo/Wan2.1と同様に機能します。
+`--blocks_to_swap`の最大値は38です。
+</details>

docs/sampling_during_training.md ADDED Viewed

	@@ -0,0 +1,116 @@

+> 📝 Click on the language section to expand / 言語をクリックして展開
+# Sampling during training / 学習中のサンプル画像生成
+By preparing a prompt file, you can generate sample images during training.
+Please be aware that it consumes a considerable amount of VRAM, so be careful when generating sample images for videos with a large number of frames. Also, since it takes time to generate, adjust the frequency of sample image generation as needed.
+<details>
+<summary>日本語</summary>
+プロンプトファイルを用意することで、学習中にサンプル画像を生成することができます。
+VRAMをそれなりに消費しますので、特にフレーム数が多い動画を生成する場合は注意してください。また生成には時間がかかりますので、サンプル画像生成の頻度は適宜調整してください。
+</details>
+## How to use / 使い方
+### Command line options for training with sampling / サンプル画像生成に関連する学習時のコマンドラインオプション
+Example of command line options for training with sampling / 記述例:
+```bash
+--vae path/to/ckpts/hunyuan-video-t2v-720p/vae/pytorch_model.pt
+--vae_chunk_size 32 --vae_spatial_tile_sample_min_size 128
+--text_encoder1 path/to/ckpts/text_encoder
+--text_encoder2 path/to/ckpts/text_encoder_2
+--sample_prompts /path/to/prompt_file.txt
+--sample_every_n_epochs 1 --sample_every_n_steps 1000 --sample_at_first
+```
+`--vae`, `--vae_chunk_size`, `--vae_spatial_tile_sample_min_size`, `--text_encoder1`, `--text_encoder2` are the same as when generating images, so please refer to [here](/README.md#inference) for details. `--fp8_llm` can also be specified.
+`--sample_prompts` specifies the path to the prompt file used for sample image generation. Details are described below.
+`--sample_every_n_epochs` specifies how often to generate sample images in epochs, and `--sample_every_n_steps` specifies how often to generate sample images in steps.
+`--sample_at_first` is specified when generating sample images at the beginning of training.
+Sample images and videos are saved in the `sample` directory in the directory specified by `--output_dir`. They are saved as `.png` for still images and `.mp4` for videos.
+<details>
+<summary>日本語</summary>
+`--vae`、`--vae_chunk_size`、`--vae_spatial_tile_sample_min_size`、`--text_encoder1`、`--text_encoder2`は、画像生成時と同様ですので、詳細は[こちら](/README.ja.md#推論)を参照してください。`--fp8_llm`も指定可能です。
+`--sample_prompts`は、サンプル画像生成に使用するプロンプトファイルのパスを指定します。詳細は後述します。
+`--sample_every_n_epochs`は、何エポックごとにサンプル画像を生成するかを、`--sample_every_n_steps`は、何ステップごとにサンプル画像を生成するかを指定します。
+`--sample_at_first`は、学習開始時にサンプル画像を生成する場合に指定します。
+サンプル画像、動画は、`--output_dir`で指定したディレクトリ内の、`sample`ディレクトリに保存されます。静止画の場合は`.png`、動画の場合は`.mp4`で保存されます。
+</details>
+### Prompt file / プロンプトファイル
+The prompt file is a text file that contains the prompts for generating sample images. The example is as follows. / プロンプトファイルは、サンプル画像生成のためのプロンプトを記述したテキストファイルです。例は以下の通りです。
+```
+# prompt 1: for generating a cat video
+A cat walks on the grass, realistic style. --w 640 --h 480 --f 25 --d 1 --s 20
+# prompt 2: for generating a dog image
+A dog runs on the beach, realistic style. --w 960 --h 544 --f 1 --d 2 --s 20
+```
+A line starting with `#` is a comment.
+* `--w` specifies the width of the generated image or video. The default is 256.
+* `--h` specifies the height. The default is 256.
+* `--f` specifies the number of frames. The default is 1, which generates a still image.
+* `--d` specifies the seed. The default is random.
+* `--s` specifies the number of steps in generation. The default is 20.
+* `--g` specifies the embedded guidance scale (not CFG scale). The default is 6.0 for HunyuanVideo, 10.0 for FramePack, which is the default value during inference of each architecture. Specify 1.0 for SkyReels V1 models. Ignore this option for Wan2.1 models.
+* `--fs` specifies the discrete flow shift. The default is 14.5, which corresponds to the number of steps 20. In the HunyuanVideo paper, 7.0 is recommended for 50 steps, and 17.0 is recommended for less than 20 steps (e.g. 10). Ignore this option for FramePack models (it uses 10.0).
+If you train I2V models, you must add the following option.
+* `--i path/to/image.png`: the image path for image2video inference.
+If you train Wan2.1-Fun-Control models, you must add the following option.
+* `--cn path/to/control_video_or_dir_of_images`: the path to the video or directory containing multiple images for control.
+If you train the model with classifier free guidance (such as Wan2.1), you can use the additional options below.
+*`--n negative prompt...`: the negative prompt for the classifier free guidance. The default prompt for each model is used if omitted.
+*`--l 6.0`: the classifier free guidance scale. Should be set to 6.0 for SkyReels V1 models. 5.0 is the default value for Wan2.1 (if omitted).
+<details>
+<summary>日本語</summary>
+`#` で始まる行はコメントです。
+* `--w` 生成画像、動画の幅を指定します。省略時は256です。
+* `--h` 高さを指定します。省略時は256です。
+* `--f` フレーム数を指定します。省略時は1で、静止画を生成します。
+* `--d` シードを指定します。省略時はランダムです。
+* `--s` 生成におけるステップ数を指定します。省略時は20です。
+* `--g` embedded guidance scaleを指定します（CFG scaleではありません）。省略時はHunyuanVideoは6.0、FramePackは10.0で、各アーキテクチャの推論時のデフォルト値です。SkyReels V1モデルの場合は1.0を指定してください。Wan2.1モデルの場合はこのオプションは無視されます。
+* `--fs` discrete flow shiftを指定します。省略時は14.5で、ステップ数20の場合に対応した値です。HunyuanVideoの論文では、ステップ数50の場合は7.0、ステップ数20未満（10など）で17.0が推奨されています。FramePackモデルはこのオプションは無視され、10.0が使用されます。
+I2Vモデルを学習する場合、以下のオプションを追加してください。
+* `--i path/to/image.png`: image2video推論用の画像パス。
+Wan2.1-Fun-Controlモデルを学習する場合、以下のオプションを追加してください。
+* `--cn path/to/control_video_or_dir_of_images`: control用の動画または複数枚の画像を含むディレクトリのパス。
+classifier free guidance（ネガティブプロンプト）を必要とするモデル（Wan2.1など）を学習する場合、以下の追加オプションを使用できます。
+*`--n negative prompt...`: classifier free guidance用のネガティブプロンプト。省略時はモデルごとのデフォルトプロンプトが使用されます。
+*`--l 6.0`: classifier free guidance scale。SkyReels V1モデルの場合は6.0に設定してください。Wan2.1の場合はデフォルト値が5.0です（省略時）。
+</details>

docs/wan.md ADDED Viewed

	@@ -0,0 +1,531 @@

+> 📝 Click on the language section to expand / 言語をクリックして展開
+# Wan 2.1
+## Overview / 概要
+This is an unofficial training and inference script for [Wan2.1](https://github.com/Wan-Video/Wan2.1). The features are as follows.
+- fp8 support and memory reduction by block swap: Inference of a 720x1280x81frames videos with 24GB VRAM, training with 720x1280 images with 24GB VRAM
+- Inference without installing Flash attention (using PyTorch's scaled dot product attention)
+- Supports xformers and Sage attention
+This feature is experimental.
+<details>
+<summary>日本語</summary>
+[Wan2.1](https://github.com/Wan-Video/Wan2.1) の非公式の学習および推論スクリプトです。
+以下の特徴があります。
+- fp8対応およびblock swapによる省メモリ化：720x1280x81framesの動画を24GB VRAMで推論可能、720x1280の画像での学習が24GB VRAMで可能
+- Flash attentionのインストールなしでの実行（PyTorchのscaled dot product attentionを使用）
+- xformersおよびSage attention対応
+この機能は実験的なものです。
+</details>
+## Download the model / モデルのダウンロード
+Download the T5 `models_t5_umt5-xxl-enc-bf16.pth` and CLIP `models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth` from the following page: https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P/tree/main
+Download the VAE from the above page `Wan2.1_VAE.pth` or download `split_files/vae/wan_2.1_vae.safetensors` from the following page: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/vae
+Download the DiT weights from the following page: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
+Wan2.1 Fun Control model weights can be downloaded from [here](https://huggingface.co/alibaba-pai/Wan2.1-Fun-14B-Control). Navigate to each weight page and download. The Fun Control model seems to support not only T2V but also I2V tasks.
+Please select the appropriate weights according to T2V, I2V, resolution, model size, etc.
+`fp16` and `bf16` models can be used, and `fp8_e4m3fn` models can be used if `--fp8` (or `--fp8_base`) is specified without specifying `--fp8_scaled`. **Please note that `fp8_scaled` models are not supported even with `--fp8_scaled`.**
+(Thanks to Comfy-Org for providing the repackaged weights.)
+### Model support matrix / モデルサポートマトリックス
+* columns: training dtype (行：学習時のデータ型)
+* rows: model dtype (列：モデルのデータ型)
+| model \ training |bf16|fp16|--fp8_base|--fp8base & --fp8_scaled|
+|--|--|--|--|--|
+|bf16|✓|--|✓|✓|
+|fp16|--|✓|✓|✓|
+|fp8_e4m3fn|--|--|✓|--|
+|fp8_scaled|--|--|--|--|
+<details>
+<summary>日本語</summary>
+T5 `models_t5_umt5-xxl-enc-bf16.pth` およびCLIP `models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth` を、次のページからダウンロードしてください：https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P/tree/main
+VAEは上のページから `Wan2.1_VAE.pth` をダウンロードするか、次のページから `split_files/vae/wan_2.1_vae.safetensors` をダウンロードしてください：https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/vae
+DiTの重みを次のページからダウンロードしてください：https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
+Wan2.1 Fun Controlモデルの重みは、[こちら](https://huggingface.co/alibaba-pai/Wan2.1-Fun-14B-Control)から、それぞれの重みのページに遷移し、ダウンロードしてください。Fun ControlモデルはT2VだけでなくI2Vタスクにも対応しているようです。
+T2VやI2V、解像度、モデルサイズなどにより適切な重みを選択してください。
+`fp16` および `bf16` モデルを使用できます。また、`--fp8` （または`--fp8_base`）を指定し`--fp8_scaled`を指定をしないときには `fp8_e4m3fn` モデルを使用できます。**`fp8_scaled` モデルはいずれの場合もサポートされていませんのでご注意ください。**
+（repackaged版の重みを提供してくださっているComfy-Orgに感謝いたします。）
+</details>
+## Pre-caching / 事前キャッシュ
+### Latent Pre-caching
+Latent pre-caching is almost the same as in HunyuanVideo. Create the cache using the following command:
+```bash
+python wan_cache_latents.py --dataset_config path/to/toml --vae path/to/wan_2.1_vae.safetensors
+```
+If you train I2V models, add `--clip path/to/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth` to specify the CLIP model. If not specified, the training will raise an error.
+If you're running low on VRAM, specify `--vae_cache_cpu` to use the CPU for the VAE internal cache, which will reduce VRAM usage somewhat.
+The control video settings are required for training the Fun-Control model. Please refer to [Dataset Settings](/dataset/dataset_config.md#sample-for-video-dataset-with-control-images) for details.
+<details>
+<summary>日本語</summary>
+latentの事前キャッシングはHunyuanVideoとほぼ同じです。上のコマンド例を使用してキャッシュを作成してください。
+I2Vモデルを学習する場合は、`--clip path/to/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth` を追加してCLIPモデルを指定してください。指定しないと学習時にエラーが発生します。
+VRAMが不足している場合は、`--vae_cache_cpu` を指定するとVAEの内部キャッシュにCPUを使うことで、使用VRAMを多少削減できます。
+Fun-Controlモデルを学習する場合は、制御用動画の設定が必要です。[データセット設定](/dataset/dataset_config.md#sample-for-video-dataset-with-control-images)を参照してください。
+</details>
+### Text Encoder Output Pre-caching
+Text encoder output pre-caching is also almost the same as in HunyuanVideo. Create the cache using the following command:
+```bash
+python wan_cache_text_encoder_outputs.py --dataset_config path/to/toml  --t5 path/to/models_t5_umt5-xxl-enc-bf16.pth --batch_size 16
+```
+Adjust `--batch_size` according to your available VRAM.
+For systems with limited VRAM (less than ~16GB), use `--fp8_t5` to run the T5 in fp8 mode.
+<details>
+<summary>日本語</summary>
+テキストエンコーダ出力の事前キャッシングもHunyuanVideoとほぼ同じです。上のコマンド例を使用してキャッシュを作成してください。
+使用可能なVRAMに合わせて `--batch_size` を調整してください。
+VRAMが限られているシステム（約16GB未満）の場合は、T5をfp8モードで実行するために `--fp8_t5` を使用してください。
+</details>
+## Training / 学習
+### Training
+Start training using the following command (input as a single line):
+```bash
+accelerate launch --num_cpu_threads_per_process 1 --mixed_precision bf16 wan_train_network.py
+    --task t2v-1.3B
+    --dit path/to/wan2.1_xxx_bf16.safetensors
+    --dataset_config path/to/toml --sdpa --mixed_precision bf16 --fp8_base
+    --optimizer_type adamw8bit --learning_rate 2e-4 --gradient_checkpointing
+    --max_data_loader_n_workers 2 --persistent_data_loader_workers
+    --network_module networks.lora_wan --network_dim 32
+    --timestep_sampling shift --discrete_flow_shift 3.0
+    --max_train_epochs 16 --save_every_n_epochs 1 --seed 42
+    --output_dir path/to/output_dir --output_name name-of-lora
+```
+The above is an example. The appropriate values for `timestep_sampling` and `discrete_flow_shift` need to be determined by experimentation.
+For additional options, use `python wan_train_network.py --help` (note that many options are unverified).
+`--task` is one of `t2v-1.3B`, `t2v-14B`, `i2v-14B`, `t2i-14B` (for Wan2.1 official models), `t2v-1.3B-FC`, `t2v-14B-FC`, and `i2v-14B-FC` (for Wan2.1 Fun Control model). Specify the DiT weights for the task with `--dit`.
+Don't forget to specify `--network_module networks.lora_wan`.
+Other options are mostly the same as `hv_train_network.py`.
+Use `convert_lora.py` for converting the LoRA weights after training, as in HunyuanVideo.
+<details>
+<summary>日本語</summary>
+`timestep_sampling`や`discrete_flow_shift`は一例です。どのような値が適切かは実験が必要です。
+その他のオプションについては `python wan_train_network.py --help` を使用してください（多くのオプションは未検証です）。
+`--task` には `t2v-1.3B`, `t2v-14B`, `i2v-14B`, `t2i-14B` （これらはWan2.1公式モデル）、`t2v-1.3B-FC`, `t2v-14B-FC`, `i2v-14B-FC`（Wan2.1-Fun Controlモデル）を指定します。`--dit`に、taskに応じたDiTの重みを指定してください。
+ `--network_module` に `networks.lora_wan` を指定することを忘れないでください。
+その他のオプションは、ほぼ`hv_train_network.py`と同様です。
+学習後のLoRAの重みの変換は、HunyuanVideoと同様に`convert_lora.py`を使用してください。
+</details>
+### Command line options for training with sampling / サンプル画像生成に関連する学習時のコマンドラインオプション
+Example of command line options for training with sampling / 記述例:
+```bash
+--vae path/to/wan_2.1_vae.safetensors
+--t5 path/to/models_t5_umt5-xxl-enc-bf16.pth
+--sample_prompts /path/to/prompt_file.txt
+--sample_every_n_epochs 1 --sample_every_n_steps 1000 -- sample_at_first
+```
+Each option is the same as when generating images or as HunyuanVideo. Please refer to [here](/docs/sampling_during_training.md) for details.
+If you train I2V models, add `--clip path/to/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth` to specify the CLIP model.
+You can specify the initial image, the negative prompt and the control video (for Wan2.1-Fun-Control) in the prompt file. Please refer to [here](/docs/sampling_during_training.md#prompt-file--プロンプトファイル).
+<details>
+<summary>日本語</summary>
+各オプションは推論時、およびHunyuanVideoの場合と同様です。[こちら](/docs/sampling_during_training.md)を参照してください。
+I2Vモデルを学習する場合は、`--clip path/to/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth` を追加してCLIPモデルを指定してください。
+プロンプトファイルで、初期画像やネガティブプロンプト、制御動画（Wan2.1-Fun-Control用）等を指定できます。[こちら](/docs/sampling_during_training.md#prompt-file--プロンプトファイル)を参照してください。
+</details>
+## Inference / 推論
+### Inference Options Comparison / 推論オプション比較
+#### Speed Comparison (Faster → Slower) / 速度比較（速い→遅い）
+*Note: Results may vary depending on GPU type*
+fp8_fast > bf16/fp16 (no block swap) > fp8 > fp8_scaled > bf16/fp16 (block swap)
+#### Quality Comparison (Higher → Lower) / 品質比較（高→低）
+bf16/fp16 > fp8_scaled > fp8 >> fp8_fast
+### T2V Inference / T2V推論
+The following is an example of T2V inference (input as a single line):
+```bash
+python wan_generate_video.py --fp8 --task t2v-1.3B --video_size  832 480 --video_length 81 --infer_steps 20
+--prompt "prompt for the video" --save_path path/to/save.mp4 --output_type both
+--dit path/to/wan2.1_t2v_1.3B_bf16_etc.safetensors --vae path/to/wan_2.1_vae.safetensors
+--t5 path/to/models_t5_umt5-xxl-enc-bf16.pth
+--attn_mode torch
+```
+`--task` is one of `t2v-1.3B`, `t2v-14B`, `i2v-14B`, `t2i-14B` (these are Wan2.1 official models), `t2v-1.3B-FC`, `t2v-14B-FC` and `i2v-14B-FC` (for Wan2.1-Fun Control model).
+`--attn_mode` is `torch`, `sdpa` (same as `torch`), `xformers`, `sageattn`,`flash2`, `flash` (same as `flash2`) or `flash3`. `torch` is the default. Other options require the corresponding library to be installed. `flash3` (Flash attention 3) is not tested.
+Specifying `--fp8` runs DiT in fp8 mode. fp8 can significantly reduce memory consumption but may impact output quality.
+`--fp8_scaled` can be specified in addition to `--fp8` to run the model in fp8 weights optimization. This increases memory consumption and speed slightly but improves output quality. See [here](advanced_config.md#fp8-weight-optimization-for-models--モデルの重みのfp8への最適化) for details.
+`--fp8_fast` option is also available for faster inference on RTX 40x0 GPUs. This option requires `--fp8_scaled` option. **This option seems to degrade the output quality.**
+`--fp8_t5` can be used to specify the T5 model in fp8 format. This option reduces memory usage for the T5 model.
+`--negative_prompt` can be used to specify a negative prompt. If omitted, the default negative prompt is used.
+`--flow_shift` can be used to specify the flow shift (default 3.0 for I2V with 480p, 5.0 for others).
+`--guidance_scale` can be used to specify the guidance scale for classifier free guidance (default 5.0).
+`--blocks_to_swap` is the number of blocks to swap during inference. The default value is None (no block swap). The maximum value is 39 for 14B model and 29 for 1.3B model.
+`--vae_cache_cpu` enables VAE cache in main memory. This reduces VRAM usage slightly but processing is slower.
+`--compile` enables torch.compile. See [here](/README.md#inference) for details.
+`--trim_tail_frames` can be used to trim the tail frames when saving. The default is 0.
+`--cfg_skip_mode` specifies the mode for skipping CFG in different steps. The default is `none` (all steps).`--cfg_apply_ratio` specifies the ratio of steps where CFG is applied. See below for details.
+`--include_patterns` and `--exclude_patterns` can be used to specify which LoRA modules to apply or exclude during training. If not specified, all modules are applied by default. These options accept regular expressions.
+`--include_patterns` specifies the modules to be applied, and `--exclude_patterns` specifies the modules to be excluded. The regular expression is matched against the LoRA key name, and include takes precedence.
+The key name to be searched is in sd-scripts format (`lora_unet_<module_name with dot replaced by _>`). For example, `lora_unet_blocks_9_cross_attn_k`.
+For example, if you specify `--exclude_patterns "blocks_[23]\d_"`, it will exclude modules containing `blocks_20` to `blocks_39`. If you specify `--include_patterns "cross_attn" --exclude_patterns "blocks_(0|1|2|3|4)_"`, it will apply LoRA to modules containing `cross_attn` and not containing `blocks_0` to `blocks_4`.
+If you specify multiple LoRA weights, please specify them with multiple arguments. For example: `--include_patterns "cross_attn" ".*" --exclude_patterns "dummy_do_not_exclude" "blocks_(0|1|2|3|4)"`. `".*"` is a regex that matches everything. `dummy_do_not_exclude` is a dummy regex that does not match anything.
+`--cpu_noise` generates initial noise on the CPU. This may result in the same results as ComfyUI with the same seed (depending on other settings).
+If you are using the Fun Control model, specify the control video with `--control_path`. You can specify a video file or a folder containing multiple image files. The number of frames in the video file (or the number of images) should be at least the number specified in `--video_length` (plus 1 frame if you specify `--end_image_path`).
+Please try to match the aspect ratio of the control video with the aspect ratio specified in `--video_size` (there may be some deviation from the initial image of I2V due to the use of bucketing processing).
+Other options are same as `hv_generate_video.py` (some options are not supported, please check the help).
+<details>
+<summary>日本語</summary>
+`--task` には `t2v-1.3B`, `t2v-14B`, `i2v-14B`, `t2i-14B` （これらはWan2.1公式モデル）、`t2v-1.3B-FC`, `t2v-14B-FC`, `i2v-14B-FC`（Wan2.1-Fun Controlモデル）を指定します。
+`--attn_mode` には `torch`, `sdpa`（`torch`と同じ）、`xformers`, `sageattn`, `flash2`, `flash`（`flash2`と同じ）, `flash3` のいずれかを指定します。デフォルトは `torch` です。その他のオプションを使用する場合は、対応するライブラリをインストールする必要があります。`flash3`（Flash attention 3）は未テストです。
+`--fp8` を指定するとDiTモデルをfp8形式で実行します。fp8はメモリ消費を大幅に削減できますが、出力品質に影響を与える可能性があります。
+`--fp8_scaled` を `--fp8` と併用すると、fp8への重み量子化を行います。メモリ消費と速度はわずかに悪化しますが、出力品質が向上します。詳しくは[こちら](advanced_config.md#fp8-weight-optimization-for-models--モデルの重みのfp8への最適化)を参照してください。
+`--fp8_fast` オプションはRTX 40x0 GPUでの高速推論に使用されるオプションです。このオプションは `--fp8_scaled` オプションが必要です。**出力品質が劣化するようです。**
+`--fp8_t5` を指定するとT5モデルをfp8形式で実行します。T5モデル呼び出し時のメモリ使用量を削減します。
+`--negative_prompt` でネガティブプロンプトを指定できます。省略した場合はデフォルトのネガティブプロンプトが使用されます。
+`--flow_shift` でflow shiftを指定できます（480pのI2Vの場合はデフォルト3.0、それ以外は5.0）。
+`--guidance_scale` でclassifier free guianceのガイダンススケールを指定できます（デフォルト5.0）。
+`--blocks_to_swap` は推論時のblock swapの数です。デフォルト値はNone（block swapなし）です。最大値は14Bモデルの場合39、1.3Bモデルの場合29です。
+`--vae_cache_cpu` を有効にすると、VAEのキャッシュをメインメモリに保持します。VRAM使用量が多少減りますが、処理は遅くなります。
+`--compile`でtorch.compileを有効にします。詳細については[こちら](/README.md#inference)を参照してください。
+`--trim_tail_frames` で保存時に末尾のフレームをトリミングできます。デフォルトは0です。
+`--cfg_skip_mode` は異なるステップでCFGをスキップするモードを指定します。デフォルトは `none`（全ステップ）。`--cfg_apply_ratio` はCFGが適用されるステップの割合を指定します。詳細は後述します。
+LoRAのどのモジュールを適用するかを、`--include_patterns`と`--exclude_patterns`で指定できます（未指定時・デフォルトは全モジュール適用されます
+）。これらのオプションには、正規表現を指定します。`--include_patterns`は適用するモジュール、`--exclude_patterns`は適用しないモジュールを指定します。正規表現がLoRAのキー名に含まれるかどうかで判断され、includeが優先されます。
+検索対象となるキー名は sd-scripts 形式（`lora_unet_<モジュール名のドットを_に置換したもの>`）です。例：`lora_unet_blocks_9_cross_attn_k`
+たとえば `--exclude_patterns "blocks_[23]\d_"`のみを指定すると、`blocks_20`から`blocks_39`を含むモジュールが除外されます。`--include_patterns "cross_attn" --exclude_patterns "blocks_(0|1|2|3|4)_"`のようにincludeとexcludeを指定すると、`cross_attn`を含むモジュールで、かつ`blocks_0`から`blocks_4`を含まないモジュールにLoRAが適用されます。
+複数のLoRAの重みを指定する場合は、複数個の引数で指定してください。例：`--include_patterns "cross_attn" ".*" --exclude_patterns "dummy_do_not_exclude" "blocks_(0|1|2|3|4)"` `".*"`は全てにマッチする正規表現です。`dummy_do_not_exclude`は何にもマッチしないダミーの正規表現です。
+`--cpu_noise`を指定すると初期ノイズをCPUで生成します。これにより同一seed時の結果がComfyUIと同じになる可能性があります（他の設定にもよります）。
+Fun Controlモデルを使用する場合は、`--control_path`で制御用の映像を指定します。動画ファイル、または複数枚の画像ファイルを含んだフォルダを指定できます。動画ファイルのフレーム数（または画像の枚数）は、`--video_length`で指定したフレーム数以上にしてください（後述の`--end_image_path`を指定した場合は、さらに+1フレーム）。
+制御用の映像のアスペクト比は、`--video_size`で指定したアスペクト比とできるかぎり合わせてください（bucketingの処理を流用しているためI2Vの初期画像とズレる場合があります）。
+その他のオプションは `hv_generate_video.py` と同じです（一部のオプションはサポートされていないため、ヘルプを確認してください）。
+</details>
+#### CFG Skip Mode / CFGスキップモード
+ These options allow you to balance generation speed against prompt accuracy. More skipped steps results in faster generation with potential quality degradation.
+Setting `--cfg_apply_ratio` to 0.5 speeds up the denoising loop by up to 25%.
+`--cfg_skip_mode` specified one of the following modes:
+- `early`: Skips CFG in early steps for faster generation, applying guidance mainly in later refinement steps
+- `late`: Skips CFG in later steps, applying guidance during initial structure formation
+- `middle`: Skips CFG in middle steps, applying guidance in both early and later steps
+- `early_late`: Skips CFG in both early and late steps, applying only in middle steps
+- `alternate`: Applies CFG in alternate steps based on the specified ratio
+- `none`: Applies CFG at all steps (default)
+`--cfg_apply_ratio` specifies a value from 0.0 to 1.0 controlling the proportion of steps where CFG is applied. For example, setting 0.5 means CFG will be applied in only 50% of the steps.
+If num_steps is 10, the following table shows the steps where CFG is applied based on the `--cfg_skip_mode` option (A means CFG is applied, S means it is skipped, `--cfg_apply_ratio` is 0.6):
+| skip mode | CFG apply pattern |
+|---|---|
+| early | SSSSAAAAAA |
+| late | AAAAAASSSS |
+| middle | AAASSSSAAA |
+| early_late | SSAAAAAASS |
+| alternate | SASASAASAS |
+The appropriate settings are unknown, but you may want to try `late` or `early_late` mode with a ratio of around 0.3 to 0.5.
+<details>
+<summary>日本語</summary>
+これらのオプションは、生成速度とプロンプトの精度のバランスを取ることができます。スキップされるステップが多いほど、生成速度が速くなりますが、品質が低下する可能性があります。
+ratioに0.5を指定することで、デノイジングのループが最大25%程度、高速化されます。
+`--cfg_skip_mode` は次のモードのいずれかを指定します：
+- `early`：初期のステップでCFGをスキップして、主に終盤の精細化のステップで適用します
+- `late`：終盤のステップでCFGをスキップし、初期の構造が決まる段階で適用します
+- `middle`：中間のステップでCFGをスキップし、初期と終盤のステップの両方で適用します
+- `early_late`：初期と終盤のステップの両方でCFGをスキップし、中間のステップのみ適用します
+- `alternate`：指定された割合に基づいてCFGを適用します
+`--cfg_apply_ratio` は、CFGが適用されるステップの割合を0.0から1.0の値で指定します。たとえば、0.5に設定すると、CFGはステップの50%のみで適用されます。
+具体的なパターンは上のテーブルを参照してください。
+適切な設定は不明ですが、モードは`late`または`early_late`、ratioは0.3~0.5程度から試してみると良いかもしれません。
+</details>
+#### Skip Layer Guidance
+Skip Layer Guidance is a feature that uses the output of a model with some blocks skipped as the unconditional output of classifier free guidance. It was originally proposed in [SD 3.5](https://github.com/comfyanonymous/ComfyUI/pull/5404) and first applied in Wan2GP in [this PR](https://github.com/deepbeepmeep/Wan2GP/pull/61). It may improve the quality of generated videos.
+The implementation of SD 3.5 is [here](https://github.com/Stability-AI/sd3.5/blob/main/sd3_impls.py), and the implementation of Wan2GP (the PR mentioned above) has some different specifications. This inference script allows you to choose between the two methods.
+*The SD3.5 method applies slg output in addition to cond and uncond (slows down the speed). The Wan2GP method uses only cond and slg output.*
+The following arguments are available:
+- `--slg_mode`: Specifies the SLG mode. `original` for SD 3.5 method, `uncond` for Wan2GP method. Default is None (no SLG).
+- `--slg_layers`: Specifies the indices of the blocks (layers) to skip in SLG, separated by commas. Example: `--slg_layers 4,5,6`. Default is empty (no skip). If this option is not specified, `--slg_mode` is ignored.
+- `--slg_scale`: Specifies the scale of SLG when `original`. Default is 3.0.
+- `--slg_start`: Specifies the start step of SLG application in inference steps from 0.0 to 1.0. Default is 0.0 (applied from the beginning).
+- `--slg_end`: Specifies the end step of SLG application in inference steps from 0.0 to 1.0. Default is 0.3 (applied up to 30% from the beginning).
+Appropriate settings are unknown, but you may want to try `original` mode with a scale of around 3.0 and a start ratio of 0.0 and an end ratio of 0.5, with layers 4, 5, and 6 skipped.
+<details>
+<summary>日本語</summary>
+Skip Layer Guidanceは、一部のblockをスキップしたモデル出力をclassifier free guidanceのunconditional出力に使用する機能です。元々は[SD 3.5](https://github.com/comfyanonymous/ComfyUI/pull/5404)で提案されたもので、Wan2.1には[Wan2GPのこちらのPR](https://github.com/deepbeepmeep/Wan2GP/pull/61)で初めて適用されました。生成動画の品質が向上する可能性があります。
+SD 3.5の実装は[こちら](https://github.com/Stability-AI/sd3.5/blob/main/sd3_impls.py)で、Wan2GPの実装（前述のPR）は一部仕様が異なります。この推論スクリプトでは両者の方式を選択できるようになっています。
+※SD3.5方式はcondとuncondに加えてslg outputを適用します（速度が低下します）。Wan2GP方式はcondとslg outputのみを使用します。
+以下の引数があります。
+- `--slg_mode`：SLGのモードを指定します。`original`でSD 3.5の方式、`uncond`でWan2GPの方式です。デフォルトはNoneで、SLGを使用しません。
+- `--slg_layers`：SLGでスキップするblock (layer)のインデクスをカンマ区切りで指定します。例：`--slg_layers 4,5,6`。デフォルトは空（スキップしない）です。このオプションを指定しないと`--slg_mode`は無視されます。
+- `--slg_scale`：`original`のときのSLGのスケールを指定します。デフォルトは3.0です。
+- `--slg_start`：推論ステップのSLG適用開始ステップを0.0から1.0の割合で指定します。デフォルトは0.0です（最初から適用）。
+- `--slg_end`：推論ステップのSLG適用終了ステップを0.0から1.0の割合で指定します。デフォルトは0.3です（最初から30%まで適用）。
+適切な設定は不明ですが、`original`モードでスケールを3.0程度、開始割合を0.0、終了割合を0.5程度に設定し、4, 5, 6のlayerをスキップする設定から始めると良いかもしれません。
+</details>
+### I2V Inference / I2V推論
+The following is an example of I2V inference (input as a single line):
+```bash
+python wan_generate_video.py --fp8 --task i2v-14B --video_size 832 480 --video_length 81 --infer_steps 20
+--prompt "prompt for the video" --save_path path/to/save.mp4 --output_type both
+--dit path/to/wan2.1_i2v_480p_14B_bf16_etc.safetensors --vae path/to/wan_2.1_vae.safetensors
+--t5 path/to/models_t5_umt5-xxl-enc-bf16.pth --clip path/to/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth
+--attn_mode torch --image_path path/to/image.jpg
+```
+Add `--clip` to specify the CLIP model. `--image_path` is the path to the image to be used as the initial frame.
+`--end_image_path` can be used to specify the end image. This option is experimental. When this option is specified, the saved video will be slightly longer than the specified number of frames and will have noise, so it is recommended to specify `--trim_tail_frames 3` to trim the tail frames.
+You can also use the Fun Control model for I2V inference. Specify the control video with `--control_path`.
+Other options are same as T2V inference.
+<details>
+<summary>日本語</summary>
+`--clip` を追加してCLIPモデルを指定します。`--image_path` は初期フレームとして使用する画像のパスです。
+`--end_image_path` で終了画像を指定できます。このオプションは実験的なものです。このオプションを指定すると、保存される動画が指定フレーム数よりもやや多くなり、かつノイズが乗るため、`--trim_tail_frames 3` などを指定して末尾のフレームをトリミングすることをお勧めします。
+I2V推論でもFun Controlモデルが使用できます。`--control_path` で制御用の映像を指定します。
+その他のオプションはT2V推論と同じです。
+</details>
+### New Batch and Interactive Modes / 新しいバッチモードとインタラクティブモード
+In addition to single video generation, Wan 2.1 now supports batch generation from file and interactive prompt input:
+#### Batch Mode from File / ファイルからのバッチモード
+Generate multiple videos from prompts stored in a text file:
+```bash
+python wan_generate_video.py --from_file prompts.txt --task t2v-14B
+--dit path/to/model.safetensors --vae path/to/vae.safetensors
+--t5 path/to/t5_model.pth --save_path output_directory
+```
+The prompts file format:
+- One prompt per line
+- Empty lines and lines starting with # are ignored (comments)
+- Each line can include prompt-specific parameters using command-line style format:
+```
+A beautiful sunset over mountains --w 832 --h 480 --f 81 --d 42 --s 20
+A busy city street at night --w 480 --h 832 --g 7.5 --n low quality, blurry
+```
+Supported inline parameters (if ommitted, default values from the command line are used):
+- `--w`: Width
+- `--h`: Height
+- `--f`: Frame count
+- `--d`: Seed
+- `--s`: Inference steps
+- `--g` or `--l`: Guidance scale
+- `--fs`: Flow shift
+- `--i`: Image path (for I2V)
+- `--cn`: Control path (for Fun Control)
+- `--n`: Negative prompt
+In batch mode, models are loaded once and reused for all prompts, significantly improving overall generation time compared to multiple single runs.
+#### Interactive Mode / インタラクティブモード
+Interactive command-line interface for entering prompts:
+```bash
+python wan_generate_video.py --interactive --task t2v-14B
+--dit path/to/model.safetensors --vae path/to/vae.safetensors
+--t5 path/to/t5_model.pth --save_path output_directory
+```
+In interactive mode:
+- Enter prompts directly at the command line
+- Use the same inline parameter format as batch mode
+- Use Ctrl+D (or Ctrl+Z on Windows) to exit
+- Models remain loaded between generations for efficiency
+<details>
+<summary>日本語</summary>
+単一動画の生成に加えて、Wan 2.1は現在、ファイルからのバッチ生成とインタラクティブなプロンプト入力をサポートしています。
+#### ファイルからのバッチモード
+テキストファイルに保存されたプロンプトから複数の動画を生成します：
+```bash
+python wan_generate_video.py --from_file prompts.txt --task t2v-14B
+--dit path/to/model.safetensors --vae path/to/vae.safetensors
+--t5 path/to/t5_model.pth --save_path output_directory
+```
+プロンプトファイルの形式：
+- 1行に1つのプロンプト
+- 空行や#で始まる行は無視されます（コメント）
+- 各行にはコマンドライン形式でプロンプト固有のパラメータを含めることができます：
+サポートされているインラインパラメータ（省略した場合、コマンドラインのデフォルト値が使用されます）
+- `--w`: 幅
+- `--h`: 高さ
+- `--f`: フレーム数
+- `--d`: シード
+- `--s`: 推論ステップ
+- `--g` または `--l`: ガイダンススケール
+- `--fs`: フローシフト
+- `--i`: 画像パス（I2V用）
+- `--cn`: コントロールパス（Fun Control用）
+- `--n`: ネガティブプロンプト
+バッチモードでは、モデルは一度だけロードされ、すべてのプロンプトで再利用されるため、複数回の単一実行と比較して全体的な生成時間が大幅に改善されます。
+#### インタラクティブモード
+プロンプトを入力するためのインタラクティブなコマンドラインインターフェース：
+```bash
+python wan_generate_video.py --interactive --task t2v-14B
+--dit path/to/model.safetensors --vae path/to/vae.safetensors
+--t5 path/to/t5_model.pth --save_path output_directory
+```
+インタラクティブモードでは：
+- コマンドラインで直接プロンプトを入力
+- バッチモードと同じインラインパラメータ形式を使用
+- 終了するには Ctrl+D (Windowsでは Ctrl+Z) を使用
+- 効率のため、モデルは生成間で読み込まれたままになります
+</details>

fpack_cache_latents.py ADDED Viewed

	@@ -0,0 +1,381 @@

+import argparse
+import logging
+import math
+import os
+from typing import List
+import numpy as np
+import torch
+import torch.nn.functional as F
+from tqdm import tqdm
+from transformers import SiglipImageProcessor, SiglipVisionModel
+from dataset import config_utils
+from dataset.config_utils import BlueprintGenerator, ConfigSanitizer
+from dataset.image_video_dataset import BaseDataset, ItemInfo, save_latent_cache_framepack, ARCHITECTURE_FRAMEPACK
+from frame_pack import hunyuan
+from frame_pack.framepack_utils import load_image_encoders, load_vae
+from hunyuan_model.autoencoder_kl_causal_3d import AutoencoderKLCausal3D
+from frame_pack.clip_vision import hf_clip_vision_encode
+import cache_latents
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+def encode_and_save_batch(
+    vae: AutoencoderKLCausal3D,
+    feature_extractor: SiglipImageProcessor,
+    image_encoder: SiglipVisionModel,
+    batch: List[ItemInfo],
+    latent_window_size: int,
+    vanilla_sampling: bool = False,
+):
+    """Encode a batch of original RGB videos and save FramePack section caches."""
+    # Stack batch into tensor (B,C,F,H,W) in RGB order
+    contents = torch.stack([torch.from_numpy(item.content) for item in batch])
+    if len(contents.shape) == 4:
+        contents = contents.unsqueeze(1)  # B, H, W, C -> B, F, H, W, C
+    contents = contents.permute(0, 4, 1, 2, 3).contiguous()  # B, C, F, H, W
+    contents = contents.to(vae.device, dtype=vae.dtype)
+    contents = contents / 127.5 - 1.0  # normalize to [-1, 1]
+    height, width = contents.shape[3], contents.shape[4]
+    if height < 8 or width < 8:
+        item = batch[0]  # other items should have the same size
+        raise ValueError(f"Image or video size too small: {item.item_key} and {len(batch) - 1} more, size: {item.original_size}")
+    # calculate latent frame count from original frame count (4n+1)
+    latent_f = (batch[0].frame_count - 1) // 4 + 1
+    # calculate the total number of sections (excluding the first frame, divided by window size)
+    total_latent_sections = math.floor((latent_f - 1) / latent_window_size)
+    if total_latent_sections < 1:
+        min_frames_needed = latent_window_size * 4 + 1
+        raise ValueError(
+            f"Not enough frames for FramePack: {batch[0].frame_count} frames ({latent_f} latent frames), minimum required: {min_frames_needed} frames ({latent_window_size+1} latent frames)"
+        )
+    # 実際に処理する潜在変数のフレーム数 (セクション境界に合わせる)
+    latent_f_aligned = total_latent_sections * latent_window_size + 1
+    # 実際に処理する元のフレーム数
+    frame_count_aligned = (latent_f_aligned - 1) * 4 + 1
+    if frame_count_aligned != batch[0].frame_count:
+        logger.info(
+            f"Frame count mismatch: required={frame_count_aligned} != actual={batch[0].frame_count}, trimming to {frame_count_aligned}"
+        )
+        contents = contents[:, :, :frame_count_aligned, :, :]
+    latent_f = latent_f_aligned  # Update to the aligned value
+    # VAE encode (list of tensor -> stack)
+    latents = hunyuan.vae_encode(contents, vae)  # include scaling factor
+    latents = latents.to("cpu")  # (B, C, latent_f, H/8, W/8)
+    # Vision encoding per‑item (once)
+    images = np.stack([item.content[0] for item in batch], axis=0)  # B, H, W, C
+    # encode image with image encoder
+    image_embeddings = []
+    with torch.no_grad():
+        for image in images:
+            image_encoder_output = hf_clip_vision_encode(image, feature_extractor, image_encoder)
+            image_embeddings.append(image_encoder_output.last_hidden_state)
+    image_embeddings = torch.cat(image_embeddings, dim=0)  # B, LEN, 1152
+    image_embeddings = image_embeddings.to("cpu")  # Save memory
+    if not vanilla_sampling:
+        # padding is reversed for inference (future to past)
+        latent_paddings = list(reversed(range(total_latent_sections)))
+        # Note: The padding trick for inference. See the paper for details.
+        if total_latent_sections > 4:
+            latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0]
+        for b, item in enumerate(batch):
+            original_latent_cache_path = item.latent_cache_path
+            video_lat = latents[b : b + 1]  # keep batch dim, 1, C, F, H, W
+            # emulate inference step (history latents)
+            # Note: In inference, history_latents stores *generated* future latents.
+            # Here, for caching, we just need its shape and type for clean_* tensors.
+            # The actual content doesn't matter much as clean_* will be overwritten.
+            history_latents = torch.zeros(
+                (1, video_lat.shape[1], 1 + 2 + 16, video_lat.shape[3], video_lat.shape[4]), dtype=video_lat.dtype
+            )  # C=16 for HY
+            latent_f_index = latent_f - latent_window_size  # Start from the last section
+            section_index = total_latent_sections - 1
+            for latent_padding in latent_paddings:
+                is_last_section = section_index == 0  # the last section in inference order == the first section in time
+                latent_padding_size = latent_padding * latent_window_size
+                if is_last_section:
+                    assert latent_f_index == 1, "Last section should be starting from frame 1"
+                # indices generation (same as inference)
+                indices = torch.arange(0, sum([1, latent_padding_size, latent_window_size, 1, 2, 16])).unsqueeze(0)
+                (
+                    clean_latent_indices_pre,  # Index for start_latent
+                    blank_indices,  # Indices for padding (future context in inference)
+                    latent_indices,  # Indices for the target latents to predict
+                    clean_latent_indices_post,  # Index for the most recent history frame
+                    clean_latent_2x_indices,  # Indices for the next 2 history frames
+                    clean_latent_4x_indices,  # Indices for the next 16 history frames
+                ) = indices.split([1, latent_padding_size, latent_window_size, 1, 2, 16], dim=1)
+                # Indices for clean_latents (start + recent history)
+                clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1)
+                # clean latents preparation (emulating inference)
+                clean_latents_pre = video_lat[:, :, 0:1, :, :]  # Always the first frame (start_latent)
+                clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, : 1 + 2 + 16, :, :].split(
+                    [1, 2, 16], dim=2
+                )
+                clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2)  # Combine start frame + placeholder
+                # Target latents for this section (ground truth)
+                target_latents = video_lat[:, :, latent_f_index : latent_f_index + latent_window_size, :, :]
+                # save cache (file path is inside item.latent_cache_path pattern), remove batch dim
+                item.latent_cache_path = append_section_idx_to_latent_cache_path(original_latent_cache_path, section_index)
+                save_latent_cache_framepack(
+                    item_info=item,
+                    latent=target_latents.squeeze(0),  # Ground truth for this section
+                    latent_indices=latent_indices.squeeze(0),  # Indices for the ground truth section
+                    clean_latents=clean_latents.squeeze(0),  # Start frame + history placeholder
+                    clean_latent_indices=clean_latent_indices.squeeze(0),  # Indices for start frame + history placeholder
+                    clean_latents_2x=clean_latents_2x.squeeze(0),  # History placeholder
+                    clean_latent_2x_indices=clean_latent_2x_indices.squeeze(0),  # Indices for history placeholder
+                    clean_latents_4x=clean_latents_4x.squeeze(0),  # History placeholder
+                    clean_latent_4x_indices=clean_latent_4x_indices.squeeze(0),  # Indices for history placeholder
+                    image_embeddings=image_embeddings[b],
+                )
+                if is_last_section:  # If this was the first section generated in inference (time=0)
+                    # History gets the start frame + the generated first section
+                    generated_latents_for_history = video_lat[:, :, : latent_window_size + 1, :, :]
+                else:
+                    # History gets the generated current section
+                    generated_latents_for_history = target_latents  # Use true latents as stand-in for generated
+                history_latents = torch.cat([generated_latents_for_history, history_latents], dim=2)
+                section_index -= 1
+                latent_f_index -= latent_window_size
+    else:
+        # Vanilla Sampling Logic
+        for b, item in enumerate(batch):
+            original_latent_cache_path = item.latent_cache_path
+            video_lat = latents[b : b + 1]  # Keep batch dim: 1, C, F_aligned, H, W
+            img_emb = image_embeddings[b]  # LEN, 1152
+            for section_index in range(total_latent_sections):
+                target_start_f = section_index * latent_window_size + 1
+                target_end_f = target_start_f + latent_window_size
+                target_latents = video_lat[:, :, target_start_f:target_end_f, :, :]
+                # Clean latents preparation (Vanilla)
+                # Get clean_latents_pre (Always frame 0)
+                clean_latents_pre = video_lat[:, :, 0:1, :, :]
+                # Frame indices for past context (relative to anchor)
+                idx_post_frame = target_start_f - 1  # Frame index of the last frame of section i-1
+                idx_2x_frame_1 = idx_post_frame - 1
+                idx_2x_frame_2 = idx_post_frame - 2
+                idx_4x_start_frame = idx_post_frame - idx_2x_frame_2 - 16
+                # Helper function to get frame or zeros if index is out of bounds
+                def get_frame_or_zeros(frame_idx):
+                    if frame_idx >= 0:
+                        # Ensure frame_idx doesn't exceed the actual length
+                        if frame_idx < video_lat.shape[2]:
+                            return video_lat[:, :, frame_idx : frame_idx + 1, :, :]
+                        else:
+                            # This case should ideally not happen if indexing is correct
+                            logger.warning(
+                                f"Attempted to access frame {frame_idx} beyond latent length {video_lat.shape[2]}. Returning zeros."
+                            )
+                            return torch.zeros_like(clean_latents_pre)
+                    else:
+                        return torch.zeros_like(clean_latents_pre)
+                # Get clean_latents_post (frame at idx_post_frame)
+                clean_latents_post = get_frame_or_zeros(idx_post_frame)
+                # Get clean_latents_2x (frames at idx_2x_frame_1, idx_2x_frame_2)
+                frame_2x_1 = get_frame_or_zeros(idx_2x_frame_1)
+                frame_2x_2 = get_frame_or_zeros(idx_2x_frame_2)
+                clean_latents_2x = torch.cat(
+                    [frame_2x_2, frame_2x_1], dim=2
+                )  # Order might matter (older first?) - assuming order [..., t-2, t-1]
+                # Get clean_latents_4x (16 frames ending at idx_4x_start_frame)
+                clean_latents_4x_list = []
+                for i in range(16):
+                    frame_idx = idx_4x_start_frame + i
+                    clean_latents_4x_list.append(get_frame_or_zeros(frame_idx))
+                clean_latents_4x = torch.cat(clean_latents_4x_list, dim=2)  # Ensure correct temporal order [..., t-18, ..., t-3]
+                # Combine pre and post for the main clean_latents input
+                clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2)  # (1, C, 2, H, W)
+                # Indices generation (Vanilla with Offset)
+                vanilla_offset_size = section_index * latent_window_size  # Offset based on section index
+                # print(f"Vanilla offset size: {vanilla_offset_size}")
+                # Calculate total length including the offset
+                total_length = sum([1, vanilla_offset_size, latent_window_size, 1, 2, 16])
+                indices = torch.arange(0, total_length).unsqueeze(0)
+                # Split indices including the offset part
+                (
+                    clean_latent_indices_pre,  # Index for frame 0
+                    past_offset_indices,  # Indices representing the time offset *before* section i
+                    latent_indices,  # Indices for the target latents (section i)
+                    clean_latent_indices_post,  # Index for frame from end of section i-1
+                    clean_latent_2x_indices,  # Indices for frames from end of section i-2, i-3
+                    clean_latent_4x_indices,  # Indices for the 16 past frames
+                ) = indices.split([1, vanilla_offset_size, latent_window_size, 1, 2, 16], dim=1)
+                clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1)
+                # Save cache
+                item.latent_cache_path = append_section_idx_to_latent_cache_path(original_latent_cache_path, section_index)
+                save_latent_cache_framepack(
+                    item_info=item,
+                    latent=target_latents.squeeze(0),
+                    latent_indices=latent_indices.squeeze(0),  # Indices for target section i
+                    clean_latents=clean_latents.squeeze(0),  # Past clean frames
+                    clean_latent_indices=clean_latent_indices.squeeze(0),  # Indices for clean_latents_pre/post
+                    clean_latents_2x=clean_latents_2x.squeeze(0),  # Past clean frames (2x)
+                    clean_latent_2x_indices=clean_latent_2x_indices.squeeze(0),  # Indices for clean_latents_2x
+                    clean_latents_4x=clean_latents_4x.squeeze(0),  # Past clean frames (4x)
+                    clean_latent_4x_indices=clean_latent_4x_indices.squeeze(0),  # Indices for clean_latents_4x
+                    image_embeddings=img_emb,
+                    # Note: We don't explicitly save past_offset_indices,
+                    # but its size influences the absolute values in other indices.
+                )
+def framepack_setup_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    parser.add_argument("--image_encoder", type=str, required=True, help="Image encoder (CLIP) checkpoint path or directory")
+    parser.add_argument("--latent_window_size", type=int, default=9, help="FramePack latent window size (default 9)")
+    parser.add_argument(
+        "--vanilla_sampling",
+        action="store_true",
+        help="Generate cache for vanilla (autoregressive) sampling instead of inference emulation",
+    )
+    return parser
+def main(args: argparse.Namespace):
+    device = args.device if hasattr(args, "device") and args.device else ("cuda" if torch.cuda.is_available() else "cpu")
+    device = torch.device(device)
+    # Load dataset config
+    blueprint_generator = BlueprintGenerator(ConfigSanitizer())
+    logger.info(f"Load dataset config from {args.dataset_config}")
+    user_config = config_utils.load_user_config(args.dataset_config)
+    blueprint = blueprint_generator.generate(user_config, args, architecture=ARCHITECTURE_FRAMEPACK)
+    train_dataset_group = config_utils.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+    datasets = train_dataset_group.datasets
+    if args.debug_mode is not None:
+        cache_latents.show_datasets(
+            datasets, args.debug_mode, args.console_width, args.console_back, args.console_num_images, fps=16
+        )
+        return
+    assert args.vae is not None, "vae checkpoint is required"
+    logger.info(f"Loading VAE model from {args.vae}")
+    vae = load_vae(args.vae, args.vae_chunk_size, args.vae_spatial_tile_sample_min_size, device=device)
+    vae.to(device)
+    logger.info(f"Loading image encoder from {args.image_encoder}")
+    feature_extractor, image_encoder = load_image_encoders(args)
+    image_encoder.eval()
+    image_encoder.to(device)
+    logger.info(f"Cache generation mode: {'Vanilla Sampling' if args.vanilla_sampling else 'Inference Emulation'}")
+    # encoding closure
+    def encode(batch: List[ItemInfo]):
+        encode_and_save_batch(vae, feature_extractor, image_encoder, batch, args.latent_window_size, args.vanilla_sampling)
+    # reuse core loop from cache_latents with no change
+    encode_datasets_framepack(datasets, encode, args)
+def append_section_idx_to_latent_cache_path(latent_cache_path: str, section_idx: int) -> str:
+    tokens = latent_cache_path.split("_")
+    tokens[-3] = f"{tokens[-3]}-{section_idx:04d}"  # append section index to "frame_pos-count"
+    return "_".join(tokens)
+def encode_datasets_framepack(datasets: list[BaseDataset], encode: callable, args: argparse.Namespace):
+    num_workers = args.num_workers if args.num_workers is not None else max(1, os.cpu_count() - 1)
+    for i, dataset in enumerate(datasets):
+        logger.info(f"Encoding dataset [{i}]")
+        all_latent_cache_paths = []
+        for _, batch in tqdm(dataset.retrieve_latent_cache_batches(num_workers)):
+            batch: list[ItemInfo] = batch  # type: ignore
+            # latent_cache_path is "{basename}_{w:04d}x{h:04d}_{self.architecture}.safetensors"
+            # we expand it to "{basename}_{section_idx:04d}_{w:04d}x{h:04d}_{self.architecture}.safetensors"
+            filtered_batch = []
+            for item in batch:
+                latent_f = (item.frame_count - 1) // 4 + 1
+                num_sections = math.floor((latent_f - 1) / args.latent_window_size)
+                all_existing = True
+                for sec in range(num_sections):
+                    p = append_section_idx_to_latent_cache_path(item.latent_cache_path, sec)
+                    all_latent_cache_paths.append(p)
+                    all_existing = all_existing and os.path.exists(p)
+                if all_existing:
+                    filtered_batch.append(item)
+            if args.skip_existing:
+                if len(filtered_batch) == 0:
+                    continue
+                batch = filtered_batch
+            bs = args.batch_size if args.batch_size is not None else len(batch)
+            for i in range(0, len(batch), bs):
+                encode(batch[i : i + bs])
+        # normalize paths
+        all_latent_cache_paths = [os.path.normpath(p) for p in all_latent_cache_paths]
+        all_latent_cache_paths = set(all_latent_cache_paths)
+        # remove old cache files not in the dataset
+        all_cache_files = dataset.get_all_latent_cache_files()
+        for cache_file in all_cache_files:
+            if os.path.normpath(cache_file) not in all_latent_cache_paths:
+                if args.keep_cache:
+                    logger.info(f"Keep cache file not in the dataset: {cache_file}")
+                else:
+                    os.remove(cache_file)
+                    logger.info(f"Removed old cache file: {cache_file}")
+if __name__ == "__main__":
+    parser = cache_latents.setup_parser_common()
+    parser = cache_latents.hv_setup_parser(parser)  # VAE
+    parser = framepack_setup_parser(parser)
+    args = parser.parse_args()
+    if args.vae_dtype is not None:
+        raise ValueError("VAE dtype is not supported in FramePack")
+    # if args.batch_size != 1:
+    #     args.batch_size = 1
+    #     logger.info("Batch size is set to 1 for FramePack.")
+    main(args)

fpack_cache_text_encoder_outputs.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import argparse
+import os
+from typing import Optional, Union
+import numpy as np
+import torch
+from tqdm import tqdm
+from transformers import LlamaTokenizerFast, LlamaModel, CLIPTokenizer, CLIPTextModel
+from dataset import config_utils
+from dataset.config_utils import BlueprintGenerator, ConfigSanitizer
+from dataset.image_video_dataset import ARCHITECTURE_FRAMEPACK, ItemInfo, save_text_encoder_output_cache_framepack
+import cache_text_encoder_outputs
+from frame_pack import hunyuan
+from frame_pack.framepack_utils import load_text_encoder1, load_text_encoder2
+import logging
+from frame_pack.utils import crop_or_pad_yield_mask
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+def encode_and_save_batch(
+    tokenizer1: LlamaTokenizerFast,
+    text_encoder1: LlamaModel,
+    tokenizer2: CLIPTokenizer,
+    text_encoder2: CLIPTextModel,
+    batch: list[ItemInfo],
+    device: torch.device,
+):
+    prompts = [item.caption for item in batch]
+    # encode prompt
+    # FramePack's encode_prompt_conds only supports single prompt, so we need to encode each prompt separately
+    list_of_llama_vec = []
+    list_of_llama_attention_mask = []
+    list_of_clip_l_pooler = []
+    for prompt in prompts:
+        with torch.autocast(device_type=device.type, dtype=text_encoder1.dtype), torch.no_grad():
+            # llama_vec, clip_l_pooler = hunyuan.encode_prompt_conds(prompts, text_encoder1, text_encoder2, tokenizer1, tokenizer2)
+            llama_vec, clip_l_pooler = hunyuan.encode_prompt_conds(prompt, text_encoder1, text_encoder2, tokenizer1, tokenizer2)
+            llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
+        list_of_llama_vec.append(llama_vec.squeeze(0))
+        list_of_llama_attention_mask.append(llama_attention_mask.squeeze(0))
+        list_of_clip_l_pooler.append(clip_l_pooler.squeeze(0))
+    # save prompt cache
+    for item, llama_vec, llama_attention_mask, clip_l_pooler in zip(
+        batch, list_of_llama_vec, list_of_llama_attention_mask, list_of_clip_l_pooler
+    ):
+        # save llama_vec and clip_l_pooler to cache
+        save_text_encoder_output_cache_framepack(item, llama_vec, llama_attention_mask, clip_l_pooler)
+def main(args):
+    device = args.device if args.device is not None else "cuda" if torch.cuda.is_available() else "cpu"
+    device = torch.device(device)
+    # Load dataset config
+    blueprint_generator = BlueprintGenerator(ConfigSanitizer())
+    logger.info(f"Load dataset config from {args.dataset_config}")
+    user_config = config_utils.load_user_config(args.dataset_config)
+    blueprint = blueprint_generator.generate(user_config, args, architecture=ARCHITECTURE_FRAMEPACK)
+    train_dataset_group = config_utils.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+    datasets = train_dataset_group.datasets
+    # prepare cache files and paths: all_cache_files_for_dataset = exisiting cache files, all_cache_paths_for_dataset = all cache paths in the dataset
+    all_cache_files_for_dataset, all_cache_paths_for_dataset = cache_text_encoder_outputs.prepare_cache_files_and_paths(datasets)
+    # load text encoder
+    tokenizer1, text_encoder1 = load_text_encoder1(args, args.fp8_llm, device)
+    tokenizer2, text_encoder2 = load_text_encoder2(args)
+    text_encoder2.to(device)
+    # Encode with Text Encoders
+    logger.info("Encoding with Text Encoders")
+    def encode_for_text_encoder(batch: list[ItemInfo]):
+        encode_and_save_batch(tokenizer1, text_encoder1, tokenizer2, text_encoder2, batch, device)
+    cache_text_encoder_outputs.process_text_encoder_batches(
+        args.num_workers,
+        args.skip_existing,
+        args.batch_size,
+        datasets,
+        all_cache_files_for_dataset,
+        all_cache_paths_for_dataset,
+        encode_for_text_encoder,
+    )
+    # remove cache files not in dataset
+    cache_text_encoder_outputs.post_process_cache_files(datasets, all_cache_files_for_dataset, all_cache_paths_for_dataset)
+def framepack_setup_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    parser.add_argument("--text_encoder1", type=str, required=True, help="Text Encoder 1 directory")
+    parser.add_argument("--text_encoder2", type=str, required=True, help="Text Encoder 2 directory")
+    parser.add_argument("--fp8_llm", action="store_true", help="use fp8 for Text Encoder 1 (LLM)")
+    return parser
+if __name__ == "__main__":
+    parser = cache_text_encoder_outputs.setup_parser_common()
+    parser = framepack_setup_parser(parser)
+    args = parser.parse_args()
+    main(args)

fpack_generate_video.py ADDED Viewed

	@@ -0,0 +1,1149 @@

+import argparse
+from datetime import datetime
+import gc
+import json
+import random
+import os
+import re
+import time
+import math
+import copy
+from typing import Tuple, Optional, List, Union, Any, Dict
+import torch
+from safetensors.torch import load_file, save_file
+from safetensors import safe_open
+from PIL import Image
+import cv2
+import numpy as np
+import torchvision.transforms.functional as TF
+from transformers import LlamaModel
+from tqdm import tqdm
+from networks import lora_framepack
+from hunyuan_model.autoencoder_kl_causal_3d import AutoencoderKLCausal3D
+from frame_pack import hunyuan
+from frame_pack.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked, load_packed_model
+from frame_pack.utils import crop_or_pad_yield_mask, resize_and_center_crop, soft_append_bcthw
+from frame_pack.bucket_tools import find_nearest_bucket
+from frame_pack.clip_vision import hf_clip_vision_encode
+from frame_pack.k_diffusion_hunyuan import sample_hunyuan
+from dataset import image_video_dataset
+try:
+    from lycoris.kohya import create_network_from_weights
+except:
+    pass
+from utils.device_utils import clean_memory_on_device
+from hv_generate_video import save_images_grid, save_videos_grid, synchronize_device
+from wan_generate_video import merge_lora_weights
+from frame_pack.framepack_utils import load_vae, load_text_encoder1, load_text_encoder2, load_image_encoders
+from dataset.image_video_dataset import load_video
+import logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+class GenerationSettings:
+    def __init__(self, device: torch.device, dit_weight_dtype: Optional[torch.dtype] = None):
+        self.device = device
+        self.dit_weight_dtype = dit_weight_dtype
+def parse_args() -> argparse.Namespace:
+    """parse command line arguments"""
+    parser = argparse.ArgumentParser(description="Wan 2.1 inference script")
+    # WAN arguments
+    # parser.add_argument("--ckpt_dir", type=str, default=None, help="The path to the checkpoint directory (Wan 2.1 official).")
+    parser.add_argument(
+        "--sample_solver", type=str, default="unipc", choices=["unipc", "dpm++", "vanilla"], help="The solver used to sample."
+    )
+    parser.add_argument("--dit", type=str, default=None, help="DiT directory or path")
+    parser.add_argument("--vae", type=str, default=None, help="VAE directory or path")
+    parser.add_argument("--text_encoder1", type=str, required=True, help="Text Encoder 1 directory or path")
+    parser.add_argument("--text_encoder2", type=str, required=True, help="Text Encoder 2 directory or path")
+    parser.add_argument("--image_encoder", type=str, required=True, help="Image Encoder directory or path")
+    # LoRA
+    parser.add_argument("--lora_weight", type=str, nargs="*", required=False, default=None, help="LoRA weight path")
+    parser.add_argument("--lora_multiplier", type=float, nargs="*", default=1.0, help="LoRA multiplier")
+    parser.add_argument("--include_patterns", type=str, nargs="*", default=None, help="LoRA module include patterns")
+    parser.add_argument("--exclude_patterns", type=str, nargs="*", default=None, help="LoRA module exclude patterns")
+    parser.add_argument(
+        "--save_merged_model",
+        type=str,
+        default=None,
+        help="Save merged model to path. If specified, no inference will be performed.",
+    )
+    # inference
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        default=None,
+        help="prompt for generation. If `;;;` is used, it will be split into sections. Example: `section_index:prompt` or "
+        "`section_index:prompt;;;section_index:prompt;;;...`, section_index can be `0` or `-1` or `0-2`, `-1` means last section, `0-2` means from 0 to 2 (inclusive).",
+    )
+    parser.add_argument(
+        "--negative_prompt",
+        type=str,
+        default=None,
+        help="negative prompt for generation, default is empty string. should not change.",
+    )
+    parser.add_argument("--video_size", type=int, nargs=2, default=[256, 256], help="video size, height and width")
+    parser.add_argument("--video_seconds", type=float, default=5.0, help="video length, Default is 5.0 seconds")
+    parser.add_argument("--fps", type=int, default=30, help="video fps, Default is 30")
+    parser.add_argument("--infer_steps", type=int, default=25, help="number of inference steps, Default is 25")
+    parser.add_argument("--save_path", type=str, required=True, help="path to save generated video")
+    parser.add_argument("--seed", type=int, default=None, help="Seed for evaluation.")
+    # parser.add_argument(
+    #     "--cpu_noise", action="store_true", help="Use CPU to generate noise (compatible with ComfyUI). Default is False."
+    # )
+    parser.add_argument("--latent_window_size", type=int, default=9, help="latent window size, default is 9. should not change.")
+    parser.add_argument(
+        "--embedded_cfg_scale", type=float, default=10.0, help="Embeded CFG scale (distilled CFG Scale), default is 10.0"
+    )
+    parser.add_argument(
+        "--guidance_scale",
+        type=float,
+        default=1.0,
+        help="Guidance scale for classifier free guidance. Default is 1.0, should not change.",
+    )
+    parser.add_argument("--guidance_rescale", type=float, default=0.0, help="CFG Re-scale, default is 0.0. Should not change.")
+    # parser.add_argument("--video_path", type=str, default=None, help="path to video for video2video inference")
+    parser.add_argument("--image_path", type=str, default=None, help="path to image for image2video inference")
+    parser.add_argument("--end_image_path", type=str, default=None, help="path to end image for image2video inference")
+    # parser.add_argument(
+    #     "--control_path",
+    #     type=str,
+    #     default=None,
+    #     help="path to control video for inference with controlnet. video file or directory with images",
+    # )
+    # parser.add_argument("--trim_tail_frames", type=int, default=0, help="trim tail N frames from the video before saving")
+    # # Flow Matching
+    # parser.add_argument(
+    #     "--flow_shift",
+    #     type=float,
+    #     default=None,
+    #     help="Shift factor for flow matching schedulers. Default depends on task.",
+    # )
+    parser.add_argument("--fp8", action="store_true", help="use fp8 for DiT model")
+    parser.add_argument("--fp8_scaled", action="store_true", help="use scaled fp8 for DiT, only for fp8")
+    # parser.add_argument("--fp8_fast", action="store_true", help="Enable fast FP8 arithmetic (RTX 4XXX+), only for fp8_scaled")
+    parser.add_argument("--fp8_llm", action="store_true", help="use fp8 for Text Encoder 1 (LLM)")
+    parser.add_argument(
+        "--device", type=str, default=None, help="device to use for inference. If None, use CUDA if available, otherwise use CPU"
+    )
+    parser.add_argument(
+        "--attn_mode",
+        type=str,
+        default="torch",
+        choices=["flash", "torch", "sageattn", "xformers", "sdpa"],  #  "flash2", "flash3",
+        help="attention mode",
+    )
+    parser.add_argument("--vae_chunk_size", type=int, default=None, help="chunk size for CausalConv3d in VAE")
+    parser.add_argument(
+        "--vae_spatial_tile_sample_min_size", type=int, default=None, help="spatial tile sample min size for VAE, default 256"
+    )
+    parser.add_argument("--bulk_decode", action="store_true", help="decode all frames at once")
+    parser.add_argument("--blocks_to_swap", type=int, default=0, help="number of blocks to swap in the model")
+    parser.add_argument(
+        "--output_type", type=str, default="video", choices=["video", "images", "latent", "both"], help="output type"
+    )
+    parser.add_argument("--no_metadata", action="store_true", help="do not save metadata")
+    parser.add_argument("--latent_path", type=str, nargs="*", default=None, help="path to latent for decode. no inference")
+    parser.add_argument("--lycoris", action="store_true", help="use lycoris for inference")
+    # parser.add_argument("--compile", action="store_true", help="Enable torch.compile")
+    # parser.add_argument(
+    #     "--compile_args",
+    #     nargs=4,
+    #     metavar=("BACKEND", "MODE", "DYNAMIC", "FULLGRAPH"),
+    #     default=["inductor", "max-autotune-no-cudagraphs", "False", "False"],
+    #     help="Torch.compile settings",
+    # )
+    # New arguments for batch and interactive modes
+    parser.add_argument("--from_file", type=str, default=None, help="Read prompts from a file")
+    parser.add_argument("--interactive", action="store_true", help="Interactive mode: read prompts from console")
+    args = parser.parse_args()
+    # Validate arguments
+    if args.from_file and args.interactive:
+        raise ValueError("Cannot use both --from_file and --interactive at the same time")
+    if args.prompt is None and not args.from_file and not args.interactive:
+        raise ValueError("Either --prompt, --from_file or --interactive must be specified")
+    return args
+def parse_prompt_line(line: str) -> Dict[str, Any]:
+    """Parse a prompt line into a dictionary of argument overrides
+    Args:
+        line: Prompt line with options
+    Returns:
+        Dict[str, Any]: Dictionary of argument overrides
+    """
+    # TODO common function with hv_train_network.line_to_prompt_dict
+    parts = line.split(" --")
+    prompt = parts[0].strip()
+    # Create dictionary of overrides
+    overrides = {"prompt": prompt}
+    for part in parts[1:]:
+        if not part.strip():
+            continue
+        option_parts = part.split(" ", 1)
+        option = option_parts[0].strip()
+        value = option_parts[1].strip() if len(option_parts) > 1 else ""
+        # Map options to argument names
+        if option == "w":
+            overrides["video_size_width"] = int(value)
+        elif option == "h":
+            overrides["video_size_height"] = int(value)
+        elif option == "f":
+            overrides["video_seconds"] = float(value)
+        elif option == "d":
+            overrides["seed"] = int(value)
+        elif option == "s":
+            overrides["infer_steps"] = int(value)
+        elif option == "g" or option == "l":
+            overrides["guidance_scale"] = float(value)
+        # elif option == "fs":
+        #     overrides["flow_shift"] = float(value)
+        elif option == "i":
+            overrides["image_path"] = value
+        elif option == "cn":
+            overrides["control_path"] = value
+        elif option == "n":
+            overrides["negative_prompt"] = value
+    return overrides
+def apply_overrides(args: argparse.Namespace, overrides: Dict[str, Any]) -> argparse.Namespace:
+    """Apply overrides to args
+    Args:
+        args: Original arguments
+        overrides: Dictionary of overrides
+    Returns:
+        argparse.Namespace: New arguments with overrides applied
+    """
+    args_copy = copy.deepcopy(args)
+    for key, value in overrides.items():
+        if key == "video_size_width":
+            args_copy.video_size[1] = value
+        elif key == "video_size_height":
+            args_copy.video_size[0] = value
+        else:
+            setattr(args_copy, key, value)
+    return args_copy
+def check_inputs(args: argparse.Namespace) -> Tuple[int, int, int]:
+    """Validate video size and length
+    Args:
+        args: command line arguments
+    Returns:
+        Tuple[int, int, float]: (height, width, video_seconds)
+    """
+    height = args.video_size[0]
+    width = args.video_size[1]
+    video_seconds = args.video_seconds
+    if height % 8 != 0 or width % 8 != 0:
+        raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+    return height, width, video_seconds
+# region DiT model
+def load_dit_model(args: argparse.Namespace, device: torch.device) -> HunyuanVideoTransformer3DModelPacked:
+    """load DiT model
+    Args:
+        args: command line arguments
+        device: device to use
+        dit_dtype: data type for the model
+        dit_weight_dtype: data type for the model weights. None for as-is
+    Returns:
+        HunyuanVideoTransformer3DModelPacked: DiT model
+    """
+    loading_device = "cpu"
+    if args.blocks_to_swap == 0 and not args.fp8_scaled and args.lora_weight is None:
+        loading_device = device
+    # do not fp8 optimize because we will merge LoRA weights
+    model = load_packed_model(device, args.dit, args.attn_mode, loading_device)
+    return model
+def optimize_model(model: HunyuanVideoTransformer3DModelPacked, args: argparse.Namespace, device: torch.device) -> None:
+    """optimize the model (FP8 conversion, device move etc.)
+    Args:
+        model: dit model
+        args: command line arguments
+        device: device to use
+    """
+    if args.fp8_scaled:
+        # load state dict as-is and optimize to fp8
+        state_dict = model.state_dict()
+        # if no blocks to swap, we can move the weights to GPU after optimization on GPU (omit redundant CPU->GPU copy)
+        move_to_device = args.blocks_to_swap == 0  # if blocks_to_swap > 0, we will keep the model on CPU
+        state_dict = model.fp8_optimization(state_dict, device, move_to_device, use_scaled_mm=False)  # args.fp8_fast)
+        info = model.load_state_dict(state_dict, strict=True, assign=True)
+        logger.info(f"Loaded FP8 optimized weights: {info}")
+        if args.blocks_to_swap == 0:
+            model.to(device)  # make sure all parameters are on the right device (e.g. RoPE etc.)
+    else:
+        # simple cast to dit_dtype
+        target_dtype = None  # load as-is (dit_weight_dtype == dtype of the weights in state_dict)
+        target_device = None
+        if args.fp8:
+            target_dtype = torch.float8e4m3fn
+        if args.blocks_to_swap == 0:
+            logger.info(f"Move model to device: {device}")
+            target_device = device
+        if target_device is not None and target_dtype is not None:
+            model.to(target_device, target_dtype)  # move and cast  at the same time. this reduces redundant copy operations
+    # if args.compile:
+    #     compile_backend, compile_mode, compile_dynamic, compile_fullgraph = args.compile_args
+    #     logger.info(
+    #         f"Torch Compiling[Backend: {compile_backend}; Mode: {compile_mode}; Dynamic: {compile_dynamic}; Fullgraph: {compile_fullgraph}]"
+    #     )
+    #     torch._dynamo.config.cache_size_limit = 32
+    #     for i in range(len(model.blocks)):
+    #         model.blocks[i] = torch.compile(
+    #             model.blocks[i],
+    #             backend=compile_backend,
+    #             mode=compile_mode,
+    #             dynamic=compile_dynamic.lower() in "true",
+    #             fullgraph=compile_fullgraph.lower() in "true",
+    #         )
+    if args.blocks_to_swap > 0:
+        logger.info(f"Enable swap {args.blocks_to_swap} blocks to CPU from device: {device}")
+        model.enable_block_swap(args.blocks_to_swap, device, supports_backward=False)
+        model.move_to_device_except_swap_blocks(device)
+        model.prepare_block_swap_before_forward()
+    else:
+        # make sure the model is on the right device
+        model.to(device)
+    model.eval().requires_grad_(False)
+    clean_memory_on_device(device)
+# endregion
+def decode_latent(
+    latent_window_size: int,
+    total_latent_sections: int,
+    bulk_decode: bool,
+    vae: AutoencoderKLCausal3D,
+    latent: torch.Tensor,
+    device: torch.device,
+) -> torch.Tensor:
+    logger.info(f"Decoding video...")
+    if latent.ndim == 4:
+        latent = latent.unsqueeze(0)  # add batch dimension
+    vae.to(device)
+    if not bulk_decode:
+        latent_window_size = latent_window_size  # default is 9
+        # total_latent_sections = (args.video_seconds * 30) / (latent_window_size * 4)
+        # total_latent_sections = int(max(round(total_latent_sections), 1))
+        num_frames = latent_window_size * 4 - 3
+        latents_to_decode = []
+        latent_frame_index = 0
+        for i in range(total_latent_sections - 1, -1, -1):
+            is_last_section = i == total_latent_sections - 1
+            generated_latent_frames = (num_frames + 3) // 4 + (1 if is_last_section else 0)
+            section_latent_frames = (latent_window_size * 2 + 1) if is_last_section else (latent_window_size * 2)
+            section_latent = latent[:, :, latent_frame_index : latent_frame_index + section_latent_frames, :, :]
+            latents_to_decode.append(section_latent)
+            latent_frame_index += generated_latent_frames
+        latents_to_decode = latents_to_decode[::-1]  # reverse the order of latents to decode
+        history_pixels = None
+        for latent in tqdm(latents_to_decode):
+            if history_pixels is None:
+                history_pixels = hunyuan.vae_decode(latent, vae).cpu()
+            else:
+                overlapped_frames = latent_window_size * 4 - 3
+                current_pixels = hunyuan.vae_decode(latent, vae).cpu()
+                history_pixels = soft_append_bcthw(current_pixels, history_pixels, overlapped_frames)
+            clean_memory_on_device(device)
+    else:
+        # bulk decode
+        logger.info(f"Bulk decoding")
+        history_pixels = hunyuan.vae_decode(latent, vae).cpu()
+    vae.to("cpu")
+    print(f"Decoded. Pixel shape {history_pixels.shape}")
+    return history_pixels[0]  # remove batch dimension
+def prepare_i2v_inputs(
+    args: argparse.Namespace,
+    device: torch.device,
+    vae: AutoencoderKLCausal3D,
+    encoded_context: Optional[Dict] = None,
+    encoded_context_n: Optional[Dict] = None,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Tuple[dict, dict]]:
+    """Prepare inputs for I2V
+    Args:
+        args: command line arguments
+        config: model configuration
+        device: device to use
+        vae: VAE model, used for image encoding
+        encoded_context: Pre-encoded text context
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Tuple[dict, dict]]:
+            (noise, context, context_null, y, (arg_c, arg_null))
+    """
+    height, width, video_seconds = check_inputs(args)
+    # prepare image
+    def preprocess_image(image_path: str):
+        image = Image.open(image_path).convert("RGB")
+        image_np = np.array(image)  # PIL to numpy, HWC
+        image_np = image_video_dataset.resize_image_to_bucket(image_np, (width, height))
+        image_tensor = torch.from_numpy(image_np).float() / 127.5 - 1.0  # -1 to 1.0, HWC
+        image_tensor = image_tensor.permute(2, 0, 1)[None, :, None]  # HWC -> CHW -> NCFHW, N=1, C=3, F=1
+        return image_tensor, image_np
+    img_tensor, img_np = preprocess_image(args.image_path)
+    if args.end_image_path is not None:
+        end_img_tensor, end_img_np = preprocess_image(args.end_image_path)
+    else:
+        end_img_tensor, end_img_np = None, None
+    # configure negative prompt
+    n_prompt = args.negative_prompt if args.negative_prompt else ""
+    if encoded_context is None:
+        # load text encoder
+        tokenizer1, text_encoder1 = load_text_encoder1(args, args.fp8_llm, device)
+        tokenizer2, text_encoder2 = load_text_encoder2(args)
+        text_encoder2.to(device)
+        # parse section prompts
+        section_prompts = {}
+        if ";;;" in args.prompt:
+            section_prompt_strs = args.prompt.split(";;;")
+            for section_prompt_str in section_prompt_strs:
+                if ":" not in section_prompt_str:
+                    start = end = 0
+                    prompt_str = section_prompt_str.strip()
+                else:
+                    index_str, prompt_str = section_prompt_str.split(":", 1)
+                    index_str = index_str.strip()
+                    prompt_str = prompt_str.strip()
+                    m = re.match(r"^(-?\d+)(-\d+)?$", index_str)
+                    if m:
+                        start = int(m.group(1))
+                        end = int(m.group(2)[1:]) if m.group(2) is not None else start
+                    else:
+                        start = end = 0
+                        prompt_str = section_prompt_str.strip()
+                for i in range(start, end + 1):
+                    section_prompts[i] = prompt_str
+        else:
+            section_prompts[0] = args.prompt
+        # assert 0 in section_prompts, "Section prompts must contain section 0"
+        if 0 not in section_prompts:
+            # use smallest section index. prefer positive index over negative index
+            # if all section indices are negative, use the smallest negative index
+            indices = list(section_prompts.keys())
+            if all(i < 0 for i in indices):
+                section_index = min(indices)
+            else:
+                section_index = min(i for i in indices if i >= 0)
+            section_prompts[0] = section_prompts[section_index]
+        print(section_prompts)
+        logger.info(f"Encoding prompt")
+        llama_vecs = {}
+        llama_attention_masks = {}
+        clip_l_poolers = {}
+        with torch.autocast(device_type=device.type, dtype=text_encoder1.dtype), torch.no_grad():
+            for index, prompt in section_prompts.items():
+                llama_vec, clip_l_pooler = hunyuan.encode_prompt_conds(prompt, text_encoder1, text_encoder2, tokenizer1, tokenizer2)
+                llama_vec = llama_vec.cpu()
+                clip_l_pooler = clip_l_pooler.cpu()
+                llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
+                llama_vecs[index] = llama_vec
+                llama_attention_masks[index] = llama_attention_mask
+                clip_l_poolers[index] = clip_l_pooler
+        if args.guidance_scale == 1.0:
+            llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vecs[0]), torch.zeros_like(clip_l_poolers[0])
+        else:
+            with torch.autocast(device_type=device.type, dtype=text_encoder1.dtype), torch.no_grad():
+                llama_vec_n, clip_l_pooler_n = hunyuan.encode_prompt_conds(
+                    n_prompt, text_encoder1, text_encoder2, tokenizer1, tokenizer2
+                )
+                llama_vec_n = llama_vec_n.cpu()
+                clip_l_pooler_n = clip_l_pooler_n.cpu()
+        llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
+        # free text encoder and clean memory
+        del text_encoder1, text_encoder2, tokenizer1, tokenizer2
+        clean_memory_on_device(device)
+        # load image encoder
+        feature_extractor, image_encoder = load_image_encoders(args)
+        image_encoder.to(device)
+        # encode image with image encoder
+        with torch.no_grad():
+            image_encoder_output = hf_clip_vision_encode(img_np, feature_extractor, image_encoder)
+        image_encoder_last_hidden_state = image_encoder_output.last_hidden_state.cpu()
+        if end_img_np is not None:
+            with torch.no_grad():
+                end_image_encoder_output = hf_clip_vision_encode(end_img_np, feature_extractor, image_encoder)
+            end_image_encoder_last_hidden_state = end_image_encoder_output.last_hidden_state.cpu()
+        else:
+            end_image_encoder_last_hidden_state = None
+        # free image encoder and clean memory
+        del image_encoder, feature_extractor
+        clean_memory_on_device(device)
+    else:
+        # Use pre-encoded context
+        llama_vecs = encoded_context["llama_vecs"]
+        llama_attention_masks = encoded_context["llama_attention_masks"]
+        clip_l_poolers = encoded_context["clip_l_poolers"]
+        llama_vec_n = encoded_context_n["llama_vec"]
+        llama_attention_mask_n = encoded_context_n["llama_attention_mask"]
+        clip_l_pooler_n = encoded_context_n["clip_l_pooler"]
+        image_encoder_last_hidden_state = encoded_context["image_encoder_last_hidden_state"]
+    # # end frame image
+    # if args.end_image_path is not None:
+    #     end_img = Image.open(args.end_image_path).convert("RGB")
+    #     end_img_cv2 = np.array(end_img)  # PIL to numpy
+    # else:
+    #     end_img = None
+    #     end_img_cv2 = None
+    # has_end_image = end_img is not None
+    # VAE encoding
+    logger.info(f"Encoding image to latent space")
+    vae.to(device)
+    start_latent = hunyuan.vae_encode(img_tensor, vae).cpu()
+    if end_img_tensor is not None:
+        end_latent = hunyuan.vae_encode(end_img_tensor, vae).cpu()
+    else:
+        end_latent = None
+    vae.to("cpu")  # move VAE to CPU to save memory
+    clean_memory_on_device(device)
+    # prepare model input arguments
+    arg_c = {}
+    for index in llama_vecs.keys():
+        llama_vec = llama_vecs[index]
+        llama_attention_mask = llama_attention_masks[index]
+        clip_l_pooler = clip_l_poolers[index]
+        arg_c_i = {
+            "llama_vec": llama_vec,
+            "llama_attention_mask": llama_attention_mask,
+            "clip_l_pooler": clip_l_pooler,
+            "image_encoder_last_hidden_state": image_encoder_last_hidden_state,
+            "end_image_encoder_last_hidden_state": end_image_encoder_last_hidden_state,
+            "prompt": section_prompts[index],  # for debugging
+        }
+        arg_c[index] = arg_c_i
+    arg_null = {
+        "llama_vec": llama_vec_n,
+        "llama_attention_mask": llama_attention_mask_n,
+        "clip_l_pooler": clip_l_pooler_n,
+        "image_encoder_last_hidden_state": image_encoder_last_hidden_state,
+        "end_image_encoder_last_hidden_state": end_image_encoder_last_hidden_state,
+    }
+    return height, width, video_seconds, start_latent, end_latent, arg_c, arg_null
+# def setup_scheduler(args: argparse.Namespace, config, device: torch.device) -> Tuple[Any, torch.Tensor]:
+#     """setup scheduler for sampling
+#     Args:
+#         args: command line arguments
+#         config: model configuration
+#         device: device to use
+#     Returns:
+#         Tuple[Any, torch.Tensor]: (scheduler, timesteps)
+#     """
+#     if args.sample_solver == "unipc":
+#         scheduler = FlowUniPCMultistepScheduler(num_train_timesteps=config.num_train_timesteps, shift=1, use_dynamic_shifting=False)
+#         scheduler.set_timesteps(args.infer_steps, device=device, shift=args.flow_shift)
+#         timesteps = scheduler.timesteps
+#     elif args.sample_solver == "dpm++":
+#         scheduler = FlowDPMSolverMultistepScheduler(
+#             num_train_timesteps=config.num_train_timesteps, shift=1, use_dynamic_shifting=False
+#         )
+#         sampling_sigmas = get_sampling_sigmas(args.infer_steps, args.flow_shift)
+#         timesteps, _ = retrieve_timesteps(scheduler, device=device, sigmas=sampling_sigmas)
+#     elif args.sample_solver == "vanilla":
+#         scheduler = FlowMatchDiscreteScheduler(num_train_timesteps=config.num_train_timesteps, shift=args.flow_shift)
+#         scheduler.set_timesteps(args.infer_steps, device=device)
+#         timesteps = scheduler.timesteps
+#         # FlowMatchDiscreteScheduler does not support generator argument in step method
+#         org_step = scheduler.step
+#         def step_wrapper(
+#             model_output: torch.Tensor,
+#             timestep: Union[int, torch.Tensor],
+#             sample: torch.Tensor,
+#             return_dict: bool = True,
+#             generator=None,
+#         ):
+#             return org_step(model_output, timestep, sample, return_dict=return_dict)
+#         scheduler.step = step_wrapper
+#     else:
+#         raise NotImplementedError("Unsupported solver.")
+#     return scheduler, timesteps
+def generate(args: argparse.Namespace, gen_settings: GenerationSettings, shared_models: Optional[Dict] = None) -> torch.Tensor:
+    """main function for generation
+    Args:
+        args: command line arguments
+        shared_models: dictionary containing pre-loaded models and encoded data
+    Returns:
+        torch.Tensor: generated latent
+    """
+    device, dit_weight_dtype = (gen_settings.device, gen_settings.dit_weight_dtype)
+    # prepare seed
+    seed = args.seed if args.seed is not None else random.randint(0, 2**32 - 1)
+    args.seed = seed  # set seed to args for saving
+    # Check if we have shared models
+    if shared_models is not None:
+        # Use shared models and encoded data
+        vae = shared_models.get("vae")
+        model = shared_models.get("model")
+        encoded_context = shared_models.get("encoded_contexts", {}).get(args.prompt)
+        n_prompt = args.negative_prompt if args.negative_prompt else ""
+        encoded_context_n = shared_models.get("encoded_contexts", {}).get(n_prompt)
+        height, width, video_seconds, start_latent, end_latent, context, context_null = prepare_i2v_inputs(
+            args, device, vae, encoded_context, encoded_context_n
+        )
+    else:
+        # prepare inputs without shared models
+        vae = load_vae(args.vae, args.vae_chunk_size, args.vae_spatial_tile_sample_min_size, device)
+        height, width, video_seconds, start_latent, end_latent, context, context_null = prepare_i2v_inputs(args, device, vae)
+        # load DiT model
+        model = load_dit_model(args, device)
+        # merge LoRA weights
+        if args.lora_weight is not None and len(args.lora_weight) > 0:
+            merge_lora_weights(lora_framepack, model, args, device)  # ugly hack to common merge_lora_weights function
+            # if we only want to save the model, we can skip the rest
+            if args.save_merged_model:
+                return None
+        # optimize model: fp8 conversion, block swap etc.
+        optimize_model(model, args, device)
+    # sampling
+    latent_window_size = args.latent_window_size  # default is 9
+    # ex: (5s * 30fps) / (9 * 4) = 4.16 -> 4 sections, 60s -> 1800 / 36 = 50 sections
+    total_latent_sections = (video_seconds * 30) / (latent_window_size * 4)
+    total_latent_sections = int(max(round(total_latent_sections), 1))
+    # set random generator
+    seed_g = torch.Generator(device="cpu")
+    seed_g.manual_seed(seed)
+    num_frames = latent_window_size * 4 - 3
+    logger.info(
+        f"Video size: {height}x{width}@{video_seconds} (HxW@seconds), fps: {args.fps}, "
+        f"infer_steps: {args.infer_steps}, frames per generation: {num_frames}"
+    )
+    history_latents = torch.zeros((1, 16, 1 + 2 + 16, height // 8, width // 8), dtype=torch.float32)
+    # history_pixels = None
+    total_generated_latent_frames = 0
+    latent_paddings = reversed(range(total_latent_sections))
+    if total_latent_sections > 4:
+        # In theory the latent_paddings should follow the above sequence, but it seems that duplicating some
+        # items looks better than expanding it when total_latent_sections > 4
+        # One can try to remove below trick and just
+        # use `latent_paddings = list(reversed(range(total_latent_sections)))` to compare
+        # 4 sections: 3, 2, 1, 0. 50 sections: 3, 2, 2, ... 2, 1, 0
+        latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0]
+    for section_index_reverse, latent_padding in enumerate(latent_paddings):
+        section_index = total_latent_sections - 1 - section_index_reverse
+        is_last_section = latent_padding == 0
+        is_first_section = section_index_reverse == 0
+        latent_padding_size = latent_padding * latent_window_size
+        logger.info(f"latent_padding_size = {latent_padding_size}, is_last_section = {is_last_section}")
+        reference_start_latent = start_latent
+        apply_end_image = args.end_image_path is not None and is_first_section
+        if apply_end_image:
+            latent_padding_size = 0
+            reference_start_latent = end_latent
+            logger.info(f"Apply experimental end image, latent_padding_size = {latent_padding_size}")
+        # sum([1, 3, 9, 1, 2, 16]) = 32
+        indices = torch.arange(0, sum([1, latent_padding_size, latent_window_size, 1, 2, 16])).unsqueeze(0)
+        (
+            clean_latent_indices_pre,
+            blank_indices,
+            latent_indices,
+            clean_latent_indices_post,
+            clean_latent_2x_indices,
+            clean_latent_4x_indices,
+        ) = indices.split([1, latent_padding_size, latent_window_size, 1, 2, 16], dim=1)
+        clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1)
+        clean_latents_pre = reference_start_latent.to(history_latents)
+        clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, : 1 + 2 + 16, :, :].split([1, 2, 16], dim=2)
+        clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2)
+        # if use_teacache:
+        #     transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
+        # else:
+        #     transformer.initialize_teacache(enable_teacache=False)
+        section_index_from_last = -(section_index_reverse + 1)  # -1, -2 ...
+        if section_index_from_last in context:
+            prompt_index = section_index_from_last
+        elif section_index in context:
+            prompt_index = section_index
+        else:
+            prompt_index = 0
+        context_for_index = context[prompt_index]
+        # if args.section_prompts is not None:
+        logger.info(f"Section {section_index}: {context_for_index['prompt']}")
+        llama_vec = context_for_index["llama_vec"].to(device, dtype=torch.bfloat16)
+        llama_attention_mask = context_for_index["llama_attention_mask"].to(device)
+        clip_l_pooler = context_for_index["clip_l_pooler"].to(device, dtype=torch.bfloat16)
+        if not apply_end_image:
+            image_encoder_last_hidden_state = context_for_index["image_encoder_last_hidden_state"].to(device, dtype=torch.bfloat16)
+        else:
+            image_encoder_last_hidden_state = context_for_index["end_image_encoder_last_hidden_state"].to(
+                device, dtype=torch.bfloat16
+            )
+        llama_vec_n = context_null["llama_vec"].to(device, dtype=torch.bfloat16)
+        llama_attention_mask_n = context_null["llama_attention_mask"].to(device)
+        clip_l_pooler_n = context_null["clip_l_pooler"].to(device, dtype=torch.bfloat16)
+        generated_latents = sample_hunyuan(
+            transformer=model,
+            sampler=args.sample_solver,
+            width=width,
+            height=height,
+            frames=num_frames,
+            real_guidance_scale=args.guidance_scale,
+            distilled_guidance_scale=args.embedded_cfg_scale,
+            guidance_rescale=args.guidance_rescale,
+            # shift=3.0,
+            num_inference_steps=args.infer_steps,
+            generator=seed_g,
+            prompt_embeds=llama_vec,
+            prompt_embeds_mask=llama_attention_mask,
+            prompt_poolers=clip_l_pooler,
+            negative_prompt_embeds=llama_vec_n,
+            negative_prompt_embeds_mask=llama_attention_mask_n,
+            negative_prompt_poolers=clip_l_pooler_n,
+            device=device,
+            dtype=torch.bfloat16,
+            image_embeddings=image_encoder_last_hidden_state,
+            latent_indices=latent_indices,
+            clean_latents=clean_latents,
+            clean_latent_indices=clean_latent_indices,
+            clean_latents_2x=clean_latents_2x,
+            clean_latent_2x_indices=clean_latent_2x_indices,
+            clean_latents_4x=clean_latents_4x,
+            clean_latent_4x_indices=clean_latent_4x_indices,
+        )
+        if is_last_section:
+            generated_latents = torch.cat([start_latent.to(generated_latents), generated_latents], dim=2)
+        total_generated_latent_frames += int(generated_latents.shape[2])
+        history_latents = torch.cat([generated_latents.to(history_latents), history_latents], dim=2)
+        real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]
+        logger.info(f"Generated. Latent shape {real_history_latents.shape}")
+        # # TODO support saving intermediate video
+        # clean_memory_on_device(device)
+        # vae.to(device)
+        # if history_pixels is None:
+        #     history_pixels = hunyuan.vae_decode(real_history_latents, vae).cpu()
+        # else:
+        #     section_latent_frames = (latent_window_size * 2 + 1) if is_last_section else (latent_window_size * 2)
+        #     overlapped_frames = latent_window_size * 4 - 3
+        #     current_pixels = hunyuan.vae_decode(real_history_latents[:, :, :section_latent_frames], vae).cpu()
+        #     history_pixels = soft_append_bcthw(current_pixels, history_pixels, overlapped_frames)
+        # vae.to("cpu")
+        # # if not is_last_section:
+        # #     # save intermediate video
+        # #     save_video(history_pixels[0], args, total_generated_latent_frames)
+        # print(f"Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}")
+    # Only clean up shared models if they were created within this function
+    if shared_models is None:
+        # free memory
+        del model
+        # del scheduler
+        synchronize_device(device)
+    # wait for 5 seconds until block swap is done
+    logger.info("Waiting for 5 seconds to finish block swap")
+    time.sleep(5)
+    gc.collect()
+    clean_memory_on_device(device)
+    return vae, real_history_latents
+def save_latent(latent: torch.Tensor, args: argparse.Namespace, height: int, width: int) -> str:
+    """Save latent to file
+    Args:
+        latent: Latent tensor
+        args: command line arguments
+        height: height of frame
+        width: width of frame
+    Returns:
+        str: Path to saved latent file
+    """
+    save_path = args.save_path
+    os.makedirs(save_path, exist_ok=True)
+    time_flag = datetime.fromtimestamp(time.time()).strftime("%Y%m%d-%H%M%S")
+    seed = args.seed
+    video_seconds = args.video_seconds
+    latent_path = f"{save_path}/{time_flag}_{seed}_latent.safetensors"
+    if args.no_metadata:
+        metadata = None
+    else:
+        metadata = {
+            "seeds": f"{seed}",
+            "prompt": f"{args.prompt}",
+            "height": f"{height}",
+            "width": f"{width}",
+            "video_seconds": f"{video_seconds}",
+            "infer_steps": f"{args.infer_steps}",
+            "guidance_scale": f"{args.guidance_scale}",
+            "latent_window_size": f"{args.latent_window_size}",
+            "embedded_cfg_scale": f"{args.embedded_cfg_scale}",
+            "guidance_rescale": f"{args.guidance_rescale}",
+            "sample_solver": f"{args.sample_solver}",
+            "latent_window_size": f"{args.latent_window_size}",
+            "fps": f"{args.fps}",
+        }
+        if args.negative_prompt is not None:
+            metadata["negative_prompt"] = f"{args.negative_prompt}"
+    sd = {"latent": latent.contiguous()}
+    save_file(sd, latent_path, metadata=metadata)
+    logger.info(f"Latent saved to: {latent_path}")
+    return latent_path
+def save_video(
+    video: torch.Tensor, args: argparse.Namespace, original_base_name: Optional[str] = None, latent_frames: Optional[int] = None
+) -> str:
+    """Save video to file
+    Args:
+        video: Video tensor
+        args: command line arguments
+        original_base_name: Original base name (if latents are loaded from files)
+    Returns:
+        str: Path to saved video file
+    """
+    save_path = args.save_path
+    os.makedirs(save_path, exist_ok=True)
+    time_flag = datetime.fromtimestamp(time.time()).strftime("%Y%m%d-%H%M%S")
+    seed = args.seed
+    original_name = "" if original_base_name is None else f"_{original_base_name}"
+    latent_frames = "" if latent_frames is None else f"_{latent_frames}"
+    video_path = f"{save_path}/{time_flag}_{seed}{original_name}{latent_frames}.mp4"
+    video = video.unsqueeze(0)
+    save_videos_grid(video, video_path, fps=args.fps, rescale=True)
+    logger.info(f"Video saved to: {video_path}")
+    return video_path
+def save_images(sample: torch.Tensor, args: argparse.Namespace, original_base_name: Optional[str] = None) -> str:
+    """Save images to directory
+    Args:
+        sample: Video tensor
+        args: command line arguments
+        original_base_name: Original base name (if latents are loaded from files)
+    Returns:
+        str: Path to saved images directory
+    """
+    save_path = args.save_path
+    os.makedirs(save_path, exist_ok=True)
+    time_flag = datetime.fromtimestamp(time.time()).strftime("%Y%m%d-%H%M%S")
+    seed = args.seed
+    original_name = "" if original_base_name is None else f"_{original_base_name}"
+    image_name = f"{time_flag}_{seed}{original_name}"
+    sample = sample.unsqueeze(0)
+    save_images_grid(sample, save_path, image_name, rescale=True)
+    logger.info(f"Sample images saved to: {save_path}/{image_name}")
+    return f"{save_path}/{image_name}"
+def save_output(
+    args: argparse.Namespace,
+    vae: AutoencoderKLCausal3D,
+    latent: torch.Tensor,
+    device: torch.device,
+    original_base_names: Optional[List[str]] = None,
+) -> None:
+    """save output
+    Args:
+        args: command line arguments
+        vae: VAE model
+        latent: latent tensor
+        device: device to use
+        original_base_names: original base names (if latents are loaded from files)
+    """
+    height, width = latent.shape[-2], latent.shape[-1]  # BCTHW
+    height *= 8
+    width *= 8
+    # print(f"Saving output. Latent shape {latent.shape}; pixel shape {height}x{width}")
+    if args.output_type == "latent" or args.output_type == "both":
+        # save latent
+        save_latent(latent, args, height, width)
+    if args.output_type == "latent":
+        return
+    total_latent_sections = (args.video_seconds * 30) / (args.latent_window_size * 4)
+    total_latent_sections = int(max(round(total_latent_sections), 1))
+    video = decode_latent(args.latent_window_size, total_latent_sections, args.bulk_decode, vae, latent, device)
+    if args.output_type == "video" or args.output_type == "both":
+        # save video
+        original_name = "" if original_base_names is None else f"_{original_base_names[0]}"
+        save_video(video, args, original_name)
+    elif args.output_type == "images":
+        # save images
+        original_name = "" if original_base_names is None else f"_{original_base_names[0]}"
+        save_images(video, args, original_name)
+def preprocess_prompts_for_batch(prompt_lines: List[str], base_args: argparse.Namespace) -> List[Dict]:
+    """Process multiple prompts for batch mode
+    Args:
+        prompt_lines: List of prompt lines
+        base_args: Base command line arguments
+    Returns:
+        List[Dict]: List of prompt data dictionaries
+    """
+    prompts_data = []
+    for line in prompt_lines:
+        line = line.strip()
+        if not line or line.startswith("#"):  # Skip empty lines and comments
+            continue
+        # Parse prompt line and create override dictionary
+        prompt_data = parse_prompt_line(line)
+        logger.info(f"Parsed prompt data: {prompt_data}")
+        prompts_data.append(prompt_data)
+    return prompts_data
+def get_generation_settings(args: argparse.Namespace) -> GenerationSettings:
+    device = torch.device(args.device)
+    dit_weight_dtype = None  # default
+    if args.fp8_scaled:
+        dit_weight_dtype = None  # various precision weights, so don't cast to specific dtype
+    elif args.fp8:
+        dit_weight_dtype = torch.float8_e4m3fn
+    logger.info(f"Using device: {device}, DiT weight weight precision: {dit_weight_dtype}")
+    gen_settings = GenerationSettings(device=device, dit_weight_dtype=dit_weight_dtype)
+    return gen_settings
+def main():
+    # Parse arguments
+    args = parse_args()
+    # Check if latents are provided
+    latents_mode = args.latent_path is not None and len(args.latent_path) > 0
+    # Set device
+    device = args.device if args.device is not None else "cuda" if torch.cuda.is_available() else "cpu"
+    device = torch.device(device)
+    logger.info(f"Using device: {device}")
+    args.device = device
+    if latents_mode:
+        # Original latent decode mode
+        original_base_names = []
+        latents_list = []
+        seeds = []
+        assert len(args.latent_path) == 1, "Only one latent path is supported for now"
+        for latent_path in args.latent_path:
+            original_base_names.append(os.path.splitext(os.path.basename(latent_path))[0])
+            seed = 0
+            if os.path.splitext(latent_path)[1] != ".safetensors":
+                latents = torch.load(latent_path, map_location="cpu")
+            else:
+                latents = load_file(latent_path)["latent"]
+                with safe_open(latent_path, framework="pt") as f:
+                    metadata = f.metadata()
+                if metadata is None:
+                    metadata = {}
+                logger.info(f"Loaded metadata: {metadata}")
+                if "seeds" in metadata:
+                    seed = int(metadata["seeds"])
+                if "height" in metadata and "width" in metadata:
+                    height = int(metadata["height"])
+                    width = int(metadata["width"])
+                    args.video_size = [height, width]
+                if "video_seconds" in metadata:
+                    args.video_seconds = float(metadata["video_seconds"])
+            seeds.append(seed)
+            logger.info(f"Loaded latent from {latent_path}. Shape: {latents.shape}")
+            if latents.ndim == 5:  # [BCTHW]
+                latents = latents.squeeze(0)  # [CTHW]
+            latents_list.append(latents)
+        latent = torch.stack(latents_list, dim=0)  # [N, ...], must be same shape
+        args.seed = seeds[0]
+        vae = load_vae(args.vae, args.vae_chunk_size, args.vae_spatial_tile_sample_min_size, device)
+        save_output(args, vae, latent, device, original_base_names)
+    elif args.from_file:
+        # Batch mode from file
+        # Read prompts from file
+        with open(args.from_file, "r", encoding="utf-8") as f:
+            prompt_lines = f.readlines()
+        # Process prompts
+        prompts_data = preprocess_prompts_for_batch(prompt_lines, args)
+        # process_batch_prompts(prompts_data, args)
+        raise NotImplementedError("Batch mode is not implemented yet.")
+    elif args.interactive:
+        # Interactive mode
+        # process_interactive(args)
+        raise NotImplementedError("Interactive mode is not implemented yet.")
+    else:
+        # Single prompt mode (original behavior)
+        # Generate latent
+        gen_settings = get_generation_settings(args)
+        vae, latent = generate(args, gen_settings)
+        # print(f"Generated latent shape: {latent.shape}")
+        # # Save latent and video
+        # if args.save_merged_model:
+        #     return
+        save_output(args, vae, latent[0], device)
+    logger.info("Done!")
+if __name__ == "__main__":
+    main()

fpack_train_network.py ADDED Viewed

	@@ -0,0 +1,410 @@

+import argparse
+import gc
+import math
+import time
+from typing import Optional
+from PIL import Image
+import numpy as np
+import torch
+import torchvision.transforms.functional as TF
+from tqdm import tqdm
+from accelerate import Accelerator, init_empty_weights
+from dataset import image_video_dataset
+from dataset.image_video_dataset import ARCHITECTURE_FRAMEPACK, ARCHITECTURE_FRAMEPACK_FULL, load_video
+from fpack_generate_video import decode_latent
+from frame_pack import hunyuan
+from frame_pack.clip_vision import hf_clip_vision_encode
+from frame_pack.framepack_utils import load_image_encoders, load_text_encoder1, load_text_encoder2
+from frame_pack.framepack_utils import load_vae as load_framepack_vae
+from frame_pack.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked, load_packed_model
+from frame_pack.k_diffusion_hunyuan import sample_hunyuan
+from frame_pack.utils import crop_or_pad_yield_mask
+from dataset.image_video_dataset import resize_image_to_bucket
+from hv_train_network import NetworkTrainer, load_prompts, clean_memory_on_device, setup_parser_common, read_config_from_file
+import logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+from utils import model_utils
+from utils.safetensors_utils import load_safetensors, MemoryEfficientSafeOpen
+class FramePackNetworkTrainer(NetworkTrainer):
+    def __init__(self):
+        super().__init__()
+    # region model specific
+    @property
+    def architecture(self) -> str:
+        return ARCHITECTURE_FRAMEPACK
+    @property
+    def architecture_full_name(self) -> str:
+        return ARCHITECTURE_FRAMEPACK_FULL
+    def handle_model_specific_args(self, args):
+        self._i2v_training = True
+        self._control_training = False
+        self.default_guidance_scale = 10.0  # embeded guidance scale
+    def process_sample_prompts(
+        self,
+        args: argparse.Namespace,
+        accelerator: Accelerator,
+        sample_prompts: str,
+    ):
+        device = accelerator.device
+        logger.info(f"cache Text Encoder outputs for sample prompt: {sample_prompts}")
+        prompts = load_prompts(sample_prompts)
+        # load text encoder
+        tokenizer1, text_encoder1 = load_text_encoder1(args, args.fp8_llm, device)
+        tokenizer2, text_encoder2 = load_text_encoder2(args)
+        text_encoder2.to(device)
+        sample_prompts_te_outputs = {}  # (prompt) -> (t1 embeds, t1 mask, t2 embeds)
+        for prompt_dict in prompts:
+            for p in [prompt_dict.get("prompt", ""), prompt_dict.get("negative_prompt", "")]:
+                if p is None or p in sample_prompts_te_outputs:
+                    continue
+                logger.info(f"cache Text Encoder outputs for prompt: {p}")
+                with torch.amp.autocast(device_type=device.type, dtype=text_encoder1.dtype), torch.no_grad():
+                    llama_vec, clip_l_pooler = hunyuan.encode_prompt_conds(p, text_encoder1, text_encoder2, tokenizer1, tokenizer2)
+                llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
+                llama_vec = llama_vec.to("cpu")
+                llama_attention_mask = llama_attention_mask.to("cpu")
+                clip_l_pooler = clip_l_pooler.to("cpu")
+                sample_prompts_te_outputs[p] = (llama_vec, llama_attention_mask, clip_l_pooler)
+        del text_encoder1, text_encoder2
+        clean_memory_on_device(device)
+        # image embedding for I2V training
+        feature_extractor, image_encoder = load_image_encoders(args)
+        image_encoder.to(device)
+        # encode image with image encoder
+        sample_prompts_image_embs = {}
+        for prompt_dict in prompts:
+            image_path = prompt_dict.get("image_path", None)
+            assert image_path is not None, "image_path should be set for I2V training"
+            if image_path in sample_prompts_image_embs:
+                continue
+            logger.info(f"Encoding image to image encoder context: {image_path}")
+            height = prompt_dict.get("height", 256)
+            width = prompt_dict.get("width", 256)
+            img = Image.open(image_path).convert("RGB")
+            img_np = np.array(img)  # PIL to numpy, HWC
+            img_np = image_video_dataset.resize_image_to_bucket(img_np, (width, height))  # returns a numpy array
+            with torch.no_grad():
+                image_encoder_output = hf_clip_vision_encode(img_np, feature_extractor, image_encoder)
+            image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
+            image_encoder_last_hidden_state = image_encoder_last_hidden_state.to("cpu")
+            sample_prompts_image_embs[image_path] = image_encoder_last_hidden_state
+        del image_encoder
+        clean_memory_on_device(device)
+        # prepare sample parameters
+        sample_parameters = []
+        for prompt_dict in prompts:
+            prompt_dict_copy = prompt_dict.copy()
+            p = prompt_dict.get("prompt", "")
+            llama_vec, llama_attention_mask, clip_l_pooler = sample_prompts_te_outputs[p]
+            prompt_dict_copy["llama_vec"] = llama_vec
+            prompt_dict_copy["llama_attention_mask"] = llama_attention_mask
+            prompt_dict_copy["clip_l_pooler"] = clip_l_pooler
+            p = prompt_dict.get("negative_prompt", "")
+            llama_vec, llama_attention_mask, clip_l_pooler = sample_prompts_te_outputs[p]
+            prompt_dict_copy["negative_llama_vec"] = llama_vec
+            prompt_dict_copy["negative_llama_attention_mask"] = llama_attention_mask
+            prompt_dict_copy["negative_clip_l_pooler"] = clip_l_pooler
+            p = prompt_dict.get("image_path", None)
+            prompt_dict_copy["image_encoder_last_hidden_state"] = sample_prompts_image_embs[p]
+            sample_parameters.append(prompt_dict_copy)
+        clean_memory_on_device(accelerator.device)
+        return sample_parameters
+    def do_inference(
+        self,
+        accelerator,
+        args,
+        sample_parameter,
+        vae,
+        dit_dtype,
+        transformer,
+        discrete_flow_shift,
+        sample_steps,
+        width,
+        height,
+        frame_count,
+        generator,
+        do_classifier_free_guidance,
+        guidance_scale,
+        cfg_scale,
+        image_path=None,
+        control_video_path=None,
+    ):
+        """architecture dependent inference"""
+        model: HunyuanVideoTransformer3DModelPacked = transformer
+        device = accelerator.device
+        if cfg_scale is None:
+            cfg_scale = 1.0
+        do_classifier_free_guidance = do_classifier_free_guidance and cfg_scale != 1.0
+        # prepare parameters
+        latent_window_size = args.latent_window_size  # default is 9
+        latent_f = (frame_count - 1) // 4 + 1
+        total_latent_sections = math.floor((latent_f - 1) / latent_window_size)
+        if total_latent_sections < 1:
+            logger.warning(f"Not enough frames for FramePack: {latent_f}, minimum: {latent_window_size*4+1}")
+            return None
+        latent_f = total_latent_sections * latent_window_size + 1
+        actual_frame_count = (latent_f - 1) * 4 + 1
+        if actual_frame_count != frame_count:
+            logger.info(f"Frame count mismatch: {actual_frame_count} != {frame_count}, trimming to {actual_frame_count}")
+        frame_count = actual_frame_count
+        num_frames = latent_window_size * 4 - 3
+        # parepare start latent
+        image = Image.open(image_path).convert("RGB")
+        image = resize_image_to_bucket(image, (width, height))  # returns a numpy array
+        image = torch.from_numpy(image).permute(2, 0, 1).unsqueeze(1).unsqueeze(0).float()  # 1, C, 1, H, W
+        image = image / 127.5 - 1  # -1 to 1
+        # VAE encoding
+        logger.info(f"Encoding image to latent space")
+        vae.to(device)
+        start_latent = hunyuan.vae_encode(image, vae)
+        vae.to("cpu")  # move VAE to CPU to save memory
+        clean_memory_on_device(device)
+        # sampilng
+        history_latents = torch.zeros((1, 16, 1 + 2 + 16, height // 8, width // 8), dtype=torch.float32)
+        total_generated_latent_frames = 0
+        latent_paddings = reversed(range(total_latent_sections))
+        if total_latent_sections > 4:
+            latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0]
+        for latent_padding in latent_paddings:
+            is_last_section = latent_padding == 0
+            latent_padding_size = latent_padding * latent_window_size
+            logger.info(f"latent_padding_size = {latent_padding_size}, is_last_section = {is_last_section}")
+            indices = torch.arange(0, sum([1, latent_padding_size, latent_window_size, 1, 2, 16])).unsqueeze(0)
+            (
+                clean_latent_indices_pre,
+                blank_indices,
+                latent_indices,
+                clean_latent_indices_post,
+                clean_latent_2x_indices,
+                clean_latent_4x_indices,
+            ) = indices.split([1, latent_padding_size, latent_window_size, 1, 2, 16], dim=1)
+            clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1)
+            clean_latents_pre = start_latent.to(history_latents)
+            clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, : 1 + 2 + 16, :, :].split(
+                [1, 2, 16], dim=2
+            )
+            clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2)
+            # if use_teacache:
+            #     transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
+            # else:
+            #     transformer.initialize_teacache(enable_teacache=False)
+            llama_vec = sample_parameter["llama_vec"].to(device, dtype=torch.bfloat16)
+            llama_attention_mask = sample_parameter["llama_attention_mask"].to(device)
+            clip_l_pooler = sample_parameter["clip_l_pooler"].to(device, dtype=torch.bfloat16)
+            if cfg_scale == 1.0:
+                llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
+                llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
+            else:
+                llama_vec_n = sample_parameter["negative_llama_vec"].to(device, dtype=torch.bfloat16)
+                llama_attention_mask_n = sample_parameter["negative_llama_attention_mask"].to(device)
+                clip_l_pooler_n = sample_parameter["negative_clip_l_pooler"].to(device, dtype=torch.bfloat16)
+            image_encoder_last_hidden_state = sample_parameter["image_encoder_last_hidden_state"].to(device, dtype=torch.bfloat16)
+            generated_latents = sample_hunyuan(
+                transformer=model,
+                sampler=args.sample_solver,
+                width=width,
+                height=height,
+                frames=num_frames,
+                real_guidance_scale=cfg_scale,
+                distilled_guidance_scale=guidance_scale,
+                guidance_rescale=0.0,
+                # shift=3.0,
+                num_inference_steps=sample_steps,
+                generator=generator,
+                prompt_embeds=llama_vec,
+                prompt_embeds_mask=llama_attention_mask,
+                prompt_poolers=clip_l_pooler,
+                negative_prompt_embeds=llama_vec_n,
+                negative_prompt_embeds_mask=llama_attention_mask_n,
+                negative_prompt_poolers=clip_l_pooler_n,
+                device=device,
+                dtype=torch.bfloat16,
+                image_embeddings=image_encoder_last_hidden_state,
+                latent_indices=latent_indices,
+                clean_latents=clean_latents,
+                clean_latent_indices=clean_latent_indices,
+                clean_latents_2x=clean_latents_2x,
+                clean_latent_2x_indices=clean_latent_2x_indices,
+                clean_latents_4x=clean_latents_4x,
+                clean_latent_4x_indices=clean_latent_4x_indices,
+            )
+            if is_last_section:
+                generated_latents = torch.cat([start_latent.to(generated_latents), generated_latents], dim=2)
+            total_generated_latent_frames += int(generated_latents.shape[2])
+            history_latents = torch.cat([generated_latents.to(history_latents), history_latents], dim=2)
+            real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]
+            logger.info(f"Generated. Latent shape {real_history_latents.shape}")
+        # wait for 5 seconds until block swap is done
+        logger.info("Waiting for 5 seconds to finish block swap")
+        time.sleep(5)
+        gc.collect()
+        clean_memory_on_device(device)
+        video = decode_latent(latent_window_size, total_latent_sections, args.bulk_decode, vae, real_history_latents, device)
+        video = video.to("cpu", dtype=torch.float32).unsqueeze(0)  # add batch dimension
+        video = (video / 2 + 0.5).clamp(0, 1)  # -1 to 1 -> 0 to 1
+        clean_memory_on_device(device)
+        return video
+    def load_vae(self, args: argparse.Namespace, vae_dtype: torch.dtype, vae_path: str):
+        vae_path = args.vae
+        logger.info(f"Loading VAE model from {vae_path}")
+        vae = load_framepack_vae(args.vae, args.vae_chunk_size, args.vae_spatial_tile_sample_min_size, "cpu")
+        return vae
+    def load_transformer(
+        self,
+        accelerator: Accelerator,
+        args: argparse.Namespace,
+        dit_path: str,
+        attn_mode: str,
+        split_attn: bool,
+        loading_device: str,
+        dit_weight_dtype: Optional[torch.dtype],
+    ):
+        logger.info(f"Loading DiT model from {dit_path}")
+        device = accelerator.device
+        model = load_packed_model(device, dit_path, attn_mode, loading_device, args.fp8_scaled, split_attn)
+        return model
+    def scale_shift_latents(self, latents):
+        # FramePack VAE includes scaling
+        return latents
+    def call_dit(
+        self,
+        args: argparse.Namespace,
+        accelerator: Accelerator,
+        transformer,
+        latents: torch.Tensor,
+        batch: dict[str, torch.Tensor],
+        noise: torch.Tensor,
+        noisy_model_input: torch.Tensor,
+        timesteps: torch.Tensor,
+        network_dtype: torch.dtype,
+    ):
+        model: HunyuanVideoTransformer3DModelPacked = transformer
+        device = accelerator.device
+        batch_size = latents.shape[0]
+        # maybe model.dtype is better than network_dtype...
+        distilled_guidance = torch.tensor([args.guidance_scale * 1000.0] * batch_size).to(device=device, dtype=network_dtype)
+        latents = latents.to(device=accelerator.device, dtype=network_dtype)
+        noisy_model_input = noisy_model_input.to(device=accelerator.device, dtype=network_dtype)
+        # for k, v in batch.items():
+        #     if isinstance(v, torch.Tensor):
+        #         print(f"{k}: {v.shape} {v.dtype} {v.device}")
+        with accelerator.autocast():
+            model_pred = model(
+                hidden_states=noisy_model_input,
+                timestep=timesteps,
+                encoder_hidden_states=batch["llama_vec"],
+                encoder_attention_mask=batch["llama_attention_mask"],
+                pooled_projections=batch["clip_l_pooler"],
+                guidance=distilled_guidance,
+                latent_indices=batch["latent_indices"],
+                clean_latents=batch["latents_clean"],
+                clean_latent_indices=batch["clean_latent_indices"],
+                clean_latents_2x=batch["latents_clean_2x"],
+                clean_latent_2x_indices=batch["clean_latent_2x_indices"],
+                clean_latents_4x=batch["latents_clean_4x"],
+                clean_latent_4x_indices=batch["clean_latent_4x_indices"],
+                image_embeddings=batch["image_embeddings"],
+                return_dict=False,
+            )
+            model_pred = model_pred[0]  # returns tuple (model_pred, )
+        # flow matching loss
+        target = noise - latents
+        return model_pred, target
+    # endregion model specific
+def framepack_setup_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    """FramePack specific parser setup"""
+    parser.add_argument("--fp8_scaled", action="store_true", help="use scaled fp8 for DiT / DiTにスケーリングされたfp8を使う")
+    parser.add_argument("--fp8_llm", action="store_true", help="use fp8 for LLM / LLMにfp8を使う")
+    parser.add_argument("--text_encoder1", type=str, help="Text Encoder 1 directory / テキストエンコーダ1のディレクトリ")
+    parser.add_argument("--text_encoder2", type=str, help="Text Encoder 2 directory / テキストエンコーダ2のディレクトリ")
+    parser.add_argument("--vae_chunk_size", type=int, default=None, help="chunk size for CausalConv3d in VAE")
+    parser.add_argument(
+        "--vae_spatial_tile_sample_min_size", type=int, default=None, help="spatial tile sample min size for VAE, default 256"
+    )
+    parser.add_argument("--image_encoder", type=str, required=True, help="Image encoder (CLIP) checkpoint path or directory")
+    parser.add_argument("--latent_window_size", type=int, default=9, help="FramePack latent window size (default 9)")
+    parser.add_argument("--bulk_decode", action="store_true", help="decode all frames at once in sample generation")
+    return parser
+if __name__ == "__main__":
+    parser = setup_parser_common()
+    parser = framepack_setup_parser(parser)
+    args = parser.parse_args()
+    args = read_config_from_file(args, parser)
+    assert (
+        args.vae_dtype is None or args.vae_dtype == "float16"
+    ), "VAE dtype must be float16 / VAEのdtypeはfloat16でなければなりません"
+    args.vae_dtype = "float16"  # fixed
+    args.dit_dtype = "bfloat16"  # fixed
+    args.sample_solver = "unipc"  # for sample generation, fixed to unipc
+    trainer = FramePackNetworkTrainer()
+    trainer.train(args)

frame_pack/__init__.py ADDED Viewed

File without changes

frame_pack/bucket_tools.py ADDED Viewed

	@@ -0,0 +1,30 @@

+bucket_options = {
+    640: [
+        (416, 960),
+        (448, 864),
+        (480, 832),
+        (512, 768),
+        (544, 704),
+        (576, 672),
+        (608, 640),
+        (640, 608),
+        (672, 576),
+        (704, 544),
+        (768, 512),
+        (832, 480),
+        (864, 448),
+        (960, 416),
+    ],
+}
+def find_nearest_bucket(h, w, resolution=640):
+    min_metric = float('inf')
+    best_bucket = None
+    for (bucket_h, bucket_w) in bucket_options[resolution]:
+        metric = abs(h * bucket_w - w * bucket_h)
+        if metric <= min_metric:
+            min_metric = metric
+            best_bucket = (bucket_h, bucket_w)
+    return best_bucket

frame_pack/clip_vision.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import numpy as np
+def hf_clip_vision_encode(image, feature_extractor, image_encoder):
+    assert isinstance(image, np.ndarray)
+    assert image.ndim == 3 and image.shape[2] == 3
+    assert image.dtype == np.uint8
+    preprocessed = feature_extractor.preprocess(images=image, return_tensors="pt").to(
+        device=image_encoder.device, dtype=image_encoder.dtype
+    )
+    image_encoder_output = image_encoder(**preprocessed)
+    return image_encoder_output

frame_pack/framepack_utils.py ADDED Viewed

	@@ -0,0 +1,273 @@

+import os
+import logging
+from types import SimpleNamespace
+from typing import Optional, Union
+import accelerate
+from accelerate import Accelerator, init_empty_weights
+import torch
+from safetensors.torch import load_file
+from transformers import (
+    LlamaTokenizerFast,
+    LlamaConfig,
+    LlamaModel,
+    CLIPTokenizer,
+    CLIPTextModel,
+    CLIPConfig,
+    SiglipImageProcessor,
+    SiglipVisionModel,
+    SiglipVisionConfig,
+)
+from utils.safetensors_utils import load_split_weights
+from hunyuan_model.vae import load_vae as hunyuan_load_vae
+import logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+def load_vae(
+    vae_path: str, vae_chunk_size: Optional[int], vae_spatial_tile_sample_min_size: Optional[int], device: Union[str, torch.device]
+):
+    # single file and directory (contains 'vae') support
+    if os.path.isdir(vae_path):
+        vae_path = os.path.join(vae_path, "vae", "diffusion_pytorch_model.safetensors")
+    else:
+        vae_path = vae_path
+    vae_dtype = torch.float16  # if vae_dtype is None else str_to_dtype(vae_dtype)
+    vae, _, s_ratio, t_ratio = hunyuan_load_vae(vae_dtype=vae_dtype, device=device, vae_path=vae_path)
+    vae.eval()
+    # vae_kwargs = {"s_ratio": s_ratio, "t_ratio": t_ratio}
+    # set chunk_size to CausalConv3d recursively
+    chunk_size = vae_chunk_size
+    if chunk_size is not None:
+        vae.set_chunk_size_for_causal_conv_3d(chunk_size)
+        logger.info(f"Set chunk_size to {chunk_size} for CausalConv3d")
+    if vae_spatial_tile_sample_min_size is not None:
+        vae.enable_spatial_tiling(True)
+        vae.tile_sample_min_size = vae_spatial_tile_sample_min_size
+        vae.tile_latent_min_size = vae_spatial_tile_sample_min_size // 8
+        logger.info(f"Enabled spatial tiling with min size {vae_spatial_tile_sample_min_size}")
+    # elif vae_tiling:
+    else:
+        vae.enable_spatial_tiling(True)
+    return vae
+# region Text Encoders
+# Text Encoder configs are copied from HunyuanVideo repo
+LLAMA_CONFIG = {
+    "architectures": ["LlamaModel"],
+    "attention_bias": False,
+    "attention_dropout": 0.0,
+    "bos_token_id": 128000,
+    "eos_token_id": 128001,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 4096,
+    "initializer_range": 0.02,
+    "intermediate_size": 14336,
+    "max_position_embeddings": 8192,
+    "mlp_bias": False,
+    "model_type": "llama",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 32,
+    "num_key_value_heads": 8,
+    "pretraining_tp": 1,
+    "rms_norm_eps": 1e-05,
+    "rope_scaling": None,
+    "rope_theta": 500000.0,
+    "tie_word_embeddings": False,
+    "torch_dtype": "float16",
+    "transformers_version": "4.46.3",
+    "use_cache": True,
+    "vocab_size": 128320,
+}
+CLIP_CONFIG = {
+    #   "_name_or_path": "/raid/aryan/llava-llama-3-8b-v1_1-extracted/text_encoder_2",
+    "architectures": ["CLIPTextModel"],
+    "attention_dropout": 0.0,
+    "bos_token_id": 0,
+    "dropout": 0.0,
+    "eos_token_id": 2,
+    "hidden_act": "quick_gelu",
+    "hidden_size": 768,
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "layer_norm_eps": 1e-05,
+    "max_position_embeddings": 77,
+    "model_type": "clip_text_model",
+    "num_attention_heads": 12,
+    "num_hidden_layers": 12,
+    "pad_token_id": 1,
+    "projection_dim": 768,
+    "torch_dtype": "float16",
+    "transformers_version": "4.48.0.dev0",
+    "vocab_size": 49408,
+}
+def load_text_encoder1(
+    args, fp8_llm: Optional[bool] = False, device: Optional[Union[str, torch.device]] = None
+) -> tuple[LlamaTokenizerFast, LlamaModel]:
+    # single file, split file and directory (contains 'text_encoder') support
+    logger.info(f"Loading text encoder 1 tokenizer")
+    tokenizer1 = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder="tokenizer")
+    logger.info(f"Loading text encoder 1 from {args.text_encoder1}")
+    if os.path.isdir(args.text_encoder1):
+        # load from directory, configs are in the directory
+        text_encoder1 = LlamaModel.from_pretrained(args.text_encoder1, subfolder="text_encoder", torch_dtype=torch.float16)
+    else:
+        # load from file, we create the model with the appropriate config
+        config = LlamaConfig(**LLAMA_CONFIG)
+        with init_empty_weights():
+            text_encoder1 = LlamaModel._from_config(config, torch_dtype=torch.float16)
+        state_dict = load_split_weights(args.text_encoder1)
+        # support weights from ComfyUI
+        if "model.embed_tokens.weight" in state_dict:
+            for key in list(state_dict.keys()):
+                if key.startswith("model."):
+                    new_key = key.replace("model.", "")
+                    state_dict[new_key] = state_dict[key]
+                    del state_dict[key]
+        if "tokenizer" in state_dict:
+            state_dict.pop("tokenizer")
+        if "lm_head.weight" in state_dict:
+            state_dict.pop("lm_head.weight")
+        # # support weights from ComfyUI
+        # if "tokenizer" in state_dict:
+        #     state_dict.pop("tokenizer")
+        text_encoder1.load_state_dict(state_dict, strict=True, assign=True)
+    if fp8_llm:
+        org_dtype = text_encoder1.dtype
+        logger.info(f"Moving and casting text encoder to {device} and torch.float8_e4m3fn")
+        text_encoder1.to(device=device, dtype=torch.float8_e4m3fn)
+        # prepare LLM for fp8
+        def prepare_fp8(llama_model: LlamaModel, target_dtype):
+            def forward_hook(module):
+                def forward(hidden_states):
+                    input_dtype = hidden_states.dtype
+                    hidden_states = hidden_states.to(torch.float32)
+                    variance = hidden_states.pow(2).mean(-1, keepdim=True)
+                    hidden_states = hidden_states * torch.rsqrt(variance + module.variance_epsilon)
+                    return module.weight.to(input_dtype) * hidden_states.to(input_dtype)
+                return forward
+            for module in llama_model.modules():
+                if module.__class__.__name__ in ["Embedding"]:
+                    # print("set", module.__class__.__name__, "to", target_dtype)
+                    module.to(target_dtype)
+                if module.__class__.__name__ in ["LlamaRMSNorm"]:
+                    # print("set", module.__class__.__name__, "hooks")
+                    module.forward = forward_hook(module)
+        prepare_fp8(text_encoder1, org_dtype)
+    else:
+        text_encoder1.to(device)
+    text_encoder1.eval()
+    return tokenizer1, text_encoder1
+def load_text_encoder2(args) -> tuple[CLIPTokenizer, CLIPTextModel]:
+    # single file and directory (contains 'text_encoder_2') support
+    logger.info(f"Loading text encoder 2 tokenizer")
+    tokenizer2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder="tokenizer_2")
+    logger.info(f"Loading text encoder 2 from {args.text_encoder2}")
+    if os.path.isdir(args.text_encoder2):
+        # load from directory, configs are in the directory
+        text_encoder2 = CLIPTextModel.from_pretrained(args.text_encoder2, subfolder="text_encoder_2", torch_dtype=torch.float16)
+    else:
+        # we only have one file, so we can load it directly
+        config = CLIPConfig(**CLIP_CONFIG)
+        with init_empty_weights():
+            text_encoder2 = CLIPTextModel._from_config(config, torch_dtype=torch.float16)
+        state_dict = load_file(args.text_encoder2)
+        text_encoder2.load_state_dict(state_dict, strict=True, assign=True)
+    text_encoder2.eval()
+    return tokenizer2, text_encoder2
+# endregion
+# region image encoder
+# Siglip configs are copied from FramePack repo
+FEATURE_EXTRACTOR_CONFIG = {
+    "do_convert_rgb": None,
+    "do_normalize": True,
+    "do_rescale": True,
+    "do_resize": True,
+    "image_mean": [0.5, 0.5, 0.5],
+    "image_processor_type": "SiglipImageProcessor",
+    "image_std": [0.5, 0.5, 0.5],
+    "processor_class": "SiglipProcessor",
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "size": {"height": 384, "width": 384},
+}
+IMAGE_ENCODER_CONFIG = {
+    "_name_or_path": "/home/lvmin/.cache/huggingface/hub/models--black-forest-labs--FLUX.1-Redux-dev/snapshots/1282f955f706b5240161278f2ef261d2a29ad649/image_encoder",
+    "architectures": ["SiglipVisionModel"],
+    "attention_dropout": 0.0,
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1152,
+    "image_size": 384,
+    "intermediate_size": 4304,
+    "layer_norm_eps": 1e-06,
+    "model_type": "siglip_vision_model",
+    "num_attention_heads": 16,
+    "num_channels": 3,
+    "num_hidden_layers": 27,
+    "patch_size": 14,
+    "torch_dtype": "bfloat16",
+    "transformers_version": "4.46.2",
+}
+def load_image_encoders(args):
+    logger.info(f"Loading image encoder feature extractor")
+    feature_extractor = SiglipImageProcessor(**FEATURE_EXTRACTOR_CONFIG)
+    # single file, split file and directory (contains 'image_encoder') support
+    logger.info(f"Loading image encoder from {args.image_encoder}")
+    if os.path.isdir(args.image_encoder):
+        # load from directory, configs are in the directory
+        image_encoder = SiglipVisionModel.from_pretrained(args.image_encoder, subfolder="image_encoder", torch_dtype=torch.float16)
+    else:
+        # load from file, we create the model with the appropriate config
+        config = SiglipVisionConfig(**IMAGE_ENCODER_CONFIG)
+        with init_empty_weights():
+            image_encoder = SiglipVisionModel._from_config(config, torch_dtype=torch.float16)
+        state_dict = load_file(args.image_encoder)
+        image_encoder.load_state_dict(state_dict, strict=True, assign=True)
+    image_encoder.eval()
+    return feature_extractor, image_encoder
+# endregion

frame_pack/hunyuan.py ADDED Viewed

	@@ -0,0 +1,116 @@

+# original code: https://github.com/lllyasviel/FramePack
+# original license: Apache-2.0
+import torch
+# from diffusers.pipelines.hunyuan_video.pipeline_hunyuan_video import DEFAULT_PROMPT_TEMPLATE
+# from diffusers_helper.utils import crop_or_pad_yield_mask
+from hunyuan_model.autoencoder_kl_causal_3d import AutoencoderKLCausal3D
+from hunyuan_model.text_encoder import PROMPT_TEMPLATE
+@torch.no_grad()
+def encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2, max_length=256):
+    assert isinstance(prompt, str)
+    prompt = [prompt]
+    # LLAMA
+    prompt_llama = [PROMPT_TEMPLATE["dit-llm-encode-video"]["template"].format(p) for p in prompt]
+    crop_start = PROMPT_TEMPLATE["dit-llm-encode-video"]["crop_start"]
+    llama_inputs = tokenizer(
+        prompt_llama,
+        padding="max_length",
+        max_length=max_length + crop_start,
+        truncation=True,
+        return_tensors="pt",
+        return_length=False,
+        return_overflowing_tokens=False,
+        return_attention_mask=True,
+    )
+    llama_input_ids = llama_inputs.input_ids.to(text_encoder.device)
+    llama_attention_mask = llama_inputs.attention_mask.to(text_encoder.device)
+    llama_attention_length = int(llama_attention_mask.sum())
+    llama_outputs = text_encoder(
+        input_ids=llama_input_ids,
+        attention_mask=llama_attention_mask,
+        output_hidden_states=True,
+    )
+    llama_vec = llama_outputs.hidden_states[-3][:, crop_start:llama_attention_length]
+    # llama_vec_remaining = llama_outputs.hidden_states[-3][:, llama_attention_length:]
+    llama_attention_mask = llama_attention_mask[:, crop_start:llama_attention_length]
+    assert torch.all(llama_attention_mask.bool())
+    # CLIP
+    clip_l_input_ids = tokenizer_2(
+        prompt,
+        padding="max_length",
+        max_length=77,
+        truncation=True,
+        return_overflowing_tokens=False,
+        return_length=False,
+        return_tensors="pt",
+    ).input_ids
+    clip_l_pooler = text_encoder_2(clip_l_input_ids.to(text_encoder_2.device), output_hidden_states=False).pooler_output
+    return llama_vec, clip_l_pooler
+@torch.no_grad()
+def vae_decode_fake(latents):
+    latent_rgb_factors = [
+        [-0.0395, -0.0331, 0.0445],
+        [0.0696, 0.0795, 0.0518],
+        [0.0135, -0.0945, -0.0282],
+        [0.0108, -0.0250, -0.0765],
+        [-0.0209, 0.0032, 0.0224],
+        [-0.0804, -0.0254, -0.0639],
+        [-0.0991, 0.0271, -0.0669],
+        [-0.0646, -0.0422, -0.0400],
+        [-0.0696, -0.0595, -0.0894],
+        [-0.0799, -0.0208, -0.0375],
+        [0.1166, 0.1627, 0.0962],
+        [0.1165, 0.0432, 0.0407],
+        [-0.2315, -0.1920, -0.1355],
+        [-0.0270, 0.0401, -0.0821],
+        [-0.0616, -0.0997, -0.0727],
+        [0.0249, -0.0469, -0.1703],
+    ]  # From comfyui
+    latent_rgb_factors_bias = [0.0259, -0.0192, -0.0761]
+    weight = torch.tensor(latent_rgb_factors, device=latents.device, dtype=latents.dtype).transpose(0, 1)[:, :, None, None, None]
+    bias = torch.tensor(latent_rgb_factors_bias, device=latents.device, dtype=latents.dtype)
+    images = torch.nn.functional.conv3d(latents, weight, bias=bias, stride=1, padding=0, dilation=1, groups=1)
+    images = images.clamp(0.0, 1.0)
+    return images
+@torch.no_grad()
+def vae_decode(latents, vae, image_mode=False) -> torch.Tensor:
+    latents = latents / vae.config.scaling_factor
+    if not image_mode:
+        image = vae.decode(latents.to(device=vae.device, dtype=vae.dtype)).sample
+    else:
+        latents = latents.to(device=vae.device, dtype=vae.dtype).unbind(2)
+        image = [vae.decode(l.unsqueeze(2)).sample for l in latents]
+        image = torch.cat(image, dim=2)
+    return image
+@torch.no_grad()
+def vae_encode(image, vae: AutoencoderKLCausal3D) -> torch.Tensor:
+    latents = vae.encode(image.to(device=vae.device, dtype=vae.dtype)).latent_dist.sample()
+    latents = latents * vae.config.scaling_factor
+    return latents

frame_pack/hunyuan_video_packed.py ADDED Viewed

	@@ -0,0 +1,2015 @@

+# original code: https://github.com/lllyasviel/FramePack
+# original license: Apache-2.0
+import glob
+import math
+import numbers
+import os
+from types import SimpleNamespace
+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch
+import einops
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from modules.custom_offloading_utils import ModelOffloader
+from utils.safetensors_utils import load_split_weights
+from modules.fp8_optimization_utils import apply_fp8_monkey_patch, optimize_state_dict_with_fp8
+from accelerate import init_empty_weights
+try:
+    # raise NotImplementedError
+    from xformers.ops import memory_efficient_attention as xformers_attn_func
+    print("Xformers is installed!")
+except:
+    print("Xformers is not installed!")
+    xformers_attn_func = None
+try:
+    # raise NotImplementedError
+    from flash_attn import flash_attn_varlen_func, flash_attn_func
+    print("Flash Attn is installed!")
+except:
+    print("Flash Attn is not installed!")
+    flash_attn_varlen_func = None
+    flash_attn_func = None
+try:
+    # raise NotImplementedError
+    from sageattention import sageattn_varlen, sageattn
+    print("Sage Attn is installed!")
+except:
+    print("Sage Attn is not installed!")
+    sageattn_varlen = None
+    sageattn = None
+import logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+# region diffusers
+# copied from diffusers with some modifications to minimize dependencies
+# original code: https://github.com/huggingface/diffusers/
+# original license: Apache-2.0
+ACT2CLS = {
+    "swish": nn.SiLU,
+    "silu": nn.SiLU,
+    "mish": nn.Mish,
+    "gelu": nn.GELU,
+    "relu": nn.ReLU,
+}
+def get_activation(act_fn: str) -> nn.Module:
+    """Helper function to get activation function from string.
+    Args:
+        act_fn (str): Name of activation function.
+    Returns:
+        nn.Module: Activation function.
+    """
+    act_fn = act_fn.lower()
+    if act_fn in ACT2CLS:
+        return ACT2CLS[act_fn]()
+    else:
+        raise ValueError(f"activation function {act_fn} not found in ACT2FN mapping {list(ACT2CLS.keys())}")
+def get_timestep_embedding(
+    timesteps: torch.Tensor,
+    embedding_dim: int,
+    flip_sin_to_cos: bool = False,
+    downscale_freq_shift: float = 1,
+    scale: float = 1,
+    max_period: int = 10000,
+):
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
+    Args
+        timesteps (torch.Tensor):
+            a 1-D Tensor of N indices, one per batch element. These may be fractional.
+        embedding_dim (int):
+            the dimension of the output.
+        flip_sin_to_cos (bool):
+            Whether the embedding order should be `cos, sin` (if True) or `sin, cos` (if False)
+        downscale_freq_shift (float):
+            Controls the delta between frequencies between dimensions
+        scale (float):
+            Scaling factor applied to the embeddings.
+        max_period (int):
+            Controls the maximum frequency of the embeddings
+    Returns
+        torch.Tensor: an [N x dim] Tensor of positional embeddings.
+    """
+    assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
+    half_dim = embedding_dim // 2
+    exponent = -math.log(max_period) * torch.arange(start=0, end=half_dim, dtype=torch.float32, device=timesteps.device)
+    exponent = exponent / (half_dim - downscale_freq_shift)
+    emb = torch.exp(exponent)
+    emb = timesteps[:, None].float() * emb[None, :]
+    # scale embeddings
+    emb = scale * emb
+    # concat sine and cosine embeddings
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
+    # flip sine and cosine embeddings
+    if flip_sin_to_cos:
+        emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1)
+    # zero pad
+    if embedding_dim % 2 == 1:
+        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
+    return emb
+class TimestepEmbedding(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        time_embed_dim: int,
+        act_fn: str = "silu",
+        out_dim: int = None,
+        post_act_fn: Optional[str] = None,
+        cond_proj_dim=None,
+        sample_proj_bias=True,
+    ):
+        super().__init__()
+        self.linear_1 = nn.Linear(in_channels, time_embed_dim, sample_proj_bias)
+        if cond_proj_dim is not None:
+            self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias=False)
+        else:
+            self.cond_proj = None
+        self.act = get_activation(act_fn)
+        if out_dim is not None:
+            time_embed_dim_out = out_dim
+        else:
+            time_embed_dim_out = time_embed_dim
+        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out, sample_proj_bias)
+        if post_act_fn is None:
+            self.post_act = None
+        else:
+            self.post_act = get_activation(post_act_fn)
+    def forward(self, sample, condition=None):
+        if condition is not None:
+            sample = sample + self.cond_proj(condition)
+        sample = self.linear_1(sample)
+        if self.act is not None:
+            sample = self.act(sample)
+        sample = self.linear_2(sample)
+        if self.post_act is not None:
+            sample = self.post_act(sample)
+        return sample
+class Timesteps(nn.Module):
+    def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float, scale: int = 1):
+        super().__init__()
+        self.num_channels = num_channels
+        self.flip_sin_to_cos = flip_sin_to_cos
+        self.downscale_freq_shift = downscale_freq_shift
+        self.scale = scale
+    def forward(self, timesteps):
+        t_emb = get_timestep_embedding(
+            timesteps,
+            self.num_channels,
+            flip_sin_to_cos=self.flip_sin_to_cos,
+            downscale_freq_shift=self.downscale_freq_shift,
+            scale=self.scale,
+        )
+        return t_emb
+class FP32SiLU(nn.Module):
+    r"""
+    SiLU activation function with input upcasted to torch.float32.
+    """
+    def __init__(self):
+        super().__init__()
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        return F.silu(inputs.float(), inplace=False).to(inputs.dtype)
+class GELU(nn.Module):
+    r"""
+    GELU activation function with tanh approximation support with `approximate="tanh"`.
+    Parameters:
+        dim_in (`int`): The number of channels in the input.
+        dim_out (`int`): The number of channels in the output.
+        approximate (`str`, *optional*, defaults to `"none"`): If `"tanh"`, use tanh approximation.
+        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
+    """
+    def __init__(self, dim_in: int, dim_out: int, approximate: str = "none", bias: bool = True):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out, bias=bias)
+        self.approximate = approximate
+    def gelu(self, gate: torch.Tensor) -> torch.Tensor:
+        # if gate.device.type == "mps" and is_torch_version("<", "2.0.0"):
+        #     # fp16 gelu not supported on mps before torch 2.0
+        #     return F.gelu(gate.to(dtype=torch.float32), approximate=self.approximate).to(dtype=gate.dtype)
+        return F.gelu(gate, approximate=self.approximate)
+    def forward(self, hidden_states):
+        hidden_states = self.proj(hidden_states)
+        hidden_states = self.gelu(hidden_states)
+        return hidden_states
+class PixArtAlphaTextProjection(nn.Module):
+    """
+    Projects caption embeddings. Also handles dropout for classifier-free guidance.
+    Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/nets/PixArt_blocks.py
+    """
+    def __init__(self, in_features, hidden_size, out_features=None, act_fn="gelu_tanh"):
+        super().__init__()
+        if out_features is None:
+            out_features = hidden_size
+        self.linear_1 = nn.Linear(in_features=in_features, out_features=hidden_size, bias=True)
+        if act_fn == "gelu_tanh":
+            self.act_1 = nn.GELU(approximate="tanh")
+        elif act_fn == "silu":
+            self.act_1 = nn.SiLU()
+        elif act_fn == "silu_fp32":
+            self.act_1 = FP32SiLU()
+        else:
+            raise ValueError(f"Unknown activation function: {act_fn}")
+        self.linear_2 = nn.Linear(in_features=hidden_size, out_features=out_features, bias=True)
+    def forward(self, caption):
+        hidden_states = self.linear_1(caption)
+        hidden_states = self.act_1(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+class LayerNormFramePack(nn.LayerNorm):
+    # casting to dtype of input tensor is added
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.nn.functional.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps).to(x)
+class FP32LayerNormFramePack(nn.LayerNorm):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        origin_dtype = x.dtype
+        return torch.nn.functional.layer_norm(
+            x.float(),
+            self.normalized_shape,
+            self.weight.float() if self.weight is not None else None,
+            self.bias.float() if self.bias is not None else None,
+            self.eps,
+        ).to(origin_dtype)
+class RMSNormFramePack(nn.Module):
+    r"""
+    RMS Norm as introduced in https://arxiv.org/abs/1910.07467 by Zhang et al.
+    Args:
+        dim (`int`): Number of dimensions to use for `weights`. Only effective when `elementwise_affine` is True.
+        eps (`float`): Small value to use when calculating the reciprocal of the square-root.
+        elementwise_affine (`bool`, defaults to `True`):
+            Boolean flag to denote if affine transformation should be applied.
+        bias (`bool`, defaults to False): If also training the `bias` param.
+    """
+    def __init__(self, dim, eps: float, elementwise_affine: bool = True, bias: bool = False):
+        super().__init__()
+        self.eps = eps
+        self.elementwise_affine = elementwise_affine
+        if isinstance(dim, numbers.Integral):
+            dim = (dim,)
+        self.dim = torch.Size(dim)
+        self.weight = None
+        self.bias = None
+        if elementwise_affine:
+            self.weight = nn.Parameter(torch.ones(dim))
+            if bias:
+                self.bias = nn.Parameter(torch.zeros(dim))
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
+        if self.weight is None:
+            return hidden_states.to(input_dtype)
+        return hidden_states.to(input_dtype) * self.weight.to(input_dtype)
+class AdaLayerNormContinuousFramePack(nn.Module):
+    r"""
+    Adaptive normalization layer with a norm layer (layer_norm or rms_norm).
+    Args:
+        embedding_dim (`int`): Embedding dimension to use during projection.
+        conditioning_embedding_dim (`int`): Dimension of the input condition.
+        elementwise_affine (`bool`, defaults to `True`):
+            Boolean flag to denote if affine transformation should be applied.
+        eps (`float`, defaults to 1e-5): Epsilon factor.
+        bias (`bias`, defaults to `True`): Boolean flag to denote if bias should be use.
+        norm_type (`str`, defaults to `"layer_norm"`):
+            Normalization layer to use. Values supported: "layer_norm", "rms_norm".
+    """
+    def __init__(
+        self,
+        embedding_dim: int,
+        conditioning_embedding_dim: int,
+        # NOTE: It is a bit weird that the norm layer can be configured to have scale and shift parameters
+        # because the output is immediately scaled and shifted by the projected conditioning embeddings.
+        # Note that AdaLayerNorm does not let the norm layer have scale and shift parameters.
+        # However, this is how it was implemented in the original code, and it's rather likely you should
+        # set `elementwise_affine` to False.
+        elementwise_affine=True,
+        eps=1e-5,
+        bias=True,
+        norm_type="layer_norm",
+    ):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(conditioning_embedding_dim, embedding_dim * 2, bias=bias)
+        if norm_type == "layer_norm":
+            self.norm = LayerNormFramePack(embedding_dim, eps, elementwise_affine, bias)
+        elif norm_type == "rms_norm":
+            self.norm = RMSNormFramePack(embedding_dim, eps, elementwise_affine)
+        else:
+            raise ValueError(f"unknown norm_type {norm_type}")
+    def forward(self, x, conditioning_embedding):
+        emb = self.linear(self.silu(conditioning_embedding))
+        scale, shift = emb.chunk(2, dim=1)
+        x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
+        return x
+class LinearActivation(nn.Module):
+    def __init__(self, dim_in: int, dim_out: int, bias: bool = True, activation: str = "silu"):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out, bias=bias)
+        self.activation = get_activation(activation)
+    def forward(self, hidden_states):
+        hidden_states = self.proj(hidden_states)
+        return self.activation(hidden_states)
+class FeedForward(nn.Module):
+    r"""
+    A feed-forward layer.
+    Parameters:
+        dim (`int`): The number of channels in the input.
+        dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
+        mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        final_dropout (`bool` *optional*, defaults to False): Apply a final dropout.
+        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
+    """
+    def __init__(
+        self,
+        dim: int,
+        dim_out: Optional[int] = None,
+        mult: int = 4,
+        dropout: float = 0.0,
+        activation_fn: str = "geglu",
+        final_dropout: bool = False,
+        inner_dim=None,
+        bias: bool = True,
+    ):
+        super().__init__()
+        if inner_dim is None:
+            inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+        # if activation_fn == "gelu":
+        #     act_fn = GELU(dim, inner_dim, bias=bias)
+        if activation_fn == "gelu-approximate":
+            act_fn = GELU(dim, inner_dim, approximate="tanh", bias=bias)
+        # elif activation_fn == "geglu":
+        #     act_fn = GEGLU(dim, inner_dim, bias=bias)
+        # elif activation_fn == "geglu-approximate":
+        #     act_fn = ApproximateGELU(dim, inner_dim, bias=bias)
+        # elif activation_fn == "swiglu":
+        #     act_fn = SwiGLU(dim, inner_dim, bias=bias)
+        elif activation_fn == "linear-silu":
+            act_fn = LinearActivation(dim, inner_dim, bias=bias, activation="silu")
+        else:
+            raise ValueError(f"Unknown activation function: {activation_fn}")
+        self.net = nn.ModuleList([])
+        # project in
+        self.net.append(act_fn)
+        # project dropout
+        self.net.append(nn.Dropout(dropout))
+        # project out
+        self.net.append(nn.Linear(inner_dim, dim_out, bias=bias))
+        # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
+        if final_dropout:
+            self.net.append(nn.Dropout(dropout))
+    def forward(self, hidden_states: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            # deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            # deprecate("scale", "1.0.0", deprecation_message)
+            raise ValueError("scale is not supported in this version. Please remove it.")
+        for module in self.net:
+            hidden_states = module(hidden_states)
+        return hidden_states
+# @maybe_allow_in_graph
+class Attention(nn.Module):
+    r"""
+    Minimal copy of Attention class from diffusers.
+    """
+    def __init__(
+        self,
+        query_dim: int,
+        cross_attention_dim: Optional[int] = None,
+        heads: int = 8,
+        dim_head: int = 64,
+        bias: bool = False,
+        qk_norm: Optional[str] = None,
+        added_kv_proj_dim: Optional[int] = None,
+        eps: float = 1e-5,
+        processor: Optional[any] = None,
+        out_dim: int = None,
+        context_pre_only=None,
+        pre_only=False,
+    ):
+        super().__init__()
+        self.inner_dim = out_dim if out_dim is not None else dim_head * heads
+        self.inner_kv_dim = self.inner_dim  # if kv_heads is None else dim_head * kv_heads
+        self.query_dim = query_dim
+        self.use_bias = bias
+        self.cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
+        self.out_dim = out_dim if out_dim is not None else query_dim
+        self.out_context_dim = query_dim
+        self.context_pre_only = context_pre_only
+        self.pre_only = pre_only
+        self.scale = dim_head**-0.5
+        self.heads = out_dim // dim_head if out_dim is not None else heads
+        self.added_kv_proj_dim = added_kv_proj_dim
+        if qk_norm is None:
+            self.norm_q = None
+            self.norm_k = None
+        elif qk_norm == "rms_norm":
+            self.norm_q = RMSNormFramePack(dim_head, eps=eps)
+            self.norm_k = RMSNormFramePack(dim_head, eps=eps)
+        else:
+            raise ValueError(
+                f"unknown qk_norm: {qk_norm}. Should be one of None, 'layer_norm', 'fp32_layer_norm', 'layer_norm_across_heads', 'rms_norm', 'rms_norm_across_heads', 'l2'."
+            )
+        self.to_q = nn.Linear(query_dim, self.inner_dim, bias=bias)
+        self.to_k = nn.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias)
+        self.to_v = nn.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias)
+        self.added_proj_bias = True  # added_proj_bias
+        if self.added_kv_proj_dim is not None:
+            self.add_k_proj = nn.Linear(added_kv_proj_dim, self.inner_kv_dim, bias=True)
+            self.add_v_proj = nn.Linear(added_kv_proj_dim, self.inner_kv_dim, bias=True)
+            if self.context_pre_only is not None:
+                self.add_q_proj = nn.Linear(added_kv_proj_dim, self.inner_dim, bias=True)
+        else:
+            self.add_q_proj = None
+            self.add_k_proj = None
+            self.add_v_proj = None
+        if not self.pre_only:
+            self.to_out = nn.ModuleList([])
+            self.to_out.append(nn.Linear(self.inner_dim, self.out_dim, bias=True))
+            # self.to_out.append(nn.Dropout(dropout))
+            self.to_out.append(nn.Identity())  # dropout=0.0
+        else:
+            self.to_out = None
+        if self.context_pre_only is not None and not self.context_pre_only:
+            self.to_add_out = nn.Linear(self.inner_dim, self.out_context_dim, bias=True)
+        else:
+            self.to_add_out = None
+        if qk_norm is not None and added_kv_proj_dim is not None:
+            if qk_norm == "rms_norm":
+                self.norm_added_q = RMSNormFramePack(dim_head, eps=eps)
+                self.norm_added_k = RMSNormFramePack(dim_head, eps=eps)
+            else:
+                raise ValueError(f"unknown qk_norm: {qk_norm}. Should be one of `None,'layer_norm','fp32_layer_norm','rms_norm'`")
+        else:
+            self.norm_added_q = None
+            self.norm_added_k = None
+        # set attention processor
+        # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
+        # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
+        if processor is None:
+            processor = AttnProcessor2_0()
+        self.set_processor(processor)
+    def set_processor(self, processor: any) -> None:
+        self.processor = processor
+    def get_processor(self) -> any:
+        return self.processor
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        **cross_attention_kwargs,
+    ) -> torch.Tensor:
+        return self.processor(
+            self,
+            hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            attention_mask=attention_mask,
+            **cross_attention_kwargs,
+        )
+    def prepare_attention_mask(
+        self, attention_mask: torch.Tensor, target_length: int, batch_size: int, out_dim: int = 3
+    ) -> torch.Tensor:
+        r"""
+        Prepare the attention mask for the attention computation.
+        Args:
+            attention_mask (`torch.Tensor`):
+                The attention mask to prepare.
+            target_length (`int`):
+                The target length of the attention mask. This is the length of the attention mask after padding.
+            batch_size (`int`):
+                The batch size, which is used to repeat the attention mask.
+            out_dim (`int`, *optional*, defaults to `3`):
+                The output dimension of the attention mask. Can be either `3` or `4`.
+        Returns:
+            `torch.Tensor`: The prepared attention mask.
+        """
+        head_size = self.heads
+        if attention_mask is None:
+            return attention_mask
+        current_length: int = attention_mask.shape[-1]
+        if current_length != target_length:
+            if attention_mask.device.type == "mps":
+                # HACK: MPS: Does not support padding by greater than dimension of input tensor.
+                # Instead, we can manually construct the padding tensor.
+                padding_shape = (attention_mask.shape[0], attention_mask.shape[1], target_length)
+                padding = torch.zeros(padding_shape, dtype=attention_mask.dtype, device=attention_mask.device)
+                attention_mask = torch.cat([attention_mask, padding], dim=2)
+            else:
+                # TODO: for pipelines such as stable-diffusion, padding cross-attn mask:
+                #       we want to instead pad by (0, remaining_length), where remaining_length is:
+                #       remaining_length: int = target_length - current_length
+                # TODO: re-enable tests/models/test_models_unet_2d_condition.py#test_model_xattn_padding
+                attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
+        if out_dim == 3:
+            if attention_mask.shape[0] < batch_size * head_size:
+                attention_mask = attention_mask.repeat_interleave(head_size, dim=0, output_size=attention_mask.shape[0] * head_size)
+        elif out_dim == 4:
+            attention_mask = attention_mask.unsqueeze(1)
+            attention_mask = attention_mask.repeat_interleave(head_size, dim=1, output_size=attention_mask.shape[1] * head_size)
+        return attention_mask
+class AttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        query = attn.to_q(hidden_states)
+        query_dtype = query.dtype  # store dtype before potentially deleting query
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        hidden_states = F.scaled_dot_product_attention(query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False)
+        del query, key, value, attention_mask  # free memory
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query_dtype)  # use stored dtype
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        return hidden_states
+# endregion diffusers
+def pad_for_3d_conv(x, kernel_size):
+    b, c, t, h, w = x.shape
+    pt, ph, pw = kernel_size
+    pad_t = (pt - (t % pt)) % pt
+    pad_h = (ph - (h % ph)) % ph
+    pad_w = (pw - (w % pw)) % pw
+    return torch.nn.functional.pad(x, (0, pad_w, 0, pad_h, 0, pad_t), mode="replicate")
+def center_down_sample_3d(x, kernel_size):
+    # pt, ph, pw = kernel_size
+    # cp = (pt * ph * pw) // 2
+    # xp = einops.rearrange(x, 'b c (t pt) (h ph) (w pw) -> (pt ph pw) b c t h w', pt=pt, ph=ph, pw=pw)
+    # xc = xp[cp]
+    # return xc
+    return torch.nn.functional.avg_pool3d(x, kernel_size, stride=kernel_size)
+def get_cu_seqlens(text_mask, img_len):
+    batch_size = text_mask.shape[0]
+    text_len = text_mask.sum(dim=1)
+    max_len = text_mask.shape[1] + img_len
+    cu_seqlens = torch.zeros([2 * batch_size + 1], dtype=torch.int32, device=text_mask.device)  # ensure device match
+    for i in range(batch_size):
+        s = text_len[i] + img_len
+        s1 = i * max_len + s
+        s2 = (i + 1) * max_len
+        cu_seqlens[2 * i + 1] = s1
+        cu_seqlens[2 * i + 2] = s2
+    return cu_seqlens
+def apply_rotary_emb_transposed(x, freqs_cis):
+    cos, sin = freqs_cis.unsqueeze(-2).chunk(2, dim=-1)
+    del freqs_cis
+    x_real, x_imag = x.unflatten(-1, (-1, 2)).unbind(-1)
+    x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+    del x_real, x_imag
+    return (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
+def attn_varlen_func(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv, attn_mode=None, split_attn=False):
+    if cu_seqlens_q is None and cu_seqlens_kv is None and max_seqlen_q is None and max_seqlen_kv is None:
+        if attn_mode == "sageattn" or attn_mode is None and sageattn is not None:
+            x = sageattn(q, k, v, tensor_layout="NHD")
+            return x
+        if attn_mode == "flash" or attn_mode is None and flash_attn_func is not None:
+            x = flash_attn_func(q, k, v)
+            return x
+        if attn_mode == "xformers" or attn_mode is None and xformers_attn_func is not None:
+            x = xformers_attn_func(q, k, v)
+            return x
+        x = torch.nn.functional.scaled_dot_product_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)).transpose(
+            1, 2
+        )
+        return x
+    if split_attn:
+        if attn_mode == "sageattn" or attn_mode is None and sageattn is not None:
+            x = torch.empty_like(q)
+            for i in range(q.size(0)):
+                x[i : i + 1] = sageattn(q[i : i + 1], k[i : i + 1], v[i : i + 1], tensor_layout="NHD")
+            return x
+        if attn_mode == "flash" or attn_mode is None and flash_attn_func is not None:
+            x = torch.empty_like(q)
+            for i in range(q.size(0)):
+                x[i : i + 1] = flash_attn_func(q[i : i + 1], k[i : i + 1], v[i : i + 1])
+            return x
+        if attn_mode == "xformers" or attn_mode is None and xformers_attn_func is not None:
+            x = torch.empty_like(q)
+            for i in range(q.size(0)):
+                x[i : i + 1] = xformers_attn_func(q[i : i + 1], k[i : i + 1], v[i : i + 1])
+            return x
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+        x = torch.empty_like(q)
+        for i in range(q.size(0)):
+            x[i : i + 1] = torch.nn.functional.scaled_dot_product_attention(q[i : i + 1], k[i : i + 1], v[i : i + 1])
+        x = x.transpose(1, 2)
+        return x
+    batch_size = q.shape[0]
+    q = q.view(q.shape[0] * q.shape[1], *q.shape[2:])
+    k = k.view(k.shape[0] * k.shape[1], *k.shape[2:])
+    v = v.view(v.shape[0] * v.shape[1], *v.shape[2:])
+    if attn_mode == "sageattn" or attn_mode is None and sageattn_varlen is not None:
+        x = sageattn_varlen(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv)
+        del q, k, v  # free memory
+    elif attn_mode == "flash" or attn_mode is None and flash_attn_varlen_func is not None:
+        x = flash_attn_varlen_func(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv)
+        del q, k, v  # free memory
+    else:
+        raise NotImplementedError("No Attn Installed!")
+    x = x.view(batch_size, max_seqlen_q, *x.shape[2:])
+    return x
+class HunyuanAttnProcessorFlashAttnDouble:
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states,
+        attention_mask,
+        image_rotary_emb,
+        attn_mode: Optional[str] = None,
+        split_attn: Optional[bool] = False,
+    ):
+        cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv = attention_mask
+        # Project image latents
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+        del hidden_states  # free memory
+        query = query.unflatten(2, (attn.heads, -1))
+        key = key.unflatten(2, (attn.heads, -1))
+        value = value.unflatten(2, (attn.heads, -1))
+        query = attn.norm_q(query)
+        key = attn.norm_k(key)
+        query = apply_rotary_emb_transposed(query, image_rotary_emb)
+        key = apply_rotary_emb_transposed(key, image_rotary_emb)
+        del image_rotary_emb  # free memory
+        # Project context (text/encoder) embeddings
+        encoder_query = attn.add_q_proj(encoder_hidden_states)
+        encoder_key = attn.add_k_proj(encoder_hidden_states)
+        encoder_value = attn.add_v_proj(encoder_hidden_states)
+        txt_length = encoder_hidden_states.shape[1]  # store length before deleting
+        del encoder_hidden_states  # free memory
+        encoder_query = encoder_query.unflatten(2, (attn.heads, -1))
+        encoder_key = encoder_key.unflatten(2, (attn.heads, -1))
+        encoder_value = encoder_value.unflatten(2, (attn.heads, -1))
+        encoder_query = attn.norm_added_q(encoder_query)
+        encoder_key = attn.norm_added_k(encoder_key)
+        # Concatenate image and context q, k, v
+        query = torch.cat([query, encoder_query], dim=1)
+        key = torch.cat([key, encoder_key], dim=1)
+        value = torch.cat([value, encoder_value], dim=1)
+        del encoder_query, encoder_key, encoder_value  # free memory
+        hidden_states_attn = attn_varlen_func(
+            query, key, value, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv, attn_mode=attn_mode, split_attn=split_attn
+        )
+        del query, key, value  # free memory
+        hidden_states_attn = hidden_states_attn.flatten(-2)
+        hidden_states, encoder_hidden_states = hidden_states_attn[:, :-txt_length], hidden_states_attn[:, -txt_length:]
+        del hidden_states_attn  # free memory
+        # Apply output projections
+        hidden_states = attn.to_out[0](hidden_states)
+        hidden_states = attn.to_out[1](hidden_states)  # Dropout/Identity
+        encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+        return hidden_states, encoder_hidden_states
+class HunyuanAttnProcessorFlashAttnSingle:
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states,
+        attention_mask,
+        image_rotary_emb,
+        attn_mode: Optional[str] = None,
+        split_attn: Optional[bool] = False,
+    ):
+        cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv = attention_mask
+        txt_length = encoder_hidden_states.shape[1]  # Store text length
+        # Concatenate image and context inputs
+        hidden_states_cat = torch.cat([hidden_states, encoder_hidden_states], dim=1)
+        del hidden_states, encoder_hidden_states  # free memory
+        # Project concatenated inputs
+        query = attn.to_q(hidden_states_cat)
+        key = attn.to_k(hidden_states_cat)
+        value = attn.to_v(hidden_states_cat)
+        del hidden_states_cat  # free memory
+        query = query.unflatten(2, (attn.heads, -1))
+        key = key.unflatten(2, (attn.heads, -1))
+        value = value.unflatten(2, (attn.heads, -1))
+        query = attn.norm_q(query)
+        key = attn.norm_k(key)
+        query = torch.cat([apply_rotary_emb_transposed(query[:, :-txt_length], image_rotary_emb), query[:, -txt_length:]], dim=1)
+        key = torch.cat([apply_rotary_emb_transposed(key[:, :-txt_length], image_rotary_emb), key[:, -txt_length:]], dim=1)
+        del image_rotary_emb  # free memory
+        hidden_states = attn_varlen_func(
+            query, key, value, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv, attn_mode=attn_mode, split_attn=split_attn
+        )
+        del query, key, value  # free memory
+        hidden_states = hidden_states.flatten(-2)
+        hidden_states, encoder_hidden_states = hidden_states[:, :-txt_length], hidden_states[:, -txt_length:]
+        return hidden_states, encoder_hidden_states
+class CombinedTimestepGuidanceTextProjEmbeddings(nn.Module):
+    def __init__(self, embedding_dim, pooled_projection_dim):
+        super().__init__()
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+        self.guidance_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+        self.text_embedder = PixArtAlphaTextProjection(pooled_projection_dim, embedding_dim, act_fn="silu")
+    def forward(self, timestep, guidance, pooled_projection):
+        timesteps_proj = self.time_proj(timestep)
+        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=pooled_projection.dtype))
+        guidance_proj = self.time_proj(guidance)
+        guidance_emb = self.guidance_embedder(guidance_proj.to(dtype=pooled_projection.dtype))
+        time_guidance_emb = timesteps_emb + guidance_emb
+        pooled_projections = self.text_embedder(pooled_projection)
+        conditioning = time_guidance_emb + pooled_projections
+        return conditioning
+class CombinedTimestepTextProjEmbeddings(nn.Module):
+    def __init__(self, embedding_dim, pooled_projection_dim):
+        super().__init__()
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+        self.text_embedder = PixArtAlphaTextProjection(pooled_projection_dim, embedding_dim, act_fn="silu")
+    def forward(self, timestep, pooled_projection):
+        timesteps_proj = self.time_proj(timestep)
+        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=pooled_projection.dtype))
+        pooled_projections = self.text_embedder(pooled_projection)
+        conditioning = timesteps_emb + pooled_projections
+        return conditioning
+class HunyuanVideoAdaNorm(nn.Module):
+    def __init__(self, in_features: int, out_features: Optional[int] = None) -> None:
+        super().__init__()
+        out_features = out_features or 2 * in_features
+        self.linear = nn.Linear(in_features, out_features)
+        self.nonlinearity = nn.SiLU()
+    def forward(self, temb: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        temb = self.linear(self.nonlinearity(temb))
+        gate_msa, gate_mlp = temb.chunk(2, dim=-1)
+        gate_msa, gate_mlp = gate_msa.unsqueeze(1), gate_mlp.unsqueeze(1)
+        return gate_msa, gate_mlp
+class HunyuanVideoIndividualTokenRefinerBlock(nn.Module):
+    def __init__(
+        self,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        mlp_width_ratio: float = 4.0,
+        mlp_drop_rate: float = 0.0,
+        attention_bias: bool = True,
+    ) -> None:
+        super().__init__()
+        hidden_size = num_attention_heads * attention_head_dim
+        self.norm1 = LayerNormFramePack(hidden_size, elementwise_affine=True, eps=1e-6)
+        self.attn = Attention(
+            query_dim=hidden_size,
+            cross_attention_dim=None,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            bias=attention_bias,
+        )
+        self.norm2 = LayerNormFramePack(hidden_size, elementwise_affine=True, eps=1e-6)
+        self.ff = FeedForward(hidden_size, mult=mlp_width_ratio, activation_fn="linear-silu", dropout=mlp_drop_rate)
+        self.norm_out = HunyuanVideoAdaNorm(hidden_size, 2 * hidden_size)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        norm_hidden_states = self.norm1(hidden_states)
+        # Self-attention
+        attn_output = self.attn(
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=None,
+            attention_mask=attention_mask,
+        )
+        del norm_hidden_states  # free memory
+        gate_msa, gate_mlp = self.norm_out(temb)
+        hidden_states = hidden_states + attn_output * gate_msa
+        del attn_output, gate_msa  # free memory
+        ff_output = self.ff(self.norm2(hidden_states))
+        hidden_states = hidden_states + ff_output * gate_mlp
+        del ff_output, gate_mlp  # free memory
+        return hidden_states
+class HunyuanVideoIndividualTokenRefiner(nn.Module):
+    def __init__(
+        self,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        num_layers: int,
+        mlp_width_ratio: float = 4.0,
+        mlp_drop_rate: float = 0.0,
+        attention_bias: bool = True,
+    ) -> None:
+        super().__init__()
+        self.refiner_blocks = nn.ModuleList(
+            [
+                HunyuanVideoIndividualTokenRefinerBlock(
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    mlp_width_ratio=mlp_width_ratio,
+                    mlp_drop_rate=mlp_drop_rate,
+                    attention_bias=attention_bias,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        self_attn_mask = None
+        if attention_mask is not None:
+            batch_size = attention_mask.shape[0]
+            seq_len = attention_mask.shape[1]
+            attention_mask = attention_mask.to(hidden_states.device).bool()
+            self_attn_mask_1 = attention_mask.view(batch_size, 1, 1, seq_len).repeat(1, 1, seq_len, 1)
+            self_attn_mask_2 = self_attn_mask_1.transpose(2, 3)
+            self_attn_mask = (self_attn_mask_1 & self_attn_mask_2).bool()
+            self_attn_mask[:, :, :, 0] = True
+        for block in self.refiner_blocks:
+            hidden_states = block(hidden_states, temb, self_attn_mask)
+        return hidden_states
+class HunyuanVideoTokenRefiner(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        num_layers: int,
+        mlp_ratio: float = 4.0,
+        mlp_drop_rate: float = 0.0,
+        attention_bias: bool = True,
+    ) -> None:
+        super().__init__()
+        hidden_size = num_attention_heads * attention_head_dim
+        self.time_text_embed = CombinedTimestepTextProjEmbeddings(embedding_dim=hidden_size, pooled_projection_dim=in_channels)
+        self.proj_in = nn.Linear(in_channels, hidden_size, bias=True)
+        self.token_refiner = HunyuanVideoIndividualTokenRefiner(
+            num_attention_heads=num_attention_heads,
+            attention_head_dim=attention_head_dim,
+            num_layers=num_layers,
+            mlp_width_ratio=mlp_ratio,
+            mlp_drop_rate=mlp_drop_rate,
+            attention_bias=attention_bias,
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        timestep: torch.LongTensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        if attention_mask is None:
+            pooled_projections = hidden_states.mean(dim=1)
+        else:
+            original_dtype = hidden_states.dtype
+            mask_float = attention_mask.float().unsqueeze(-1)
+            pooled_projections = (hidden_states * mask_float).sum(dim=1) / mask_float.sum(dim=1)
+            pooled_projections = pooled_projections.to(original_dtype)
+        temb = self.time_text_embed(timestep, pooled_projections)
+        del pooled_projections  # free memory
+        hidden_states = self.proj_in(hidden_states)
+        hidden_states = self.token_refiner(hidden_states, temb, attention_mask)
+        del temb, attention_mask  # free memory
+        return hidden_states
+class HunyuanVideoRotaryPosEmbed(nn.Module):
+    def __init__(self, rope_dim, theta):
+        super().__init__()
+        self.DT, self.DY, self.DX = rope_dim
+        self.theta = theta
+    @torch.no_grad()
+    def get_frequency(self, dim, pos):
+        T, H, W = pos.shape
+        freqs = 1.0 / (self.theta ** (torch.arange(0, dim, 2, dtype=torch.float32, device=pos.device)[: (dim // 2)] / dim))
+        freqs = torch.outer(freqs, pos.reshape(-1)).unflatten(-1, (T, H, W)).repeat_interleave(2, dim=0)
+        return freqs.cos(), freqs.sin()
+    @torch.no_grad()
+    def forward_inner(self, frame_indices, height, width, device):
+        GT, GY, GX = torch.meshgrid(
+            frame_indices.to(device=device, dtype=torch.float32),
+            torch.arange(0, height, device=device, dtype=torch.float32),
+            torch.arange(0, width, device=device, dtype=torch.float32),
+            indexing="ij",
+        )
+        FCT, FST = self.get_frequency(self.DT, GT)
+        del GT  # free memory
+        FCY, FSY = self.get_frequency(self.DY, GY)
+        del GY  # free memory
+        FCX, FSX = self.get_frequency(self.DX, GX)
+        del GX  # free memory
+        result = torch.cat([FCT, FCY, FCX, FST, FSY, FSX], dim=0)
+        del FCT, FCY, FCX, FST, FSY, FSX  # free memory
+        # Return result already on the correct device
+        return result  # Shape (2 * total_dim / 2, T, H, W) -> (total_dim, T, H, W)
+    @torch.no_grad()
+    def forward(self, frame_indices, height, width, device):
+        frame_indices = frame_indices.unbind(0)
+        results = [self.forward_inner(f, height, width, device) for f in frame_indices]
+        results = torch.stack(results, dim=0)
+        return results
+class AdaLayerNormZero(nn.Module):
+    def __init__(self, embedding_dim: int, norm_type="layer_norm", bias=True):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=bias)
+        if norm_type == "layer_norm":
+            self.norm = LayerNormFramePack(embedding_dim, elementwise_affine=False, eps=1e-6)
+        else:
+            raise ValueError(f"unknown norm_type {norm_type}")
+    def forward(
+        self, x: torch.Tensor, emb: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        emb = emb.unsqueeze(-2)
+        emb = self.linear(self.silu(emb))
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.chunk(6, dim=-1)
+        x = self.norm(x) * (1 + scale_msa) + shift_msa
+        return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
+class AdaLayerNormZeroSingle(nn.Module):
+    def __init__(self, embedding_dim: int, norm_type="layer_norm", bias=True):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, 3 * embedding_dim, bias=bias)
+        if norm_type == "layer_norm":
+            self.norm = LayerNormFramePack(embedding_dim, elementwise_affine=False, eps=1e-6)
+        else:
+            raise ValueError(f"unknown norm_type {norm_type}")
+    def forward(
+        self,
+        x: torch.Tensor,
+        emb: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        emb = emb.unsqueeze(-2)
+        emb = self.linear(self.silu(emb))
+        shift_msa, scale_msa, gate_msa = emb.chunk(3, dim=-1)
+        x = self.norm(x) * (1 + scale_msa) + shift_msa
+        return x, gate_msa
+class AdaLayerNormContinuous(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        conditioning_embedding_dim: int,
+        elementwise_affine=True,
+        eps=1e-5,
+        bias=True,
+        norm_type="layer_norm",
+    ):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(conditioning_embedding_dim, embedding_dim * 2, bias=bias)
+        if norm_type == "layer_norm":
+            self.norm = LayerNormFramePack(embedding_dim, eps, elementwise_affine, bias)
+        else:
+            raise ValueError(f"unknown norm_type {norm_type}")
+    def forward(self, x: torch.Tensor, emb: torch.Tensor) -> torch.Tensor:
+        emb = emb.unsqueeze(-2)
+        emb = self.linear(self.silu(emb))
+        scale, shift = emb.chunk(2, dim=-1)
+        del emb  # free memory
+        x = self.norm(x) * (1 + scale) + shift
+        return x
+class HunyuanVideoSingleTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        mlp_ratio: float = 4.0,
+        qk_norm: str = "rms_norm",
+        attn_mode: Optional[str] = None,
+        split_attn: Optional[bool] = False,
+    ) -> None:
+        super().__init__()
+        hidden_size = num_attention_heads * attention_head_dim
+        mlp_dim = int(hidden_size * mlp_ratio)
+        self.attn_mode = attn_mode
+        self.split_attn = split_attn
+        # Attention layer (pre_only=True means no output projection in Attention module itself)
+        self.attn = Attention(
+            query_dim=hidden_size,
+            cross_attention_dim=None,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=hidden_size,
+            bias=True,
+            processor=HunyuanAttnProcessorFlashAttnSingle(),
+            qk_norm=qk_norm,
+            eps=1e-6,
+            pre_only=True,  # Crucial: Attn processor will return raw attention output
+        )
+        self.norm = AdaLayerNormZeroSingle(hidden_size, norm_type="layer_norm")
+        self.proj_mlp = nn.Linear(hidden_size, mlp_dim)
+        self.act_mlp = nn.GELU(approximate="tanh")
+        self.proj_out = nn.Linear(hidden_size + mlp_dim, hidden_size)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        text_seq_length = encoder_hidden_states.shape[1]
+        hidden_states = torch.cat([hidden_states, encoder_hidden_states], dim=1)
+        del encoder_hidden_states  # free memory
+        residual = hidden_states
+        # 1. Input normalization
+        norm_hidden_states, gate = self.norm(hidden_states, emb=temb)
+        mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states))
+        norm_hidden_states, norm_encoder_hidden_states = (
+            norm_hidden_states[:, :-text_seq_length, :],
+            norm_hidden_states[:, -text_seq_length:, :],
+        )
+        # 2. Attention
+        attn_output, context_attn_output = self.attn(
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=norm_encoder_hidden_states,
+            attention_mask=attention_mask,
+            image_rotary_emb=image_rotary_emb,
+            attn_mode=self.attn_mode,
+            split_attn=self.split_attn,
+        )
+        attn_output = torch.cat([attn_output, context_attn_output], dim=1)
+        del norm_hidden_states, norm_encoder_hidden_states, context_attn_output  # free memory
+        del image_rotary_emb
+        # 3. Modulation and residual connection
+        hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2)
+        del attn_output, mlp_hidden_states  # free memory
+        hidden_states = gate * self.proj_out(hidden_states)
+        hidden_states = hidden_states + residual
+        hidden_states, encoder_hidden_states = (
+            hidden_states[:, :-text_seq_length, :],
+            hidden_states[:, -text_seq_length:, :],
+        )
+        return hidden_states, encoder_hidden_states
+class HunyuanVideoTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        mlp_ratio: float,
+        qk_norm: str = "rms_norm",
+        attn_mode: Optional[str] = None,
+        split_attn: Optional[bool] = False,
+    ) -> None:
+        super().__init__()
+        hidden_size = num_attention_heads * attention_head_dim
+        self.attn_mode = attn_mode
+        self.split_attn = split_attn
+        self.norm1 = AdaLayerNormZero(hidden_size, norm_type="layer_norm")
+        self.norm1_context = AdaLayerNormZero(hidden_size, norm_type="layer_norm")
+        self.attn = Attention(
+            query_dim=hidden_size,
+            cross_attention_dim=None,
+            added_kv_proj_dim=hidden_size,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=hidden_size,
+            context_pre_only=False,
+            bias=True,
+            processor=HunyuanAttnProcessorFlashAttnDouble(),
+            qk_norm=qk_norm,
+            eps=1e-6,
+        )
+        self.norm2 = LayerNormFramePack(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.ff = FeedForward(hidden_size, mult=mlp_ratio, activation_fn="gelu-approximate")
+        self.norm2_context = LayerNormFramePack(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.ff_context = FeedForward(hidden_size, mult=mlp_ratio, activation_fn="gelu-approximate")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        freqs_cis: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # 1. Input normalization
+        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
+        norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
+            encoder_hidden_states, emb=temb
+        )
+        # 2. Joint attention
+        attn_output, context_attn_output = self.attn(
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=norm_encoder_hidden_states,
+            attention_mask=attention_mask,
+            image_rotary_emb=freqs_cis,
+            attn_mode=self.attn_mode,
+            split_attn=self.split_attn,
+        )
+        del norm_hidden_states, norm_encoder_hidden_states, freqs_cis  # free memory
+        # 3. Modulation and residual connection
+        hidden_states = hidden_states + attn_output * gate_msa
+        del attn_output, gate_msa  # free memory
+        encoder_hidden_states = encoder_hidden_states + context_attn_output * c_gate_msa
+        del context_attn_output, c_gate_msa  # free memory
+        norm_hidden_states = self.norm2(hidden_states)
+        norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
+        norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
+        del shift_mlp, scale_mlp  # free memory
+        norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp) + c_shift_mlp
+        del c_shift_mlp, c_scale_mlp  # free memory
+        # 4. Feed-forward
+        ff_output = self.ff(norm_hidden_states)
+        del norm_hidden_states  # free memory
+        context_ff_output = self.ff_context(norm_encoder_hidden_states)
+        del norm_encoder_hidden_states  # free memory
+        hidden_states = hidden_states + gate_mlp * ff_output
+        del ff_output, gate_mlp  # free memory
+        encoder_hidden_states = encoder_hidden_states + c_gate_mlp * context_ff_output
+        del context_ff_output, c_gate_mlp  # free memory
+        return hidden_states, encoder_hidden_states
+class ClipVisionProjection(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        self.up = nn.Linear(in_channels, out_channels * 3)
+        self.down = nn.Linear(out_channels * 3, out_channels)
+    def forward(self, x):
+        projected_x = self.down(nn.functional.silu(self.up(x)))
+        return projected_x
+class HunyuanVideoPatchEmbed(nn.Module):
+    def __init__(self, patch_size, in_chans, embed_dim):
+        super().__init__()
+        self.proj = nn.Conv3d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+class HunyuanVideoPatchEmbedForCleanLatents(nn.Module):
+    def __init__(self, inner_dim):
+        super().__init__()
+        self.proj = nn.Conv3d(16, inner_dim, kernel_size=(1, 2, 2), stride=(1, 2, 2))
+        self.proj_2x = nn.Conv3d(16, inner_dim, kernel_size=(2, 4, 4), stride=(2, 4, 4))
+        self.proj_4x = nn.Conv3d(16, inner_dim, kernel_size=(4, 8, 8), stride=(4, 8, 8))
+    @torch.no_grad()
+    def initialize_weight_from_another_conv3d(self, another_layer):
+        weight = another_layer.weight.detach().clone()
+        bias = another_layer.bias.detach().clone()
+        sd = {
+            "proj.weight": weight.clone(),
+            "proj.bias": bias.clone(),
+            "proj_2x.weight": einops.repeat(weight, "b c t h w -> b c (t tk) (h hk) (w wk)", tk=2, hk=2, wk=2) / 8.0,
+            "proj_2x.bias": bias.clone(),
+            "proj_4x.weight": einops.repeat(weight, "b c t h w -> b c (t tk) (h hk) (w wk)", tk=4, hk=4, wk=4) / 64.0,
+            "proj_4x.bias": bias.clone(),
+        }
+        sd = {k: v.clone() for k, v in sd.items()}
+        self.load_state_dict(sd)
+        return
+class HunyuanVideoTransformer3DModelPacked(nn.Module):  # (PreTrainedModelMixin, GenerationMixin,
+    # ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
+    # @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 16,
+        out_channels: int = 16,
+        num_attention_heads: int = 24,
+        attention_head_dim: int = 128,
+        num_layers: int = 20,
+        num_single_layers: int = 40,
+        num_refiner_layers: int = 2,
+        mlp_ratio: float = 4.0,
+        patch_size: int = 2,
+        patch_size_t: int = 1,
+        qk_norm: str = "rms_norm",
+        guidance_embeds: bool = True,
+        text_embed_dim: int = 4096,
+        pooled_projection_dim: int = 768,
+        rope_theta: float = 256.0,
+        rope_axes_dim: Tuple[int] = (16, 56, 56),
+        has_image_proj=False,
+        image_proj_dim=1152,
+        has_clean_x_embedder=False,
+        attn_mode: Optional[str] = None,
+        split_attn: Optional[bool] = False,
+    ) -> None:
+        super().__init__()
+        inner_dim = num_attention_heads * attention_head_dim
+        out_channels = out_channels or in_channels
+        self.config_patch_size = patch_size
+        self.config_patch_size_t = patch_size_t
+        # 1. Latent and condition embedders
+        self.x_embedder = HunyuanVideoPatchEmbed((patch_size_t, patch_size, patch_size), in_channels, inner_dim)
+        self.context_embedder = HunyuanVideoTokenRefiner(
+            text_embed_dim, num_attention_heads, attention_head_dim, num_layers=num_refiner_layers
+        )
+        self.time_text_embed = CombinedTimestepGuidanceTextProjEmbeddings(inner_dim, pooled_projection_dim)
+        self.clean_x_embedder = None
+        self.image_projection = None
+        # 2. RoPE
+        self.rope = HunyuanVideoRotaryPosEmbed(rope_axes_dim, rope_theta)
+        # 3. Dual stream transformer blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                HunyuanVideoTransformerBlock(
+                    num_attention_heads,
+                    attention_head_dim,
+                    mlp_ratio=mlp_ratio,
+                    qk_norm=qk_norm,
+                    attn_mode=attn_mode,
+                    split_attn=split_attn,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        # 4. Single stream transformer blocks
+        self.single_transformer_blocks = nn.ModuleList(
+            [
+                HunyuanVideoSingleTransformerBlock(
+                    num_attention_heads,
+                    attention_head_dim,
+                    mlp_ratio=mlp_ratio,
+                    qk_norm=qk_norm,
+                    attn_mode=attn_mode,
+                    split_attn=split_attn,
+                )
+                for _ in range(num_single_layers)
+            ]
+        )
+        # 5. Output projection
+        self.norm_out = AdaLayerNormContinuous(inner_dim, inner_dim, elementwise_affine=False, eps=1e-6)
+        self.proj_out = nn.Linear(inner_dim, patch_size_t * patch_size * patch_size * out_channels)
+        self.inner_dim = inner_dim
+        self.use_gradient_checkpointing = False
+        self.enable_teacache = False
+        # if has_image_proj:
+        #     self.install_image_projection(image_proj_dim)
+        self.image_projection = ClipVisionProjection(in_channels=image_proj_dim, out_channels=self.inner_dim)
+        # self.config["has_image_proj"] = True
+        # self.config["image_proj_dim"] = in_channels
+        # if has_clean_x_embedder:
+        #     self.install_clean_x_embedder()
+        self.clean_x_embedder = HunyuanVideoPatchEmbedForCleanLatents(self.inner_dim)
+        # self.config["has_clean_x_embedder"] = True
+        self.high_quality_fp32_output_for_inference = True  # False # change default to True
+        # Block swapping attributes (initialized to None)
+        self.blocks_to_swap = None
+        self.offloader_double = None
+        self.offloader_single = None
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+    def enable_gradient_checkpointing(self):
+        self.use_gradient_checkpointing = True
+        print("Gradient checkpointing enabled for HunyuanVideoTransformer3DModelPacked.")  # Logging
+    def disable_gradient_checkpointing(self):
+        self.use_gradient_checkpointing = False
+        print("Gradient checkpointing disabled for HunyuanVideoTransformer3DModelPacked.")  # Logging
+    def initialize_teacache(self, enable_teacache=True, num_steps=25, rel_l1_thresh=0.15):
+        self.enable_teacache = enable_teacache
+        self.cnt = 0
+        self.num_steps = num_steps
+        self.rel_l1_thresh = rel_l1_thresh  # 0.1 for 1.6x speedup, 0.15 for 2.1x speedup
+        self.accumulated_rel_l1_distance = 0
+        self.previous_modulated_input = None
+        self.previous_residual = None
+        self.teacache_rescale_func = np.poly1d([7.33226126e02, -4.01131952e02, 6.75869174e01, -3.14987800e00, 9.61237896e-02])
+        if enable_teacache:
+            print(f"TeaCache enabled: num_steps={num_steps}, rel_l1_thresh={rel_l1_thresh}")
+        else:
+            print("TeaCache disabled.")
+    def gradient_checkpointing_method(self, block, *args):
+        if self.use_gradient_checkpointing:
+            result = torch.utils.checkpoint.checkpoint(block, *args, use_reentrant=False)
+        else:
+            result = block(*args)
+        return result
+    def enable_block_swap(self, num_blocks: int, device: torch.device, supports_backward: bool):
+        self.blocks_to_swap = num_blocks
+        self.num_double_blocks = len(self.transformer_blocks)
+        self.num_single_blocks = len(self.single_transformer_blocks)
+        double_blocks_to_swap = num_blocks // 2
+        single_blocks_to_swap = (num_blocks - double_blocks_to_swap) * 2 + 1
+        assert double_blocks_to_swap <= self.num_double_blocks - 1 and single_blocks_to_swap <= self.num_single_blocks - 1, (
+            f"Cannot swap more than {self.num_double_blocks - 1} double blocks and {self.num_single_blocks - 1} single blocks. "
+            f"Requested {double_blocks_to_swap} double blocks and {single_blocks_to_swap} single blocks."
+        )
+        self.offloader_double = ModelOffloader(
+            "double",
+            self.transformer_blocks,
+            self.num_double_blocks,
+            double_blocks_to_swap,
+            supports_backward,
+            device,
+            # debug=True # Optional debugging
+        )
+        self.offloader_single = ModelOffloader(
+            "single",
+            self.single_transformer_blocks,
+            self.num_single_blocks,
+            single_blocks_to_swap,
+            supports_backward,
+            device,  # , debug=True
+        )
+        print(
+            f"HunyuanVideoTransformer3DModelPacked: Block swap enabled. Swapping {num_blocks} blocks, "
+            + f"double blocks: {double_blocks_to_swap}, single blocks: {single_blocks_to_swap}, supports_backward: {supports_backward}."
+        )
+    def switch_block_swap_for_inference(self):
+        if self.blocks_to_swap and self.blocks_to_swap > 0:
+            self.offloader_double.set_forward_only(True)
+            self.offloader_single.set_forward_only(True)
+            self.prepare_block_swap_before_forward()
+            print(f"HunyuanVideoTransformer3DModelPacked: Block swap set to forward only.")
+    def switch_block_swap_for_training(self):
+        if self.blocks_to_swap and self.blocks_to_swap > 0:
+            self.offloader_double.set_forward_only(False)
+            self.offloader_single.set_forward_only(False)
+            self.prepare_block_swap_before_forward()
+            print(f"HunyuanVideoTransformer3DModelPacked: Block swap set to forward and backward.")
+    def move_to_device_except_swap_blocks(self, device: torch.device):
+        # assume model is on cpu. do not move blocks to device to reduce temporary memory usage
+        if self.blocks_to_swap:
+            saved_double_blocks = self.transformer_blocks
+            saved_single_blocks = self.single_transformer_blocks
+            self.transformer_blocks = None
+            self.single_transformer_blocks = None
+        self.to(device)
+        if self.blocks_to_swap:
+            self.transformer_blocks = saved_double_blocks
+            self.single_transformer_blocks = saved_single_blocks
+    def prepare_block_swap_before_forward(self):
+        if self.blocks_to_swap is None or self.blocks_to_swap == 0:
+            return
+        self.offloader_double.prepare_block_devices_before_forward(self.transformer_blocks)
+        self.offloader_single.prepare_block_devices_before_forward(self.single_transformer_blocks)
+    def process_input_hidden_states(
+        self,
+        latents,
+        latent_indices=None,
+        clean_latents=None,
+        clean_latent_indices=None,
+        clean_latents_2x=None,
+        clean_latent_2x_indices=None,
+        clean_latents_4x=None,
+        clean_latent_4x_indices=None,
+    ):
+        hidden_states = self.gradient_checkpointing_method(self.x_embedder.proj, latents)
+        B, C, T, H, W = hidden_states.shape
+        if latent_indices is None:
+            latent_indices = torch.arange(0, T).unsqueeze(0).expand(B, -1)
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)
+        rope_freqs = self.rope(frame_indices=latent_indices, height=H, width=W, device=hidden_states.device)
+        rope_freqs = rope_freqs.flatten(2).transpose(1, 2)
+        if clean_latents is not None and clean_latent_indices is not None:
+            clean_latents = clean_latents.to(hidden_states)
+            clean_latents = self.gradient_checkpointing_method(self.clean_x_embedder.proj, clean_latents)
+            clean_latents = clean_latents.flatten(2).transpose(1, 2)
+            clean_latent_rope_freqs = self.rope(frame_indices=clean_latent_indices, height=H, width=W, device=clean_latents.device)
+            clean_latent_rope_freqs = clean_latent_rope_freqs.flatten(2).transpose(1, 2)
+            hidden_states = torch.cat([clean_latents, hidden_states], dim=1)
+            rope_freqs = torch.cat([clean_latent_rope_freqs, rope_freqs], dim=1)
+        if clean_latents_2x is not None and clean_latent_2x_indices is not None:
+            clean_latents_2x = clean_latents_2x.to(hidden_states)
+            clean_latents_2x = pad_for_3d_conv(clean_latents_2x, (2, 4, 4))
+            clean_latents_2x = self.gradient_checkpointing_method(self.clean_x_embedder.proj_2x, clean_latents_2x)
+            clean_latents_2x = clean_latents_2x.flatten(2).transpose(1, 2)
+            clean_latent_2x_rope_freqs = self.rope(
+                frame_indices=clean_latent_2x_indices, height=H, width=W, device=clean_latents_2x.device
+            )
+            clean_latent_2x_rope_freqs = pad_for_3d_conv(clean_latent_2x_rope_freqs, (2, 2, 2))
+            clean_latent_2x_rope_freqs = center_down_sample_3d(clean_latent_2x_rope_freqs, (2, 2, 2))
+            clean_latent_2x_rope_freqs = clean_latent_2x_rope_freqs.flatten(2).transpose(1, 2)
+            hidden_states = torch.cat([clean_latents_2x, hidden_states], dim=1)
+            rope_freqs = torch.cat([clean_latent_2x_rope_freqs, rope_freqs], dim=1)
+        if clean_latents_4x is not None and clean_latent_4x_indices is not None:
+            clean_latents_4x = clean_latents_4x.to(hidden_states)
+            clean_latents_4x = pad_for_3d_conv(clean_latents_4x, (4, 8, 8))
+            clean_latents_4x = self.gradient_checkpointing_method(self.clean_x_embedder.proj_4x, clean_latents_4x)
+            clean_latents_4x = clean_latents_4x.flatten(2).transpose(1, 2)
+            clean_latent_4x_rope_freqs = self.rope(
+                frame_indices=clean_latent_4x_indices, height=H, width=W, device=clean_latents_4x.device
+            )
+            clean_latent_4x_rope_freqs = pad_for_3d_conv(clean_latent_4x_rope_freqs, (4, 4, 4))
+            clean_latent_4x_rope_freqs = center_down_sample_3d(clean_latent_4x_rope_freqs, (4, 4, 4))
+            clean_latent_4x_rope_freqs = clean_latent_4x_rope_freqs.flatten(2).transpose(1, 2)
+            hidden_states = torch.cat([clean_latents_4x, hidden_states], dim=1)
+            rope_freqs = torch.cat([clean_latent_4x_rope_freqs, rope_freqs], dim=1)
+        return hidden_states, rope_freqs
+    def forward(
+        self,
+        hidden_states,
+        timestep,
+        encoder_hidden_states,
+        encoder_attention_mask,
+        pooled_projections,
+        guidance,
+        latent_indices=None,
+        clean_latents=None,
+        clean_latent_indices=None,
+        clean_latents_2x=None,
+        clean_latent_2x_indices=None,
+        clean_latents_4x=None,
+        clean_latent_4x_indices=None,
+        image_embeddings=None,
+        attention_kwargs=None,
+        return_dict=True,
+    ):
+        if attention_kwargs is None:
+            attention_kwargs = {}
+        batch_size, num_channels, num_frames, height, width = hidden_states.shape
+        p, p_t = self.config_patch_size, self.config_patch_size_t
+        post_patch_num_frames = num_frames // p_t
+        post_patch_height = height // p
+        post_patch_width = width // p
+        original_context_length = post_patch_num_frames * post_patch_height * post_patch_width
+        hidden_states, rope_freqs = self.process_input_hidden_states(
+            hidden_states,
+            latent_indices,
+            clean_latents,
+            clean_latent_indices,
+            clean_latents_2x,
+            clean_latent_2x_indices,
+            clean_latents_4x,
+            clean_latent_4x_indices,
+        )
+        del (
+            latent_indices,
+            clean_latents,
+            clean_latent_indices,
+            clean_latents_2x,
+            clean_latent_2x_indices,
+            clean_latents_4x,
+            clean_latent_4x_indices,
+        )  # free memory
+        temb = self.gradient_checkpointing_method(self.time_text_embed, timestep, guidance, pooled_projections)
+        encoder_hidden_states = self.gradient_checkpointing_method(
+            self.context_embedder, encoder_hidden_states, timestep, encoder_attention_mask
+        )
+        if self.image_projection is not None:
+            assert image_embeddings is not None, "You must use image embeddings!"
+            extra_encoder_hidden_states = self.gradient_checkpointing_method(self.image_projection, image_embeddings)
+            extra_attention_mask = torch.ones(
+                (batch_size, extra_encoder_hidden_states.shape[1]),
+                dtype=encoder_attention_mask.dtype,
+                device=encoder_attention_mask.device,
+            )
+            # must cat before (not after) encoder_hidden_states, due to attn masking
+            encoder_hidden_states = torch.cat([extra_encoder_hidden_states, encoder_hidden_states], dim=1)
+            encoder_attention_mask = torch.cat([extra_attention_mask, encoder_attention_mask], dim=1)
+            del extra_encoder_hidden_states, extra_attention_mask  # free memory
+        with torch.no_grad():
+            if batch_size == 1:
+                # When batch size is 1, we do not need any masks or var-len funcs since cropping is mathematically same to what we want
+                # If they are not same, then their impls are wrong. Ours are always the correct one.
+                text_len = encoder_attention_mask.sum().item()
+                encoder_hidden_states = encoder_hidden_states[:, :text_len]
+                attention_mask = None, None, None, None
+            else:
+                img_seq_len = hidden_states.shape[1]
+                txt_seq_len = encoder_hidden_states.shape[1]
+                cu_seqlens_q = get_cu_seqlens(encoder_attention_mask, img_seq_len)
+                cu_seqlens_kv = cu_seqlens_q
+                max_seqlen_q = img_seq_len + txt_seq_len
+                max_seqlen_kv = max_seqlen_q
+                attention_mask = cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv
+                del cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv  # free memory
+        del encoder_attention_mask  # free memory
+        if self.enable_teacache:
+            modulated_inp = self.transformer_blocks[0].norm1(hidden_states, emb=temb)[0]
+            if self.cnt == 0 or self.cnt == self.num_steps - 1:
+                should_calc = True
+                self.accumulated_rel_l1_distance = 0
+            else:
+                curr_rel_l1 = (
+                    ((modulated_inp - self.previous_modulated_input).abs().mean() / self.previous_modulated_input.abs().mean())
+                    .cpu()
+                    .item()
+                )
+                self.accumulated_rel_l1_distance += self.teacache_rescale_func(curr_rel_l1)
+                should_calc = self.accumulated_rel_l1_distance >= self.rel_l1_thresh
+                if should_calc:
+                    self.accumulated_rel_l1_distance = 0
+            self.previous_modulated_input = modulated_inp
+            self.cnt += 1
+            if self.cnt == self.num_steps:
+                self.cnt = 0
+            if not should_calc:
+                hidden_states = hidden_states + self.previous_residual
+            else:
+                ori_hidden_states = hidden_states.clone()
+                for block_id, block in enumerate(self.transformer_blocks):
+                    hidden_states, encoder_hidden_states = self.gradient_checkpointing_method(
+                        block, hidden_states, encoder_hidden_states, temb, attention_mask, rope_freqs
+                    )
+                for block_id, block in enumerate(self.single_transformer_blocks):
+                    hidden_states, encoder_hidden_states = self.gradient_checkpointing_method(
+                        block, hidden_states, encoder_hidden_states, temb, attention_mask, rope_freqs
+                    )
+                self.previous_residual = hidden_states - ori_hidden_states
+                del ori_hidden_states  # free memory
+        else:
+            for block_id, block in enumerate(self.transformer_blocks):
+                if self.blocks_to_swap:
+                    self.offloader_double.wait_for_block(block_id)
+                hidden_states, encoder_hidden_states = self.gradient_checkpointing_method(
+                    block, hidden_states, encoder_hidden_states, temb, attention_mask, rope_freqs
+                )
+                if self.blocks_to_swap:
+                    self.offloader_double.submit_move_blocks_forward(self.transformer_blocks, block_id)
+            for block_id, block in enumerate(self.single_transformer_blocks):
+                if self.blocks_to_swap:
+                    self.offloader_single.wait_for_block(block_id)
+                hidden_states, encoder_hidden_states = self.gradient_checkpointing_method(
+                    block, hidden_states, encoder_hidden_states, temb, attention_mask, rope_freqs
+                )
+                if self.blocks_to_swap:
+                    self.offloader_single.submit_move_blocks_forward(self.single_transformer_blocks, block_id)
+        del attention_mask, rope_freqs  # free memory
+        del encoder_hidden_states  # free memory
+        hidden_states = self.gradient_checkpointing_method(self.norm_out, hidden_states, temb)
+        hidden_states = hidden_states[:, -original_context_length:, :]
+        if self.high_quality_fp32_output_for_inference:
+            hidden_states = hidden_states.to(dtype=torch.float32)
+            if self.proj_out.weight.dtype != torch.float32:
+                self.proj_out.to(dtype=torch.float32)
+        hidden_states = self.gradient_checkpointing_method(self.proj_out, hidden_states)
+        hidden_states = einops.rearrange(
+            hidden_states,
+            "b (t h w) (c pt ph pw) -> b c (t pt) (h ph) (w pw)",
+            t=post_patch_num_frames,
+            h=post_patch_height,
+            w=post_patch_width,
+            pt=p_t,
+            ph=p,
+            pw=p,
+        )
+        if return_dict:
+            # return Transformer2DModelOutput(sample=hidden_states)
+            return SimpleNamespace(sample=hidden_states)
+        return (hidden_states,)
+    def fp8_optimization(
+        self, state_dict: dict[str, torch.Tensor], device: torch.device, move_to_device: bool, use_scaled_mm: bool = False
+    ) -> dict[str, torch.Tensor]:  # Return type hint added
+        """
+        Optimize the model state_dict with fp8.
+        Args:
+            state_dict (dict[str, torch.Tensor]):
+                The state_dict of the model.
+            device (torch.device):
+                The device to calculate the weight.
+            move_to_device (bool):
+                Whether to move the weight to the device after optimization.
+            use_scaled_mm (bool):
+                Whether to use scaled matrix multiplication for FP8.
+        """
+        TARGET_KEYS = ["transformer_blocks", "single_transformer_blocks"]
+        EXCLUDE_KEYS = ["norm"]  # Exclude norm layers (e.g., LayerNorm, RMSNorm) from FP8
+        # inplace optimization
+        state_dict = optimize_state_dict_with_fp8(state_dict, device, TARGET_KEYS, EXCLUDE_KEYS, move_to_device=move_to_device)
+        # apply monkey patching
+        apply_fp8_monkey_patch(self, state_dict, use_scaled_mm=use_scaled_mm)
+        return state_dict
+def load_packed_model(
+    device: Union[str, torch.device],
+    dit_path: str,
+    attn_mode: str,
+    loading_device: Union[str, torch.device],
+    fp8_scaled: bool = False,
+    split_attn: bool = False,
+) -> HunyuanVideoTransformer3DModelPacked:
+    # TODO support split_attn
+    device = torch.device(device)
+    loading_device = torch.device(loading_device)
+    if os.path.isdir(dit_path):
+        # we don't support from_pretrained for now, so loading safetensors directly
+        safetensor_files = glob.glob(os.path.join(dit_path, "*.safetensors"))
+        if len(safetensor_files) == 0:
+            raise ValueError(f"Cannot find safetensors file in {dit_path}")
+        # sort by name and take the first one
+        safetensor_files.sort()
+        dit_path = safetensor_files[0]
+    with init_empty_weights():
+        logger.info(f"Creating HunyuanVideoTransformer3DModelPacked")
+        model = HunyuanVideoTransformer3DModelPacked(
+            attention_head_dim=128,
+            guidance_embeds=True,
+            has_clean_x_embedder=True,
+            has_image_proj=True,
+            image_proj_dim=1152,
+            in_channels=16,
+            mlp_ratio=4.0,
+            num_attention_heads=24,
+            num_layers=20,
+            num_refiner_layers=2,
+            num_single_layers=40,
+            out_channels=16,
+            patch_size=2,
+            patch_size_t=1,
+            pooled_projection_dim=768,
+            qk_norm="rms_norm",
+            rope_axes_dim=(16, 56, 56),
+            rope_theta=256.0,
+            text_embed_dim=4096,
+            attn_mode=attn_mode,
+            split_attn=split_attn,
+        )
+    # if fp8_scaled, load model weights to CPU to reduce VRAM usage. Otherwise, load to the specified device (CPU for block swap or CUDA for others)
+    dit_loading_device = torch.device("cpu") if fp8_scaled else loading_device
+    logger.info(f"Loading DiT model from {dit_path}, device={dit_loading_device}")
+    # load model weights with the specified dtype or as is
+    sd = load_split_weights(dit_path, device=dit_loading_device, disable_mmap=True)
+    if fp8_scaled:
+        # fp8 optimization: calculate on CUDA, move back to CPU if loading_device is CPU (block swap)
+        logger.info(f"Optimizing model weights to fp8. This may take a while.")
+        sd = model.fp8_optimization(sd, device, move_to_device=loading_device.type == "cpu")
+        if loading_device.type != "cpu":
+            # make sure all the model weights are on the loading_device
+            logger.info(f"Moving weights to {loading_device}")
+            for key in sd.keys():
+                sd[key] = sd[key].to(loading_device)
+    info = model.load_state_dict(sd, strict=True, assign=True)
+    logger.info(f"Loaded DiT model from {dit_path}, info={info}")
+    return model

frame_pack/k_diffusion_hunyuan.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# original code: https://github.com/lllyasviel/FramePack
+# original license: Apache-2.0
+import torch
+import math
+# from diffusers_helper.k_diffusion.uni_pc_fm import sample_unipc
+# from diffusers_helper.k_diffusion.wrapper import fm_wrapper
+# from diffusers_helper.utils import repeat_to_batch_size
+from frame_pack.uni_pc_fm import sample_unipc
+from frame_pack.wrapper import fm_wrapper
+from frame_pack.utils import repeat_to_batch_size
+def flux_time_shift(t, mu=1.15, sigma=1.0):
+    return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+def calculate_flux_mu(context_length, x1=256, y1=0.5, x2=4096, y2=1.15, exp_max=7.0):
+    k = (y2 - y1) / (x2 - x1)
+    b = y1 - k * x1
+    mu = k * context_length + b
+    mu = min(mu, math.log(exp_max))
+    return mu
+def get_flux_sigmas_from_mu(n, mu):
+    sigmas = torch.linspace(1, 0, steps=n + 1)
+    sigmas = flux_time_shift(sigmas, mu=mu)
+    return sigmas
+# @torch.inference_mode()
+def sample_hunyuan(
+    transformer,
+    sampler="unipc",
+    initial_latent=None,
+    concat_latent=None,
+    strength=1.0,
+    width=512,
+    height=512,
+    frames=16,
+    real_guidance_scale=1.0,
+    distilled_guidance_scale=6.0,
+    guidance_rescale=0.0,
+    shift=None,
+    num_inference_steps=25,
+    batch_size=None,
+    generator=None,
+    prompt_embeds=None,
+    prompt_embeds_mask=None,
+    prompt_poolers=None,
+    negative_prompt_embeds=None,
+    negative_prompt_embeds_mask=None,
+    negative_prompt_poolers=None,
+    dtype=torch.bfloat16,
+    device=None,
+    negative_kwargs=None,
+    callback=None,
+    **kwargs,
+):
+    device = device or transformer.device
+    if batch_size is None:
+        batch_size = int(prompt_embeds.shape[0])
+    latents = torch.randn(
+        (batch_size, 16, (frames + 3) // 4, height // 8, width // 8), generator=generator, device=generator.device
+    ).to(device=device, dtype=torch.float32)
+    B, C, T, H, W = latents.shape
+    seq_length = T * H * W // 4  # 9*80*80//4 = 14400
+    if shift is None:
+        mu = calculate_flux_mu(seq_length, exp_max=7.0)  # 1.9459... if seq_len is large, mu is clipped.
+    else:
+        mu = math.log(shift)
+    sigmas = get_flux_sigmas_from_mu(num_inference_steps, mu).to(device)
+    k_model = fm_wrapper(transformer)
+    if initial_latent is not None:
+        sigmas = sigmas * strength
+        first_sigma = sigmas[0].to(device=device, dtype=torch.float32)
+        initial_latent = initial_latent.to(device=device, dtype=torch.float32)
+        latents = initial_latent.float() * (1.0 - first_sigma) + latents.float() * first_sigma
+    if concat_latent is not None:
+        concat_latent = concat_latent.to(latents)
+    distilled_guidance = torch.tensor([distilled_guidance_scale * 1000.0] * batch_size).to(device=device, dtype=dtype)
+    prompt_embeds = repeat_to_batch_size(prompt_embeds, batch_size)
+    prompt_embeds_mask = repeat_to_batch_size(prompt_embeds_mask, batch_size)
+    prompt_poolers = repeat_to_batch_size(prompt_poolers, batch_size)
+    negative_prompt_embeds = repeat_to_batch_size(negative_prompt_embeds, batch_size)
+    negative_prompt_embeds_mask = repeat_to_batch_size(negative_prompt_embeds_mask, batch_size)
+    negative_prompt_poolers = repeat_to_batch_size(negative_prompt_poolers, batch_size)
+    concat_latent = repeat_to_batch_size(concat_latent, batch_size)
+    sampler_kwargs = dict(
+        dtype=dtype,
+        cfg_scale=real_guidance_scale,
+        cfg_rescale=guidance_rescale,
+        concat_latent=concat_latent,
+        positive=dict(
+            pooled_projections=prompt_poolers,
+            encoder_hidden_states=prompt_embeds,
+            encoder_attention_mask=prompt_embeds_mask,
+            guidance=distilled_guidance,
+            **kwargs,
+        ),
+        negative=dict(
+            pooled_projections=negative_prompt_poolers,
+            encoder_hidden_states=negative_prompt_embeds,
+            encoder_attention_mask=negative_prompt_embeds_mask,
+            guidance=distilled_guidance,
+            **(kwargs if negative_kwargs is None else {**kwargs, **negative_kwargs}),
+        ),
+    )
+    if sampler == "unipc":
+        results = sample_unipc(k_model, latents, sigmas, extra_args=sampler_kwargs, disable=False, callback=callback)
+    else:
+        raise NotImplementedError(f"Sampler {sampler} is not supported.")
+    return results

frame_pack/uni_pc_fm.py ADDED Viewed

	@@ -0,0 +1,142 @@

+# Better Flow Matching UniPC by Lvmin Zhang
+# (c) 2025
+# CC BY-SA 4.0
+# Attribution-ShareAlike 4.0 International Licence
+import torch
+from tqdm.auto import trange
+def expand_dims(v, dims):
+    return v[(...,) + (None,) * (dims - 1)]
+class FlowMatchUniPC:
+    def __init__(self, model, extra_args, variant='bh1'):
+        self.model = model
+        self.variant = variant
+        self.extra_args = extra_args
+    def model_fn(self, x, t):
+        return self.model(x, t, **self.extra_args)
+    def update_fn(self, x, model_prev_list, t_prev_list, t, order):
+        assert order <= len(model_prev_list)
+        dims = x.dim()
+        t_prev_0 = t_prev_list[-1]
+        lambda_prev_0 = - torch.log(t_prev_0)
+        lambda_t = - torch.log(t)
+        model_prev_0 = model_prev_list[-1]
+        h = lambda_t - lambda_prev_0
+        rks = []
+        D1s = []
+        for i in range(1, order):
+            t_prev_i = t_prev_list[-(i + 1)]
+            model_prev_i = model_prev_list[-(i + 1)]
+            lambda_prev_i = - torch.log(t_prev_i)
+            rk = ((lambda_prev_i - lambda_prev_0) / h)[0]
+            rks.append(rk)
+            D1s.append((model_prev_i - model_prev_0) / rk)
+        rks.append(1.)
+        rks = torch.tensor(rks, device=x.device)
+        R = []
+        b = []
+        hh = -h[0]
+        h_phi_1 = torch.expm1(hh)
+        h_phi_k = h_phi_1 / hh - 1
+        factorial_i = 1
+        if self.variant == 'bh1':
+            B_h = hh
+        elif self.variant == 'bh2':
+            B_h = torch.expm1(hh)
+        else:
+            raise NotImplementedError('Bad variant!')
+        for i in range(1, order + 1):
+            R.append(torch.pow(rks, i - 1))
+            b.append(h_phi_k * factorial_i / B_h)
+            factorial_i *= (i + 1)
+            h_phi_k = h_phi_k / hh - 1 / factorial_i
+        R = torch.stack(R)
+        b = torch.tensor(b, device=x.device)
+        use_predictor = len(D1s) > 0
+        if use_predictor:
+            D1s = torch.stack(D1s, dim=1)
+            if order == 2:
+                rhos_p = torch.tensor([0.5], device=b.device)
+            else:
+                rhos_p = torch.linalg.solve(R[:-1, :-1], b[:-1])
+        else:
+            D1s = None
+            rhos_p = None
+        if order == 1:
+            rhos_c = torch.tensor([0.5], device=b.device)
+        else:
+            rhos_c = torch.linalg.solve(R, b)
+        x_t_ = expand_dims(t / t_prev_0, dims) * x - expand_dims(h_phi_1, dims) * model_prev_0
+        if use_predictor:
+            pred_res = torch.tensordot(D1s, rhos_p, dims=([1], [0]))
+        else:
+            pred_res = 0
+        x_t = x_t_ - expand_dims(B_h, dims) * pred_res
+        model_t = self.model_fn(x_t, t)
+        if D1s is not None:
+            corr_res = torch.tensordot(D1s, rhos_c[:-1], dims=([1], [0]))
+        else:
+            corr_res = 0
+        D1_t = (model_t - model_prev_0)
+        x_t = x_t_ - expand_dims(B_h, dims) * (corr_res + rhos_c[-1] * D1_t)
+        return x_t, model_t
+    def sample(self, x, sigmas, callback=None, disable_pbar=False):
+        order = min(3, len(sigmas) - 2)
+        model_prev_list, t_prev_list = [], []
+        for i in trange(len(sigmas) - 1, disable=disable_pbar):
+            vec_t = sigmas[i].expand(x.shape[0])
+            with torch.no_grad():
+                if i == 0:
+                    model_prev_list = [self.model_fn(x, vec_t)]
+                    t_prev_list = [vec_t]
+                elif i < order:
+                    init_order = i
+                    x, model_x = self.update_fn(x, model_prev_list, t_prev_list, vec_t, init_order)
+                    model_prev_list.append(model_x)
+                    t_prev_list.append(vec_t)
+                else:
+                    x, model_x = self.update_fn(x, model_prev_list, t_prev_list, vec_t, order)
+                    model_prev_list.append(model_x)
+                    t_prev_list.append(vec_t)
+            model_prev_list = model_prev_list[-order:]
+            t_prev_list = t_prev_list[-order:]
+            if callback is not None:
+                callback({'x': x, 'i': i, 'denoised': model_prev_list[-1]})
+        return model_prev_list[-1]
+def sample_unipc(model, noise, sigmas, extra_args=None, callback=None, disable=False, variant='bh1'):
+    assert variant in ['bh1', 'bh2']
+    return FlowMatchUniPC(model, extra_args=extra_args, variant=variant).sample(noise, sigmas=sigmas, callback=callback, disable_pbar=disable)

frame_pack/utils.py ADDED Viewed

	@@ -0,0 +1,617 @@

+import os
+import cv2
+import json
+import random
+import glob
+import torch
+import einops
+import numpy as np
+import datetime
+import torchvision
+import safetensors.torch as sf
+from PIL import Image
+def min_resize(x, m):
+    if x.shape[0] < x.shape[1]:
+        s0 = m
+        s1 = int(float(m) / float(x.shape[0]) * float(x.shape[1]))
+    else:
+        s0 = int(float(m) / float(x.shape[1]) * float(x.shape[0]))
+        s1 = m
+    new_max = max(s1, s0)
+    raw_max = max(x.shape[0], x.shape[1])
+    if new_max < raw_max:
+        interpolation = cv2.INTER_AREA
+    else:
+        interpolation = cv2.INTER_LANCZOS4
+    y = cv2.resize(x, (s1, s0), interpolation=interpolation)
+    return y
+def d_resize(x, y):
+    H, W, C = y.shape
+    new_min = min(H, W)
+    raw_min = min(x.shape[0], x.shape[1])
+    if new_min < raw_min:
+        interpolation = cv2.INTER_AREA
+    else:
+        interpolation = cv2.INTER_LANCZOS4
+    y = cv2.resize(x, (W, H), interpolation=interpolation)
+    return y
+def resize_and_center_crop(image, target_width, target_height):
+    if target_height == image.shape[0] and target_width == image.shape[1]:
+        return image
+    pil_image = Image.fromarray(image)
+    original_width, original_height = pil_image.size
+    scale_factor = max(target_width / original_width, target_height / original_height)
+    resized_width = int(round(original_width * scale_factor))
+    resized_height = int(round(original_height * scale_factor))
+    resized_image = pil_image.resize((resized_width, resized_height), Image.LANCZOS)
+    left = (resized_width - target_width) / 2
+    top = (resized_height - target_height) / 2
+    right = (resized_width + target_width) / 2
+    bottom = (resized_height + target_height) / 2
+    cropped_image = resized_image.crop((left, top, right, bottom))
+    return np.array(cropped_image)
+def resize_and_center_crop_pytorch(image, target_width, target_height):
+    B, C, H, W = image.shape
+    if H == target_height and W == target_width:
+        return image
+    scale_factor = max(target_width / W, target_height / H)
+    resized_width = int(round(W * scale_factor))
+    resized_height = int(round(H * scale_factor))
+    resized = torch.nn.functional.interpolate(image, size=(resized_height, resized_width), mode="bilinear", align_corners=False)
+    top = (resized_height - target_height) // 2
+    left = (resized_width - target_width) // 2
+    cropped = resized[:, :, top : top + target_height, left : left + target_width]
+    return cropped
+def resize_without_crop(image, target_width, target_height):
+    if target_height == image.shape[0] and target_width == image.shape[1]:
+        return image
+    pil_image = Image.fromarray(image)
+    resized_image = pil_image.resize((target_width, target_height), Image.LANCZOS)
+    return np.array(resized_image)
+def just_crop(image, w, h):
+    if h == image.shape[0] and w == image.shape[1]:
+        return image
+    original_height, original_width = image.shape[:2]
+    k = min(original_height / h, original_width / w)
+    new_width = int(round(w * k))
+    new_height = int(round(h * k))
+    x_start = (original_width - new_width) // 2
+    y_start = (original_height - new_height) // 2
+    cropped_image = image[y_start : y_start + new_height, x_start : x_start + new_width]
+    return cropped_image
+def write_to_json(data, file_path):
+    temp_file_path = file_path + ".tmp"
+    with open(temp_file_path, "wt", encoding="utf-8") as temp_file:
+        json.dump(data, temp_file, indent=4)
+    os.replace(temp_file_path, file_path)
+    return
+def read_from_json(file_path):
+    with open(file_path, "rt", encoding="utf-8") as file:
+        data = json.load(file)
+    return data
+def get_active_parameters(m):
+    return {k: v for k, v in m.named_parameters() if v.requires_grad}
+def cast_training_params(m, dtype=torch.float32):
+    result = {}
+    for n, param in m.named_parameters():
+        if param.requires_grad:
+            param.data = param.to(dtype)
+            result[n] = param
+    return result
+def separate_lora_AB(parameters, B_patterns=None):
+    parameters_normal = {}
+    parameters_B = {}
+    if B_patterns is None:
+        B_patterns = [".lora_B.", "__zero__"]
+    for k, v in parameters.items():
+        if any(B_pattern in k for B_pattern in B_patterns):
+            parameters_B[k] = v
+        else:
+            parameters_normal[k] = v
+    return parameters_normal, parameters_B
+def set_attr_recursive(obj, attr, value):
+    attrs = attr.split(".")
+    for name in attrs[:-1]:
+        obj = getattr(obj, name)
+    setattr(obj, attrs[-1], value)
+    return
+def print_tensor_list_size(tensors):
+    total_size = 0
+    total_elements = 0
+    if isinstance(tensors, dict):
+        tensors = tensors.values()
+    for tensor in tensors:
+        total_size += tensor.nelement() * tensor.element_size()
+        total_elements += tensor.nelement()
+    total_size_MB = total_size / (1024**2)
+    total_elements_B = total_elements / 1e9
+    print(f"Total number of tensors: {len(tensors)}")
+    print(f"Total size of tensors: {total_size_MB:.2f} MB")
+    print(f"Total number of parameters: {total_elements_B:.3f} billion")
+    return
+@torch.no_grad()
+def batch_mixture(a, b=None, probability_a=0.5, mask_a=None):
+    batch_size = a.size(0)
+    if b is None:
+        b = torch.zeros_like(a)
+    if mask_a is None:
+        mask_a = torch.rand(batch_size) < probability_a
+    mask_a = mask_a.to(a.device)
+    mask_a = mask_a.reshape((batch_size,) + (1,) * (a.dim() - 1))
+    result = torch.where(mask_a, a, b)
+    return result
+@torch.no_grad()
+def zero_module(module):
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+@torch.no_grad()
+def supress_lower_channels(m, k, alpha=0.01):
+    data = m.weight.data.clone()
+    assert int(data.shape[1]) >= k
+    data[:, :k] = data[:, :k] * alpha
+    m.weight.data = data.contiguous().clone()
+    return m
+def freeze_module(m):
+    if not hasattr(m, "_forward_inside_frozen_module"):
+        m._forward_inside_frozen_module = m.forward
+    m.requires_grad_(False)
+    m.forward = torch.no_grad()(m.forward)
+    return m
+def get_latest_safetensors(folder_path):
+    safetensors_files = glob.glob(os.path.join(folder_path, "*.safetensors"))
+    if not safetensors_files:
+        raise ValueError("No file to resume!")
+    latest_file = max(safetensors_files, key=os.path.getmtime)
+    latest_file = os.path.abspath(os.path.realpath(latest_file))
+    return latest_file
+def generate_random_prompt_from_tags(tags_str, min_length=3, max_length=32):
+    tags = tags_str.split(", ")
+    tags = random.sample(tags, k=min(random.randint(min_length, max_length), len(tags)))
+    prompt = ", ".join(tags)
+    return prompt
+def interpolate_numbers(a, b, n, round_to_int=False, gamma=1.0):
+    numbers = a + (b - a) * (np.linspace(0, 1, n) ** gamma)
+    if round_to_int:
+        numbers = np.round(numbers).astype(int)
+    return numbers.tolist()
+def uniform_random_by_intervals(inclusive, exclusive, n, round_to_int=False):
+    edges = np.linspace(0, 1, n + 1)
+    points = np.random.uniform(edges[:-1], edges[1:])
+    numbers = inclusive + (exclusive - inclusive) * points
+    if round_to_int:
+        numbers = np.round(numbers).astype(int)
+    return numbers.tolist()
+def soft_append_bcthw(history, current, overlap=0):
+    if overlap <= 0:
+        return torch.cat([history, current], dim=2)
+    assert history.shape[2] >= overlap, f"History length ({history.shape[2]}) must be >= overlap ({overlap})"
+    assert current.shape[2] >= overlap, f"Current length ({current.shape[2]}) must be >= overlap ({overlap})"
+    weights = torch.linspace(1, 0, overlap, dtype=history.dtype, device=history.device).view(1, 1, -1, 1, 1)
+    blended = weights * history[:, :, -overlap:] + (1 - weights) * current[:, :, :overlap]
+    output = torch.cat([history[:, :, :-overlap], blended, current[:, :, overlap:]], dim=2)
+    return output.to(history)
+def save_bcthw_as_mp4(x, output_filename, fps=10):
+    b, c, t, h, w = x.shape
+    per_row = b
+    for p in [6, 5, 4, 3, 2]:
+        if b % p == 0:
+            per_row = p
+            break
+    os.makedirs(os.path.dirname(os.path.abspath(os.path.realpath(output_filename))), exist_ok=True)
+    x = torch.clamp(x.float(), -1.0, 1.0) * 127.5 + 127.5
+    x = x.detach().cpu().to(torch.uint8)
+    x = einops.rearrange(x, "(m n) c t h w -> t (m h) (n w) c", n=per_row)
+    torchvision.io.write_video(output_filename, x, fps=fps, video_codec="libx264", options={"crf": "0"})
+    # write tensor as .pt file
+    torch.save(x, output_filename.replace(".mp4", ".pt"))
+    return x
+def save_bcthw_as_png(x, output_filename):
+    os.makedirs(os.path.dirname(os.path.abspath(os.path.realpath(output_filename))), exist_ok=True)
+    x = torch.clamp(x.float(), -1.0, 1.0) * 127.5 + 127.5
+    x = x.detach().cpu().to(torch.uint8)
+    x = einops.rearrange(x, "b c t h w -> c (b h) (t w)")
+    torchvision.io.write_png(x, output_filename)
+    return output_filename
+def save_bchw_as_png(x, output_filename):
+    os.makedirs(os.path.dirname(os.path.abspath(os.path.realpath(output_filename))), exist_ok=True)
+    x = torch.clamp(x.float(), -1.0, 1.0) * 127.5 + 127.5
+    x = x.detach().cpu().to(torch.uint8)
+    x = einops.rearrange(x, "b c h w -> c h (b w)")
+    torchvision.io.write_png(x, output_filename)
+    return output_filename
+def add_tensors_with_padding(tensor1, tensor2):
+    if tensor1.shape == tensor2.shape:
+        return tensor1 + tensor2
+    shape1 = tensor1.shape
+    shape2 = tensor2.shape
+    new_shape = tuple(max(s1, s2) for s1, s2 in zip(shape1, shape2))
+    padded_tensor1 = torch.zeros(new_shape)
+    padded_tensor2 = torch.zeros(new_shape)
+    padded_tensor1[tuple(slice(0, s) for s in shape1)] = tensor1
+    padded_tensor2[tuple(slice(0, s) for s in shape2)] = tensor2
+    result = padded_tensor1 + padded_tensor2
+    return result
+def print_free_mem():
+    torch.cuda.empty_cache()
+    free_mem, total_mem = torch.cuda.mem_get_info(0)
+    free_mem_mb = free_mem / (1024**2)
+    total_mem_mb = total_mem / (1024**2)
+    print(f"Free memory: {free_mem_mb:.2f} MB")
+    print(f"Total memory: {total_mem_mb:.2f} MB")
+    return
+def print_gpu_parameters(device, state_dict, log_count=1):
+    summary = {"device": device, "keys_count": len(state_dict)}
+    logged_params = {}
+    for i, (key, tensor) in enumerate(state_dict.items()):
+        if i >= log_count:
+            break
+        logged_params[key] = tensor.flatten()[:3].tolist()
+    summary["params"] = logged_params
+    print(str(summary))
+    return
+def visualize_txt_as_img(width, height, text, font_path="font/DejaVuSans.ttf", size=18):
+    from PIL import Image, ImageDraw, ImageFont
+    txt = Image.new("RGB", (width, height), color="white")
+    draw = ImageDraw.Draw(txt)
+    font = ImageFont.truetype(font_path, size=size)
+    if text == "":
+        return np.array(txt)
+    # Split text into lines that fit within the image width
+    lines = []
+    words = text.split()
+    current_line = words[0]
+    for word in words[1:]:
+        line_with_word = f"{current_line} {word}"
+        if draw.textbbox((0, 0), line_with_word, font=font)[2] <= width:
+            current_line = line_with_word
+        else:
+            lines.append(current_line)
+            current_line = word
+    lines.append(current_line)
+    # Draw the text line by line
+    y = 0
+    line_height = draw.textbbox((0, 0), "A", font=font)[3]
+    for line in lines:
+        if y + line_height > height:
+            break  # stop drawing if the next line will be outside the image
+        draw.text((0, y), line, fill="black", font=font)
+        y += line_height
+    return np.array(txt)
+def blue_mark(x):
+    x = x.copy()
+    c = x[:, :, 2]
+    b = cv2.blur(c, (9, 9))
+    x[:, :, 2] = ((c - b) * 16.0 + b).clip(-1, 1)
+    return x
+def green_mark(x):
+    x = x.copy()
+    x[:, :, 2] = -1
+    x[:, :, 0] = -1
+    return x
+def frame_mark(x):
+    x = x.copy()
+    x[:64] = -1
+    x[-64:] = -1
+    x[:, :8] = 1
+    x[:, -8:] = 1
+    return x
+@torch.inference_mode()
+def pytorch2numpy(imgs):
+    results = []
+    for x in imgs:
+        y = x.movedim(0, -1)
+        y = y * 127.5 + 127.5
+        y = y.detach().float().cpu().numpy().clip(0, 255).astype(np.uint8)
+        results.append(y)
+    return results
+@torch.inference_mode()
+def numpy2pytorch(imgs):
+    h = torch.from_numpy(np.stack(imgs, axis=0)).float() / 127.5 - 1.0
+    h = h.movedim(-1, 1)
+    return h
+@torch.no_grad()
+def duplicate_prefix_to_suffix(x, count, zero_out=False):
+    if zero_out:
+        return torch.cat([x, torch.zeros_like(x[:count])], dim=0)
+    else:
+        return torch.cat([x, x[:count]], dim=0)
+def weighted_mse(a, b, weight):
+    return torch.mean(weight.float() * (a.float() - b.float()) ** 2)
+def clamped_linear_interpolation(x, x_min, y_min, x_max, y_max, sigma=1.0):
+    x = (x - x_min) / (x_max - x_min)
+    x = max(0.0, min(x, 1.0))
+    x = x**sigma
+    return y_min + x * (y_max - y_min)
+def expand_to_dims(x, target_dims):
+    return x.view(*x.shape, *([1] * max(0, target_dims - x.dim())))
+def repeat_to_batch_size(tensor: torch.Tensor, batch_size: int):
+    if tensor is None:
+        return None
+    first_dim = tensor.shape[0]
+    if first_dim == batch_size:
+        return tensor
+    if batch_size % first_dim != 0:
+        raise ValueError(f"Cannot evenly repeat first dim {first_dim} to match batch_size {batch_size}.")
+    repeat_times = batch_size // first_dim
+    return tensor.repeat(repeat_times, *[1] * (tensor.dim() - 1))
+def dim5(x):
+    return expand_to_dims(x, 5)
+def dim4(x):
+    return expand_to_dims(x, 4)
+def dim3(x):
+    return expand_to_dims(x, 3)
+def crop_or_pad_yield_mask(x, length):
+    B, F, C = x.shape
+    device = x.device
+    dtype = x.dtype
+    if F < length:
+        y = torch.zeros((B, length, C), dtype=dtype, device=device)
+        mask = torch.zeros((B, length), dtype=torch.bool, device=device)
+        y[:, :F, :] = x
+        mask[:, :F] = True
+        return y, mask
+    return x[:, :length, :], torch.ones((B, length), dtype=torch.bool, device=device)
+def extend_dim(x, dim, minimal_length, zero_pad=False):
+    original_length = int(x.shape[dim])
+    if original_length >= minimal_length:
+        return x
+    if zero_pad:
+        padding_shape = list(x.shape)
+        padding_shape[dim] = minimal_length - original_length
+        padding = torch.zeros(padding_shape, dtype=x.dtype, device=x.device)
+    else:
+        idx = (slice(None),) * dim + (slice(-1, None),) + (slice(None),) * (len(x.shape) - dim - 1)
+        last_element = x[idx]
+        padding = last_element.repeat_interleave(minimal_length - original_length, dim=dim)
+    return torch.cat([x, padding], dim=dim)
+def lazy_positional_encoding(t, repeats=None):
+    if not isinstance(t, list):
+        t = [t]
+    from diffusers.models.embeddings import get_timestep_embedding
+    te = torch.tensor(t)
+    te = get_timestep_embedding(timesteps=te, embedding_dim=256, flip_sin_to_cos=True, downscale_freq_shift=0.0, scale=1.0)
+    if repeats is None:
+        return te
+    te = te[:, None, :].expand(-1, repeats, -1)
+    return te
+def state_dict_offset_merge(A, B, C=None):
+    result = {}
+    keys = A.keys()
+    for key in keys:
+        A_value = A[key]
+        B_value = B[key].to(A_value)
+        if C is None:
+            result[key] = A_value + B_value
+        else:
+            C_value = C[key].to(A_value)
+            result[key] = A_value + B_value - C_value
+    return result
+def state_dict_weighted_merge(state_dicts, weights):
+    if len(state_dicts) != len(weights):
+        raise ValueError("Number of state dictionaries must match number of weights")
+    if not state_dicts:
+        return {}
+    total_weight = sum(weights)
+    if total_weight == 0:
+        raise ValueError("Sum of weights cannot be zero")
+    normalized_weights = [w / total_weight for w in weights]
+    keys = state_dicts[0].keys()
+    result = {}
+    for key in keys:
+        result[key] = state_dicts[0][key] * normalized_weights[0]
+        for i in range(1, len(state_dicts)):
+            state_dict_value = state_dicts[i][key].to(result[key])
+            result[key] += state_dict_value * normalized_weights[i]
+    return result
+def group_files_by_folder(all_files):
+    grouped_files = {}
+    for file in all_files:
+        folder_name = os.path.basename(os.path.dirname(file))
+        if folder_name not in grouped_files:
+            grouped_files[folder_name] = []
+        grouped_files[folder_name].append(file)
+    list_of_lists = list(grouped_files.values())
+    return list_of_lists
+def generate_timestamp():
+    now = datetime.datetime.now()
+    timestamp = now.strftime("%y%m%d_%H%M%S")
+    milliseconds = f"{int(now.microsecond / 1000):03d}"
+    random_number = random.randint(0, 9999)
+    return f"{timestamp}_{milliseconds}_{random_number}"
+def write_PIL_image_with_png_info(image, metadata, path):
+    from PIL.PngImagePlugin import PngInfo
+    png_info = PngInfo()
+    for key, value in metadata.items():
+        png_info.add_text(key, value)
+    image.save(path, "PNG", pnginfo=png_info)
+    return image
+def torch_safe_save(content, path):
+    torch.save(content, path + "_tmp")
+    os.replace(path + "_tmp", path)
+    return path
+def move_optimizer_to_device(optimizer, device):
+    for state in optimizer.state.values():
+        for k, v in state.items():
+            if isinstance(v, torch.Tensor):
+                state[k] = v.to(device)

frame_pack/wrapper.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import torch
+def append_dims(x, target_dims):
+    return x[(...,) + (None,) * (target_dims - x.ndim)]
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=1.0):
+    if guidance_rescale == 0:
+        return noise_cfg
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1.0 - guidance_rescale) * noise_cfg
+    return noise_cfg
+def fm_wrapper(transformer, t_scale=1000.0):
+    def k_model(x, sigma, **extra_args):
+        dtype = extra_args['dtype']
+        cfg_scale = extra_args['cfg_scale']
+        cfg_rescale = extra_args['cfg_rescale']
+        concat_latent = extra_args['concat_latent']
+        original_dtype = x.dtype
+        sigma = sigma.float()
+        x = x.to(dtype)
+        timestep = (sigma * t_scale).to(dtype)
+        if concat_latent is None:
+            hidden_states = x
+        else:
+            hidden_states = torch.cat([x, concat_latent.to(x)], dim=1)
+        pred_positive = transformer(hidden_states=hidden_states, timestep=timestep, return_dict=False, **extra_args['positive'])[0].float()
+        if cfg_scale == 1.0:
+            pred_negative = torch.zeros_like(pred_positive)
+        else:
+            pred_negative = transformer(hidden_states=hidden_states, timestep=timestep, return_dict=False, **extra_args['negative'])[0].float()
+        pred_cfg = pred_negative + cfg_scale * (pred_positive - pred_negative)
+        pred = rescale_noise_cfg(pred_cfg, pred_positive, guidance_rescale=cfg_rescale)
+        x0 = x.float() - pred.float() * append_dims(sigma, x.ndim)
+        return x0.to(dtype=original_dtype)
+    return k_model

framepack_yichen_output/framepack-yichen-lora-000001.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:326b6106f35da477b51af1d7a0064745e839906b68cc6de7a181af8f24102969
+size 275426472

framepack_yichen_output/framepack-yichen-lora-000002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6152c8741c1830db7238cd468f702925b1e32e9aaad47683d6f4a3a71b5f8a80
+size 275426472

framepack_yichen_output/framepack-yichen-lora-000003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:73b38a3bfcb16e58aab100f8737937ac498e905180ce7d21262821d624fc38e5
+size 275426480

framepack_yichen_output/framepack-yichen-lora-000004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:000d4dbc25ae1af98e578d014593d777d6ea79cf298baea76a16acb19cfe1d48
+size 275426472

framepack_yichen_output/framepack-yichen-lora-000005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5bf8f8dd7a2a7568d2ddd7dcfffb7ef041b32981f4f8400ec6b0edd5c308449c
+size 275426472

framepack_yichen_output/framepack-yichen-lora-000006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f8672ad47082125de5898c45c9040a36549a75b419c1ba9a0f6efdd1775f79d
+size 275426480

hunyuan_model/__init__.py ADDED Viewed

File without changes

hunyuan_model/activation_layers.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import torch.nn as nn
+def get_activation_layer(act_type):
+    """get activation layer
+    Args:
+        act_type (str): the activation type
+    Returns:
+        torch.nn.functional: the activation layer
+    """
+    if act_type == "gelu":
+        return lambda: nn.GELU()
+    elif act_type == "gelu_tanh":
+        # Approximate `tanh` requires torch >= 1.13
+        return lambda: nn.GELU(approximate="tanh")
+    elif act_type == "relu":
+        return nn.ReLU
+    elif act_type == "silu":
+        return nn.SiLU
+    else:
+        raise ValueError(f"Unknown activation type: {act_type}")

hunyuan_model/attention.py ADDED Viewed

	@@ -0,0 +1,295 @@

+import importlib.metadata
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+try:
+    import flash_attn
+    from flash_attn.flash_attn_interface import _flash_attn_forward
+    from flash_attn.flash_attn_interface import flash_attn_varlen_func
+    from flash_attn.flash_attn_interface import flash_attn_func
+except ImportError:
+    flash_attn = None
+    flash_attn_varlen_func = None
+    _flash_attn_forward = None
+    flash_attn_func = None
+try:
+    print(f"Trying to import sageattention")
+    from sageattention import sageattn_varlen, sageattn
+    print("Successfully imported sageattention")
+except ImportError:
+    print(f"Failed to import sageattention")
+    sageattn_varlen = None
+    sageattn = None
+try:
+    import xformers.ops as xops
+except ImportError:
+    xops = None
+MEMORY_LAYOUT = {
+    "flash": (
+        lambda x: x.view(x.shape[0] * x.shape[1], *x.shape[2:]),
+        lambda x: x,
+    ),
+    "flash_fixlen": (
+        lambda x: x,
+        lambda x: x,
+    ),
+    "sageattn": (
+        lambda x: x.view(x.shape[0] * x.shape[1], *x.shape[2:]),
+        lambda x: x,
+    ),
+    "sageattn_fixlen": (
+        lambda x: x.transpose(1, 2),
+        lambda x: x.transpose(1, 2),
+    ),
+    "torch": (
+        lambda x: x.transpose(1, 2),
+        lambda x: x.transpose(1, 2),
+    ),
+    "xformers": (
+        lambda x: x,
+        lambda x: x,
+    ),
+    "vanilla": (
+        lambda x: x.transpose(1, 2),
+        lambda x: x.transpose(1, 2),
+    ),
+}
+def get_cu_seqlens(text_mask, img_len):
+    """Calculate cu_seqlens_q, cu_seqlens_kv using text_mask and img_len
+    Args:
+        text_mask (torch.Tensor): the mask of text
+        img_len (int): the length of image
+    Returns:
+        torch.Tensor: the calculated cu_seqlens for flash attention
+    """
+    batch_size = text_mask.shape[0]
+    text_len = text_mask.sum(dim=1)
+    max_len = text_mask.shape[1] + img_len
+    cu_seqlens = torch.zeros([2 * batch_size + 1], dtype=torch.int32, device="cuda")
+    for i in range(batch_size):
+        s = text_len[i] + img_len
+        s1 = i * max_len + s
+        s2 = (i + 1) * max_len
+        cu_seqlens[2 * i + 1] = s1
+        cu_seqlens[2 * i + 2] = s2
+    return cu_seqlens
+def attention(
+    q_or_qkv_list,
+    k=None,
+    v=None,
+    mode="flash",
+    drop_rate=0,
+    attn_mask=None,
+    total_len=None,
+    causal=False,
+    cu_seqlens_q=None,
+    cu_seqlens_kv=None,
+    max_seqlen_q=None,
+    max_seqlen_kv=None,
+    batch_size=1,
+):
+    """
+    Perform QKV self attention.
+    Args:
+        q (torch.Tensor): Query tensor with shape [b, s, a, d], where a is the number of heads.
+        k (torch.Tensor): Key tensor with shape [b, s1, a, d]
+        v (torch.Tensor): Value tensor with shape [b, s1, a, d]
+        mode (str): Attention mode. Choose from 'self_flash', 'cross_flash', 'torch', and 'vanilla'.
+        drop_rate (float): Dropout rate in attention map. (default: 0)
+        attn_mask (torch.Tensor): Attention mask with shape [b, s1] (cross_attn), or [b, a, s, s1] (torch or vanilla).
+            (default: None)
+        causal (bool): Whether to use causal attention. (default: False)
+        cu_seqlens_q (torch.Tensor): dtype torch.int32. The cumulative sequence lengths of the sequences in the batch,
+            used to index into q.
+        cu_seqlens_kv (torch.Tensor): dtype torch.int32. The cumulative sequence lengths of the sequences in the batch,
+            used to index into kv.
+        max_seqlen_q (int): The maximum sequence length in the batch of q.
+        max_seqlen_kv (int): The maximum sequence length in the batch of k and v.
+    Returns:
+        torch.Tensor: Output tensor after self attention with shape [b, s, ad]
+    """
+    q, k, v = q_or_qkv_list if type(q_or_qkv_list) == list else (q_or_qkv_list, k, v)
+    if type(q_or_qkv_list) == list:
+        q_or_qkv_list.clear()
+    split_attn = total_len is not None
+    if split_attn and mode == "sageattn":
+        mode = "sageattn_fixlen"
+    elif split_attn and mode == "flash":
+        mode = "flash_fixlen"
+    # print(f"Attention mode: {mode}, split_attn: {split_attn}")
+    pre_attn_layout, post_attn_layout = MEMORY_LAYOUT[mode]
+    # trim the sequence length to the actual length instead of attn_mask
+    if split_attn:
+        trimmed_len = q.shape[1] - total_len
+        q = [q[i : i + 1, : total_len[i]] for i in range(len(q))]
+        k = [k[i : i + 1, : total_len[i]] for i in range(len(k))]
+        v = [v[i : i + 1, : total_len[i]] for i in range(len(v))]
+        q = [pre_attn_layout(q_i) for q_i in q]
+        k = [pre_attn_layout(k_i) for k_i in k]
+        v = [pre_attn_layout(v_i) for v_i in v]
+        # print(
+        #     f"Trimming the sequence length to {total_len},trimmed_len: {trimmed_len}, q.shape: {[q_i.shape for q_i in q]}, mode: {mode}"
+        # )
+    else:
+        q = pre_attn_layout(q)
+        k = pre_attn_layout(k)
+        v = pre_attn_layout(v)
+    if mode == "torch":
+        if split_attn:
+            x = []
+            for i in range(len(q)):
+                x_i = F.scaled_dot_product_attention(q[i], k[i], v[i], dropout_p=drop_rate, is_causal=causal)
+                q[i], k[i], v[i] = None, None, None
+                x.append(x_i)
+            del q, k, v
+        else:
+            if attn_mask is not None and attn_mask.dtype != torch.bool:
+                attn_mask = attn_mask.to(q.dtype)
+            x = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask, dropout_p=drop_rate, is_causal=causal)
+            del q, k, v
+            del attn_mask
+    elif mode == "xformers":
+        # B, M, H, K: M is the sequence length, H is the number of heads, K is the dimension of the heads -> it is same as input dimension
+        # currently only support batch_size = 1
+        assert split_attn, "Xformers only supports splitting"
+        x = []
+        for i in range(len(q)):
+            x_i = xops.memory_efficient_attention(q[i], k[i], v[i], p=drop_rate)  # , causal=causal)
+            q[i], k[i], v[i] = None, None, None
+            x.append(x_i)
+        del q, k, v
+    elif mode == "flash":
+        x = flash_attn_varlen_func(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv)
+        del q, k, v
+        # x with shape [(bxs), a, d]
+        x = x.view(batch_size, max_seqlen_q, x.shape[-2], x.shape[-1])  # reshape x to [b, s, a, d]
+    elif mode == "flash_fixlen":
+        x = []
+        for i in range(len(q)):
+            # q: (batch_size, seqlen, nheads, headdim), k: (batch_size, seqlen, nheads_k, headdim), v: (batch_size, seqlen, nheads_k, headdim)
+            x_i = flash_attn_func(q[i], k[i], v[i], dropout_p=drop_rate, causal=causal)
+            q[i], k[i], v[i] = None, None, None
+            x.append(x_i)
+        del q, k, v
+    elif mode == "sageattn":
+        x = sageattn_varlen(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv)
+        del q, k, v
+        # x with shape [(bxs), a, d]
+        x = x.view(batch_size, max_seqlen_q, x.shape[-2], x.shape[-1])  # reshape x to [b, s, a, d]
+    elif mode == "sageattn_fixlen":
+        x = []
+        for i in range(len(q)):
+            # HND seems to cause an error
+            x_i = sageattn(q[i], k[i], v[i])  # (batch_size, seq_len, head_num, head_dim)
+            q[i], k[i], v[i] = None, None, None
+            x.append(x_i)
+        del q, k, v
+    elif mode == "vanilla":
+        assert not split_attn, "Vanilla attention does not support trimming"
+        scale_factor = 1 / math.sqrt(q.size(-1))
+        b, a, s, _ = q.shape
+        s1 = k.size(2)
+        attn_bias = torch.zeros(b, a, s, s1, dtype=q.dtype, device=q.device)
+        if causal:
+            # Only applied to self attention
+            assert attn_mask is None, "Causal mask and attn_mask cannot be used together"
+            temp_mask = torch.ones(b, a, s, s, dtype=torch.bool, device=q.device).tril(diagonal=0)
+            attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
+            attn_bias.to(q.dtype)
+        if attn_mask is not None:
+            if attn_mask.dtype == torch.bool:
+                attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
+            else:
+                attn_bias += attn_mask
+        # TODO: Maybe force q and k to be float32 to avoid numerical overflow
+        attn = (q @ k.transpose(-2, -1)) * scale_factor
+        attn += attn_bias
+        attn = attn.softmax(dim=-1)
+        attn = torch.dropout(attn, p=drop_rate, train=True)
+        x = attn @ v
+    else:
+        raise NotImplementedError(f"Unsupported attention mode: {mode}")
+    if split_attn:
+        x = [post_attn_layout(x_i) for x_i in x]
+        for i in range(len(x)):
+            x[i] = F.pad(x[i], (0, 0, 0, 0, 0, trimmed_len[i]))
+        x = torch.cat(x, dim=0)
+    else:
+        x = post_attn_layout(x)
+    b, s, a, d = x.shape
+    out = x.reshape(b, s, -1)
+    return out
+def parallel_attention(hybrid_seq_parallel_attn, q, k, v, img_q_len, img_kv_len, cu_seqlens_q, cu_seqlens_kv):
+    attn1 = hybrid_seq_parallel_attn(
+        None,
+        q[:, :img_q_len, :, :],
+        k[:, :img_kv_len, :, :],
+        v[:, :img_kv_len, :, :],
+        dropout_p=0.0,
+        causal=False,
+        joint_tensor_query=q[:, img_q_len : cu_seqlens_q[1]],
+        joint_tensor_key=k[:, img_kv_len : cu_seqlens_kv[1]],
+        joint_tensor_value=v[:, img_kv_len : cu_seqlens_kv[1]],
+        joint_strategy="rear",
+    )
+    if flash_attn.__version__ >= "2.7.0":
+        attn2, *_ = _flash_attn_forward(
+            q[:, cu_seqlens_q[1] :],
+            k[:, cu_seqlens_kv[1] :],
+            v[:, cu_seqlens_kv[1] :],
+            dropout_p=0.0,
+            softmax_scale=q.shape[-1] ** (-0.5),
+            causal=False,
+            window_size_left=-1,
+            window_size_right=-1,
+            softcap=0.0,
+            alibi_slopes=None,
+            return_softmax=False,
+        )
+    else:
+        attn2, *_ = _flash_attn_forward(
+            q[:, cu_seqlens_q[1] :],
+            k[:, cu_seqlens_kv[1] :],
+            v[:, cu_seqlens_kv[1] :],
+            dropout_p=0.0,
+            softmax_scale=q.shape[-1] ** (-0.5),
+            causal=False,
+            window_size=(-1, -1),
+            softcap=0.0,
+            alibi_slopes=None,
+            return_softmax=False,
+        )
+    attn = torch.cat([attn1, attn2], dim=1)
+    b, s, a, d = attn.shape
+    attn = attn.reshape(b, s, -1)
+    return attn

hunyuan_model/autoencoder_kl_causal_3d.py ADDED Viewed

	@@ -0,0 +1,609 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Modified from diffusers==0.29.2
+#
+# ==============================================================================
+from typing import Dict, Optional, Tuple, Union
+from dataclasses import dataclass
+import torch
+import torch.nn as nn
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+# try:
+#     # This diffusers is modified and packed in the mirror.
+#     from diffusers.loaders import FromOriginalVAEMixin
+# except ImportError:
+#     # Use this to be compatible with the original diffusers.
+#     from diffusers.loaders.single_file_model import FromOriginalModelMixin as FromOriginalVAEMixin
+from diffusers.utils.accelerate_utils import apply_forward_hook
+from diffusers.models.attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    Attention,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from diffusers.models.modeling_outputs import AutoencoderKLOutput
+from diffusers.models.modeling_utils import ModelMixin
+from .vae import DecoderCausal3D, BaseOutput, DecoderOutput, DiagonalGaussianDistribution, EncoderCausal3D
+@dataclass
+class DecoderOutput2(BaseOutput):
+    sample: torch.FloatTensor
+    posterior: Optional[DiagonalGaussianDistribution] = None
+class AutoencoderKLCausal3D(ModelMixin, ConfigMixin):
+    r"""
+    A VAE model with KL loss for encoding images/videos into latents and decoding latent representations into images/videos.
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+    """
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str] = ("DownEncoderBlockCausal3D",),
+        up_block_types: Tuple[str] = ("UpDecoderBlockCausal3D",),
+        block_out_channels: Tuple[int] = (64,),
+        layers_per_block: int = 1,
+        act_fn: str = "silu",
+        latent_channels: int = 4,
+        norm_num_groups: int = 32,
+        sample_size: int = 32,
+        sample_tsize: int = 64,
+        scaling_factor: float = 0.18215,
+        force_upcast: float = True,
+        spatial_compression_ratio: int = 8,
+        time_compression_ratio: int = 4,
+        mid_block_add_attention: bool = True,
+    ):
+        super().__init__()
+        self.time_compression_ratio = time_compression_ratio
+        self.encoder = EncoderCausal3D(
+            in_channels=in_channels,
+            out_channels=latent_channels,
+            down_block_types=down_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            act_fn=act_fn,
+            norm_num_groups=norm_num_groups,
+            double_z=True,
+            time_compression_ratio=time_compression_ratio,
+            spatial_compression_ratio=spatial_compression_ratio,
+            mid_block_add_attention=mid_block_add_attention,
+        )
+        self.decoder = DecoderCausal3D(
+            in_channels=latent_channels,
+            out_channels=out_channels,
+            up_block_types=up_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            norm_num_groups=norm_num_groups,
+            act_fn=act_fn,
+            time_compression_ratio=time_compression_ratio,
+            spatial_compression_ratio=spatial_compression_ratio,
+            mid_block_add_attention=mid_block_add_attention,
+        )
+        self.quant_conv = nn.Conv3d(2 * latent_channels, 2 * latent_channels, kernel_size=1)
+        self.post_quant_conv = nn.Conv3d(latent_channels, latent_channels, kernel_size=1)
+        self.use_slicing = False
+        self.use_spatial_tiling = False
+        self.use_temporal_tiling = False
+        # only relevant if vae tiling is enabled
+        self.tile_sample_min_tsize = sample_tsize
+        self.tile_latent_min_tsize = sample_tsize // time_compression_ratio
+        self.tile_sample_min_size = self.config.sample_size
+        sample_size = self.config.sample_size[0] if isinstance(self.config.sample_size, (list, tuple)) else self.config.sample_size
+        self.tile_latent_min_size = int(sample_size / (2 ** (len(self.config.block_out_channels) - 1)))
+        self.tile_overlap_factor = 0.25
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (EncoderCausal3D, DecoderCausal3D)):
+            module.gradient_checkpointing = value
+    def enable_temporal_tiling(self, use_tiling: bool = True):
+        self.use_temporal_tiling = use_tiling
+    def disable_temporal_tiling(self):
+        self.enable_temporal_tiling(False)
+    def enable_spatial_tiling(self, use_tiling: bool = True):
+        self.use_spatial_tiling = use_tiling
+    def disable_spatial_tiling(self):
+        self.enable_spatial_tiling(False)
+    def enable_tiling(self, use_tiling: bool = True):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger videos.
+        """
+        self.enable_spatial_tiling(use_tiling)
+        self.enable_temporal_tiling(use_tiling)
+    def disable_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.disable_spatial_tiling()
+        self.disable_temporal_tiling()
+    def enable_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.use_slicing = True
+    def disable_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_slicing = False
+    def set_chunk_size_for_causal_conv_3d(self, chunk_size: int):
+        # set chunk_size to CausalConv3d recursively
+        def set_chunk_size(module):
+            if hasattr(module, "chunk_size"):
+                module.chunk_size = chunk_size
+        self.apply(set_chunk_size)
+    @property
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]], _remove_lora=False):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor, _remove_lora=_remove_lora)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora)
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+        self.set_attn_processor(processor, _remove_lora=True)
+    @apply_forward_hook
+    def encode(
+        self, x: torch.FloatTensor, return_dict: bool = True
+    ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
+        """
+        Encode a batch of images/videos into latents.
+        Args:
+            x (`torch.FloatTensor`): Input batch of images/videos.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+        Returns:
+                The latent representations of the encoded images/videos. If `return_dict` is True, a
+                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
+        """
+        assert len(x.shape) == 5, "The input tensor should have 5 dimensions."
+        if self.use_temporal_tiling and x.shape[2] > self.tile_sample_min_tsize:
+            return self.temporal_tiled_encode(x, return_dict=return_dict)
+        if self.use_spatial_tiling and (x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size):
+            return self.spatial_tiled_encode(x, return_dict=return_dict)
+        if self.use_slicing and x.shape[0] > 1:
+            encoded_slices = [self.encoder(x_slice) for x_slice in x.split(1)]
+            h = torch.cat(encoded_slices)
+        else:
+            h = self.encoder(x)
+        moments = self.quant_conv(h)
+        posterior = DiagonalGaussianDistribution(moments)
+        if not return_dict:
+            return (posterior,)
+        return AutoencoderKLOutput(latent_dist=posterior)
+    def _decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
+        assert len(z.shape) == 5, "The input tensor should have 5 dimensions."
+        if self.use_temporal_tiling and z.shape[2] > self.tile_latent_min_tsize:
+            return self.temporal_tiled_decode(z, return_dict=return_dict)
+        if self.use_spatial_tiling and (z.shape[-1] > self.tile_latent_min_size or z.shape[-2] > self.tile_latent_min_size):
+            return self.spatial_tiled_decode(z, return_dict=return_dict)
+        z = self.post_quant_conv(z)
+        dec = self.decoder(z)
+        if not return_dict:
+            return (dec,)
+        return DecoderOutput(sample=dec)
+    @apply_forward_hook
+    def decode(self, z: torch.FloatTensor, return_dict: bool = True, generator=None) -> Union[DecoderOutput, torch.FloatTensor]:
+        """
+        Decode a batch of images/videos.
+        Args:
+            z (`torch.FloatTensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+        """
+        if self.use_slicing and z.shape[0] > 1:
+            decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
+            decoded = torch.cat(decoded_slices)
+        else:
+            decoded = self._decode(z).sample
+        if not return_dict:
+            return (decoded,)
+        return DecoderOutput(sample=decoded)
+    def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[-2], b.shape[-2], blend_extent)
+        for y in range(blend_extent):
+            b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * (y / blend_extent)
+        return b
+    def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[-1], b.shape[-1], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * (x / blend_extent)
+        return b
+    def blend_t(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[-3], b.shape[-3], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, x, :, :] = a[:, :, -blend_extent + x, :, :] * (1 - x / blend_extent) + b[:, :, x, :, :] * (x / blend_extent)
+        return b
+    def spatial_tiled_encode(
+        self, x: torch.FloatTensor, return_dict: bool = True, return_moments: bool = False
+    ) -> AutoencoderKLOutput:
+        r"""Encode a batch of images/videos using a tiled encoder.
+        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
+        steps. This is useful to keep memory use constant regardless of image/videos size. The end result of tiled encoding is
+        different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
+        tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
+        output, but they should be much less noticeable.
+        Args:
+            x (`torch.FloatTensor`): Input batch of images/videos.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.autoencoder_kl.AutoencoderKLOutput`] or `tuple`:
+                If return_dict is True, a [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain
+                `tuple` is returned.
+        """
+        overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)
+        row_limit = self.tile_latent_min_size - blend_extent
+        # Split video into tiles and encode them separately.
+        rows = []
+        for i in range(0, x.shape[-2], overlap_size):
+            row = []
+            for j in range(0, x.shape[-1], overlap_size):
+                tile = x[:, :, :, i : i + self.tile_sample_min_size, j : j + self.tile_sample_min_size]
+                tile = self.encoder(tile)
+                tile = self.quant_conv(tile)
+                row.append(tile)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=-1))
+        moments = torch.cat(result_rows, dim=-2)
+        if return_moments:
+            return moments
+        posterior = DiagonalGaussianDistribution(moments)
+        if not return_dict:
+            return (posterior,)
+        return AutoencoderKLOutput(latent_dist=posterior)
+    def spatial_tiled_decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
+        r"""
+        Decode a batch of images/videos using a tiled decoder.
+        Args:
+            z (`torch.FloatTensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+        """
+        overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_sample_min_size * self.tile_overlap_factor)
+        row_limit = self.tile_sample_min_size - blend_extent
+        # Split z into overlapping tiles and decode them separately.
+        # The tiles have an overlap to avoid seams between tiles.
+        rows = []
+        for i in range(0, z.shape[-2], overlap_size):
+            row = []
+            for j in range(0, z.shape[-1], overlap_size):
+                tile = z[:, :, :, i : i + self.tile_latent_min_size, j : j + self.tile_latent_min_size]
+                tile = self.post_quant_conv(tile)
+                decoded = self.decoder(tile)
+                row.append(decoded)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=-1))
+        dec = torch.cat(result_rows, dim=-2)
+        if not return_dict:
+            return (dec,)
+        return DecoderOutput(sample=dec)
+    def temporal_tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True) -> AutoencoderKLOutput:
+        B, C, T, H, W = x.shape
+        overlap_size = int(self.tile_sample_min_tsize * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_latent_min_tsize * self.tile_overlap_factor)
+        t_limit = self.tile_latent_min_tsize - blend_extent
+        # Split the video into tiles and encode them separately.
+        row = []
+        for i in range(0, T, overlap_size):
+            tile = x[:, :, i : i + self.tile_sample_min_tsize + 1, :, :]
+            if self.use_spatial_tiling and (
+                tile.shape[-1] > self.tile_sample_min_size or tile.shape[-2] > self.tile_sample_min_size
+            ):
+                tile = self.spatial_tiled_encode(tile, return_moments=True)
+            else:
+                tile = self.encoder(tile)
+                tile = self.quant_conv(tile)
+            if i > 0:
+                tile = tile[:, :, 1:, :, :]
+            row.append(tile)
+        result_row = []
+        for i, tile in enumerate(row):
+            if i > 0:
+                tile = self.blend_t(row[i - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :t_limit, :, :])
+            else:
+                result_row.append(tile[:, :, : t_limit + 1, :, :])
+        moments = torch.cat(result_row, dim=2)
+        posterior = DiagonalGaussianDistribution(moments)
+        if not return_dict:
+            return (posterior,)
+        return AutoencoderKLOutput(latent_dist=posterior)
+    def temporal_tiled_decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
+        # Split z into overlapping tiles and decode them separately.
+        B, C, T, H, W = z.shape
+        overlap_size = int(self.tile_latent_min_tsize * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_sample_min_tsize * self.tile_overlap_factor)
+        t_limit = self.tile_sample_min_tsize - blend_extent
+        row = []
+        for i in range(0, T, overlap_size):
+            tile = z[:, :, i : i + self.tile_latent_min_tsize + 1, :, :]
+            if self.use_spatial_tiling and (
+                tile.shape[-1] > self.tile_latent_min_size or tile.shape[-2] > self.tile_latent_min_size
+            ):
+                decoded = self.spatial_tiled_decode(tile, return_dict=True).sample
+            else:
+                tile = self.post_quant_conv(tile)
+                decoded = self.decoder(tile)
+            if i > 0:
+                decoded = decoded[:, :, 1:, :, :]
+            row.append(decoded)
+        result_row = []
+        for i, tile in enumerate(row):
+            if i > 0:
+                tile = self.blend_t(row[i - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :t_limit, :, :])
+            else:
+                result_row.append(tile[:, :, : t_limit + 1, :, :])
+        dec = torch.cat(result_row, dim=2)
+        if not return_dict:
+            return (dec,)
+        return DecoderOutput(sample=dec)
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        sample_posterior: bool = False,
+        return_dict: bool = True,
+        return_posterior: bool = False,
+        generator: Optional[torch.Generator] = None,
+    ) -> Union[DecoderOutput2, torch.FloatTensor]:
+        r"""
+        Args:
+            sample (`torch.FloatTensor`): Input sample.
+            sample_posterior (`bool`, *optional*, defaults to `False`):
+                Whether to sample from the posterior.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
+        """
+        x = sample
+        posterior = self.encode(x).latent_dist
+        if sample_posterior:
+            z = posterior.sample(generator=generator)
+        else:
+            z = posterior.mode()
+        dec = self.decode(z).sample
+        if not return_dict:
+            if return_posterior:
+                return (dec, posterior)
+            else:
+                return (dec,)
+        if return_posterior:
+            return DecoderOutput2(sample=dec, posterior=posterior)
+        else:
+            return DecoderOutput2(sample=dec)
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+        key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        self.original_attn_processors = None
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+        self.original_attn_processors = self.attn_processors
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)

hunyuan_model/embed_layers.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import collections
+import math
+import torch
+import torch.nn as nn
+from einops import rearrange, repeat
+from .helpers import to_2tuple
+class PatchEmbed(nn.Module):
+    """2D Image to Patch Embedding
+    Image to Patch Embedding using Conv2d
+    A convolution based approach to patchifying a 2D image w/ embedding projection.
+    Based on the impl in https://github.com/google-research/vision_transformer
+    Hacked together by / Copyright 2020 Ross Wightman
+    Remove the _assert function in forward function to be compatible with multi-resolution images.
+    """
+    def __init__(
+        self,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        norm_layer=None,
+        flatten=True,
+        bias=True,
+        dtype=None,
+        device=None,
+    ):
+        factory_kwargs = {"dtype": dtype, "device": device}
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+        self.flatten = flatten
+        self.proj = nn.Conv3d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias, **factory_kwargs)
+        nn.init.xavier_uniform_(self.proj.weight.view(self.proj.weight.size(0), -1))
+        if bias:
+            nn.init.zeros_(self.proj.bias)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x):
+        x = self.proj(x)
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        x = self.norm(x)
+        return x
+class TextProjection(nn.Module):
+    """
+    Projects text embeddings. Also handles dropout for classifier-free guidance.
+    Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/nets/PixArt_blocks.py
+    """
+    def __init__(self, in_channels, hidden_size, act_layer, dtype=None, device=None):
+        factory_kwargs = {"dtype": dtype, "device": device}
+        super().__init__()
+        self.linear_1 = nn.Linear(in_features=in_channels, out_features=hidden_size, bias=True, **factory_kwargs)
+        self.act_1 = act_layer()
+        self.linear_2 = nn.Linear(in_features=hidden_size, out_features=hidden_size, bias=True, **factory_kwargs)
+    def forward(self, caption):
+        hidden_states = self.linear_1(caption)
+        hidden_states = self.act_1(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+def timestep_embedding(t, dim, max_period=10000):
+    """
+    Create sinusoidal timestep embeddings.
+    Args:
+        t (torch.Tensor): a 1-D Tensor of N indices, one per batch element. These may be fractional.
+        dim (int): the dimension of the output.
+        max_period (int): controls the minimum frequency of the embeddings.
+    Returns:
+        embedding (torch.Tensor): An (N, D) Tensor of positional embeddings.
+    .. ref_link: https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+    """
+    half = dim // 2
+    freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(device=t.device)
+    args = t[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    return embedding
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(
+        self,
+        hidden_size,
+        act_layer,
+        frequency_embedding_size=256,
+        max_period=10000,
+        out_size=None,
+        dtype=None,
+        device=None,
+    ):
+        factory_kwargs = {"dtype": dtype, "device": device}
+        super().__init__()
+        self.frequency_embedding_size = frequency_embedding_size
+        self.max_period = max_period
+        if out_size is None:
+            out_size = hidden_size
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True, **factory_kwargs),
+            act_layer(),
+            nn.Linear(hidden_size, out_size, bias=True, **factory_kwargs),
+        )
+        nn.init.normal_(self.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.mlp[2].weight, std=0.02)
+    def forward(self, t):
+        t_freq = timestep_embedding(t, self.frequency_embedding_size, self.max_period).type(self.mlp[0].weight.dtype)
+        t_emb = self.mlp(t_freq)
+        return t_emb

hunyuan_model/fp8_optimization.py ADDED Viewed

	@@ -0,0 +1,39 @@

+#based on ComfyUI's and MinusZoneAI's fp8_linear optimization
+#further borrowed from HunyuanVideoWrapper for Musubi Tuner
+import torch
+import torch.nn as nn
+def fp8_linear_forward(cls, original_dtype, input):
+    weight_dtype = cls.weight.dtype
+    if weight_dtype in [torch.float8_e4m3fn, torch.float8_e5m2]:
+        if len(input.shape) == 3:
+            target_dtype = torch.float8_e5m2 if weight_dtype == torch.float8_e4m3fn else torch.float8_e4m3fn
+            inn = input.reshape(-1, input.shape[2]).to(target_dtype)
+            w = cls.weight.t()
+            scale = torch.ones((1), device=input.device, dtype=torch.float32)
+            bias = cls.bias.to(original_dtype) if cls.bias is not None else None
+            if bias is not None:
+                o = torch._scaled_mm(inn, w, out_dtype=original_dtype, bias=bias, scale_a=scale, scale_b=scale)
+            else:
+                o = torch._scaled_mm(inn, w, out_dtype=original_dtype, scale_a=scale, scale_b=scale)
+            if isinstance(o, tuple):
+                o = o[0]
+            return o.reshape((-1, input.shape[1], cls.weight.shape[0]))
+        else:
+            return cls.original_forward(input.to(original_dtype))
+    else:
+        return cls.original_forward(input)
+def convert_fp8_linear(module, original_dtype, params_to_keep={}):
+    setattr(module, "fp8_matmul_enabled", True)
+    for name, module in module.named_modules():
+        if not any(keyword in name for keyword in params_to_keep):
+            if isinstance(module, nn.Linear):
+                original_forward = module.forward
+                setattr(module, "original_forward", original_forward)
+                setattr(module, "forward", lambda input, m=module: fp8_linear_forward(m, original_dtype, input))

hunyuan_model/helpers.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import collections.abc
+from itertools import repeat
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            x = tuple(x)
+            if len(x) == 1:
+                x = tuple(repeat(x[0], n))
+            return x
+        return tuple(repeat(x, n))
+    return parse
+to_1tuple = _ntuple(1)
+to_2tuple = _ntuple(2)
+to_3tuple = _ntuple(3)
+to_4tuple = _ntuple(4)
+def as_tuple(x):
+    if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+        return tuple(x)
+    if x is None or isinstance(x, (int, float, str)):
+        return (x,)
+    else:
+        raise ValueError(f"Unknown type {type(x)}")
+def as_list_of_2tuple(x):
+    x = as_tuple(x)
+    if len(x) == 1:
+        x = (x[0], x[0])
+    assert len(x) % 2 == 0, f"Expect even length, got {len(x)}."
+    lst = []
+    for i in range(0, len(x), 2):
+        lst.append((x[i], x[i + 1]))
+    return lst

hunyuan_model/mlp_layers.py ADDED Viewed

	@@ -0,0 +1,118 @@

+# Modified from timm library:
+# https://github.com/huggingface/pytorch-image-models/blob/648aaa41233ba83eb38faf5ba9d415d574823241/timm/layers/mlp.py#L13
+from functools import partial
+import torch
+import torch.nn as nn
+from .modulate_layers import modulate
+from .helpers import to_2tuple
+class MLP(nn.Module):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+    def __init__(
+        self,
+        in_channels,
+        hidden_channels=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        norm_layer=None,
+        bias=True,
+        drop=0.0,
+        use_conv=False,
+        device=None,
+        dtype=None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        out_features = out_features or in_channels
+        hidden_channels = hidden_channels or in_channels
+        bias = to_2tuple(bias)
+        drop_probs = to_2tuple(drop)
+        linear_layer = partial(nn.Conv2d, kernel_size=1) if use_conv else nn.Linear
+        self.fc1 = linear_layer(
+            in_channels, hidden_channels, bias=bias[0], **factory_kwargs
+        )
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.norm = (
+            norm_layer(hidden_channels, **factory_kwargs)
+            if norm_layer is not None
+            else nn.Identity()
+        )
+        self.fc2 = linear_layer(
+            hidden_channels, out_features, bias=bias[1], **factory_kwargs
+        )
+        self.drop2 = nn.Dropout(drop_probs[1])
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.norm(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+#
+class MLPEmbedder(nn.Module):
+    """copied from https://github.com/black-forest-labs/flux/blob/main/src/flux/modules/layers.py"""
+    def __init__(self, in_dim: int, hidden_dim: int, device=None, dtype=None):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True, **factory_kwargs)
+        self.silu = nn.SiLU()
+        self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True, **factory_kwargs)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.out_layer(self.silu(self.in_layer(x)))
+class FinalLayer(nn.Module):
+    """The final layer of DiT."""
+    def __init__(
+        self, hidden_size, patch_size, out_channels, act_layer, device=None, dtype=None
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        # Just use LayerNorm for the final layer
+        self.norm_final = nn.LayerNorm(
+            hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs
+        )
+        if isinstance(patch_size, int):
+            self.linear = nn.Linear(
+                hidden_size,
+                patch_size * patch_size * out_channels,
+                bias=True,
+                **factory_kwargs
+            )
+        else:
+            self.linear = nn.Linear(
+                hidden_size,
+                patch_size[0] * patch_size[1] * patch_size[2] * out_channels,
+                bias=True,
+            )
+        nn.init.zeros_(self.linear.weight)
+        nn.init.zeros_(self.linear.bias)
+        # Here we don't distinguish between the modulate types. Just use the simple one.
+        self.adaLN_modulation = nn.Sequential(
+            act_layer(),
+            nn.Linear(hidden_size, 2 * hidden_size, bias=True, **factory_kwargs),
+        )
+        # Zero-initialize the modulation
+        nn.init.zeros_(self.adaLN_modulation[1].weight)
+        nn.init.zeros_(self.adaLN_modulation[1].bias)
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
+        x = modulate(self.norm_final(x), shift=shift, scale=scale)
+        x = self.linear(x)
+        return x

hunyuan_model/models.py ADDED Viewed

	@@ -0,0 +1,1044 @@

+import os
+from typing import Any, List, Tuple, Optional, Union, Dict
+import accelerate
+from einops import rearrange
+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+from .activation_layers import get_activation_layer
+from .norm_layers import get_norm_layer
+from .embed_layers import TimestepEmbedder, PatchEmbed, TextProjection
+from .attention import attention, parallel_attention, get_cu_seqlens
+from .posemb_layers import apply_rotary_emb
+from .mlp_layers import MLP, MLPEmbedder, FinalLayer
+from .modulate_layers import ModulateDiT, modulate, apply_gate
+from .token_refiner import SingleTokenRefiner
+from modules.custom_offloading_utils import ModelOffloader, synchronize_device, clean_memory_on_device
+from hunyuan_model.posemb_layers import get_nd_rotary_pos_embed
+from utils.safetensors_utils import MemoryEfficientSafeOpen
+class MMDoubleStreamBlock(nn.Module):
+    """
+    A multimodal dit block with seperate modulation for
+    text and image/video, see more details (SD3): https://arxiv.org/abs/2403.03206
+                                     (Flux.1): https://github.com/black-forest-labs/flux
+    """
+    def __init__(
+        self,
+        hidden_size: int,
+        heads_num: int,
+        mlp_width_ratio: float,
+        mlp_act_type: str = "gelu_tanh",
+        qk_norm: bool = True,
+        qk_norm_type: str = "rms",
+        qkv_bias: bool = False,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        attn_mode: str = "flash",
+        split_attn: bool = False,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.attn_mode = attn_mode
+        self.split_attn = split_attn
+        self.deterministic = False
+        self.heads_num = heads_num
+        head_dim = hidden_size // heads_num
+        mlp_hidden_dim = int(hidden_size * mlp_width_ratio)
+        self.img_mod = ModulateDiT(
+            hidden_size,
+            factor=6,
+            act_layer=get_activation_layer("silu"),
+            **factory_kwargs,
+        )
+        self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
+        self.img_attn_qkv = nn.Linear(hidden_size, hidden_size * 3, bias=qkv_bias, **factory_kwargs)
+        qk_norm_layer = get_norm_layer(qk_norm_type)
+        self.img_attn_q_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
+        )
+        self.img_attn_k_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
+        )
+        self.img_attn_proj = nn.Linear(hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs)
+        self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
+        self.img_mlp = MLP(
+            hidden_size,
+            mlp_hidden_dim,
+            act_layer=get_activation_layer(mlp_act_type),
+            bias=True,
+            **factory_kwargs,
+        )
+        self.txt_mod = ModulateDiT(
+            hidden_size,
+            factor=6,
+            act_layer=get_activation_layer("silu"),
+            **factory_kwargs,
+        )
+        self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
+        self.txt_attn_qkv = nn.Linear(hidden_size, hidden_size * 3, bias=qkv_bias, **factory_kwargs)
+        self.txt_attn_q_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
+        )
+        self.txt_attn_k_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
+        )
+        self.txt_attn_proj = nn.Linear(hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs)
+        self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
+        self.txt_mlp = MLP(
+            hidden_size,
+            mlp_hidden_dim,
+            act_layer=get_activation_layer(mlp_act_type),
+            bias=True,
+            **factory_kwargs,
+        )
+        self.hybrid_seq_parallel_attn = None
+        self.gradient_checkpointing = False
+    def enable_deterministic(self):
+        self.deterministic = True
+    def disable_deterministic(self):
+        self.deterministic = False
+    def enable_gradient_checkpointing(self):
+        self.gradient_checkpointing = True
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+    def _forward(
+        self,
+        img: torch.Tensor,
+        txt: torch.Tensor,
+        vec: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+        total_len: Optional[torch.Tensor] = None,
+        cu_seqlens_q: Optional[torch.Tensor] = None,
+        cu_seqlens_kv: Optional[torch.Tensor] = None,
+        max_seqlen_q: Optional[int] = None,
+        max_seqlen_kv: Optional[int] = None,
+        freqs_cis: tuple = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        (img_mod1_shift, img_mod1_scale, img_mod1_gate, img_mod2_shift, img_mod2_scale, img_mod2_gate) = self.img_mod(vec).chunk(
+            6, dim=-1
+        )
+        (txt_mod1_shift, txt_mod1_scale, txt_mod1_gate, txt_mod2_shift, txt_mod2_scale, txt_mod2_gate) = self.txt_mod(vec).chunk(
+            6, dim=-1
+        )
+        # Prepare image for attention.
+        img_modulated = self.img_norm1(img)
+        img_modulated = modulate(img_modulated, shift=img_mod1_shift, scale=img_mod1_scale)
+        img_qkv = self.img_attn_qkv(img_modulated)
+        img_modulated = None
+        img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B L H D", K=3, H=self.heads_num)
+        img_qkv = None
+        # Apply QK-Norm if needed
+        img_q = self.img_attn_q_norm(img_q).to(img_v)
+        img_k = self.img_attn_k_norm(img_k).to(img_v)
+        # Apply RoPE if needed.
+        if freqs_cis is not None:
+            img_q_shape = img_q.shape
+            img_k_shape = img_k.shape
+            img_q, img_k = apply_rotary_emb(img_q, img_k, freqs_cis, head_first=False)
+            assert (
+                img_q.shape == img_q_shape and img_k.shape == img_k_shape
+            ), f"img_kk: {img_q.shape}, img_q: {img_q_shape}, img_kk: {img_k.shape}, img_k: {img_k_shape}"
+            # img_q, img_k = img_qq, img_kk
+        # Prepare txt for attention.
+        txt_modulated = self.txt_norm1(txt)
+        txt_modulated = modulate(txt_modulated, shift=txt_mod1_shift, scale=txt_mod1_scale)
+        txt_qkv = self.txt_attn_qkv(txt_modulated)
+        txt_modulated = None
+        txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B L H D", K=3, H=self.heads_num)
+        txt_qkv = None
+        # Apply QK-Norm if needed.
+        txt_q = self.txt_attn_q_norm(txt_q).to(txt_v)
+        txt_k = self.txt_attn_k_norm(txt_k).to(txt_v)
+        # Run actual attention.
+        img_q_len = img_q.shape[1]
+        img_kv_len = img_k.shape[1]
+        batch_size = img_k.shape[0]
+        q = torch.cat((img_q, txt_q), dim=1)
+        img_q = txt_q = None
+        k = torch.cat((img_k, txt_k), dim=1)
+        img_k = txt_k = None
+        v = torch.cat((img_v, txt_v), dim=1)
+        img_v = txt_v = None
+        assert (
+            cu_seqlens_q.shape[0] == 2 * img.shape[0] + 1
+        ), f"cu_seqlens_q.shape:{cu_seqlens_q.shape}, img.shape[0]:{img.shape[0]}"
+        # attention computation start
+        if not self.hybrid_seq_parallel_attn:
+            l = [q, k, v]
+            q = k = v = None
+            attn = attention(
+                l,
+                mode=self.attn_mode,
+                attn_mask=attn_mask,
+                total_len=total_len,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_kv=cu_seqlens_kv,
+                max_seqlen_q=max_seqlen_q,
+                max_seqlen_kv=max_seqlen_kv,
+                batch_size=batch_size,
+            )
+        else:
+            attn = parallel_attention(
+                self.hybrid_seq_parallel_attn,
+                q,
+                k,
+                v,
+                img_q_len=img_q_len,
+                img_kv_len=img_kv_len,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_kv=cu_seqlens_kv,
+            )
+        # attention computation end
+        img_attn, txt_attn = attn[:, : img.shape[1]], attn[:, img.shape[1] :]
+        attn = None
+        # Calculate the img bloks.
+        img = img + apply_gate(self.img_attn_proj(img_attn), gate=img_mod1_gate)
+        img_attn = None
+        img = img + apply_gate(
+            self.img_mlp(modulate(self.img_norm2(img), shift=img_mod2_shift, scale=img_mod2_scale)),
+            gate=img_mod2_gate,
+        )
+        # Calculate the txt bloks.
+        txt = txt + apply_gate(self.txt_attn_proj(txt_attn), gate=txt_mod1_gate)
+        txt_attn = None
+        txt = txt + apply_gate(
+            self.txt_mlp(modulate(self.txt_norm2(txt), shift=txt_mod2_shift, scale=txt_mod2_scale)),
+            gate=txt_mod2_gate,
+        )
+        return img, txt
+    # def forward(
+    #     self,
+    #     img: torch.Tensor,
+    #     txt: torch.Tensor,
+    #     vec: torch.Tensor,
+    #     attn_mask: Optional[torch.Tensor] = None,
+    #     cu_seqlens_q: Optional[torch.Tensor] = None,
+    #     cu_seqlens_kv: Optional[torch.Tensor] = None,
+    #     max_seqlen_q: Optional[int] = None,
+    #     max_seqlen_kv: Optional[int] = None,
+    #     freqs_cis: Tuple[torch.Tensor, torch.Tensor] = None,
+    # ) -> Tuple[torch.Tensor, torch.Tensor]:
+    def forward(self, *args, **kwargs):
+        if self.training and self.gradient_checkpointing:
+            return checkpoint(self._forward, *args, use_reentrant=False, **kwargs)
+        else:
+            return self._forward(*args, **kwargs)
+class MMSingleStreamBlock(nn.Module):
+    """
+    A DiT block with parallel linear layers as described in
+    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
+    Also refer to (SD3): https://arxiv.org/abs/2403.03206
+                  (Flux.1): https://github.com/black-forest-labs/flux
+    """
+    def __init__(
+        self,
+        hidden_size: int,
+        heads_num: int,
+        mlp_width_ratio: float = 4.0,
+        mlp_act_type: str = "gelu_tanh",
+        qk_norm: bool = True,
+        qk_norm_type: str = "rms",
+        qk_scale: float = None,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        attn_mode: str = "flash",
+        split_attn: bool = False,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.attn_mode = attn_mode
+        self.split_attn = split_attn
+        self.deterministic = False
+        self.hidden_size = hidden_size
+        self.heads_num = heads_num
+        head_dim = hidden_size // heads_num
+        mlp_hidden_dim = int(hidden_size * mlp_width_ratio)
+        self.mlp_hidden_dim = mlp_hidden_dim
+        self.scale = qk_scale or head_dim**-0.5
+        # qkv and mlp_in
+        self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + mlp_hidden_dim, **factory_kwargs)
+        # proj and mlp_out
+        self.linear2 = nn.Linear(hidden_size + mlp_hidden_dim, hidden_size, **factory_kwargs)
+        qk_norm_layer = get_norm_layer(qk_norm_type)
+        self.q_norm = qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
+        self.k_norm = qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
+        self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
+        self.mlp_act = get_activation_layer(mlp_act_type)()
+        self.modulation = ModulateDiT(hidden_size, factor=3, act_layer=get_activation_layer("silu"), **factory_kwargs)
+        self.hybrid_seq_parallel_attn = None
+        self.gradient_checkpointing = False
+    def enable_deterministic(self):
+        self.deterministic = True
+    def disable_deterministic(self):
+        self.deterministic = False
+    def enable_gradient_checkpointing(self):
+        self.gradient_checkpointing = True
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+    def _forward(
+        self,
+        x: torch.Tensor,
+        vec: torch.Tensor,
+        txt_len: int,
+        attn_mask: Optional[torch.Tensor] = None,
+        total_len: Optional[torch.Tensor] = None,
+        cu_seqlens_q: Optional[torch.Tensor] = None,
+        cu_seqlens_kv: Optional[torch.Tensor] = None,
+        max_seqlen_q: Optional[int] = None,
+        max_seqlen_kv: Optional[int] = None,
+        freqs_cis: Tuple[torch.Tensor, torch.Tensor] = None,
+    ) -> torch.Tensor:
+        mod_shift, mod_scale, mod_gate = self.modulation(vec).chunk(3, dim=-1)
+        x_mod = modulate(self.pre_norm(x), shift=mod_shift, scale=mod_scale)
+        qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
+        x_mod = None
+        # mlp = mlp.to("cpu", non_blocking=True)
+        # clean_memory_on_device(x.device)
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B L H D", K=3, H=self.heads_num)
+        qkv = None
+        # Apply QK-Norm if needed.
+        q = self.q_norm(q).to(v)
+        k = self.k_norm(k).to(v)
+        # Apply RoPE if needed.
+        if freqs_cis is not None:
+            img_q, txt_q = q[:, :-txt_len, :, :], q[:, -txt_len:, :, :]
+            img_k, txt_k = k[:, :-txt_len, :, :], k[:, -txt_len:, :, :]
+            q = k = None
+            img_q_shape = img_q.shape
+            img_k_shape = img_k.shape
+            img_q, img_k = apply_rotary_emb(img_q, img_k, freqs_cis, head_first=False)
+            assert (
+                img_q.shape == img_q_shape and img_k_shape == img_k.shape
+            ), f"img_kk: {img_q.shape}, img_q: {img_q.shape}, img_kk: {img_k.shape}, img_k: {img_k.shape}"
+            # img_q, img_k = img_qq, img_kk
+            # del img_qq, img_kk
+            q = torch.cat((img_q, txt_q), dim=1)
+            k = torch.cat((img_k, txt_k), dim=1)
+            del img_q, txt_q, img_k, txt_k
+        # Compute attention.
+        assert cu_seqlens_q.shape[0] == 2 * x.shape[0] + 1, f"cu_seqlens_q.shape:{cu_seqlens_q.shape}, x.shape[0]:{x.shape[0]}"
+        # attention computation start
+        if not self.hybrid_seq_parallel_attn:
+            l = [q, k, v]
+            q = k = v = None
+            attn = attention(
+                l,
+                mode=self.attn_mode,
+                attn_mask=attn_mask,
+                total_len=total_len,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_kv=cu_seqlens_kv,
+                max_seqlen_q=max_seqlen_q,
+                max_seqlen_kv=max_seqlen_kv,
+                batch_size=x.shape[0],
+            )
+        else:
+            attn = parallel_attention(
+                self.hybrid_seq_parallel_attn,
+                q,
+                k,
+                v,
+                img_q_len=img_q.shape[1],
+                img_kv_len=img_k.shape[1],
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_kv=cu_seqlens_kv,
+            )
+        # attention computation end
+        # Compute activation in mlp stream, cat again and run second linear layer.
+        # mlp = mlp.to(x.device)
+        mlp = self.mlp_act(mlp)
+        attn_mlp = torch.cat((attn, mlp), 2)
+        attn = None
+        mlp = None
+        output = self.linear2(attn_mlp)
+        attn_mlp = None
+        return x + apply_gate(output, gate=mod_gate)
+    # def forward(
+    #     self,
+    #     x: torch.Tensor,
+    #     vec: torch.Tensor,
+    #     txt_len: int,
+    #     attn_mask: Optional[torch.Tensor] = None,
+    #     cu_seqlens_q: Optional[torch.Tensor] = None,
+    #     cu_seqlens_kv: Optional[torch.Tensor] = None,
+    #     max_seqlen_q: Optional[int] = None,
+    #     max_seqlen_kv: Optional[int] = None,
+    #     freqs_cis: Tuple[torch.Tensor, torch.Tensor] = None,
+    # ) -> torch.Tensor:
+    def forward(self, *args, **kwargs):
+        if self.training and self.gradient_checkpointing:
+            return checkpoint(self._forward, *args, use_reentrant=False, **kwargs)
+        else:
+            return self._forward(*args, **kwargs)
+class HYVideoDiffusionTransformer(nn.Module):  # ModelMixin, ConfigMixin):
+    """
+    HunyuanVideo Transformer backbone
+    Inherited from ModelMixin and ConfigMixin for compatibility with diffusers' sampler StableDiffusionPipeline.
+    Reference:
+    [1] Flux.1: https://github.com/black-forest-labs/flux
+    [2] MMDiT: http://arxiv.org/abs/2403.03206
+    Parameters
+    ----------
+    args: argparse.Namespace
+        The arguments parsed by argparse.
+    patch_size: list
+        The size of the patch.
+    in_channels: int
+        The number of input channels.
+    out_channels: int
+        The number of output channels.
+    hidden_size: int
+        The hidden size of the transformer backbone.
+    heads_num: int
+        The number of attention heads.
+    mlp_width_ratio: float
+        The ratio of the hidden size of the MLP in the transformer block.
+    mlp_act_type: str
+        The activation function of the MLP in the transformer block.
+    depth_double_blocks: int
+        The number of transformer blocks in the double blocks.
+    depth_single_blocks: int
+        The number of transformer blocks in the single blocks.
+    rope_dim_list: list
+        The dimension of the rotary embedding for t, h, w.
+    qkv_bias: bool
+        Whether to use bias in the qkv linear layer.
+    qk_norm: bool
+        Whether to use qk norm.
+    qk_norm_type: str
+        The type of qk norm.
+    guidance_embed: bool
+        Whether to use guidance embedding for distillation.
+    text_projection: str
+        The type of the text projection, default is single_refiner.
+    use_attention_mask: bool
+        Whether to use attention mask for text encoder.
+    dtype: torch.dtype
+        The dtype of the model.
+    device: torch.device
+        The device of the model.
+    attn_mode: str
+        The mode of the attention, default is flash.
+    split_attn: bool
+        Whether to use split attention (make attention as batch size 1).
+    """
+    # @register_to_config
+    def __init__(
+        self,
+        text_states_dim: int,
+        text_states_dim_2: int,
+        patch_size: list = [1, 2, 2],
+        in_channels: int = 4,  # Should be VAE.config.latent_channels.
+        out_channels: int = None,
+        hidden_size: int = 3072,
+        heads_num: int = 24,
+        mlp_width_ratio: float = 4.0,
+        mlp_act_type: str = "gelu_tanh",
+        mm_double_blocks_depth: int = 20,
+        mm_single_blocks_depth: int = 40,
+        rope_dim_list: List[int] = [16, 56, 56],
+        qkv_bias: bool = True,
+        qk_norm: bool = True,
+        qk_norm_type: str = "rms",
+        guidance_embed: bool = False,  # For modulation.
+        text_projection: str = "single_refiner",
+        use_attention_mask: bool = True,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        attn_mode: str = "flash",
+        split_attn: bool = False,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.patch_size = patch_size
+        self.in_channels = in_channels
+        self.out_channels = in_channels if out_channels is None else out_channels
+        self.unpatchify_channels = self.out_channels
+        self.guidance_embed = guidance_embed
+        self.rope_dim_list = rope_dim_list
+        # Text projection. Default to linear projection.
+        # Alternative: TokenRefiner. See more details (LI-DiT): http://arxiv.org/abs/2406.11831
+        self.use_attention_mask = use_attention_mask
+        self.text_projection = text_projection
+        self.text_states_dim = text_states_dim
+        self.text_states_dim_2 = text_states_dim_2
+        if hidden_size % heads_num != 0:
+            raise ValueError(f"Hidden size {hidden_size} must be divisible by heads_num {heads_num}")
+        pe_dim = hidden_size // heads_num
+        if sum(rope_dim_list) != pe_dim:
+            raise ValueError(f"Got {rope_dim_list} but expected positional dim {pe_dim}")
+        self.hidden_size = hidden_size
+        self.heads_num = heads_num
+        self.attn_mode = attn_mode
+        self.split_attn = split_attn
+        print(f"Using {self.attn_mode} attention mode, split_attn: {self.split_attn}")
+        # image projection
+        self.img_in = PatchEmbed(self.patch_size, self.in_channels, self.hidden_size, **factory_kwargs)
+        # text projection
+        if self.text_projection == "linear":
+            self.txt_in = TextProjection(
+                self.text_states_dim,
+                self.hidden_size,
+                get_activation_layer("silu"),
+                **factory_kwargs,
+            )
+        elif self.text_projection == "single_refiner":
+            self.txt_in = SingleTokenRefiner(self.text_states_dim, hidden_size, heads_num, depth=2, **factory_kwargs)
+        else:
+            raise NotImplementedError(f"Unsupported text_projection: {self.text_projection}")
+        # time modulation
+        self.time_in = TimestepEmbedder(self.hidden_size, get_activation_layer("silu"), **factory_kwargs)
+        # text modulation
+        self.vector_in = MLPEmbedder(self.text_states_dim_2, self.hidden_size, **factory_kwargs)
+        # guidance modulation
+        self.guidance_in = (
+            TimestepEmbedder(self.hidden_size, get_activation_layer("silu"), **factory_kwargs) if guidance_embed else None
+        )
+        # double blocks
+        self.double_blocks = nn.ModuleList(
+            [
+                MMDoubleStreamBlock(
+                    self.hidden_size,
+                    self.heads_num,
+                    mlp_width_ratio=mlp_width_ratio,
+                    mlp_act_type=mlp_act_type,
+                    qk_norm=qk_norm,
+                    qk_norm_type=qk_norm_type,
+                    qkv_bias=qkv_bias,
+                    attn_mode=attn_mode,
+                    split_attn=split_attn,
+                    **factory_kwargs,
+                )
+                for _ in range(mm_double_blocks_depth)
+            ]
+        )
+        # single blocks
+        self.single_blocks = nn.ModuleList(
+            [
+                MMSingleStreamBlock(
+                    self.hidden_size,
+                    self.heads_num,
+                    mlp_width_ratio=mlp_width_ratio,
+                    mlp_act_type=mlp_act_type,
+                    qk_norm=qk_norm,
+                    qk_norm_type=qk_norm_type,
+                    attn_mode=attn_mode,
+                    split_attn=split_attn,
+                    **factory_kwargs,
+                )
+                for _ in range(mm_single_blocks_depth)
+            ]
+        )
+        self.final_layer = FinalLayer(
+            self.hidden_size,
+            self.patch_size,
+            self.out_channels,
+            get_activation_layer("silu"),
+            **factory_kwargs,
+        )
+        self.gradient_checkpointing = False
+        self.blocks_to_swap = None
+        self.offloader_double = None
+        self.offloader_single = None
+        self._enable_img_in_txt_in_offloading = False
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+    def enable_gradient_checkpointing(self):
+        self.gradient_checkpointing = True
+        self.txt_in.enable_gradient_checkpointing()
+        for block in self.double_blocks + self.single_blocks:
+            block.enable_gradient_checkpointing()
+        print(f"HYVideoDiffusionTransformer: Gradient checkpointing enabled.")
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+        self.txt_in.disable_gradient_checkpointing()
+        for block in self.double_blocks + self.single_blocks:
+            block.disable_gradient_checkpointing()
+        print(f"HYVideoDiffusionTransformer: Gradient checkpointing disabled.")
+    def enable_img_in_txt_in_offloading(self):
+        self._enable_img_in_txt_in_offloading = True
+    def enable_block_swap(self, num_blocks: int, device: torch.device, supports_backward: bool):
+        self.blocks_to_swap = num_blocks
+        self.num_double_blocks = len(self.double_blocks)
+        self.num_single_blocks = len(self.single_blocks)
+        double_blocks_to_swap = num_blocks // 2
+        single_blocks_to_swap = (num_blocks - double_blocks_to_swap) * 2 + 1
+        assert double_blocks_to_swap <= self.num_double_blocks - 1 and single_blocks_to_swap <= self.num_single_blocks - 1, (
+            f"Cannot swap more than {self.num_double_blocks - 1} double blocks and {self.num_single_blocks - 1} single blocks. "
+            f"Requested {double_blocks_to_swap} double blocks and {single_blocks_to_swap} single blocks."
+        )
+        self.offloader_double = ModelOffloader(
+            "double", self.double_blocks, self.num_double_blocks, double_blocks_to_swap, supports_backward, device  # , debug=True
+        )
+        self.offloader_single = ModelOffloader(
+            "single", self.single_blocks, self.num_single_blocks, single_blocks_to_swap, supports_backward, device  # , debug=True
+        )
+        print(
+            f"HYVideoDiffusionTransformer: Block swap enabled. Swapping {num_blocks} blocks, double blocks: {double_blocks_to_swap}, single blocks: {single_blocks_to_swap}."
+        )
+    def switch_block_swap_for_inference(self):
+        if self.blocks_to_swap:
+            self.offloader_double.set_forward_only(True)
+            self.offloader_single.set_forward_only(True)
+            self.prepare_block_swap_before_forward()
+            print(f"HYVideoDiffusionTransformer: Block swap set to forward only.")
+    def switch_block_swap_for_training(self):
+        if self.blocks_to_swap:
+            self.offloader_double.set_forward_only(False)
+            self.offloader_single.set_forward_only(False)
+            self.prepare_block_swap_before_forward()
+            print(f"HYVideoDiffusionTransformer: Block swap set to forward and backward.")
+    def move_to_device_except_swap_blocks(self, device: torch.device):
+        # assume model is on cpu. do not move blocks to device to reduce temporary memory usage
+        if self.blocks_to_swap:
+            save_double_blocks = self.double_blocks
+            save_single_blocks = self.single_blocks
+            self.double_blocks = None
+            self.single_blocks = None
+        self.to(device)
+        if self.blocks_to_swap:
+            self.double_blocks = save_double_blocks
+            self.single_blocks = save_single_blocks
+    def prepare_block_swap_before_forward(self):
+        if self.blocks_to_swap is None or self.blocks_to_swap == 0:
+            return
+        self.offloader_double.prepare_block_devices_before_forward(self.double_blocks)
+        self.offloader_single.prepare_block_devices_before_forward(self.single_blocks)
+    def enable_deterministic(self):
+        for block in self.double_blocks:
+            block.enable_deterministic()
+        for block in self.single_blocks:
+            block.enable_deterministic()
+    def disable_deterministic(self):
+        for block in self.double_blocks:
+            block.disable_deterministic()
+        for block in self.single_blocks:
+            block.disable_deterministic()
+    def forward(
+        self,
+        x: torch.Tensor,
+        t: torch.Tensor,  # Should be in range(0, 1000).
+        text_states: torch.Tensor = None,
+        text_mask: torch.Tensor = None,  # Now we don't use it.
+        text_states_2: Optional[torch.Tensor] = None,  # Text embedding for modulation.
+        freqs_cos: Optional[torch.Tensor] = None,
+        freqs_sin: Optional[torch.Tensor] = None,
+        guidance: torch.Tensor = None,  # Guidance for modulation, should be cfg_scale x 1000.
+        return_dict: bool = True,
+    ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
+        out = {}
+        img = x
+        txt = text_states
+        _, _, ot, oh, ow = x.shape
+        tt, th, tw = (
+            ot // self.patch_size[0],
+            oh // self.patch_size[1],
+            ow // self.patch_size[2],
+        )
+        # Prepare modulation vectors.
+        vec = self.time_in(t)
+        # text modulation
+        vec = vec + self.vector_in(text_states_2)
+        # guidance modulation
+        if self.guidance_embed:
+            if guidance is None:
+                raise ValueError("Didn't get guidance strength for guidance distilled model.")
+            # our timestep_embedding is merged into guidance_in(TimestepEmbedder)
+            vec = vec + self.guidance_in(guidance)
+        # Embed image and text.
+        if self._enable_img_in_txt_in_offloading:
+            self.img_in.to(x.device, non_blocking=True)
+            self.txt_in.to(x.device, non_blocking=True)
+            synchronize_device(x.device)
+        img = self.img_in(img)
+        if self.text_projection == "linear":
+            txt = self.txt_in(txt)
+        elif self.text_projection == "single_refiner":
+            txt = self.txt_in(txt, t, text_mask if self.use_attention_mask else None)
+        else:
+            raise NotImplementedError(f"Unsupported text_projection: {self.text_projection}")
+        if self._enable_img_in_txt_in_offloading:
+            self.img_in.to(torch.device("cpu"), non_blocking=True)
+            self.txt_in.to(torch.device("cpu"), non_blocking=True)
+            synchronize_device(x.device)
+            clean_memory_on_device(x.device)
+        txt_seq_len = txt.shape[1]
+        img_seq_len = img.shape[1]
+        # Compute cu_squlens and max_seqlen for flash attention
+        cu_seqlens_q = get_cu_seqlens(text_mask, img_seq_len)
+        cu_seqlens_kv = cu_seqlens_q
+        max_seqlen_q = img_seq_len + txt_seq_len
+        max_seqlen_kv = max_seqlen_q
+        attn_mask = total_len = None
+        if self.split_attn or self.attn_mode == "torch":
+            # calculate text length and total length
+            text_len = text_mask.sum(dim=1)  #  (bs, )
+            total_len = img_seq_len + text_len  # (bs, )
+        if self.attn_mode == "torch" and not self.split_attn:
+            # initialize attention mask: bool tensor for sdpa, (b, 1, n, n)
+            bs = img.shape[0]
+            attn_mask = torch.zeros((bs, 1, max_seqlen_q, max_seqlen_q), dtype=torch.bool, device=text_mask.device)
+            # set attention mask with total_len
+            for i in range(bs):
+                attn_mask[i, :, : total_len[i], : total_len[i]] = True
+            total_len = None  # means we don't use split_attn
+        freqs_cis = (freqs_cos, freqs_sin) if freqs_cos is not None else None
+        # --------------------- Pass through DiT blocks ------------------------
+        for block_idx, block in enumerate(self.double_blocks):
+            double_block_args = [
+                img,
+                txt,
+                vec,
+                attn_mask,
+                total_len,
+                cu_seqlens_q,
+                cu_seqlens_kv,
+                max_seqlen_q,
+                max_seqlen_kv,
+                freqs_cis,
+            ]
+            if self.blocks_to_swap:
+                self.offloader_double.wait_for_block(block_idx)
+            img, txt = block(*double_block_args)
+            if self.blocks_to_swap:
+                self.offloader_double.submit_move_blocks_forward(self.double_blocks, block_idx)
+        # Merge txt and img to pass through single stream blocks.
+        x = torch.cat((img, txt), 1)
+        if self.blocks_to_swap:
+            # delete img, txt to reduce memory usage
+            del img, txt
+            clean_memory_on_device(x.device)
+        if len(self.single_blocks) > 0:
+            for block_idx, block in enumerate(self.single_blocks):
+                single_block_args = [
+                    x,
+                    vec,
+                    txt_seq_len,
+                    attn_mask,
+                    total_len,
+                    cu_seqlens_q,
+                    cu_seqlens_kv,
+                    max_seqlen_q,
+                    max_seqlen_kv,
+                    freqs_cis,
+                ]
+                if self.blocks_to_swap:
+                    self.offloader_single.wait_for_block(block_idx)
+                x = block(*single_block_args)
+                if self.blocks_to_swap:
+                    self.offloader_single.submit_move_blocks_forward(self.single_blocks, block_idx)
+        img = x[:, :img_seq_len, ...]
+        x = None
+        # ---------------------------- Final layer ------------------------------
+        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
+        img = self.unpatchify(img, tt, th, tw)
+        if return_dict:
+            out["x"] = img
+            return out
+        return img
+    def unpatchify(self, x, t, h, w):
+        """
+        x: (N, T, patch_size**2 * C)
+        imgs: (N, H, W, C)
+        """
+        c = self.unpatchify_channels
+        pt, ph, pw = self.patch_size
+        assert t * h * w == x.shape[1]
+        x = x.reshape(shape=(x.shape[0], t, h, w, c, pt, ph, pw))
+        x = torch.einsum("nthwcopq->nctohpwq", x)
+        imgs = x.reshape(shape=(x.shape[0], c, t * pt, h * ph, w * pw))
+        return imgs
+    def params_count(self):
+        counts = {
+            "double": sum(
+                [
+                    sum(p.numel() for p in block.img_attn_qkv.parameters())
+                    + sum(p.numel() for p in block.img_attn_proj.parameters())
+                    + sum(p.numel() for p in block.img_mlp.parameters())
+                    + sum(p.numel() for p in block.txt_attn_qkv.parameters())
+                    + sum(p.numel() for p in block.txt_attn_proj.parameters())
+                    + sum(p.numel() for p in block.txt_mlp.parameters())
+                    for block in self.double_blocks
+                ]
+            ),
+            "single": sum(
+                [
+                    sum(p.numel() for p in block.linear1.parameters()) + sum(p.numel() for p in block.linear2.parameters())
+                    for block in self.single_blocks
+                ]
+            ),
+            "total": sum(p.numel() for p in self.parameters()),
+        }
+        counts["attn+mlp"] = counts["double"] + counts["single"]
+        return counts
+#################################################################################
+#                             HunyuanVideo Configs                              #
+#################################################################################
+HUNYUAN_VIDEO_CONFIG = {
+    "HYVideo-T/2": {
+        "mm_double_blocks_depth": 20,
+        "mm_single_blocks_depth": 40,
+        "rope_dim_list": [16, 56, 56],
+        "hidden_size": 3072,
+        "heads_num": 24,
+        "mlp_width_ratio": 4,
+    },
+    "HYVideo-T/2-cfgdistill": {
+        "mm_double_blocks_depth": 20,
+        "mm_single_blocks_depth": 40,
+        "rope_dim_list": [16, 56, 56],
+        "hidden_size": 3072,
+        "heads_num": 24,
+        "mlp_width_ratio": 4,
+        "guidance_embed": True,
+    },
+}
+def load_dit_model(text_states_dim, text_states_dim_2, in_channels, out_channels, factor_kwargs):
+    """load hunyuan video model
+    NOTE: Only support HYVideo-T/2-cfgdistill now.
+    Args:
+        text_state_dim (int): text state dimension
+        text_state_dim_2 (int): text state dimension 2
+        in_channels (int): input channels number
+        out_channels (int): output channels number
+        factor_kwargs (dict): factor kwargs
+    Returns:
+        model (nn.Module): The hunyuan video model
+    """
+    # if args.model in HUNYUAN_VIDEO_CONFIG.keys():
+    model = HYVideoDiffusionTransformer(
+        text_states_dim=text_states_dim,
+        text_states_dim_2=text_states_dim_2,
+        in_channels=in_channels,
+        out_channels=out_channels,
+        **HUNYUAN_VIDEO_CONFIG["HYVideo-T/2-cfgdistill"],
+        **factor_kwargs,
+    )
+    return model
+    # else:
+    #     raise NotImplementedError()
+def load_state_dict(model, model_path):
+    state_dict = torch.load(model_path, map_location=lambda storage, loc: storage, weights_only=True)
+    load_key = "module"
+    if load_key in state_dict:
+        state_dict = state_dict[load_key]
+    else:
+        raise KeyError(
+            f"Missing key: `{load_key}` in the checkpoint: {model_path}. The keys in the checkpoint "
+            f"are: {list(state_dict.keys())}."
+        )
+    model.load_state_dict(state_dict, strict=True, assign=True)
+    return model
+def load_transformer(dit_path, attn_mode, split_attn, device, dtype, in_channels=16) -> HYVideoDiffusionTransformer:
+    # =========================== Build main model ===========================
+    factor_kwargs = {"device": device, "dtype": dtype, "attn_mode": attn_mode, "split_attn": split_attn}
+    latent_channels = 16
+    out_channels = latent_channels
+    with accelerate.init_empty_weights():
+        transformer = load_dit_model(
+            text_states_dim=4096,
+            text_states_dim_2=768,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            factor_kwargs=factor_kwargs,
+        )
+    if os.path.splitext(dit_path)[-1] == ".safetensors":
+        # loading safetensors: may be already fp8
+        with MemoryEfficientSafeOpen(dit_path) as f:
+            state_dict = {}
+            for k in f.keys():
+                tensor = f.get_tensor(k)
+                tensor = tensor.to(device=device, dtype=dtype)
+                # TODO support comfy model
+                # if k.startswith("model.model."):
+                #     k = convert_comfy_model_key(k)
+                state_dict[k] = tensor
+        transformer.load_state_dict(state_dict, strict=True, assign=True)
+    else:
+        transformer = load_state_dict(transformer, dit_path)
+    return transformer
+def get_rotary_pos_embed_by_shape(model, latents_size):
+    target_ndim = 3
+    ndim = 5 - 2
+    if isinstance(model.patch_size, int):
+        assert all(s % model.patch_size == 0 for s in latents_size), (
+            f"Latent size(last {ndim} dimensions) should be divisible by patch size({model.patch_size}), "
+            f"but got {latents_size}."
+        )
+        rope_sizes = [s // model.patch_size for s in latents_size]
+    elif isinstance(model.patch_size, list):
+        assert all(s % model.patch_size[idx] == 0 for idx, s in enumerate(latents_size)), (
+            f"Latent size(last {ndim} dimensions) should be divisible by patch size({model.patch_size}), "
+            f"but got {latents_size}."
+        )
+        rope_sizes = [s // model.patch_size[idx] for idx, s in enumerate(latents_size)]
+    if len(rope_sizes) != target_ndim:
+        rope_sizes = [1] * (target_ndim - len(rope_sizes)) + rope_sizes  # time axis
+    head_dim = model.hidden_size // model.heads_num
+    rope_dim_list = model.rope_dim_list
+    if rope_dim_list is None:
+        rope_dim_list = [head_dim // target_ndim for _ in range(target_ndim)]
+    assert sum(rope_dim_list) == head_dim, "sum(rope_dim_list) should equal to head_dim of attention layer"
+    rope_theta = 256
+    freqs_cos, freqs_sin = get_nd_rotary_pos_embed(
+        rope_dim_list, rope_sizes, theta=rope_theta, use_real=True, theta_rescale_factor=1
+    )
+    return freqs_cos, freqs_sin
+def get_rotary_pos_embed(vae_name, model, video_length, height, width):
+    # 884
+    if "884" in vae_name:
+        latents_size = [(video_length - 1) // 4 + 1, height // 8, width // 8]
+    elif "888" in vae_name:
+        latents_size = [(video_length - 1) // 8 + 1, height // 8, width // 8]
+    else:
+        latents_size = [video_length, height // 8, width // 8]
+    return get_rotary_pos_embed_by_shape(model, latents_size)

hunyuan_model/modulate_layers.py ADDED Viewed

	@@ -0,0 +1,76 @@

+from typing import Callable
+import torch
+import torch.nn as nn
+class ModulateDiT(nn.Module):
+    """Modulation layer for DiT."""
+    def __init__(
+        self,
+        hidden_size: int,
+        factor: int,
+        act_layer: Callable,
+        dtype=None,
+        device=None,
+    ):
+        factory_kwargs = {"dtype": dtype, "device": device}
+        super().__init__()
+        self.act = act_layer()
+        self.linear = nn.Linear(
+            hidden_size, factor * hidden_size, bias=True, **factory_kwargs
+        )
+        # Zero-initialize the modulation
+        nn.init.zeros_(self.linear.weight)
+        nn.init.zeros_(self.linear.bias)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.linear(self.act(x))
+def modulate(x, shift=None, scale=None):
+    """modulate by shift and scale
+    Args:
+        x (torch.Tensor): input tensor.
+        shift (torch.Tensor, optional): shift tensor. Defaults to None.
+        scale (torch.Tensor, optional): scale tensor. Defaults to None.
+    Returns:
+        torch.Tensor: the output tensor after modulate.
+    """
+    if scale is None and shift is None:
+        return x
+    elif shift is None:
+        return x * (1 + scale.unsqueeze(1))
+    elif scale is None:
+        return x + shift.unsqueeze(1)
+    else:
+        return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+def apply_gate(x, gate=None, tanh=False):
+    """AI is creating summary for apply_gate
+    Args:
+        x (torch.Tensor): input tensor.
+        gate (torch.Tensor, optional): gate tensor. Defaults to None.
+        tanh (bool, optional): whether to use tanh function. Defaults to False.
+    Returns:
+        torch.Tensor: the output tensor after apply gate.
+    """
+    if gate is None:
+        return x
+    if tanh:
+        return x * gate.unsqueeze(1).tanh()
+    else:
+        return x * gate.unsqueeze(1)
+def ckpt_wrapper(module):
+    def ckpt_forward(*inputs):
+        outputs = module(*inputs)
+        return outputs
+    return ckpt_forward

hunyuan_model/norm_layers.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import torch
+import torch.nn as nn
+class RMSNorm(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        elementwise_affine=True,
+        eps: float = 1e-6,
+        device=None,
+        dtype=None,
+    ):
+        """
+        Initialize the RMSNorm normalization layer.
+        Args:
+            dim (int): The dimension of the input tensor.
+            eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
+        Attributes:
+            eps (float): A small value added to the denominator for numerical stability.
+            weight (nn.Parameter): Learnable scaling parameter.
+        """
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.eps = eps
+        if elementwise_affine:
+            self.weight = nn.Parameter(torch.ones(dim, **factory_kwargs))
+    def _norm(self, x):
+        """
+        Apply the RMSNorm normalization to the input tensor.
+        Args:
+            x (torch.Tensor): The input tensor.
+        Returns:
+            torch.Tensor: The normalized tensor.
+        """
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        """
+        Forward pass through the RMSNorm layer.
+        Args:
+            x (torch.Tensor): The input tensor.
+        Returns:
+            torch.Tensor: The output tensor after applying RMSNorm.
+        """
+        output = self._norm(x.float()).type_as(x)
+        if hasattr(self, "weight"):
+            # output = output * self.weight
+            # support fp8
+            output = output * self.weight.to(output.dtype)
+        return output
+def get_norm_layer(norm_layer):
+    """
+    Get the normalization layer.
+    Args:
+        norm_layer (str): The type of normalization layer.
+    Returns:
+        norm_layer (nn.Module): The normalization layer.
+    """
+    if norm_layer == "layer":
+        return nn.LayerNorm
+    elif norm_layer == "rms":
+        return RMSNorm
+    else:
+        raise NotImplementedError(f"Norm layer {norm_layer} is not implemented")

hunyuan_model/pipeline_hunyuan_video.py ADDED Viewed

	@@ -0,0 +1,1100 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Modified from diffusers==0.29.2
+#
+# ==============================================================================
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union, Tuple
+import torch
+import torch.distributed as dist
+import numpy as np
+from dataclasses import dataclass
+from packaging import version
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.configuration_utils import FrozenDict
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.models import AutoencoderKL
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.utils import BaseOutput
+from ...constants import PRECISION_TO_TYPE
+from ...vae.autoencoder_kl_causal_3d import AutoencoderKLCausal3D
+from ...text_encoder import TextEncoder
+from ...modules import HYVideoDiffusionTransformer
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+EXAMPLE_DOC_STRING = """"""
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(
+        dim=list(range(1, noise_pred_text.ndim)), keepdim=True
+    )
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = (
+        guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    )
+    return noise_cfg
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError(
+            "Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values"
+        )
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(
+            inspect.signature(scheduler.set_timesteps).parameters.keys()
+        )
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(
+            inspect.signature(scheduler.set_timesteps).parameters.keys()
+        )
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+@dataclass
+class HunyuanVideoPipelineOutput(BaseOutput):
+    videos: Union[torch.Tensor, np.ndarray]
+class HunyuanVideoPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-video generation using HunyuanVideo.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`TextEncoder`]):
+            Frozen text-encoder.
+        text_encoder_2 ([`TextEncoder`]):
+            Frozen text-encoder_2.
+        transformer ([`HYVideoDiffusionTransformer`]):
+            A `HYVideoDiffusionTransformer` to denoise the encoded video latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
+    """
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
+    _optional_components = ["text_encoder_2"]
+    _exclude_from_cpu_offload = ["transformer"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: TextEncoder,
+        transformer: HYVideoDiffusionTransformer,
+        scheduler: KarrasDiffusionSchedulers,
+        text_encoder_2: Optional[TextEncoder] = None,
+        progress_bar_config: Dict[str, Any] = None,
+        args=None,
+    ):
+        super().__init__()
+        # ==========================================================================================
+        if progress_bar_config is None:
+            progress_bar_config = {}
+        if not hasattr(self, "_progress_bar_config"):
+            self._progress_bar_config = {}
+        self._progress_bar_config.update(progress_bar_config)
+        self.args = args
+        # ==========================================================================================
+        if (
+            hasattr(scheduler.config, "steps_offset")
+            and scheduler.config.steps_offset != 1
+        ):
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate(
+                "steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False
+            )
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+        if (
+            hasattr(scheduler.config, "clip_sample")
+            and scheduler.config.clip_sample is True
+        ):
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate(
+                "clip_sample not set", "1.0.0", deprecation_message, standard_warn=False
+            )
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            transformer=transformer,
+            scheduler=scheduler,
+            text_encoder_2=text_encoder_2,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_videos_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_attention_mask: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+        text_encoder: Optional[TextEncoder] = None,
+        data_type: Optional[str] = "image",
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_videos_per_prompt (`int`):
+                number of videos that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the video generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            attention_mask (`torch.Tensor`, *optional*):
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            negative_attention_mask (`torch.Tensor`, *optional*):
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            text_encoder (TextEncoder, *optional*):
+            data_type (`str`, *optional*):
+        """
+        if text_encoder is None:
+            text_encoder = self.text_encoder
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(text_encoder.model, lora_scale)
+            else:
+                scale_lora_layers(text_encoder.model, lora_scale)
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, text_encoder.tokenizer)
+            text_inputs = text_encoder.text2tokens(prompt, data_type=data_type)
+            if clip_skip is None:
+                prompt_outputs = text_encoder.encode(
+                    text_inputs, data_type=data_type, device=device
+                )
+                prompt_embeds = prompt_outputs.hidden_state
+            else:
+                prompt_outputs = text_encoder.encode(
+                    text_inputs,
+                    output_hidden_states=True,
+                    data_type=data_type,
+                    device=device,
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_outputs.hidden_states_list[-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = text_encoder.model.text_model.final_layer_norm(
+                    prompt_embeds
+                )
+            attention_mask = prompt_outputs.attention_mask
+            if attention_mask is not None:
+                attention_mask = attention_mask.to(device)
+                bs_embed, seq_len = attention_mask.shape
+                attention_mask = attention_mask.repeat(1, num_videos_per_prompt)
+                attention_mask = attention_mask.view(
+                    bs_embed * num_videos_per_prompt, seq_len
+                )
+        if text_encoder is not None:
+            prompt_embeds_dtype = text_encoder.dtype
+        elif self.transformer is not None:
+            prompt_embeds_dtype = self.transformer.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+        if prompt_embeds.ndim == 2:
+            bs_embed, _ = prompt_embeds.shape
+            # duplicate text embeddings for each generation per prompt, using mps friendly method
+            prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt)
+            prompt_embeds = prompt_embeds.view(bs_embed * num_videos_per_prompt, -1)
+        else:
+            bs_embed, seq_len, _ = prompt_embeds.shape
+            # duplicate text embeddings for each generation per prompt, using mps friendly method
+            prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+            prompt_embeds = prompt_embeds.view(
+                bs_embed * num_videos_per_prompt, seq_len, -1
+            )
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(
+                    uncond_tokens, text_encoder.tokenizer
+                )
+            # max_length = prompt_embeds.shape[1]
+            uncond_input = text_encoder.text2tokens(uncond_tokens, data_type=data_type)
+            negative_prompt_outputs = text_encoder.encode(
+                uncond_input, data_type=data_type, device=device
+            )
+            negative_prompt_embeds = negative_prompt_outputs.hidden_state
+            negative_attention_mask = negative_prompt_outputs.attention_mask
+            if negative_attention_mask is not None:
+                negative_attention_mask = negative_attention_mask.to(device)
+                _, seq_len = negative_attention_mask.shape
+                negative_attention_mask = negative_attention_mask.repeat(
+                    1, num_videos_per_prompt
+                )
+                negative_attention_mask = negative_attention_mask.view(
+                    batch_size * num_videos_per_prompt, seq_len
+                )
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(
+                dtype=prompt_embeds_dtype, device=device
+            )
+            if negative_prompt_embeds.ndim == 2:
+                negative_prompt_embeds = negative_prompt_embeds.repeat(
+                    1, num_videos_per_prompt
+                )
+                negative_prompt_embeds = negative_prompt_embeds.view(
+                    batch_size * num_videos_per_prompt, -1
+                )
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.repeat(
+                    1, num_videos_per_prompt, 1
+                )
+                negative_prompt_embeds = negative_prompt_embeds.view(
+                    batch_size * num_videos_per_prompt, seq_len, -1
+                )
+        if text_encoder is not None:
+            if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(text_encoder.model, lora_scale)
+        return (
+            prompt_embeds,
+            negative_prompt_embeds,
+            attention_mask,
+            negative_attention_mask,
+        )
+    def decode_latents(self, latents, enable_tiling=True):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+        latents = 1 / self.vae.config.scaling_factor * latents
+        if enable_tiling:
+            self.vae.enable_tiling()
+            image = self.vae.decode(latents, return_dict=False)[0]
+        else:
+            image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        if image.ndim == 4:
+            image = image.cpu().permute(0, 2, 3, 1).float()
+        else:
+            image = image.cpu().float()
+        return image
+    def prepare_extra_func_kwargs(self, func, kwargs):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        extra_step_kwargs = {}
+        for k, v in kwargs.items():
+            accepts = k in set(inspect.signature(func).parameters.keys())
+            if accepts:
+                extra_step_kwargs[k] = v
+        return extra_step_kwargs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        video_length,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        vae_ver="88-4c-sd",
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(
+                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
+            )
+        if video_length is not None:
+            if "884" in vae_ver:
+                if video_length != 1 and (video_length - 1) % 4 != 0:
+                    raise ValueError(
+                        f"`video_length` has to be 1 or a multiple of 4 but is {video_length}."
+                    )
+            elif "888" in vae_ver:
+                if video_length != 1 and (video_length - 1) % 8 != 0:
+                    raise ValueError(
+                        f"`video_length` has to be 1 or a multiple of 8 but is {video_length}."
+                    )
+        if callback_steps is not None and (
+            not isinstance(callback_steps, int) or callback_steps <= 0
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs
+            for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (
+            not isinstance(prompt, str) and not isinstance(prompt, list)
+        ):
+            raise ValueError(
+                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
+            )
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        video_length,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            video_length,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            latents = randn_tensor(
+                shape, generator=generator, device=device, dtype=dtype
+            )
+        else:
+            latents = latents.to(device)
+        # Check existence to make it compatible with FlowMatchEulerDiscreteScheduler
+        if hasattr(self.scheduler, "init_noise_sigma"):
+            # scale the initial noise by the standard deviation required by the scheduler
+            latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(
+        self,
+        w: torch.Tensor,
+        embedding_dim: int = 512,
+        dtype: torch.dtype = torch.float32,
+    ) -> torch.Tensor:
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+        Args:
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
+            embedding_dim (`int`, *optional*, defaults to 512):
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        # return self._guidance_scale > 1 and self.transformer.config.time_cond_proj_dim is None
+        return self._guidance_scale > 1
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def interrupt(self):
+        return self._interrupt
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        height: int,
+        width: int,
+        video_length: int,
+        data_type: str = "video",
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_attention_mask: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[
+            Union[
+                Callable[[int, int, Dict], None],
+                PipelineCallback,
+                MultiPipelineCallbacks,
+            ]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        freqs_cis: Tuple[torch.Tensor, torch.Tensor] = None,
+        vae_ver: str = "88-4c-sd",
+        enable_tiling: bool = False,
+        n_tokens: Optional[int] = None,
+        embedded_guidance_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`):
+                The height in pixels of the generated image.
+            width (`int`):
+                The width in pixels of the generated image.
+            video_length (`int`):
+                The number of frames in the generated video.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`HunyuanVideoPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
+                using zero terminal SNR.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+        Examples:
+        Returns:
+            [`~HunyuanVideoPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`HunyuanVideoPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+        # 0. Default height and width to unet
+        # height = height or self.transformer.config.sample_size * self.vae_scale_factor
+        # width = width or self.transformer.config.sample_size * self.vae_scale_factor
+        # to deal with lora scaling and other possible forward hooks
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            video_length,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            callback_on_step_end_tensor_inputs,
+            vae_ver=vae_ver,
+        )
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._interrupt = False
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = torch.device(f"cuda:{dist.get_rank()}") if dist.is_initialized() else self._execution_device
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None)
+            if self.cross_attention_kwargs is not None
+            else None
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            prompt_mask,
+            negative_prompt_mask,
+        ) = self.encode_prompt(
+            prompt,
+            device,
+            num_videos_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            attention_mask=attention_mask,
+            negative_prompt_embeds=negative_prompt_embeds,
+            negative_attention_mask=negative_attention_mask,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+            data_type=data_type,
+        )
+        if self.text_encoder_2 is not None:
+            (
+                prompt_embeds_2,
+                negative_prompt_embeds_2,
+                prompt_mask_2,
+                negative_prompt_mask_2,
+            ) = self.encode_prompt(
+                prompt,
+                device,
+                num_videos_per_prompt,
+                self.do_classifier_free_guidance,
+                negative_prompt,
+                prompt_embeds=None,
+                attention_mask=None,
+                negative_prompt_embeds=None,
+                negative_attention_mask=None,
+                lora_scale=lora_scale,
+                clip_skip=self.clip_skip,
+                text_encoder=self.text_encoder_2,
+                data_type=data_type,
+            )
+        else:
+            prompt_embeds_2 = None
+            negative_prompt_embeds_2 = None
+            prompt_mask_2 = None
+            negative_prompt_mask_2 = None
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            if prompt_mask is not None:
+                prompt_mask = torch.cat([negative_prompt_mask, prompt_mask])
+            if prompt_embeds_2 is not None:
+                prompt_embeds_2 = torch.cat([negative_prompt_embeds_2, prompt_embeds_2])
+            if prompt_mask_2 is not None:
+                prompt_mask_2 = torch.cat([negative_prompt_mask_2, prompt_mask_2])
+        # 4. Prepare timesteps
+        extra_set_timesteps_kwargs = self.prepare_extra_func_kwargs(
+            self.scheduler.set_timesteps, {"n_tokens": n_tokens}
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            timesteps,
+            sigmas,
+            **extra_set_timesteps_kwargs,
+        )
+        if "884" in vae_ver:
+            video_length = (video_length - 1) // 4 + 1
+        elif "888" in vae_ver:
+            video_length = (video_length - 1) // 8 + 1
+        else:
+            video_length = video_length
+        # 5. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            video_length,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_func_kwargs(
+            self.scheduler.step,
+            {"generator": generator, "eta": eta},
+        )
+        target_dtype = PRECISION_TO_TYPE[self.args.precision]
+        autocast_enabled = (
+            target_dtype != torch.float32
+        ) and not self.args.disable_autocast
+        vae_dtype = PRECISION_TO_TYPE[self.args.vae_precision]
+        vae_autocast_enabled = (
+            vae_dtype != torch.float32
+        ) and not self.args.disable_autocast
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        # if is_progress_bar:
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = (
+                    torch.cat([latents] * 2)
+                    if self.do_classifier_free_guidance
+                    else latents
+                )
+                latent_model_input = self.scheduler.scale_model_input(
+                    latent_model_input, t
+                )
+                t_expand = t.repeat(latent_model_input.shape[0])
+                guidance_expand = (
+                    torch.tensor(
+                        [embedded_guidance_scale] * latent_model_input.shape[0],
+                        dtype=torch.float32,
+                        device=device,
+                    ).to(target_dtype)
+                    * 1000.0
+                    if embedded_guidance_scale is not None
+                    else None
+                )
+                # predict the noise residual
+                with torch.autocast(
+                    device_type="cuda", dtype=target_dtype, enabled=autocast_enabled
+                ):
+                    noise_pred = self.transformer(  # For an input image (129, 192, 336) (1, 256, 256)
+                        latent_model_input,  # [2, 16, 33, 24, 42]
+                        t_expand,  # [2]
+                        text_states=prompt_embeds,  # [2, 256, 4096]
+                        text_mask=prompt_mask,  # [2, 256]
+                        text_states_2=prompt_embeds_2,  # [2, 768]
+                        freqs_cos=freqs_cis[0],  # [seqlen, head_dim]
+                        freqs_sin=freqs_cis[1],  # [seqlen, head_dim]
+                        guidance=guidance_expand,
+                        return_dict=True,
+                    )[
+                        "x"
+                    ]
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (
+                        noise_pred_text - noise_pred_uncond
+                    )
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(
+                        noise_pred,
+                        noise_pred_text,
+                        guidance_rescale=self.guidance_rescale,
+                    )
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, **extra_step_kwargs, return_dict=False
+                )[0]
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop(
+                        "negative_prompt_embeds", negative_prompt_embeds
+                    )
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or (
+                    (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+                ):
+                    if progress_bar is not None:
+                        progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+        if not output_type == "latent":
+            expand_temporal_dim = False
+            if len(latents.shape) == 4:
+                if isinstance(self.vae, AutoencoderKLCausal3D):
+                    latents = latents.unsqueeze(2)
+                    expand_temporal_dim = True
+            elif len(latents.shape) == 5:
+                pass
+            else:
+                raise ValueError(
+                    f"Only support latents with shape (b, c, h, w) or (b, c, f, h, w), but got {latents.shape}."
+                )
+            if (
+                hasattr(self.vae.config, "shift_factor")
+                and self.vae.config.shift_factor
+            ):
+                latents = (
+                    latents / self.vae.config.scaling_factor
+                    + self.vae.config.shift_factor
+                )
+            else:
+                latents = latents / self.vae.config.scaling_factor
+            with torch.autocast(
+                device_type="cuda", dtype=vae_dtype, enabled=vae_autocast_enabled
+            ):
+                if enable_tiling:
+                    self.vae.enable_tiling()
+                    image = self.vae.decode(
+                        latents, return_dict=False, generator=generator
+                    )[0]
+                else:
+                    image = self.vae.decode(
+                        latents, return_dict=False, generator=generator
+                    )[0]
+            if expand_temporal_dim or image.shape[2] == 1:
+                image = image.squeeze(2)
+        else:
+            image = latents
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        image = image.cpu().float()
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return image
+        return HunyuanVideoPipelineOutput(videos=image)

hunyuan_model/posemb_layers.py ADDED Viewed

	@@ -0,0 +1,310 @@

+import torch
+from typing import Union, Tuple, List
+def _to_tuple(x, dim=2):
+    if isinstance(x, int):
+        return (x,) * dim
+    elif len(x) == dim:
+        return x
+    else:
+        raise ValueError(f"Expected length {dim} or int, but got {x}")
+def get_meshgrid_nd(start, *args, dim=2):
+    """
+    Get n-D meshgrid with start, stop and num.
+    Args:
+        start (int or tuple): If len(args) == 0, start is num; If len(args) == 1, start is start, args[0] is stop,
+            step is 1; If len(args) == 2, start is start, args[0] is stop, args[1] is num. For n-dim, start/stop/num
+            should be int or n-tuple. If n-tuple is provided, the meshgrid will be stacked following the dim order in
+            n-tuples.
+        *args: See above.
+        dim (int): Dimension of the meshgrid. Defaults to 2.
+    Returns:
+        grid (np.ndarray): [dim, ...]
+    """
+    if len(args) == 0:
+        # start is grid_size
+        num = _to_tuple(start, dim=dim)
+        start = (0,) * dim
+        stop = num
+    elif len(args) == 1:
+        # start is start, args[0] is stop, step is 1
+        start = _to_tuple(start, dim=dim)
+        stop = _to_tuple(args[0], dim=dim)
+        num = [stop[i] - start[i] for i in range(dim)]
+    elif len(args) == 2:
+        # start is start, args[0] is stop, args[1] is num
+        start = _to_tuple(start, dim=dim)  # Left-Top       eg: 12,0
+        stop = _to_tuple(args[0], dim=dim)  # Right-Bottom   eg: 20,32
+        num = _to_tuple(args[1], dim=dim)  # Target Size    eg: 32,124
+    else:
+        raise ValueError(f"len(args) should be 0, 1 or 2, but got {len(args)}")
+    # PyTorch implement of np.linspace(start[i], stop[i], num[i], endpoint=False)
+    axis_grid = []
+    for i in range(dim):
+        a, b, n = start[i], stop[i], num[i]
+        g = torch.linspace(a, b, n + 1, dtype=torch.float32)[:n]
+        axis_grid.append(g)
+    grid = torch.meshgrid(*axis_grid, indexing="ij")  # dim x [W, H, D]
+    grid = torch.stack(grid, dim=0)  # [dim, W, H, D]
+    return grid
+#################################################################################
+#                   Rotary Positional Embedding Functions                       #
+#################################################################################
+# https://github.com/meta-llama/llama/blob/be327c427cc5e89cc1d3ab3d3fec4484df771245/llama/model.py#L80
+def reshape_for_broadcast(
+    freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
+    x: torch.Tensor,
+    head_first=False,
+):
+    """
+    Reshape frequency tensor for broadcasting it with another tensor.
+    This function reshapes the frequency tensor to have the same shape as the target tensor 'x'
+    for the purpose of broadcasting the frequency tensor during element-wise operations.
+    Notes:
+        When using FlashMHAModified, head_first should be False.
+        When using Attention, head_first should be True.
+    Args:
+        freqs_cis (Union[torch.Tensor, Tuple[torch.Tensor]]): Frequency tensor to be reshaped.
+        x (torch.Tensor): Target tensor for broadcasting compatibility.
+        head_first (bool): head dimension first (except batch dim) or not.
+    Returns:
+        torch.Tensor: Reshaped frequency tensor.
+    Raises:
+        AssertionError: If the frequency tensor doesn't match the expected shape.
+        AssertionError: If the target tensor 'x' doesn't have the expected number of dimensions.
+    """
+    ndim = x.ndim
+    assert 0 <= 1 < ndim
+    if isinstance(freqs_cis, tuple):
+        # freqs_cis: (cos, sin) in real space
+        if head_first:
+            assert freqs_cis[0].shape == (
+                x.shape[-2],
+                x.shape[-1],
+            ), f"freqs_cis shape {freqs_cis[0].shape} does not match x shape {x.shape}"
+            shape = [
+                d if i == ndim - 2 or i == ndim - 1 else 1
+                for i, d in enumerate(x.shape)
+            ]
+        else:
+            assert freqs_cis[0].shape == (
+                x.shape[1],
+                x.shape[-1],
+            ), f"freqs_cis shape {freqs_cis[0].shape} does not match x shape {x.shape}"
+            shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+        return freqs_cis[0].view(*shape), freqs_cis[1].view(*shape)
+    else:
+        # freqs_cis: values in complex space
+        if head_first:
+            assert freqs_cis.shape == (
+                x.shape[-2],
+                x.shape[-1],
+            ), f"freqs_cis shape {freqs_cis.shape} does not match x shape {x.shape}"
+            shape = [
+                d if i == ndim - 2 or i == ndim - 1 else 1
+                for i, d in enumerate(x.shape)
+            ]
+        else:
+            assert freqs_cis.shape == (
+                x.shape[1],
+                x.shape[-1],
+            ), f"freqs_cis shape {freqs_cis.shape} does not match x shape {x.shape}"
+            shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+        return freqs_cis.view(*shape)
+def rotate_half(x):
+    x_real, x_imag = (
+        x.float().reshape(*x.shape[:-1], -1, 2).unbind(-1)
+    )  # [B, S, H, D//2]
+    return torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+def apply_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+    head_first: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply rotary embeddings to input tensors using the given frequency tensor.
+    This function applies rotary embeddings to the given query 'xq' and key 'xk' tensors using the provided
+    frequency tensor 'freqs_cis'. The input tensors are reshaped as complex numbers, and the frequency tensor
+    is reshaped for broadcasting compatibility. The resulting tensors contain rotary embeddings and are
+    returned as real tensors.
+    Args:
+        xq (torch.Tensor): Query tensor to apply rotary embeddings. [B, S, H, D]
+        xk (torch.Tensor): Key tensor to apply rotary embeddings.   [B, S, H, D]
+        freqs_cis (torch.Tensor or tuple): Precomputed frequency tensor for complex exponential.
+        head_first (bool): head dimension first (except batch dim) or not.
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
+    """
+    xk_out = None
+    if isinstance(freqs_cis, tuple):
+        cos, sin = reshape_for_broadcast(freqs_cis, xq, head_first)  # [S, D]
+        cos, sin = cos.to(xq.device), sin.to(xq.device)
+        # real * cos - imag * sin
+        # imag * cos + real * sin
+        xq_out = (xq.float() * cos + rotate_half(xq.float()) * sin).type_as(xq)
+        xk_out = (xk.float() * cos + rotate_half(xk.float()) * sin).type_as(xk)
+    else:
+        # view_as_complex will pack [..., D/2, 2](real) to [..., D/2](complex)
+        xq_ = torch.view_as_complex(
+            xq.float().reshape(*xq.shape[:-1], -1, 2)
+        )  # [B, S, H, D//2]
+        freqs_cis = reshape_for_broadcast(freqs_cis, xq_, head_first).to(
+            xq.device
+        )  # [S, D//2] --> [1, S, 1, D//2]
+        # (real, imag) * (cos, sin) = (real * cos - imag * sin, imag * cos + real * sin)
+        # view_as_real will expand [..., D/2](complex) to [..., D/2, 2](real)
+        xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3).type_as(xq)
+        xk_ = torch.view_as_complex(
+            xk.float().reshape(*xk.shape[:-1], -1, 2)
+        )  # [B, S, H, D//2]
+        xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3).type_as(xk)
+    return xq_out, xk_out
+def get_nd_rotary_pos_embed(
+    rope_dim_list,
+    start,
+    *args,
+    theta=10000.0,
+    use_real=False,
+    theta_rescale_factor: Union[float, List[float]] = 1.0,
+    interpolation_factor: Union[float, List[float]] = 1.0,
+):
+    """
+    This is a n-d version of precompute_freqs_cis, which is a RoPE for tokens with n-d structure.
+    Args:
+        rope_dim_list (list of int): Dimension of each rope. len(rope_dim_list) should equal to n.
+            sum(rope_dim_list) should equal to head_dim of attention layer.
+        start (int | tuple of int | list of int): If len(args) == 0, start is num; If len(args) == 1, start is start,
+            args[0] is stop, step is 1; If len(args) == 2, start is start, args[0] is stop, args[1] is num.
+        *args: See above.
+        theta (float): Scaling factor for frequency computation. Defaults to 10000.0.
+        use_real (bool): If True, return real part and imaginary part separately. Otherwise, return complex numbers.
+            Some libraries such as TensorRT does not support complex64 data type. So it is useful to provide a real
+            part and an imaginary part separately.
+        theta_rescale_factor (float): Rescale factor for theta. Defaults to 1.0.
+    Returns:
+        pos_embed (torch.Tensor): [HW, D/2]
+    """
+    grid = get_meshgrid_nd(
+        start, *args, dim=len(rope_dim_list)
+    )  # [3, W, H, D] / [2, W, H]
+    if isinstance(theta_rescale_factor, int) or isinstance(theta_rescale_factor, float):
+        theta_rescale_factor = [theta_rescale_factor] * len(rope_dim_list)
+    elif isinstance(theta_rescale_factor, list) and len(theta_rescale_factor) == 1:
+        theta_rescale_factor = [theta_rescale_factor[0]] * len(rope_dim_list)
+    assert len(theta_rescale_factor) == len(
+        rope_dim_list
+    ), "len(theta_rescale_factor) should equal to len(rope_dim_list)"
+    if isinstance(interpolation_factor, int) or isinstance(interpolation_factor, float):
+        interpolation_factor = [interpolation_factor] * len(rope_dim_list)
+    elif isinstance(interpolation_factor, list) and len(interpolation_factor) == 1:
+        interpolation_factor = [interpolation_factor[0]] * len(rope_dim_list)
+    assert len(interpolation_factor) == len(
+        rope_dim_list
+    ), "len(interpolation_factor) should equal to len(rope_dim_list)"
+    # use 1/ndim of dimensions to encode grid_axis
+    embs = []
+    for i in range(len(rope_dim_list)):
+        emb = get_1d_rotary_pos_embed(
+            rope_dim_list[i],
+            grid[i].reshape(-1),
+            theta,
+            use_real=use_real,
+            theta_rescale_factor=theta_rescale_factor[i],
+            interpolation_factor=interpolation_factor[i],
+        )  # 2 x [WHD, rope_dim_list[i]]
+        embs.append(emb)
+    if use_real:
+        cos = torch.cat([emb[0] for emb in embs], dim=1)  # (WHD, D/2)
+        sin = torch.cat([emb[1] for emb in embs], dim=1)  # (WHD, D/2)
+        return cos, sin
+    else:
+        emb = torch.cat(embs, dim=1)  # (WHD, D/2)
+        return emb
+def get_1d_rotary_pos_embed(
+    dim: int,
+    pos: Union[torch.FloatTensor, int],
+    theta: float = 10000.0,
+    use_real: bool = False,
+    theta_rescale_factor: float = 1.0,
+    interpolation_factor: float = 1.0,
+) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+    """
+    Precompute the frequency tensor for complex exponential (cis) with given dimensions.
+    (Note: `cis` means `cos + i * sin`, where i is the imaginary unit.)
+    This function calculates a frequency tensor with complex exponential using the given dimension 'dim'
+    and the end index 'end'. The 'theta' parameter scales the frequencies.
+    The returned tensor contains complex values in complex64 data type.
+    Args:
+        dim (int): Dimension of the frequency tensor.
+        pos (int or torch.FloatTensor): Position indices for the frequency tensor. [S] or scalar
+        theta (float, optional): Scaling factor for frequency computation. Defaults to 10000.0.
+        use_real (bool, optional): If True, return real part and imaginary part separately.
+                                   Otherwise, return complex numbers.
+        theta_rescale_factor (float, optional): Rescale factor for theta. Defaults to 1.0.
+    Returns:
+        freqs_cis: Precomputed frequency tensor with complex exponential. [S, D/2]
+        freqs_cos, freqs_sin: Precomputed frequency tensor with real and imaginary parts separately. [S, D]
+    """
+    if isinstance(pos, int):
+        pos = torch.arange(pos).float()
+    # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
+    # has some connection to NTK literature
+    if theta_rescale_factor != 1.0:
+        theta *= theta_rescale_factor ** (dim / (dim - 2))
+    freqs = 1.0 / (
+        theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)
+    )  # [D/2]
+    # assert interpolation_factor == 1.0, f"interpolation_factor: {interpolation_factor}"
+    freqs = torch.outer(pos * interpolation_factor, freqs)  # [S, D/2]
+    if use_real:
+        freqs_cos = freqs.cos().repeat_interleave(2, dim=1)  # [S, D]
+        freqs_sin = freqs.sin().repeat_interleave(2, dim=1)  # [S, D]
+        return freqs_cos, freqs_sin
+    else:
+        freqs_cis = torch.polar(
+            torch.ones_like(freqs), freqs
+        )  # complex64     # [S, D/2]
+        return freqs_cis

hunyuan_model/text_encoder.py ADDED Viewed

	@@ -0,0 +1,710 @@

+from dataclasses import dataclass
+import json
+import os
+from typing import Optional, Tuple, Union
+from copy import deepcopy
+import torch
+import torch.nn as nn
+from transformers import (
+    CLIPTextModel,
+    CLIPTokenizer,
+    AutoTokenizer,
+    AutoModel,
+    CLIPConfig,
+    LlamaForCausalLM,
+    LlamaConfig,
+)
+from transformers.utils import ModelOutput
+from transformers.models.llama import LlamaModel
+from safetensors.torch import load_file
+from accelerate import init_empty_weights
+import logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+CLIP_L_HUGGINGFACE_MODEL_ID = "openai/clip-vit-large-patch14"
+LLAVA_HUGGINGFACE_MODEL_ID = "xtuner/llava-llama-3-8b-v1_1-transformers"
+CLIP_CONFIG = {
+    "_name_or_path": "clip-vit-large-patch14/",
+    "architectures": ["CLIPModel"],
+    "initializer_factor": 1.0,
+    "logit_scale_init_value": 2.6592,
+    "model_type": "clip",
+    "projection_dim": 768,
+    #   "text_config": {
+    "_name_or_path": "",
+    "add_cross_attention": False,
+    "architectures": None,
+    "attention_dropout": 0.0,
+    "bad_words_ids": None,
+    "bos_token_id": 0,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": None,
+    "decoder_start_token_id": None,
+    "diversity_penalty": 0.0,
+    "do_sample": False,
+    "dropout": 0.0,
+    "early_stopping": False,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 2,
+    "finetuning_task": None,
+    "forced_bos_token_id": None,
+    "forced_eos_token_id": None,
+    "hidden_act": "quick_gelu",
+    "hidden_size": 768,
+    "id2label": {"0": "LABEL_0", "1": "LABEL_1"},
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "is_decoder": False,
+    "is_encoder_decoder": False,
+    "label2id": {"LABEL_0": 0, "LABEL_1": 1},
+    "layer_norm_eps": 1e-05,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 77,
+    "min_length": 0,
+    "model_type": "clip_text_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 12,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 12,
+    "num_return_sequences": 1,
+    "output_attentions": False,
+    "output_hidden_states": False,
+    "output_scores": False,
+    "pad_token_id": 1,
+    "prefix": None,
+    "problem_type": None,
+    "projection_dim": 768,
+    "pruned_heads": {},
+    "remove_invalid_values": False,
+    "repetition_penalty": 1.0,
+    "return_dict": True,
+    "return_dict_in_generate": False,
+    "sep_token_id": None,
+    "task_specific_params": None,
+    "temperature": 1.0,
+    "tie_encoder_decoder": False,
+    "tie_word_embeddings": True,
+    "tokenizer_class": None,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": None,
+    "torchscript": False,
+    "transformers_version": "4.16.0.dev0",
+    "use_bfloat16": False,
+    "vocab_size": 49408,
+    #   },
+    #   "text_config_dict": {
+    "hidden_size": 768,
+    "intermediate_size": 3072,
+    "num_attention_heads": 12,
+    "num_hidden_layers": 12,
+    "projection_dim": 768,
+    #   },
+    #   "torch_dtype": "float32",
+    #   "transformers_version": null
+}
+LLAMA_CONFIG = {
+    "architectures": ["LlamaForCausalLM"],
+    "attention_bias": False,
+    "attention_dropout": 0.0,
+    "bos_token_id": 128000,
+    "eos_token_id": 128001,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 4096,
+    "initializer_range": 0.02,
+    "intermediate_size": 14336,
+    "max_position_embeddings": 8192,
+    "mlp_bias": False,
+    "model_type": "llama",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 32,
+    "num_key_value_heads": 8,
+    "pretraining_tp": 1,
+    "rms_norm_eps": 1e-05,
+    "rope_scaling": None,
+    "rope_theta": 500000.0,
+    "tie_word_embeddings": False,
+    "torch_dtype": "float16",
+    "transformers_version": "4.46.3",
+    "use_cache": True,
+    "vocab_size": 128320,
+}
+# When using decoder-only models, we must provide a prompt template to instruct the text encoder
+# on how to generate the text.
+# --------------------------------------------------------------------
+PROMPT_TEMPLATE_ENCODE = (
+    "<|start_header_id|>system<|end_header_id|>\n\nDescribe the image by detailing the color, shape, size, texture, "
+    "quantity, text, spatial relationships of the objects and background:<|eot_id|>"
+    "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
+)
+PROMPT_TEMPLATE_ENCODE_VIDEO = (
+    "<|start_header_id|>system<|end_header_id|>\n\nDescribe the video by detailing the following aspects: "
+    "1. The main content and theme of the video."
+    "2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects."
+    "3. Actions, events, behaviors temporal relationships, physical movement changes of the objects."
+    "4. background environment, light, style and atmosphere."
+    "5. camera angles, movements, and transitions used in the video:<|eot_id|>"
+    "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
+)
+NEGATIVE_PROMPT = "Aerial view, aerial view, overexposed, low quality, deformation, a poor composition, bad hands, bad teeth, bad eyes, bad limbs, distortion"
+PROMPT_TEMPLATE = {
+    "dit-llm-encode": {
+        "template": PROMPT_TEMPLATE_ENCODE,
+        "crop_start": 36,
+    },
+    "dit-llm-encode-video": {
+        "template": PROMPT_TEMPLATE_ENCODE_VIDEO,
+        "crop_start": 95,
+    },
+}
+def use_default(value, default):
+    return value if value is not None else default
+def load_clip_l(text_encoder_path: str, dtype: Optional[Union[str, torch.dtype]] = None):
+    if os.path.isdir(text_encoder_path):
+        # load from directory, configs are in the directory
+        text_encoder = CLIPTextModel.from_pretrained(text_encoder_path, torch_dtype=dtype)
+    else:
+        # load from file, we create the model with the appropriate config
+        config = CLIPConfig(**CLIP_CONFIG)
+        with init_empty_weights():
+            text_encoder = CLIPTextModel._from_config(config, torch_dtype=dtype)
+        state_dict = load_file(text_encoder_path)
+        text_encoder.load_state_dict(state_dict, strict=True, assign=True)
+    # if dtype is not None:
+    #     text_encoder.to(dtype=dtype)
+    return text_encoder
+def load_clip_l_tokenizer(tokenizer_path: str):
+    if os.path.isdir(tokenizer_path):
+        tokenizer = CLIPTokenizer.from_pretrained(tokenizer_path, max_length=77)
+    else:
+        # load from Hugging Face
+        logger.info(f"Loading tokenizer from Hugging Face: {CLIP_L_HUGGINGFACE_MODEL_ID}")
+        tokenizer = CLIPTokenizer.from_pretrained(CLIP_L_HUGGINGFACE_MODEL_ID, max_length=77)
+    return tokenizer
+def load_llm(text_encoder_path: str, dtype: Optional[Union[str, torch.dtype]] = None):
+    if os.path.isdir(text_encoder_path):
+        # load from directory, configs are in the directory
+        text_encoder = AutoModel.from_pretrained(text_encoder_path, low_cpu_mem_usage=True, torch_dtype=dtype)
+    else:
+        # load from file, we create the model with the appropriate config
+        config = LlamaConfig(**LLAMA_CONFIG)
+        with init_empty_weights():
+            text_encoder = LlamaForCausalLM._from_config(config, torch_dtype=dtype)
+        state_dict = load_file(text_encoder_path)
+        # support weights from ComfyUI
+        if "tokenizer" in state_dict:
+            state_dict.pop("tokenizer")
+        text_encoder.load_state_dict(state_dict, strict=True, assign=True)
+    return text_encoder
+def load_llm_tokenizer(tokenizer_path: str, padding_side="right"):
+    if os.path.isdir(tokenizer_path):
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+    else:
+        # load from Hugging Face
+        logger.info(f"Loading tokenizer from Hugging Face: {LLAVA_HUGGINGFACE_MODEL_ID}")
+        tokenizer = AutoTokenizer.from_pretrained(LLAVA_HUGGINGFACE_MODEL_ID, padding_side=padding_side)
+    return tokenizer
+def load_text_encoder(
+    text_encoder_type: str,
+    text_encoder_path: str,
+    text_encoder_dtype: Optional[Union[str, torch.dtype]] = None,
+):
+    logger.info(f"Loading text encoder model ({text_encoder_type}) from: {text_encoder_path}")
+    # reduce peak memory usage by specifying the dtype of the model
+    dtype = text_encoder_dtype
+    if text_encoder_type == "clipL":
+        text_encoder = load_clip_l(text_encoder_path, dtype=dtype)
+        text_encoder.final_layer_norm = text_encoder.text_model.final_layer_norm
+    elif text_encoder_type == "llm":
+        text_encoder = load_llm(text_encoder_path, dtype=dtype)
+        if hasattr(text_encoder, "norm"):
+            text_encoder.final_layer_norm = text_encoder.norm  # by from_pretrained
+        else:
+            text_encoder.final_layer_norm = text_encoder.model.norm  # by _from_config
+    else:
+        raise ValueError(f"Unsupported text encoder type: {text_encoder_type}")
+    # from_pretrained will ensure that the model is in eval mode.
+    if dtype is not None:
+        text_encoder = text_encoder.to(dtype=dtype)
+    text_encoder.requires_grad_(False)
+    logger.info(f"Text encoder to dtype: {text_encoder.dtype}")
+    return text_encoder, text_encoder_path
+def load_tokenizer(tokenizer_type, tokenizer_path=None, padding_side="right"):
+    logger.info(f"Loading tokenizer ({tokenizer_type}) from: {tokenizer_path}")
+    if tokenizer_type == "clipL":
+        tokenizer = load_clip_l_tokenizer(tokenizer_path)
+    elif tokenizer_type == "llm":
+        tokenizer = load_llm_tokenizer(tokenizer_path, padding_side=padding_side)
+    else:
+        raise ValueError(f"Unsupported tokenizer type: {tokenizer_type}")
+    return tokenizer, tokenizer_path
+@dataclass
+class TextEncoderModelOutput(ModelOutput):
+    """
+    Base class for model's outputs that also contains a pooling of the last hidden states.
+    Args:
+        hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+        hidden_states_list (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        text_outputs (`list`, *optional*, returned when `return_texts=True` is passed):
+            List of decoded texts.
+    """
+    hidden_state: torch.FloatTensor = None
+    attention_mask: Optional[torch.LongTensor] = None
+    hidden_states_list: Optional[Tuple[torch.FloatTensor, ...]] = None
+    text_outputs: Optional[list] = None
+class TextEncoder(nn.Module):
+    def __init__(
+        self,
+        text_encoder_type: str,
+        max_length: int,
+        text_encoder_dtype: Optional[Union[str, torch.dtype]] = None,
+        text_encoder_path: Optional[str] = None,
+        tokenizer_type: Optional[str] = None,
+        tokenizer_path: Optional[str] = None,
+        output_key: Optional[str] = None,
+        use_attention_mask: bool = True,
+        input_max_length: Optional[int] = None,
+        prompt_template: Optional[dict] = None,
+        prompt_template_video: Optional[dict] = None,
+        hidden_state_skip_layer: Optional[int] = None,
+        apply_final_norm: bool = False,
+        reproduce: bool = False,
+    ):
+        super().__init__()
+        self.text_encoder_type = text_encoder_type
+        self.max_length = max_length
+        # self.precision = text_encoder_precision
+        self.model_path = text_encoder_path
+        self.tokenizer_type = tokenizer_type if tokenizer_type is not None else text_encoder_type
+        self.tokenizer_path = tokenizer_path if tokenizer_path is not None else text_encoder_path
+        self.use_attention_mask = use_attention_mask
+        if prompt_template_video is not None:
+            assert use_attention_mask is True, "Attention mask is True required when training videos."
+        self.input_max_length = input_max_length if input_max_length is not None else max_length
+        self.prompt_template = prompt_template
+        self.prompt_template_video = prompt_template_video
+        self.hidden_state_skip_layer = hidden_state_skip_layer
+        self.apply_final_norm = apply_final_norm
+        self.reproduce = reproduce
+        self.use_template = self.prompt_template is not None
+        if self.use_template:
+            assert (
+                isinstance(self.prompt_template, dict) and "template" in self.prompt_template
+            ), f"`prompt_template` must be a dictionary with a key 'template', got {self.prompt_template}"
+            assert "{}" in str(self.prompt_template["template"]), (
+                "`prompt_template['template']` must contain a placeholder `{}` for the input text, "
+                f"got {self.prompt_template['template']}"
+            )
+        self.use_video_template = self.prompt_template_video is not None
+        if self.use_video_template:
+            if self.prompt_template_video is not None:
+                assert (
+                    isinstance(self.prompt_template_video, dict) and "template" in self.prompt_template_video
+                ), f"`prompt_template_video` must be a dictionary with a key 'template', got {self.prompt_template_video}"
+            assert "{}" in str(self.prompt_template_video["template"]), (
+                "`prompt_template_video['template']` must contain a placeholder `{}` for the input text, "
+                f"got {self.prompt_template_video['template']}"
+            )
+        if "t5" in text_encoder_type:
+            self.output_key = output_key or "last_hidden_state"
+        elif "clip" in text_encoder_type:
+            self.output_key = output_key or "pooler_output"
+        elif "llm" in text_encoder_type or "glm" in text_encoder_type:
+            self.output_key = output_key or "last_hidden_state"
+        else:
+            raise ValueError(f"Unsupported text encoder type: {text_encoder_type}")
+        self.model, self.model_path = load_text_encoder(
+            text_encoder_type=self.text_encoder_type, text_encoder_path=self.model_path, text_encoder_dtype=text_encoder_dtype
+        )
+        self.dtype = self.model.dtype
+        self.tokenizer, self.tokenizer_path = load_tokenizer(
+            tokenizer_type=self.tokenizer_type, tokenizer_path=self.tokenizer_path, padding_side="right"
+        )
+    def __repr__(self):
+        return f"{self.text_encoder_type} ({self.precision} - {self.model_path})"
+    @property
+    def device(self):
+        return self.model.device
+    @staticmethod
+    def apply_text_to_template(text, template, prevent_empty_text=True):
+        """
+        Apply text to template.
+        Args:
+            text (str): Input text.
+            template (str or list): Template string or list of chat conversation.
+            prevent_empty_text (bool): If Ture, we will prevent the user text from being empty
+                by adding a space. Defaults to True.
+        """
+        if isinstance(template, str):
+            # Will send string to tokenizer. Used for llm
+            return template.format(text)
+        else:
+            raise TypeError(f"Unsupported template type: {type(template)}")
+    def text2tokens(self, text, data_type="image"):
+        """
+        Tokenize the input text.
+        Args:
+            text (str or list): Input text.
+        """
+        tokenize_input_type = "str"
+        if self.use_template:
+            if data_type == "image":
+                prompt_template = self.prompt_template["template"]
+            elif data_type == "video":
+                prompt_template = self.prompt_template_video["template"]
+            else:
+                raise ValueError(f"Unsupported data type: {data_type}")
+            if isinstance(text, (list, tuple)):
+                text = [self.apply_text_to_template(one_text, prompt_template) for one_text in text]
+                if isinstance(text[0], list):
+                    tokenize_input_type = "list"
+            elif isinstance(text, str):
+                text = self.apply_text_to_template(text, prompt_template)
+                if isinstance(text, list):
+                    tokenize_input_type = "list"
+            else:
+                raise TypeError(f"Unsupported text type: {type(text)}")
+        kwargs = dict(
+            truncation=True,
+            max_length=self.max_length,
+            padding="max_length",
+            return_tensors="pt",
+        )
+        if tokenize_input_type == "str":
+            return self.tokenizer(
+                text,
+                return_length=False,
+                return_overflowing_tokens=False,
+                return_attention_mask=True,
+                **kwargs,
+            )
+        elif tokenize_input_type == "list":
+            return self.tokenizer.apply_chat_template(
+                text,
+                add_generation_prompt=True,
+                tokenize=True,
+                return_dict=True,
+                **kwargs,
+            )
+        else:
+            raise ValueError(f"Unsupported tokenize_input_type: {tokenize_input_type}")
+    def encode(
+        self,
+        batch_encoding,
+        use_attention_mask=None,
+        output_hidden_states=False,
+        do_sample=None,
+        hidden_state_skip_layer=None,
+        return_texts=False,
+        data_type="image",
+        device=None,
+    ):
+        """
+        Args:
+            batch_encoding (dict): Batch encoding from tokenizer.
+            use_attention_mask (bool): Whether to use attention mask. If None, use self.use_attention_mask.
+                Defaults to None.
+            output_hidden_states (bool): Whether to output hidden states. If False, return the value of
+                self.output_key. If True, return the entire output. If set self.hidden_state_skip_layer,
+                output_hidden_states will be set True. Defaults to False.
+            do_sample (bool): Whether to sample from the model. Used for Decoder-Only LLMs. Defaults to None.
+                When self.produce is False, do_sample is set to True by default.
+            hidden_state_skip_layer (int): Number of hidden states to hidden_state_skip_layer. 0 means the last layer.
+                If None, self.output_key will be used. Defaults to None.
+            return_texts (bool): Whether to return the decoded texts. Defaults to False.
+        """
+        device = self.model.device if device is None else device
+        use_attention_mask = use_default(use_attention_mask, self.use_attention_mask)
+        hidden_state_skip_layer = use_default(hidden_state_skip_layer, self.hidden_state_skip_layer)
+        do_sample = use_default(do_sample, not self.reproduce)
+        attention_mask = batch_encoding["attention_mask"].to(device) if use_attention_mask else None
+        outputs = self.model(
+            input_ids=batch_encoding["input_ids"].to(device),
+            attention_mask=attention_mask,
+            output_hidden_states=output_hidden_states or hidden_state_skip_layer is not None,
+        )
+        if hidden_state_skip_layer is not None:
+            last_hidden_state = outputs.hidden_states[-(hidden_state_skip_layer + 1)]
+            # Real last hidden state already has layer norm applied. So here we only apply it
+            # for intermediate layers.
+            if hidden_state_skip_layer > 0 and self.apply_final_norm:
+                last_hidden_state = self.model.final_layer_norm(last_hidden_state)
+        else:
+            last_hidden_state = outputs[self.output_key]
+        # Remove hidden states of instruction tokens, only keep prompt tokens.
+        if self.use_template:
+            if data_type == "image":
+                crop_start = self.prompt_template.get("crop_start", -1)
+            elif data_type == "video":
+                crop_start = self.prompt_template_video.get("crop_start", -1)
+            else:
+                raise ValueError(f"Unsupported data type: {data_type}")
+            if crop_start > 0:
+                last_hidden_state = last_hidden_state[:, crop_start:]
+                attention_mask = attention_mask[:, crop_start:] if use_attention_mask else None
+        if output_hidden_states:
+            return TextEncoderModelOutput(last_hidden_state, attention_mask, outputs.hidden_states)
+        return TextEncoderModelOutput(last_hidden_state, attention_mask)
+    def forward(
+        self,
+        text,
+        use_attention_mask=None,
+        output_hidden_states=False,
+        do_sample=False,
+        hidden_state_skip_layer=None,
+        return_texts=False,
+    ):
+        batch_encoding = self.text2tokens(text)
+        return self.encode(
+            batch_encoding,
+            use_attention_mask=use_attention_mask,
+            output_hidden_states=output_hidden_states,
+            do_sample=do_sample,
+            hidden_state_skip_layer=hidden_state_skip_layer,
+            return_texts=return_texts,
+        )
+# region HunyanVideo architecture
+def load_text_encoder_1(
+    text_encoder_dir: str, device: torch.device, fp8_llm: bool, dtype: Optional[Union[str, torch.dtype]] = None
+) -> TextEncoder:
+    text_encoder_dtype = dtype or torch.float16
+    text_encoder_type = "llm"
+    text_len = 256
+    hidden_state_skip_layer = 2
+    apply_final_norm = False
+    reproduce = False
+    prompt_template = "dit-llm-encode"
+    prompt_template = PROMPT_TEMPLATE[prompt_template]
+    prompt_template_video = "dit-llm-encode-video"
+    prompt_template_video = PROMPT_TEMPLATE[prompt_template_video]
+    crop_start = prompt_template_video["crop_start"]  # .get("crop_start", 0)
+    max_length = text_len + crop_start
+    text_encoder_1 = TextEncoder(
+        text_encoder_type=text_encoder_type,
+        max_length=max_length,
+        text_encoder_dtype=text_encoder_dtype,
+        text_encoder_path=text_encoder_dir,
+        tokenizer_type=text_encoder_type,
+        prompt_template=prompt_template,
+        prompt_template_video=prompt_template_video,
+        hidden_state_skip_layer=hidden_state_skip_layer,
+        apply_final_norm=apply_final_norm,
+        reproduce=reproduce,
+    )
+    text_encoder_1.eval()
+    if fp8_llm:
+        org_dtype = text_encoder_1.dtype
+        logger.info(f"Moving and casting text encoder to {device} and torch.float8_e4m3fn")
+        text_encoder_1.to(device=device, dtype=torch.float8_e4m3fn)
+        # prepare LLM for fp8
+        def prepare_fp8(llama_model: LlamaModel, target_dtype):
+            def forward_hook(module):
+                def forward(hidden_states):
+                    input_dtype = hidden_states.dtype
+                    hidden_states = hidden_states.to(torch.float32)
+                    variance = hidden_states.pow(2).mean(-1, keepdim=True)
+                    hidden_states = hidden_states * torch.rsqrt(variance + module.variance_epsilon)
+                    return module.weight.to(input_dtype) * hidden_states.to(input_dtype)
+                return forward
+            for module in llama_model.modules():
+                if module.__class__.__name__ in ["Embedding"]:
+                    # print("set", module.__class__.__name__, "to", target_dtype)
+                    module.to(target_dtype)
+                if module.__class__.__name__ in ["LlamaRMSNorm"]:
+                    # print("set", module.__class__.__name__, "hooks")
+                    module.forward = forward_hook(module)
+        prepare_fp8(text_encoder_1.model, org_dtype)
+    else:
+        text_encoder_1.to(device=device)
+    return text_encoder_1
+def load_text_encoder_2(
+    text_encoder_dir: str, device: torch.device, dtype: Optional[Union[str, torch.dtype]] = None
+) -> TextEncoder:
+    text_encoder_dtype = dtype or torch.float16
+    reproduce = False
+    text_encoder_2_type = "clipL"
+    text_len_2 = 77
+    text_encoder_2 = TextEncoder(
+        text_encoder_type=text_encoder_2_type,
+        max_length=text_len_2,
+        text_encoder_dtype=text_encoder_dtype,
+        text_encoder_path=text_encoder_dir,
+        tokenizer_type=text_encoder_2_type,
+        reproduce=reproduce,
+    )
+    text_encoder_2.eval()
+    text_encoder_2.to(device=device)
+    return text_encoder_2
+# endregion
+if __name__ == "__main__":
+    import argparse
+    from utils.model_utils import str_to_dtype
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    parser = argparse.ArgumentParser()
+    parser.add_argument("type", type=str, help="Text Encoder type")
+    parser.add_argument("path1", type=str, help="Text Encoder directory or file 1")
+    parser.add_argument("path2", type=str, help="Text Encoder directory or file 2")
+    parser.add_argument("--dtype", type=str, default=None, help="Data type for Text Encoder")
+    args = parser.parse_args()
+    dtype = str_to_dtype(args.dtype) if args.dtype is not None else torch.float16
+    """
+    if args.type == "clipL":
+        text_encoder_1st = load_clip_l(args.path1, dtype=dtype)
+        tokenizer_1st = load_clip_l_tokenizer(args.path1)
+        text_encoder_2nd = load_clip_l(args.path2, dtype=dtype)
+        tokenizer_2nd = load_clip_l_tokenizer(args.path2)
+    elif args.type == "llm":
+        text_encoder_1st = load_llm(args.path1, dtype=dtype)
+        tokenizer_1st = load_llm_tokenizer(args.path1)
+        text_encoder_2nd = load_llm(args.path2, dtype=dtype)
+        tokenizer_2nd = load_llm_tokenizer(args.path2)
+    print(f"1st Text Encoder dtype: {text_encoder_1st.dtype}")
+    print(f"2nd Text Encoder dtype: {text_encoder_2nd.dtype}")
+    text_encoder_1st.to(device=device)
+    text_encoder_2nd.to(device=device)
+    test_text = "A cat sitting on a table"
+    token_ids_1st = tokenizer_1st(test_text, return_tensors="pt")["input_ids"]
+    token_ids_2nd = tokenizer_2nd(test_text, return_tensors="pt")["input_ids"]
+    assert torch.allclose(token_ids_1st, token_ids_2nd)
+    print(f"Token IDs are the same: {token_ids_1st}")
+    with torch.no_grad():
+        text_encoder_1st_output = text_encoder_1st(token_ids_1st.to(device), output_hidden_states=True)
+        text_encoder_2nd_output = text_encoder_2nd(token_ids_2nd.to(device), output_hidden_states=True)
+    print(f"1st Text Encoder output keys: {text_encoder_1st_output.keys()}")
+    print(f"2nd Text Encoder output keys: {text_encoder_2nd_output.keys()}")
+    for key in text_encoder_1st_output:
+        print(f"Checking output: {key}")
+        assert key in text_encoder_2nd_output, f"Key {key} not in 2nd Text Encoder output"
+        assert torch.allclose(text_encoder_1st_output[key], text_encoder_2nd_output[key])
+        print(f"Outputs are the same: {key}")
+    print("All outputs are the same.")
+    """
+    if args.type == "clipL":
+        text_encoder_1st = load_text_encoder_2(args.path1, device, dtype)
+        text_encoder_2nd = load_text_encoder_2(args.path2, device, dtype)
+    elif args.type == "llm":
+        text_encoder_1st = load_text_encoder_1(args.path1, device, False, dtype)
+        text_encoder_2nd = load_text_encoder_1(args.path2, device, False, dtype)
+    print(f"1st Text Encoder dtype: {text_encoder_1st.dtype}")
+    print(f"2nd Text Encoder dtype: {text_encoder_2nd.dtype}")
+    prompt = "A cat sitting on a table"
+    data_type = "video"  # video only, image is not supported
+    text_inputs_1st = text_encoder_1st.text2tokens(prompt, data_type=data_type)
+    text_inputs_2nd = text_encoder_2nd.text2tokens(prompt, data_type=data_type)
+    print(text_inputs_1st)
+    assert torch.allclose(text_inputs_1st["input_ids"], text_inputs_2nd["input_ids"])
+    with torch.no_grad():
+        prompt_outputs_1st = text_encoder_1st.encode(text_inputs_1st, data_type=data_type)
+        prompt_outputs_2nd = text_encoder_2nd.encode(text_inputs_1st, data_type=data_type)
+    # prompt_outputs.hidden_state, prompt_outputs.attention_mask
+    assert torch.allclose(prompt_outputs_1st.hidden_state, prompt_outputs_2nd.hidden_state)
+    print("Hidden states are the same.")
+    assert torch.allclose(prompt_outputs_1st.attention_mask, prompt_outputs_2nd.attention_mask)
+    print("Attention masks are the same.")
+    print("All outputs are the same.")

hunyuan_model/token_refiner.py ADDED Viewed

	@@ -0,0 +1,245 @@

+from typing import Optional
+from einops import rearrange
+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+from .activation_layers import get_activation_layer
+from .attention import attention
+from .norm_layers import get_norm_layer
+from .embed_layers import TimestepEmbedder, TextProjection
+from .mlp_layers import MLP
+from .modulate_layers import modulate, apply_gate
+class IndividualTokenRefinerBlock(nn.Module):
+    def __init__(
+        self,
+        hidden_size,
+        heads_num,
+        mlp_width_ratio: str = 4.0,
+        mlp_drop_rate: float = 0.0,
+        act_type: str = "silu",
+        qk_norm: bool = False,
+        qk_norm_type: str = "layer",
+        qkv_bias: bool = True,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.heads_num = heads_num
+        head_dim = hidden_size // heads_num
+        mlp_hidden_dim = int(hidden_size * mlp_width_ratio)
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+        self.self_attn_qkv = nn.Linear(hidden_size, hidden_size * 3, bias=qkv_bias, **factory_kwargs)
+        qk_norm_layer = get_norm_layer(qk_norm_type)
+        self.self_attn_q_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
+        )
+        self.self_attn_k_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
+        )
+        self.self_attn_proj = nn.Linear(hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs)
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+        act_layer = get_activation_layer(act_type)
+        self.mlp = MLP(
+            in_channels=hidden_size,
+            hidden_channels=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=mlp_drop_rate,
+            **factory_kwargs,
+        )
+        self.adaLN_modulation = nn.Sequential(
+            act_layer(),
+            nn.Linear(hidden_size, 2 * hidden_size, bias=True, **factory_kwargs),
+        )
+        # Zero-initialize the modulation
+        nn.init.zeros_(self.adaLN_modulation[1].weight)
+        nn.init.zeros_(self.adaLN_modulation[1].bias)
+        self.gradient_checkpointing = False
+    def enable_gradient_checkpointing(self):
+        self.gradient_checkpointing = True
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+    def _forward(
+        self,
+        x: torch.Tensor,
+        c: torch.Tensor,  # timestep_aware_representations + context_aware_representations
+        attn_mask: torch.Tensor = None,
+    ):
+        gate_msa, gate_mlp = self.adaLN_modulation(c).chunk(2, dim=1)
+        norm_x = self.norm1(x)
+        qkv = self.self_attn_qkv(norm_x)
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B L H D", K=3, H=self.heads_num)
+        # Apply QK-Norm if needed
+        q = self.self_attn_q_norm(q).to(v)
+        k = self.self_attn_k_norm(k).to(v)
+        # Self-Attention
+        attn = attention(q, k, v, mode="torch", attn_mask=attn_mask)
+        x = x + apply_gate(self.self_attn_proj(attn), gate_msa)
+        # FFN Layer
+        x = x + apply_gate(self.mlp(self.norm2(x)), gate_mlp)
+        return x
+    def forward(self, *args, **kwargs):
+        if self.training and self.gradient_checkpointing:
+            return checkpoint(self._forward, *args, use_reentrant=False, **kwargs)
+        else:
+            return self._forward(*args, **kwargs)
+class IndividualTokenRefiner(nn.Module):
+    def __init__(
+        self,
+        hidden_size,
+        heads_num,
+        depth,
+        mlp_width_ratio: float = 4.0,
+        mlp_drop_rate: float = 0.0,
+        act_type: str = "silu",
+        qk_norm: bool = False,
+        qk_norm_type: str = "layer",
+        qkv_bias: bool = True,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.blocks = nn.ModuleList(
+            [
+                IndividualTokenRefinerBlock(
+                    hidden_size=hidden_size,
+                    heads_num=heads_num,
+                    mlp_width_ratio=mlp_width_ratio,
+                    mlp_drop_rate=mlp_drop_rate,
+                    act_type=act_type,
+                    qk_norm=qk_norm,
+                    qk_norm_type=qk_norm_type,
+                    qkv_bias=qkv_bias,
+                    **factory_kwargs,
+                )
+                for _ in range(depth)
+            ]
+        )
+    def enable_gradient_checkpointing(self):
+        for block in self.blocks:
+            block.enable_gradient_checkpointing()
+    def disable_gradient_checkpointing(self):
+        for block in self.blocks:
+            block.disable_gradient_checkpointing()
+    def forward(
+        self,
+        x: torch.Tensor,
+        c: torch.LongTensor,
+        mask: Optional[torch.Tensor] = None,
+    ):
+        self_attn_mask = None
+        if mask is not None:
+            batch_size = mask.shape[0]
+            seq_len = mask.shape[1]
+            mask = mask.to(x.device)
+            # batch_size x 1 x seq_len x seq_len
+            self_attn_mask_1 = mask.view(batch_size, 1, 1, seq_len).repeat(1, 1, seq_len, 1)
+            # batch_size x 1 x seq_len x seq_len
+            self_attn_mask_2 = self_attn_mask_1.transpose(2, 3)
+            # batch_size x 1 x seq_len x seq_len, 1 for broadcasting of heads_num
+            self_attn_mask = (self_attn_mask_1 & self_attn_mask_2).bool()
+            # avoids self-attention weight being NaN for padding tokens
+            self_attn_mask[:, :, :, 0] = True
+        for block in self.blocks:
+            x = block(x, c, self_attn_mask)
+        return x
+class SingleTokenRefiner(nn.Module):
+    """
+    A single token refiner block for llm text embedding refine.
+    """
+    def __init__(
+        self,
+        in_channels,
+        hidden_size,
+        heads_num,
+        depth,
+        mlp_width_ratio: float = 4.0,
+        mlp_drop_rate: float = 0.0,
+        act_type: str = "silu",
+        qk_norm: bool = False,
+        qk_norm_type: str = "layer",
+        qkv_bias: bool = True,
+        attn_mode: str = "torch",
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.attn_mode = attn_mode
+        assert self.attn_mode == "torch", "Only support 'torch' mode for token refiner."
+        self.input_embedder = nn.Linear(in_channels, hidden_size, bias=True, **factory_kwargs)
+        act_layer = get_activation_layer(act_type)
+        # Build timestep embedding layer
+        self.t_embedder = TimestepEmbedder(hidden_size, act_layer, **factory_kwargs)
+        # Build context embedding layer
+        self.c_embedder = TextProjection(in_channels, hidden_size, act_layer, **factory_kwargs)
+        self.individual_token_refiner = IndividualTokenRefiner(
+            hidden_size=hidden_size,
+            heads_num=heads_num,
+            depth=depth,
+            mlp_width_ratio=mlp_width_ratio,
+            mlp_drop_rate=mlp_drop_rate,
+            act_type=act_type,
+            qk_norm=qk_norm,
+            qk_norm_type=qk_norm_type,
+            qkv_bias=qkv_bias,
+            **factory_kwargs,
+        )
+    def enable_gradient_checkpointing(self):
+        self.individual_token_refiner.enable_gradient_checkpointing()
+    def disable_gradient_checkpointing(self):
+        self.individual_token_refiner.disable_gradient_checkpointing()
+    def forward(
+        self,
+        x: torch.Tensor,
+        t: torch.LongTensor,
+        mask: Optional[torch.LongTensor] = None,
+    ):
+        timestep_aware_representations = self.t_embedder(t)
+        if mask is None:
+            context_aware_representations = x.mean(dim=1)
+        else:
+            mask_float = mask.float().unsqueeze(-1)  # [b, s1, 1]
+            context_aware_representations = (x * mask_float).sum(dim=1) / mask_float.sum(dim=1)
+        context_aware_representations = self.c_embedder(context_aware_representations)
+        c = timestep_aware_representations + context_aware_representations
+        x = self.input_embedder(x)
+        x = self.individual_token_refiner(x, c, mask)
+        return x