Upload 5 files

Browse files

Files changed (5) hide show

README.md +18 -0
handler.py +255 -0
model_index.json +32 -0
requirements.txt +20 -0
teacache.py +146 -0

README.md ADDED Viewed

	@@ -0,0 +1,18 @@

+---
+language:
+- en
+base_model:
+- tencent/HunyuanVideo
+pipeline_tag: text-to-video
+library_name: diffusers
+tags:
+- HunyuanVideo
+- Tencent
+- Video
+license: other
+license_name: tencent-hunyuan-community
+license_link: LICENSE
+---
+This model is [HunyuanVideo](https://huggingface.co/tencent/HunyuanVideo) adapted to run on the Hugging Face Inference Endpoints.

handler.py ADDED Viewed

	@@ -0,0 +1,255 @@

+from dataclasses import dataclass
+from typing import Dict, Any, Optional
+import base64
+import logging
+import random
+import traceback
+import torch
+from diffusers import HunyuanVideoPipeline, HunyuanVideoTransformer3DModel
+from varnish import Varnish
+from enhance_a_video import enable_enhance, inject_enhance_for_hunyuanvideo, set_enhance_weight
+from teacache import enable_teacache, disable_teacache
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+@dataclass
+class GenerationConfig:
+    """Configuration for video generation"""
+    # Content settings
+    prompt: str
+    negative_prompt: str = ""
+    # Model settings
+    num_frames: int = 49  # Should be 4k + 1 format
+    height: int = 320
+    width: int = 576
+    num_inference_steps: int = 50
+    guidance_scale: float = 7.0
+    # Reproducibility
+    seed: int = -1
+    # Varnish post-processing settings
+    fps: int = 30
+    double_num_frames: bool = False
+    super_resolution: bool = False
+    grain_amount: float = 0.0
+    quality: int = 18  # CRF scale (0-51, lower is better)
+    # Audio settings
+    enable_audio: bool = False
+    audio_prompt: str = ""
+    audio_negative_prompt: str = "voices, voice, talking, speaking, speech"
+    # TeaCache settings
+    enable_teacache: bool = True
+    teacache_threshold: float = 0.15 # values: 0 (original), 0.1 (1.6x speedup), 0.15 (2.1x speedup)
+    # Enhance-A-Video settings
+    enable_enhance_a_video: bool = True
+    enhance_a_video_weight: float = 4.0
+    def validate_and_adjust(self) -> 'GenerationConfig':
+        """Validate and adjust parameters"""
+        # Ensure num_frames follows 4k + 1 format
+        k = (self.num_frames - 1) // 4
+        self.num_frames = (k * 4) + 1
+        # Set random seed if not specified
+        if self.seed == -1:
+            self.seed = random.randint(0, 2**32 - 1)
+        return self
+class EndpointHandler:
+    """Handles video generation requests using HunyuanVideo and Varnish"""
+    def __init__(self, path: str = ""):
+        """Initialize handler with models
+        Args:
+            path: Path to model weights
+        """
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        # Initialize transformer with Enhance-A-Video injection first
+        transformer = HunyuanVideoTransformer3DModel.from_pretrained(
+            path,
+            subfolder="transformer",
+            torch_dtype=torch.bfloat16
+        )
+        inject_enhance_for_hunyuanvideo(transformer)
+        # Initialize HunyuanVideo pipeline with the enhanced transformer
+        self.pipeline = HunyuanVideoPipeline.from_pretrained(
+            path,
+            transformer=transformer,
+            torch_dtype=torch.float16,
+        ).to(self.device)
+        # Initialize text encoders in float16
+        self.pipeline.text_encoder = self.pipeline.text_encoder.half()
+        self.pipeline.text_encoder_2 = self.pipeline.text_encoder_2.half()
+        # Initialize transformer in bfloat16
+        self.pipeline.transformer = self.pipeline.transformer.to(torch.bfloat16)
+        # Initialize VAE in float16
+        self.pipeline.vae = self.pipeline.vae.half()
+        # Initialize Varnish for post-processing
+        self.varnish = Varnish(
+            device=self.device,
+            model_base_dir="/repository/varnish"
+        )
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        """Process video generation requests
+        Args:
+            data: Request data containing:
+                - inputs (str): Prompt for video generation
+                - parameters (dict): Generation parameters
+        Returns:
+            Dictionary containing:
+                - video: Base64 encoded MP4 data URI
+                - content-type: MIME type
+                - metadata: Generation metadata
+        """
+        # Extract inputs
+        inputs = data.pop("inputs", data)
+        if isinstance(inputs, dict):
+            prompt = inputs.get("prompt", "")
+        else:
+            prompt = inputs
+        params = data.get("parameters", {})
+        # Create and validate config
+        config = GenerationConfig(
+            prompt=prompt,
+            negative_prompt=params.get("negative_prompt", ""),
+            num_frames=params.get("num_frames", 49),
+            height=params.get("height", 320),
+            width=params.get("width", 576),
+            num_inference_steps=params.get("num_inference_steps", 50),
+            guidance_scale=params.get("guidance_scale", 7.0),
+            seed=params.get("seed", -1),
+            fps=params.get("fps", 30),
+            double_num_frames=params.get("double_num_frames", False),
+            super_resolution=params.get("super_resolution", False),
+            grain_amount=params.get("grain_amount", 0.0),
+            quality=params.get("quality", 18),
+            enable_audio=params.get("enable_audio", False),
+            audio_prompt=params.get("audio_prompt", ""),
+            audio_negative_prompt=params.get("audio_negative_prompt", "voices, voice, talking, speaking, speech"),
+            enable_teacache=params.get("enable_teacache", True),
+            # values: 0 (original), 0.1 (1.6x speedup), 0.15 (2.1x speedup).
+            teacache_threshold=params.get("teacache_threshold", 0.15),
+            enable_enhance_a_video=params.get("enable_enhance_a_video", True),
+            enhance_a_video_weight=params.get("enhance_a_video_weight", 4.0)
+        ).validate_and_adjust()
+        try:
+            # Set random seeds
+            if config.seed != -1:
+                torch.manual_seed(config.seed)
+                random.seed(config.seed)
+                generator = torch.Generator(device=self.device).manual_seed(config.seed)
+            else:
+                generator = None
+            # Configure TeaCache
+            #if config.enable_teacache:
+            #    enable_teacache(
+            #        self.pipeline.transformer,
+            #        num_inference_steps=config.num_inference_steps,
+            #        rel_l1_thresh=config.teacache_threshold
+            #    )
+            #else:
+            #    disable_teacache(self.pipeline.transformer)
+            # Configure Enhance-A-Video weight if enabled
+            if config.enable_enhance_a_video:
+                set_enhance_weight(config.enhance_a_video_weight)
+                enable_enhance()
+            else:
+                # Reset enhance weight to 0 to effectively disable it
+                set_enhance_weight(0)
+            # Generate video frames
+            with torch.inference_mode():
+                output = self.pipeline(
+                    prompt=config.prompt,
+                    # Failed to generate video: HunyuanVideoPipeline.__call__() got an unexpected keyword argument 'negative_prompt'
+                    #negative_prompt=config.negative_prompt,
+                    num_frames=config.num_frames,
+                    height=config.height,
+                    width=config.width,
+                    num_inference_steps=config.num_inference_steps,
+                    guidance_scale=config.guidance_scale,
+                    generator=generator,
+                    output_type="pt",
+                ).frames
+                # Process with Varnish
+                import asyncio
+                try:
+                    loop = asyncio.get_event_loop()
+                except RuntimeError:
+                    loop = asyncio.new_event_loop()
+                    asyncio.set_event_loop(loop)
+                result = loop.run_until_complete(
+                    self.varnish(
+                        input_data=output,
+                        fps=config.fps,
+                        double_num_frames=config.double_num_frames,
+                        super_resolution=config.super_resolution,
+                        grain_amount=config.grain_amount,
+                        enable_audio=config.enable_audio,
+                        audio_prompt=config.audio_prompt,
+                        audio_negative_prompt=config.audio_negative_prompt,
+                    )
+                )
+                # Get video data URI
+                video_uri = loop.run_until_complete(
+                    result.write(
+                        type="data-uri",
+                        quality=config.quality
+                    )
+                )
+                return {
+                    "video": video_uri,
+                    "content-type": "video/mp4",
+                    "metadata": {
+                        "width": result.metadata.width,
+                        "height": result.metadata.height,
+                        "num_frames": result.metadata.frame_count,
+                        "fps": result.metadata.fps,
+                        "duration": result.metadata.duration,
+                        "seed": config.seed,
+                        "enable_teacache": config.enable_teacache,
+                        "teacache_threshold": config.teacache_threshold if config.enable_teacache else 0,
+                        "enable_enhance_a_video": config.enable_enhance_a_video,
+                        "enhance_a_video_weight": config.enhance_a_video_weight if config.enable_enhance_a_video else 0,
+                    }
+                }
+        except Exception as e:
+            message = f"Error generating video ({str(e)})\n{traceback.format_exc()}"
+            logger.error(message)
+            raise RuntimeError(message)

model_index.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "_class_name": "HunyuanVideoPipeline",
+  "_diffusers_version": "0.32.0.dev0",
+  "scheduler": [
+    "diffusers",
+    "FlowMatchEulerDiscreteScheduler"
+  ],
+  "text_encoder": [
+    "transformers",
+    "LlamaModel"
+  ],
+  "text_encoder_2": [
+    "transformers",
+    "CLIPTextModel"
+  ],
+  "tokenizer": [
+    "transformers",
+    "LlamaTokenizerFast"
+  ],
+  "tokenizer_2": [
+    "transformers",
+    "CLIPTokenizer"
+  ],
+  "transformer": [
+    "diffusers",
+    "HunyuanVideoTransformer3DModel"
+  ],
+  "vae": [
+    "diffusers",
+    "AutoencoderKLHunyuanVideo"
+  ]
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+diffusers @ git+https://github.com/huggingface/diffusers.git@main
+varnish @ git+https://github.com/jbilcke-hf/varnish.git@main
+opencv-python>=4.10.0.84
+transformers==4.48.0
+huggingface_hub==0.27.1
+tokenizers>=0.20.3
+accelerate>=1.1.1
+pandas>=2.0.3
+numpy
+einops==0.7.0
+tqdm>=4.66.5
+loguru>=0.7.2
+imageio>=2.34.2
+imageio-ffmpeg>=0.5.1
+safetensors>=0.4.5
+moviepy==1.0.3

teacache.py ADDED Viewed

	@@ -0,0 +1,146 @@

+# teacache.py
+import torch
+import numpy as np
+from typing import Optional, Dict, Union, Any
+from functools import wraps
+class TeaCacheConfig:
+   """Configuration for TeaCache acceleration"""
+   def __init__(
+       self,
+       rel_l1_thresh: float = 0.15,
+       enable: bool = True
+   ):
+       self.rel_l1_thresh = rel_l1_thresh
+       self.enable = enable
+       self._reset_state()
+   def _reset_state(self):
+       """Reset internal state"""
+       self.cnt = 0
+       self.accumulated_rel_l1_distance = 0
+       self.previous_modulated_input = None
+       self.previous_residual = None
+def create_teacache_forward(original_forward):
+   """Factory function to create a TeaCache-enabled forward pass"""
+   @wraps(original_forward)
+   def teacache_forward(
+       self,
+       hidden_states: torch.Tensor,
+       timestep: torch.Tensor,
+       encoder_hidden_states: Optional[torch.Tensor] = None,
+       encoder_attention_mask: Optional[torch.Tensor] = None,
+       pooled_projections: Optional[torch.Tensor] = None,
+       guidance: Optional[torch.Tensor] = None,
+       attention_kwargs: Optional[Dict[str, Any]] = None,
+       return_dict: bool = True,
+   ):
+       # Skip TeaCache if not enabled
+       if not hasattr(self, 'teacache_config') or not self.teacache_config.enable:
+           return original_forward(
+               self,
+               hidden_states=hidden_states,
+               timestep=timestep,
+               encoder_hidden_states=encoder_hidden_states,
+               encoder_attention_mask=encoder_attention_mask,
+               pooled_projections=pooled_projections,
+               guidance=guidance,
+               attention_kwargs=attention_kwargs,
+               return_dict=return_dict
+           )
+       config = self.teacache_config
+       # Prepare modulation vectors similar to HunyuanVideo implementation
+       if pooled_projections is not None:
+           vec = self.vector_in(pooled_projections)
+       if guidance is not None:
+           if vec is None:
+               vec = self.guidance_in(guidance)
+           else:
+               vec = vec + self.guidance_in(guidance)
+       # TeaCache optimization logic
+       inp = hidden_states.clone()
+       if hasattr(self.double_blocks[0], 'img_norm1'):
+           # HunyuanVideo specific modulation
+           img_mod1_shift, img_mod1_scale, _, _, _, _ = self.double_blocks[0].img_mod(vec).chunk(6, dim=-1)
+           normed_inp = self.double_blocks[0].img_norm1(inp)
+           modulated_inp = normed_inp * (1 + img_mod1_scale) + img_mod1_shift
+       else:
+           # Fallback modulation
+           normed_inp = self.transformer_blocks[0].norm1(inp)
+           modulated_inp = normed_inp
+       # Determine if we should calculate or use cache
+       should_calc = True
+       if config.cnt == 0 or config.cnt == self.num_inference_steps - 1:
+           should_calc = True
+           config.accumulated_rel_l1_distance = 0
+       elif config.previous_modulated_input is not None:
+           coefficients = [7.33226126e+02, -4.01131952e+02, 6.75869174e+01,
+                         -3.14987800e+00, 9.61237896e-02]
+           rescale_func = np.poly1d(coefficients)
+           rel_l1 = ((modulated_inp - config.previous_modulated_input).abs().mean() /
+                    config.previous_modulated_input.abs().mean()).cpu().item()
+           config.accumulated_rel_l1_distance += rescale_func(rel_l1)
+           should_calc = config.accumulated_rel_l1_distance >= config.rel_l1_thresh
+           if should_calc:
+               config.accumulated_rel_l1_distance = 0
+       config.previous_modulated_input = modulated_inp
+       config.cnt += 1
+       if config.cnt >= self.num_inference_steps:
+           config.cnt = 0
+       # Use cache or calculate new result
+       if not should_calc and config.previous_residual is not None:
+           hidden_states += config.previous_residual
+       else:
+           ori_hidden_states = hidden_states.clone()
+           # Use original forward pass
+           out = original_forward(
+               self,
+               hidden_states=hidden_states,
+               timestep=timestep,
+               encoder_hidden_states=encoder_hidden_states,
+               encoder_attention_mask=encoder_attention_mask,
+               pooled_projections=pooled_projections,
+               guidance=guidance,
+               attention_kwargs=attention_kwargs,
+               return_dict=True
+           )
+           hidden_states = out["sample"]
+           # Store residual for future use
+           config.previous_residual = hidden_states - ori_hidden_states
+       if not return_dict:
+           return (hidden_states,)
+       return {"sample": hidden_states}
+   return teacache_forward
+def enable_teacache(model: Any, num_inference_steps: int, rel_l1_thresh: float = 0.15):
+   """Enable TeaCache acceleration for a model"""
+   if not hasattr(model, '_original_forward'):
+       model._original_forward = model.forward
+   model.teacache_config = TeaCacheConfig(rel_l1_thresh=rel_l1_thresh)
+   model.num_inference_steps = num_inference_steps
+   model.forward = create_teacache_forward(model._original_forward).__get__(model)
+def disable_teacache(model: Any):
+   """Disable TeaCache acceleration for a model"""
+   if hasattr(model, '_original_forward'):
+       model.forward = model._original_forward
+       del model._original_forward
+   if hasattr(model, 'teacache_config'):
+       del model.teacache_config