import gc import os import numpy as np import spaces import gradio as gr import torch from diffusers.training_utils import set_seed from diffusers import AutoencoderKLTemporalDecoder from normalcrafter.normal_crafter_ppl import NormalCrafterPipeline from normalcrafter.unet import DiffusersUNetSpatioTemporalConditionModelNormalCrafter import uuid import random from huggingface_hub import hf_hub_download from normalcrafter.utils import read_video_frames, vis_sequence_normal, save_video examples = [ ["examples/example_01.mp4", 1024, -1, -1], ["examples/example_02.mp4", 1024, -1, -1], ["examples/example_03.mp4", 1024, -1, -1], ["examples/example_04.mp4", 1024, -1, -1], # ["examples/example_05.mp4", 1024, -1, -1], # ["examples/example_06.mp4", 1024, -1, -1], ] pretrained_model_name_or_path = "Yanrui95/NormalCrafter" weight_dtype = torch.float16 unet = DiffusersUNetSpatioTemporalConditionModelNormalCrafter.from_pretrained( pretrained_model_name_or_path, subfolder="unet", low_cpu_mem_usage=True, ) vae = AutoencoderKLTemporalDecoder.from_pretrained( pretrained_model_name_or_path, subfolder="vae") vae.to(dtype=weight_dtype) unet.to(dtype=weight_dtype) pipe = NormalCrafterPipeline.from_pretrained( "stabilityai/stable-video-diffusion-img2vid-xt", unet=unet, vae=vae, torch_dtype=weight_dtype, variant="fp16", ) pipe.to("cuda") @spaces.GPU(duration=120) def infer_depth( video: str, max_res: int = 1024, process_length: int = -1, target_fps: int = -1, # save_folder: str = "./demo_output", window_size: int = 14, time_step_size: int = 10, decode_chunk_size: int = 7, seed: int = 42, save_npz: bool = False, ): set_seed(seed) pipe.enable_xformers_memory_efficient_attention() frames, target_fps = read_video_frames(video, process_length, target_fps, max_res) # inference the depth map using the DepthCrafter pipeline with torch.inference_mode(): res = pipe( frames, decode_chunk_size=decode_chunk_size, time_step_size=time_step_size, window_size=window_size, ).frames[0] # visualize the depth map and save the results vis = vis_sequence_normal(res) # save the depth map and visualization with the target FPS save_path = os.path.join(save_folder, os.path.splitext(os.path.basename(video))[0]) print(f"==> saving results to {save_path}") os.makedirs(os.path.dirname(save_path), exist_ok=True) if save_npz: np.savez_compressed(save_path + ".npz", normal=res) save_video(vis, save_path + "_vis.mp4", fps=target_fps) save_video(frames, save_path + "_input.mp4", fps=target_fps) # clear the cache for the next video gc.collect() torch.cuda.empty_cache() return [ save_path + "_input.mp4", save_path + "_vis.mp4", ] def construct_demo(): with gr.Blocks(analytics_enabled=False) as depthcrafter_iface: gr.Markdown( """