File size: 2,807 Bytes
97be4bd
 
 
04d1fd1
 
4706800
 
 
04d1fd1
4706800
 
 
 
 
 
 
 
 
 
04d1fd1
 
 
 
4706800
04d1fd1
4706800
04d1fd1
4706800
 
04d1fd1
 
4706800
 
 
04d1fd1
 
4706800
 
04d1fd1
4706800
 
 
 
 
 
04d1fd1
4706800
 
 
04d1fd1
4706800
04d1fd1
4706800
04d1fd1
 
 
4706800
 
 
04d1fd1
 
4706800
04d1fd1
4706800
04d1fd1
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
---

license: apache-2.0
---


```

from diffusers.utils import load_image, export_to_video

from transformers import CLIPVisionModel, CLIPImageProcessor, UMT5EncoderModel

from diffusers import WanI2VPipeline, WanTransformer3DModel

import torch



pretrained_model_name_or_path = "./wan_i2v"  # TODO replace with our hf id

image_encoder = CLIPVisionModel.from_pretrained(pretrained_model_name_or_path, subfolder='image_encoder',

                                                torch_dtype=torch.float16)

transformer_i2v = WanTransformer3DModel.from_pretrained(pretrained_model_name_or_path, subfolder='transformer_i2v_480p')

# for 720p

# transformer_i2v = WanTransformer3DModel.from_pretrained(pretrained_model_name_or_path, subfolder='transformer_i2v_720p',

#                                                          torch_dtype=torch.bfloat16)



image_processor = CLIPImageProcessor.from_pretrained(pretrained_model_name_or_path, subfolder='image_processor')



text_encoder = UMT5EncoderModel.from_pretrained(pretrained_model_name_or_path, subfolder='text_encoder',

                                                torch_dtype=torch.bfloat16)



pipe = WanI2VPipeline.from_pretrained(

    pretrained_model_name_or_path,

    transformer=transformer_i2v,

    text_encoder=text_encoder,

    image_encoder=image_encoder,

    image_processor=image_processor,

)



image = load_image(

    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"

)

device = "cuda"

seed = 0

prompt = ("An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in "

          "the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot.")

generator = torch.Generator(device=device).manual_seed(seed)



# pipe.to(device)

pipe.enable_model_cpu_offload()



negative_prompt = '色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走'



inputs = {

    'image': image,

    "prompt": prompt,

    # 'max_area': 720 * 1280, # for 720p

    "negative_prompt": negative_prompt,

    'max_area': 480 * 832,

    "generator": generator,

    "num_inference_steps": 40,

    "guidance_scale": 5.0,

    "num_frames": 81,

    "max_sequence_length": 512,

    "output_type": "np",

    # 'flow_shift': 5.0, # for 720p

    'flow_shift': 3.0

}



output = pipe(**inputs).frames[0]



export_to_video(output, "output.mp4", fps=16)





```