LPX55 commited on
Commit
a102a01
·
verified ·
1 Parent(s): 1b616c4

Update optimized.py

Browse files
Files changed (1) hide show
  1. optimized.py +38 -17
optimized.py CHANGED
@@ -11,26 +11,38 @@ from accelerate import dispatch_model, infer_auto_device_map
11
  def self_attention_slicing(module, slice_size=3):
12
  """Modified from Diffusers' original for Flux compatibility"""
13
  def sliced_attention(*args, **kwargs):
14
- return module(*args, **kwargs) # Remove dummy implementation <source_id data="pipeline_flux_controlnet.py" />
 
 
 
15
 
16
- huggingface_token = os.getenv("HUGGINFACE_TOKEN")
17
- # good_vae = AutoencoderKL.from_pretrained(
18
- # "black-forest-labs/FLUX.1-dev",
19
- # subfolder="vae",
20
- # torch_dtype=torch.bfloat16,
21
- # use_safetensors=True,
22
- # device_map=None, # Disable automatic mapping
23
- # token=huggingface_token
24
- # )
 
 
 
 
 
 
 
25
 
 
 
26
  good_vae = AutoencoderKL.from_pretrained(
27
  "black-forest-labs/FLUX.1-dev",
28
  subfolder="vae",
29
  torch_dtype=torch.bfloat16,
30
  use_safetensors=True,
31
- token=huggingface_token # Fix typo in variable name
 
32
  )
33
-
34
  # 2. Main Pipeline Initialization WITH VAE SCOPE
35
  pipe = FluxControlNetPipeline.from_pretrained(
36
  "LPX55/FLUX.1-merged_uncensored",
@@ -47,15 +59,24 @@ pipe = FluxControlNetPipeline.from_pretrained(
47
 
48
  # 3. Strict Order for Optimization Steps
49
  # A. Apply CPU Offloading FIRST
50
- pipe.enable_sequential_cpu_offload()
 
 
 
 
 
 
 
 
51
 
 
52
  # B. Enable Memory Optimizations
53
- pipe.enable_vae_tiling()
54
- pipe.enable_xformers_memory_efficient_attention()
55
 
56
  # C. Unified Precision Handling
57
- for comp in [pipe.unet, pipe.vae, pipe.controlnet]:
58
- comp.to(dtype=torch.bfloat16)
59
 
60
  print(f"VRAM used: {torch.cuda.memory_allocated()/1e9:.2f}GB")
61
  @spaces.GPU
 
11
  def self_attention_slicing(module, slice_size=3):
12
  """Modified from Diffusers' original for Flux compatibility"""
13
  def sliced_attention(*args, **kwargs):
14
+ if "dim" in kwargs:
15
+ dim = kwargs["dim"]
16
+ else:
17
+ dim = 1
18
 
19
+ if slice_size == "auto":
20
+ # Automatic slicing based on Flux architecture
21
+ return module(*args, **kwargs)
22
+
23
+ output = torch.cat([
24
+ module(
25
+ *[arg[:, :, i:i+slice_size] if i == dim else arg
26
+ for arg in args],
27
+ **{k: v[:, :, i:i+slice_size] if k == dim else v
28
+ for k,v in kwargs.items()}
29
+ )
30
+ for i in range(0, args[0].shape[dim], slice_size)
31
+ ], dim=dim)
32
+
33
+ return output
34
+ return sliced_attention
35
 
36
+
37
+ huggingface_token = os.getenv("HUGGINFACE_TOKEN")
38
  good_vae = AutoencoderKL.from_pretrained(
39
  "black-forest-labs/FLUX.1-dev",
40
  subfolder="vae",
41
  torch_dtype=torch.bfloat16,
42
  use_safetensors=True,
43
+ device_map=None, # Disable automatic mapping
44
+ token=huggingface_token
45
  )
 
46
  # 2. Main Pipeline Initialization WITH VAE SCOPE
47
  pipe = FluxControlNetPipeline.from_pretrained(
48
  "LPX55/FLUX.1-merged_uncensored",
 
59
 
60
  # 3. Strict Order for Optimization Steps
61
  # A. Apply CPU Offloading FIRST
62
+ pipe.enable_sequential_cpu_offload() # No arguments for new API
63
+ # 2. Then apply custom VAE slicing
64
+ if getattr(pipe, "vae", None) is not None:
65
+ # Method 1: Use official implementation if available
66
+ try:
67
+ pipe.vae.enable_slicing()
68
+ except AttributeError:
69
+ # Method 2: Apply manual slicing for Flux compatibility [source_id]pipeline_flux_controlnet.py
70
+ pipe.vae.decode = self_attention_slicing(pipe.vae.decode, 2)
71
 
72
+ pipe.enable_attention_slicing(1)
73
  # B. Enable Memory Optimizations
74
+ # pipe.enable_vae_tiling()
75
+ # pipe.enable_xformers_memory_efficient_attention()
76
 
77
  # C. Unified Precision Handling
78
+ # for comp in [pipe.unet, pipe.vae, pipe.controlnet]:
79
+ # comp.to(dtype=torch.bfloat16)
80
 
81
  print(f"VRAM used: {torch.cuda.memory_allocated()/1e9:.2f}GB")
82
  @spaces.GPU