# model_loader.py import os import torch import spaces from diffusers import FluxControlNetPipeline from transformers import T5EncoderModel from moondream import vl @spaces.GPU() def safe_model_load(): """Load models in a single GPU invocation to keep them warm""" try: # Set max memory usage for ZeroGPU torch.cuda.set_per_process_memory_fraction(1.0) torch.set_float32_matmul_precision("high") # Load models huggingface_token = os.getenv("HUGGINFACE_TOKEN") md_api_key = os.getenv("MD_KEY") text_encoder = T5EncoderModel.from_pretrained( "LPX55/FLUX.1-merged_uncensored", subfolder="text_encoder_2", torch_dtype=torch.bfloat16, token=huggingface_token ) pipe = FluxControlNetPipeline.from_pretrained( "LPX55/FLUX.1M-8step_upscaler-cnet", torch_dtype=torch.bfloat16, text_encoder_2=text_encoder, token=huggingface_token ) # Apply memory optimizations try: pipe.enable_xformers_memory_efficient_attention() except Exception as e: print(f"XFormers not available: {e}") pipe.enable_attention_slicing() # pipe.enable_sequential_cpu_offload() pipe.to("cuda") # For memory-sensitive environments try: torch.multiprocessing.set_sharing_strategy('file_system') except Exception as e: print(f"Exception raised (torch.multiprocessing): {e}") return pipe except Exception as e: print(f"Model loading failed: {e}") # Return placeholder to handle gracefully in UI return {"error": str(e)}