import os import torch import streamlit as st from diffusers import AutoencoderKL, DDIMScheduler from transformers import CLIPTextModel, CLIPTokenizer from src.mgd_pipelines.mgd_pipe import MGDPipe from src.mgd_pipelines.mgd_pipe_disentangled import MGDPipeDisentangled from src.utils.image_from_pipe import generate_images_from_mgd_pipe from accelerate import Accelerator from diffusers.utils import check_min_version from src.utils.set_seeds import set_seed # Will error if the minimal version of diffusers is not installed. Remove at your own risks. check_min_version("0.10.0.dev0") # Set the environment variables for Hugging Face Spaces os.environ["TOKENIZERS_PARALLELISM"] = "true" os.environ["WANDB_START_METHOD"] = "thread" # Streamlit interface components st.title("Fashion Image Generation with Multimodal Garment Designer") # Streamlit Input Parameters category = st.selectbox("Select Category", ["dresses", "upper_body", "lower_body", "all"]) guidance_scale = st.slider("Guidance Scale", min_value=0.1, max_value=20.0, value=7.5, step=0.1) guidance_scale_pose = st.slider("Guidance Scale (Pose)", min_value=0.1, max_value=20.0, value=7.5, step=0.1) guidance_scale_sketch = st.slider("Guidance Scale (Sketch)", min_value=0.1, max_value=20.0, value=7.5, step=0.1) sketch_cond_rate = st.slider("Sketch Conditioning Rate", min_value=0.1, max_value=1.0, value=0.5, step=0.05) start_cond_rate = st.slider("Start Conditioning Rate", min_value=0.1, max_value=1.0, value=0.5, step=0.05) seed = st.number_input("Seed", value=42, min_value=1) # Button to run the image generation if st.button("Generate Image"): # Initialize Accelerator (for mixed precision, etc.) accelerator = Accelerator() device = accelerator.device # Set the seed set_seed(seed) # Model and Tokenizer loading (use pre-trained from Hugging Face) model_name = "stabilityai/stable-diffusion-2-1-base" # Use appropriate model name # Load scheduler, tokenizer, and models val_scheduler = DDIMScheduler.from_pretrained(model_name, subfolder="scheduler") val_scheduler.set_timesteps(50, device=device) tokenizer = CLIPTokenizer.from_pretrained(model_name, subfolder="tokenizer") text_encoder = CLIPTextModel.from_pretrained(model_name, subfolder="text_encoder") vae = AutoencoderKL.from_pretrained(model_name, subfolder="vae") # Load UNet model (you can use your own model) unet = torch.hub.load( dataset="aimagelab/multimodal-garment-designer", repo_or_dir="aimagelab/multimodal-garment-designer", source="github", model="mgd", pretrained=True, ) # Freeze VAE and text encoder vae.requires_grad_(False) text_encoder.requires_grad_(False) # Select pipeline (use disentangled option if needed) val_pipe = MGDPipe( text_encoder=text_encoder, vae=vae, unet=unet.to(vae.dtype), tokenizer=tokenizer, scheduler=val_scheduler, ).to(device) # Run image generation using your pipeline with torch.no_grad(): # Generate the image images = generate_images_from_mgd_pipe( test_order="test", # or some predefined order pipe=val_pipe, test_dataloader=None, # Adjust accordingly, or use pre-existing dataset save_name="generated_image", dataset="dresscode", # Adjust if needed output_dir=".", # Save location guidance_scale=guidance_scale, guidance_scale_pose=guidance_scale_pose, guidance_scale_sketch=guidance_scale_sketch, sketch_cond_rate=sketch_cond_rate, start_cond_rate=start_cond_rate, no_pose=False, disentagle=False, # Adjust if needed seed=seed, ) # Display the generated image st.image(images[0], caption="Generated Fashion Image", use_column_width=True)