ritutweets46 commited on
Commit
ff7f769
·
verified ·
1 Parent(s): 8c24e45

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -65
app.py CHANGED
@@ -5,17 +5,6 @@ from diffusers import DiffusionPipeline
5
  import torch
6
  from huggingface_hub import login
7
  import os
8
- import bitsandbytes as bnb
9
- import onnx
10
- import onnxruntime as ort
11
- from onnxruntime.quantization import quantize_dynamic, QuantType
12
- import psutil
13
- from accelerate import infer_auto_device_map, init_empty_weights, load_checkpoint_and_dispatch
14
-
15
- def get_memory_usage():
16
- process = psutil.Process(os.getpid())
17
- mem_info = process.memory_info()
18
- return f"{mem_info.rss / (1024 ** 2):.2f} MB"
19
 
20
  device = "cuda" if torch.cuda.is_available() else "cpu"
21
 
@@ -27,52 +16,16 @@ login(token=HUGGINGFACE_TOKEN)
27
  base_model_repo = "stabilityai/stable-diffusion-3-medium-diffusers"
28
  lora_weights_path = "./pytorch_lora_weights.safetensors"
29
 
30
- memory_log = []
31
-
32
- def log_memory(step):
33
- memory_log.append(f"{step}: {get_memory_usage()}")
34
-
35
- log_memory("Before loading the model")
36
-
37
- # Choose the appropriate dtype
38
- dtype = torch.float16 if torch.cuda.is_available() else torch.bfloat16
39
-
40
- # Load the base model with 16-bit precision if available
41
- with init_empty_weights():
42
- pipeline = DiffusionPipeline.from_pretrained(
43
- base_model_repo,
44
- torch_dtype=dtype,
45
- use_auth_token=HUGGINGFACE_TOKEN
46
- )
47
- log_memory("After loading the model")
48
-
49
- bnb.optim.load_int8_model(pipeline.model, device=device)
50
- log_memory("After loading 8-bit model")
51
-
52
  pipeline.load_lora_weights(lora_weights_path)
53
  pipeline.enable_sequential_cpu_offload() # Efficient memory usage
54
  pipeline.enable_xformers_memory_efficient_attention() # Enable xformers memory efficient attention
55
-
56
- # Use accelerate to handle model offloading
57
- device_map = infer_auto_device_map(pipeline.model)
58
- pipeline.model = load_checkpoint_and_dispatch(pipeline.model, device_map=device_map)
59
- log_memory("After enabling optimizations")
60
-
61
- # Export to ONNX
62
- onnx_model_path = "model.onnx"
63
- pipeline.model.eval()
64
- dummy_input = torch.randn(1, 3, 512, 512, device=device)
65
- torch.onnx.export(pipeline.model, dummy_input, onnx_model_path, export_params=True, opset_version=11, do_constant_folding=True, input_names=['input'], output_names=['output'])
66
- log_memory("After exporting to ONNX")
67
-
68
- # Quantize ONNX model to 8-bit
69
- quantized_model_path = "model_quantized.onnx"
70
- quantize_dynamic(onnx_model_path, quantized_model_path, weight_type=QuantType.QUInt8)
71
- log_memory("After quantizing ONNX model")
72
-
73
- # Load quantized ONNX model
74
- session = ort.InferenceSession(quantized_model_path)
75
- log_memory("After loading quantized ONNX model")
76
 
77
  MAX_SEED = np.iinfo(np.int32).max
78
  MAX_IMAGE_SIZE = 768 # Reduce max image size to fit within memory constraints
@@ -92,10 +45,8 @@ def infer(prompt, negative_prompt, seed, randomize_seed, width, height, guidance
92
  height=height,
93
  generator=generator
94
  ).images[0]
95
-
96
- log_memory("After inference")
97
 
98
- return image, "\n".join(memory_log)
99
 
100
  examples = [
101
  "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
@@ -108,12 +59,6 @@ css = """
108
  margin: 0 auto;
109
  max-width: 520px;
110
  }
111
- #memory-log {
112
- white-space: pre-wrap;
113
- background: #f8f9fa;
114
- padding: 10px;
115
- border-radius: 5px;
116
- }
117
  """
118
 
119
  if torch.cuda.is_available():
@@ -140,7 +85,6 @@ with gr.Blocks(css=css) as demo:
140
  run_button = gr.Button("Run", scale=0)
141
 
142
  result = gr.Image(label="Result", show_label=False)
143
- memory_log_output = gr.Textbox(label="Memory Log", elem_id="memory-log", lines=10, interactive=False)
144
 
145
  with gr.Accordion("Advanced Settings", open=False):
146
  negative_prompt = gr.Textbox(
@@ -202,7 +146,7 @@ with gr.Blocks(css=css) as demo:
202
  run_button.click(
203
  fn=infer,
204
  inputs=[prompt, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps],
205
- outputs=[result, memory_log_output]
206
  )
207
 
208
  demo.queue().launch()
 
5
  import torch
6
  from huggingface_hub import login
7
  import os
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  device = "cuda" if torch.cuda.is_available() else "cpu"
10
 
 
16
  base_model_repo = "stabilityai/stable-diffusion-3-medium-diffusers"
17
  lora_weights_path = "./pytorch_lora_weights.safetensors"
18
 
19
+ # Load the base model
20
+ pipeline = DiffusionPipeline.from_pretrained(
21
+ base_model_repo,
22
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
23
+ use_auth_token=HUGGINGFACE_TOKEN
24
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  pipeline.load_lora_weights(lora_weights_path)
26
  pipeline.enable_sequential_cpu_offload() # Efficient memory usage
27
  pipeline.enable_xformers_memory_efficient_attention() # Enable xformers memory efficient attention
28
+ pipeline = pipeline.to(device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  MAX_SEED = np.iinfo(np.int32).max
31
  MAX_IMAGE_SIZE = 768 # Reduce max image size to fit within memory constraints
 
45
  height=height,
46
  generator=generator
47
  ).images[0]
 
 
48
 
49
+ return image
50
 
51
  examples = [
52
  "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
 
59
  margin: 0 auto;
60
  max-width: 520px;
61
  }
 
 
 
 
 
 
62
  """
63
 
64
  if torch.cuda.is_available():
 
85
  run_button = gr.Button("Run", scale=0)
86
 
87
  result = gr.Image(label="Result", show_label=False)
 
88
 
89
  with gr.Accordion("Advanced Settings", open=False):
90
  negative_prompt = gr.Textbox(
 
146
  run_button.click(
147
  fn=infer,
148
  inputs=[prompt, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps],
149
+ outputs=[result]
150
  )
151
 
152
  demo.queue().launch()