nickyreinert-vml
commited on
Commit
·
e4a20eb
1
Parent(s):
ddd360b
adding pre compile feature
Browse files
app.py
CHANGED
@@ -131,6 +131,12 @@ def attention_slicing_change(attention_slicing, config):
|
|
131 |
config = set_config(config, 'attention_slicing', attention_slicing)
|
132 |
|
133 |
return config, config, assemble_code(config)
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
|
135 |
def safety_checker_change(safety_checker, config):
|
136 |
|
@@ -298,6 +304,9 @@ def run_inference(config, config_history, pipeline, progress=gr.Progress(track_t
|
|
298 |
# ATTENTION SLICING
|
299 |
if str(config["attention_slicing"]).lower() == 'true': pipeline.enable_attention_slicing()
|
300 |
|
|
|
|
|
|
|
301 |
# AUTO ENCODER
|
302 |
if str(config["auto_encoder"]).lower() != 'none' and str(config["auto_encoder"]).lower() != 'null' and str(config["auto_encoder"]).lower() != '':
|
303 |
pipeline.vae = AutoencoderKL.from_pretrained(config["auto_encoder"], torch_dtype=get_data_type(config["data_type"])).to(config["device"])
|
@@ -445,12 +454,13 @@ with gr.Blocks(analytics_enabled=False) as demo:
|
|
445 |
gr.Column("")
|
446 |
with gr.Accordion("Device specific settings", open=False):
|
447 |
with gr.Row():
|
448 |
-
in_cpu_offload = gr.Radio(label="CPU Offload:", value=config.value["cpu_offload"], choices=["True", "False"], info="This may increase performance, as it offloads computations from the GPU to the CPU. But this can also lead to slower executions and lower effectiveness. Compare running time and outputs before making sure, that this setting will help you")
|
449 |
in_data_type = gr.Radio(label="Data Type:", value=config.value["data_type"], choices=["bfloat16", "float16", "float32"], info="`bfloat16` is not supported on MPS devices right now; `float16` may also not be supported on all devices, Half-precision weights, will save GPU memory, see https://huggingface.co/docs/diffusers/main/en/optimization/fp16")
|
450 |
in_allow_tensorfloat32 = gr.Radio(label="Allow TensorFloat32:", value=config.value["allow_tensorfloat32"], choices=["True", "False"], info="is not supported on MPS devices right now; use TensorFloat-32 is faster, but results in slightly less accurate computations, see https://huggingface.co/docs/diffusers/main/en/optimization/fp16 ")
|
451 |
with gr.Row():
|
452 |
in_variant = gr.Radio(label="Variant:", value=config.value["variant"], choices=["fp16", None], info="Use half-precision weights will save GPU memory, not all models support that, see https://huggingface.co/docs/diffusers/main/en/optimization/fp16 ")
|
453 |
in_attention_slicing = gr.Radio(label="Attention slicing:", value=config.value["attention_slicing"], choices=["True", "False"], info="Attention operation will be cutted into multiple steps, see https://huggingface.co/docs/diffusers/optimization/mps")
|
|
|
454 |
gr.Column("")
|
455 |
|
456 |
gr.Markdown("### Model")
|
@@ -553,6 +563,7 @@ with gr.Blocks(analytics_enabled=False) as demo:
|
|
553 |
in_allow_tensorfloat32.change(tensorfloat32_change, inputs=[in_allow_tensorfloat32, config], outputs=[config, out_config, out_code], js="(value, config) => set_cookie('allow_tensorfloat32', value, config)")
|
554 |
in_variant.change(variant_change, inputs=[in_variant, config], outputs=[config, out_config, out_code], js="(value, config) => set_cookie('variant', value, config)")
|
555 |
in_attention_slicing.change(attention_slicing_change, inputs=[in_attention_slicing, config], outputs=[config, out_config, out_code], js="(value, config) => set_cookie('attention_slicing', value, config)")
|
|
|
556 |
in_model_refiner.change(model_refiner_change, inputs=[in_model_refiner, config], outputs=[config, out_config, out_code], js="(value, config) => set_cookie('model_refiner', value, config)")
|
557 |
in_cpu_offload.change(cpu_offload_change, inputs=[in_cpu_offload, config], outputs=[config, out_config, out_code], js="(value, config) => set_cookie('cpu_offload', value, config)")
|
558 |
in_safety_checker.change(safety_checker_change, inputs=[in_safety_checker, config], outputs=[config, out_config, out_code], js="(value, config) => set_cookie('safety_checker', value, config)")
|
@@ -586,6 +597,7 @@ with gr.Blocks(analytics_enabled=False) as demo:
|
|
586 |
in_model_refiner,
|
587 |
in_variant,
|
588 |
in_attention_slicing,
|
|
|
589 |
in_safety_checker,
|
590 |
in_requires_safety_checker,
|
591 |
in_auto_encoders,
|
|
|
131 |
config = set_config(config, 'attention_slicing', attention_slicing)
|
132 |
|
133 |
return config, config, assemble_code(config)
|
134 |
+
|
135 |
+
def pre_compile_unet_change(pre_compile_unet, config):
|
136 |
+
|
137 |
+
config = set_config(config, 'pre_compile_unet', pre_compile_unet)
|
138 |
+
|
139 |
+
return config, config, assemble_code(config)
|
140 |
|
141 |
def safety_checker_change(safety_checker, config):
|
142 |
|
|
|
304 |
# ATTENTION SLICING
|
305 |
if str(config["attention_slicing"]).lower() == 'true': pipeline.enable_attention_slicing()
|
306 |
|
307 |
+
# PRE COMPILE UNET
|
308 |
+
if str(config["pre_compile_unet"]).lower() == 'true': pipeline.unet = torch.compile(pipeline.unet, mode="reduce-overhead", fullgraph=True)
|
309 |
+
|
310 |
# AUTO ENCODER
|
311 |
if str(config["auto_encoder"]).lower() != 'none' and str(config["auto_encoder"]).lower() != 'null' and str(config["auto_encoder"]).lower() != '':
|
312 |
pipeline.vae = AutoencoderKL.from_pretrained(config["auto_encoder"], torch_dtype=get_data_type(config["data_type"])).to(config["device"])
|
|
|
454 |
gr.Column("")
|
455 |
with gr.Accordion("Device specific settings", open=False):
|
456 |
with gr.Row():
|
457 |
+
in_cpu_offload = gr.Radio(label="CPU Offload:", value=config.value["cpu_offload"], choices=["True", "False"], info="This may increase performance, as it offloads computations from the GPU to the CPU. But this can also lead to slower executions and lower effectiveness. Compare running time and outputs before making sure, that this setting will help you, is not supported on MPS")
|
458 |
in_data_type = gr.Radio(label="Data Type:", value=config.value["data_type"], choices=["bfloat16", "float16", "float32"], info="`bfloat16` is not supported on MPS devices right now; `float16` may also not be supported on all devices, Half-precision weights, will save GPU memory, see https://huggingface.co/docs/diffusers/main/en/optimization/fp16")
|
459 |
in_allow_tensorfloat32 = gr.Radio(label="Allow TensorFloat32:", value=config.value["allow_tensorfloat32"], choices=["True", "False"], info="is not supported on MPS devices right now; use TensorFloat-32 is faster, but results in slightly less accurate computations, see https://huggingface.co/docs/diffusers/main/en/optimization/fp16 ")
|
460 |
with gr.Row():
|
461 |
in_variant = gr.Radio(label="Variant:", value=config.value["variant"], choices=["fp16", None], info="Use half-precision weights will save GPU memory, not all models support that, see https://huggingface.co/docs/diffusers/main/en/optimization/fp16 ")
|
462 |
in_attention_slicing = gr.Radio(label="Attention slicing:", value=config.value["attention_slicing"], choices=["True", "False"], info="Attention operation will be cutted into multiple steps, see https://huggingface.co/docs/diffusers/optimization/mps")
|
463 |
+
in_pre_compile_unet = gr.Radio(label="Pre-Compile UNet:", value=config.value["pre_compile_unet"], choices=["True", "False"], info="Can speed up the inference process, compilation takes some time, so you should only apply this option when you finalize your inference, does not work on MPS, see https://huggingface.co/docs/diffusers/optimization/torch2.0 ")
|
464 |
gr.Column("")
|
465 |
|
466 |
gr.Markdown("### Model")
|
|
|
563 |
in_allow_tensorfloat32.change(tensorfloat32_change, inputs=[in_allow_tensorfloat32, config], outputs=[config, out_config, out_code], js="(value, config) => set_cookie('allow_tensorfloat32', value, config)")
|
564 |
in_variant.change(variant_change, inputs=[in_variant, config], outputs=[config, out_config, out_code], js="(value, config) => set_cookie('variant', value, config)")
|
565 |
in_attention_slicing.change(attention_slicing_change, inputs=[in_attention_slicing, config], outputs=[config, out_config, out_code], js="(value, config) => set_cookie('attention_slicing', value, config)")
|
566 |
+
in_pre_compile_unet.change(pre_compile_unet_change, inputs=[in_pre_compile_unet, config], outputs=[config, out_config, out_code], js="(value, config) => set_cookie('pre_compile_unet', value, config)")
|
567 |
in_model_refiner.change(model_refiner_change, inputs=[in_model_refiner, config], outputs=[config, out_config, out_code], js="(value, config) => set_cookie('model_refiner', value, config)")
|
568 |
in_cpu_offload.change(cpu_offload_change, inputs=[in_cpu_offload, config], outputs=[config, out_config, out_code], js="(value, config) => set_cookie('cpu_offload', value, config)")
|
569 |
in_safety_checker.change(safety_checker_change, inputs=[in_safety_checker, config], outputs=[config, out_config, out_code], js="(value, config) => set_cookie('safety_checker', value, config)")
|
|
|
597 |
in_model_refiner,
|
598 |
in_variant,
|
599 |
in_attention_slicing,
|
600 |
+
in_pre_compile_unet,
|
601 |
in_safety_checker,
|
602 |
in_requires_safety_checker,
|
603 |
in_auto_encoders,
|
config.py
CHANGED
@@ -41,6 +41,7 @@ def get_initial_config():
|
|
41 |
"scheduler": None,
|
42 |
"variant": None,
|
43 |
"attention_slicing": "False",
|
|
|
44 |
"allow_tensorfloat32": allow_tensorfloat32,
|
45 |
"use_safetensors": "False",
|
46 |
"data_type": data_type,
|
@@ -104,6 +105,7 @@ def get_config_from_url(initial_config, request: Request):
|
|
104 |
return_config['refiner'],
|
105 |
return_config['variant'],
|
106 |
return_config['attention_slicing'],
|
|
|
107 |
return_config['safety_checker'],
|
108 |
return_config['requires_safety_checker'],
|
109 |
return_config['auto_encoder'],
|
@@ -175,6 +177,7 @@ def assemble_code(str_config):
|
|
175 |
variant=variant).to(device)''')
|
176 |
|
177 |
if str(config["attention_slicing"]).lower() != 'false': code.append("pipeline.enable_attention_slicing()")
|
|
|
178 |
|
179 |
if str(config["cpu_offload"]).lower() != 'false': code.append("pipeline.enable_model_cpu_offload()")
|
180 |
|
@@ -191,7 +194,7 @@ def assemble_code(str_config):
|
|
191 |
"{config['refiner']}",
|
192 |
text_encoder_2 = base.text_encoder_2,
|
193 |
vae = base.vae,
|
194 |
-
torch_dtype =
|
195 |
use_safetensors = use_safetensors,
|
196 |
variant=variant,
|
197 |
).to(device)''')
|
|
|
41 |
"scheduler": None,
|
42 |
"variant": None,
|
43 |
"attention_slicing": "False",
|
44 |
+
"pre_compile_unet": "False",
|
45 |
"allow_tensorfloat32": allow_tensorfloat32,
|
46 |
"use_safetensors": "False",
|
47 |
"data_type": data_type,
|
|
|
105 |
return_config['refiner'],
|
106 |
return_config['variant'],
|
107 |
return_config['attention_slicing'],
|
108 |
+
return_config['pre_compile_unet'],
|
109 |
return_config['safety_checker'],
|
110 |
return_config['requires_safety_checker'],
|
111 |
return_config['auto_encoder'],
|
|
|
177 |
variant=variant).to(device)''')
|
178 |
|
179 |
if str(config["attention_slicing"]).lower() != 'false': code.append("pipeline.enable_attention_slicing()")
|
180 |
+
if str(config["pre_compile_unet"]).lower() != 'false': code.append("pipeline.unet = torch.compile(pipeline.unet, mode="reduce-overhead", fullgraph=True)")
|
181 |
|
182 |
if str(config["cpu_offload"]).lower() != 'false': code.append("pipeline.enable_model_cpu_offload()")
|
183 |
|
|
|
194 |
"{config['refiner']}",
|
195 |
text_encoder_2 = base.text_encoder_2,
|
196 |
vae = base.vae,
|
197 |
+
torch_dtype = data_t ype,
|
198 |
use_safetensors = use_safetensors,
|
199 |
variant=variant,
|
200 |
).to(device)''')
|