Spaces:
Running
on
Zero
Running
on
Zero
Apply for community grant: Academic project (gpu and storage)
#1
by
jiangyzy
- opened
Hi
We are preparing the HF demo for CustomNet: https://jiangyzy.github.io/CustomNet/
Could you please help to give a GPU grant for this project?
Thanks
Hi, @hysts . I have some problems with ZeroGPU, it sometimes throws an Error: GPU task aborted. Why would this happen?
terminate called after throwing an instance of 'c10::Error'
what(): could not close file descriptor 18 :Bad file descriptor (9)
Exception raised from close at ../aten/src/ATen/MapAllocator.cpp:402 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8303581d87 in /usr/local/lib/python3.10/site-packages/torch/lib/libc10.so)
frame #1: c10::detail::torchCheckFail(char const*, char const*, unsigned int, std::string const&) + 0x64 (0x7f830353275f in /usr/local/lib/python3.10/site-packages/torch/lib/libc10.so)
frame #2: at::MapAllocator::close() + 0x238 (0x7f82ebd5ba48 in /usr/local/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so)
frame #3: at::MapAllocator::~MapAllocator() + 0x1b (0x7f82ebd5ba8b in /usr/local/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so)
frame #4: at::MapAllocator::~MapAllocator() + 0x9 (0x7f82ebd5bb09 in /usr/local/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so)
frame #5: <unknown function> + 0x53f798 (0x7f8301edc798 in /usr/local/lib/python3.10/site-packages/torch/lib/libtorch_python.so)
frame #6: <unknown function> + 0x6498d (0x7f830356698d in /usr/local/lib/python3.10/site-packages/torch/lib/libc10.so)
frame #7: c10::TensorImpl::~TensorImpl() + 0x21b (0x7f830355fc8b in /usr/local/lib/python3.10/site-packages/torch/lib/libc10.so)
frame #8: c10::TensorImpl::~TensorImpl() + 0x9 (0x7f830355fe39 in /usr/local/lib/python3.10/site-packages/torch/lib/libc10.so)
frame #9: <unknown function> + 0x802b98 (0x7f830219fb98 in /usr/local/lib/python3.10/site-packages/torch/lib/libtorch_python.so)
frame #10: THPVariable_subclass_dealloc(_object*) + 0x2f6 (0x7f830219ff16 in /usr/local/lib/python3.10/site-packages/torch/lib/libtorch_python.so)
<omitting python frames>
frame #40: <unknown function> + 0x89134 (0x7f8304505134 in /usr/lib/x86_64-linux-gnu/libc.so.6)
frame #41: <unknown function> + 0x1097dc (0x7f83045857dc in /usr/lib/x86_64-linux-gnu/libc.so.6)
Traceback (most recent call last):
File "/usr/local/lib/python3.10/site-packages/gradio/queueing.py", line 522, in process_events
response = await route_utils.call_process_api(
File "/usr/local/lib/python3.10/site-packages/gradio/route_utils.py", line 260, in call_process_api
output = await app.get_blocks().process_api(
File "/usr/local/lib/python3.10/site-packages/gradio/blocks.py", line 1741, in process_api
result = await self.call_function(
File "/usr/local/lib/python3.10/site-packages/gradio/blocks.py", line 1296, in call_function
prediction = await anyio.to_thread.run_sync(
File "/usr/local/lib/python3.10/site-packages/anyio/to_thread.py", line 56, in run_sync
return await get_async_backend().run_sync_in_worker_thread(
File "/usr/local/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 2144, in run_sync_in_worker_thread
return await future
File "/usr/local/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 851, in run
result = context.run(func, *args)
File "/usr/local/lib/python3.10/site-packages/gradio/utils.py", line 751, in wrapper
response = f(*args, **kwargs)
File "/usr/local/lib/python3.10/site-packages/spaces/zero/wrappers.py", line 157, in gradio_handler
raise gr.Error("GPU task aborted")
gradio.exceptions.Error: 'GPU task aborted'
@jiangyzy Thanks for testing ZeroGPU!
Can you try this diff? ZeroGPU doesn't work well with functools.partial
.
diff --git a/app.py b/app.py
index 1e3daf6..8a0b5ed 100644
--- a/app.py
+++ b/app.py
@@ -65,6 +65,23 @@ If you have any questions, please feel free to reach me out at <b>yuanzy22@mails
negtive_prompt = ""
+# load model
+device = torch.device("cuda")
+preprocess_model = load_preprocess_model()
+config = OmegaConf.load("configs/config_customnet.yaml")
+model = instantiate_from_config(config.model)
+
+model_path='./customnet_v1.pt?download=true'
+if not os.path.exists(model_path):
+ os.system(f'wget https://huggingface.co/TencentARC/CustomNet/resolve/main/customnet_v1.pt?download=true -P .')
+ckpt = torch.load(model_path, map_location="cpu")
+model.load_state_dict(ckpt)
+del ckpt
+
+model = model.to(device)
+sampler = None
+
+
def send_input_to_concat(input_image):
W, H = input_image.size
# image_array[:, 0, :] = image_array[:, 0, :]
@@ -130,10 +147,9 @@ def prepare_data(device, input_image, x0, y0, x1, y1, polar, azimuth, text):
@spaces.GPU(enable_queue=True, duration=180)
-def run_generation(sampler, model, device, input_image, x0, y0, x1, y1, polar, azimuth, text, seed):
+def run_generation(sampler, input_image, x0, y0, x1, y1, polar, azimuth, text, seed):
seed_everything(seed)
batch = prepare_data(device, input_image, x0, y0, x1, y1, polar, azimuth, text)
- model = model.to(device)
sampler = DDIMSampler(model, device=device)
c = model.get_learned_conditioning(batch["image_cond"])
@@ -189,21 +205,7 @@ def load_example(input_image, x0, y0, x1, y1, polar, azimuth, prompt):
@torch
.no_grad()
def main(args):
- # load model
- device = torch.device("cuda")
- preprocess_model = load_preprocess_model()
- config = OmegaConf.load("configs/config_customnet.yaml")
- model = instantiate_from_config(config.model)
-
- model_path='./customnet_v1.pt?download=true'
- if not os.path.exists(model_path):
- os.system(f'wget https://huggingface.co/TencentARC/CustomNet/resolve/main/customnet_v1.pt?download=true -P .')
- ckpt = torch.load(model_path, map_location="cpu")
- model.load_state_dict(ckpt)
- del ckpt
-
- model = model.to(device)
- sampler = None
+
# load demo
demo = gr.Blocks()
@@ -279,7 +281,7 @@ def main(args):
inputs=[x0, y0, x1, y1, input_image],
outputs=[x0, y0, x1, y1, location_image])
- start.click(partial(run_generation, sampler, model, device),
+ start.click(partial(run_generation, sampler),
inputs=[input_image, x0, y0, x1, y1, polar, azimuth, prompt, seed],
outputs=output_image)
It seemed to work on Zero for me.
It seems that there's no commit except this in the last few days and the Space was working a few days ago, so it might be an infra issue. cc @cbensimon