Spaces:
Paused
Paused
alessandro trinca tornidor
commited on
Commit
·
acbbf71
1
Parent(s):
4d19eb4
[test] update inference function to return also output mask, useful for tests (now on saturncloud test.ipynb notebook)
Browse files- README.md +34 -1
- notebooks/test.ipynb +0 -0
- requirements_jupyter.txt +4 -0
- tests/__init__.py +0 -0
- tests/imgs/example1_mask_0.png +3 -0
- tests/test_app_helpers.py +88 -0
- utils/app_helpers.py +26 -23
README.md
CHANGED
@@ -7,7 +7,40 @@ sdk: docker
|
|
7 |
pinned: false
|
8 |
---
|
9 |
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
[![Gradio](https://img.shields.io/badge/Gradio-Online%20Demo-blue)](http://103.170.5.190:7860/)
|
13 |
[![Open in OpenXLab](https://cdn-static.openxlab.org.cn/app-center/openxlab_app.svg)](https://openxlab.org.cn/apps/detail/openxlab-app/LISA)
|
|
|
7 |
pinned: false
|
8 |
---
|
9 |
|
10 |
+
# exec jupyter on the remote server with port forwarding on localhost
|
11 |
+
|
12 |
+
1. checkout repo, install venv with jupyter
|
13 |
+
2. port forwarding in localhost wiht private key: `ssh -i ~/.ssh/id_ecdsa_saturncloud [email protected] -L 8889:localhost:8889 -N -f`
|
14 |
+
3. start the jupyter-lab server
|
15 |
+
4. connect to page in localhost
|
16 |
+
|
17 |
+
## Commands to work on saturncloud after clone and git lfs install
|
18 |
+
```bash
|
19 |
+
cd ~/workspace/lisa-on-gpu/
|
20 |
+
rm -rf lisa_venv
|
21 |
+
python3 -m venv lisa_venv
|
22 |
+
ln -s lisa_venv/ venv
|
23 |
+
source venv/bin/activate
|
24 |
+
pip --version
|
25 |
+
which python
|
26 |
+
python -m pip install pip wheel --upgrade
|
27 |
+
python -m pip install pytest pytest-cov jupyterlab
|
28 |
+
python -m pip install -r requirements.txt
|
29 |
+
nohup jupyter-lab &
|
30 |
+
tail -F nohup.out
|
31 |
+
```
|
32 |
+
|
33 |
+
# Jupyterlab Howto
|
34 |
+
|
35 |
+
To run the `test.ipynb` notebook you should already:
|
36 |
+
- cloned project https://huggingface.co/spaces/aletrn/lisa-on-gpu with active git lfs
|
37 |
+
- created and activated a virtualenv
|
38 |
+
- installed jupyterlab dependencies from requirements_jupyter.txt
|
39 |
+
- installed dependencies from requirements.txt
|
40 |
+
|
41 |
+
## Hardware requirements
|
42 |
+
- an nvidia gpu with 10 or 12GB of memory (a T4 should suffice)
|
43 |
+
- at least 16GB of system ram
|
44 |
|
45 |
[![Gradio](https://img.shields.io/badge/Gradio-Online%20Demo-blue)](http://103.170.5.190:7860/)
|
46 |
[![Open in OpenXLab](https://cdn-static.openxlab.org.cn/app-center/openxlab_app.svg)](https://openxlab.org.cn/apps/detail/openxlab-app/LISA)
|
notebooks/test.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
requirements_jupyter.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
jupyterlab
|
2 |
+
ipywidgets
|
3 |
+
pytest
|
4 |
+
pytest-cov
|
tests/__init__.py
ADDED
File without changes
|
tests/imgs/example1_mask_0.png
ADDED
Git LFS Details
|
tests/test_app_helpers.py
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import unittest
|
3 |
+
|
4 |
+
|
5 |
+
class TestAppBuilders(unittest.TestCase):
|
6 |
+
|
7 |
+
def test_default_creation(self):
|
8 |
+
from utils import utils
|
9 |
+
|
10 |
+
placeholders = utils.create_placeholder_variables()
|
11 |
+
self.assertIsInstance(placeholders, dict)
|
12 |
+
assert placeholders["no_seg_out"].shape == (512, 512, 3)
|
13 |
+
assert placeholders["error_happened"].shape == (512, 512, 3)
|
14 |
+
|
15 |
+
def test_parse_args(self):
|
16 |
+
from utils import app_helpers
|
17 |
+
|
18 |
+
test_args_parse = app_helpers.parse_args([])
|
19 |
+
assert vars(test_args_parse) == {
|
20 |
+
'version': 'xinlai/LISA-13B-llama2-v1-explanatory',
|
21 |
+
'vis_save_path': './vis_output',
|
22 |
+
'precision': 'fp16',
|
23 |
+
'image_size': 1024,
|
24 |
+
'model_max_length': 512,
|
25 |
+
'lora_r': 8,
|
26 |
+
'vision_tower': 'openai/clip-vit-large-patch14',
|
27 |
+
'local_rank': 0,
|
28 |
+
'load_in_8bit': False,
|
29 |
+
'load_in_4bit': True,
|
30 |
+
'use_mm_start_end': True,
|
31 |
+
'conv_type': 'llava_v1'
|
32 |
+
}
|
33 |
+
|
34 |
+
def test_inference(self):
|
35 |
+
import cv2
|
36 |
+
import numpy as np
|
37 |
+
from utils import app_helpers, constants, utils
|
38 |
+
|
39 |
+
max_diff = 0.02
|
40 |
+
|
41 |
+
logging.info("starting...")
|
42 |
+
logging.warning("Remember: before running again 'get_inference_model_by_args(test_args_parse)' free some memory")
|
43 |
+
test_args_parse = app_helpers.parse_args([])
|
44 |
+
inference_fn = app_helpers.get_inference_model_by_args(test_args_parse)
|
45 |
+
idx_example = 0
|
46 |
+
input_prompt, input_image_path = constants.examples[idx_example]
|
47 |
+
logging.info("running inference function with input prompt '{}'.".format(input_prompt))
|
48 |
+
_, output_mask, output_str = inference_fn(
|
49 |
+
input_prompt,
|
50 |
+
utils.ROOT / input_image_path
|
51 |
+
)
|
52 |
+
logging.info(f"output_str: {output_str}.")
|
53 |
+
expected_mask = cv2.imread(
|
54 |
+
str(utils.ROOT / "tests" / "imgs" / f"example{idx_example}_mask_0.png"),
|
55 |
+
cv2.IMREAD_GRAYSCALE
|
56 |
+
)
|
57 |
+
|
58 |
+
tot = output_mask.size
|
59 |
+
count = np.sum(output_mask != expected_mask)
|
60 |
+
perc = 100 * count / tot
|
61 |
+
|
62 |
+
logging.info(f"diff 1 vs 1b: {perc:.2f}!")
|
63 |
+
try:
|
64 |
+
assert np.array_equal(output_mask, expected_mask)
|
65 |
+
except AssertionError:
|
66 |
+
try:
|
67 |
+
logging.error("failed equality assertion!")
|
68 |
+
logging.info(f"assert now that perc diff between ndarrays is minor than {max_diff}.")
|
69 |
+
assert perc < max_diff
|
70 |
+
except AssertionError as ae:
|
71 |
+
logging.error("failed all assertions, writing debug files...")
|
72 |
+
import datetime
|
73 |
+
now_str = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
|
74 |
+
output_folder = utils.ROOT / "tests" / "imgs"
|
75 |
+
prefix = f"broken_test_example{idx_example + 1}_{now_str}"
|
76 |
+
cv2.imwrite(
|
77 |
+
str(output_folder / f"{prefix}.png"),
|
78 |
+
output_mask
|
79 |
+
)
|
80 |
+
with open(output_folder / f"{prefix}__input_prompt.txt",
|
81 |
+
"w") as dst:
|
82 |
+
dst.write(input_prompt)
|
83 |
+
with open(output_folder / f"{prefix}__output_str.txt",
|
84 |
+
"w") as dst:
|
85 |
+
dst.write(output_str)
|
86 |
+
logging.info(f"Written files with prefix '{prefix}' in {output_folder} folder.")
|
87 |
+
raise ae
|
88 |
+
logging.info("end")
|
utils/app_helpers.py
CHANGED
@@ -17,7 +17,6 @@ from model.llava import conversation as conversation_lib
|
|
17 |
from model.llava.mm_utils import tokenizer_image_token
|
18 |
from model.segment_anything.utils.transforms import ResizeLongestSide
|
19 |
|
20 |
-
|
21 |
placeholders = utils.create_placeholder_variables()
|
22 |
|
23 |
|
@@ -96,10 +95,10 @@ def set_image_precision_by_args(input_image, precision):
|
|
96 |
|
97 |
@session_logger.set_uuid_logging
|
98 |
def preprocess(
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
) -> torch.Tensor:
|
104 |
"""Normalize pixel values and pad to a square input."""
|
105 |
logging.info("preprocess started")
|
@@ -161,7 +160,8 @@ def get_model(args_to_parse):
|
|
161 |
}
|
162 |
)
|
163 |
_model = LISAForCausalLM.from_pretrained(
|
164 |
-
args_to_parse.version, low_cpu_mem_usage=True, vision_tower=args_to_parse.vision_tower,
|
|
|
165 |
)
|
166 |
_model.config.eos_token_id = _tokenizer.eos_token_id
|
167 |
_model.config.bos_token_id = _tokenizer.bos_token_id
|
@@ -207,7 +207,6 @@ def get_inference_model_by_args(args_to_parse):
|
|
207 |
@session_logger.set_uuid_logging
|
208 |
def inference(input_str, input_image_pathname):
|
209 |
## filter out special chars
|
210 |
-
|
211 |
input_str = get_cleaned_input(input_str)
|
212 |
logging.info(f"input_str type: {type(input_str)}, input_image type: {type(input_image_pathname)}.")
|
213 |
logging.info(f"input_str: {input_str}, input_image: {type(input_image_pathname)}.")
|
@@ -225,7 +224,7 @@ def get_inference_model_by_args(args_to_parse):
|
|
225 |
prompt = utils.DEFAULT_IMAGE_TOKEN + "\n" + prompt
|
226 |
if args_to_parse.use_mm_start_end:
|
227 |
replace_token = (
|
228 |
-
|
229 |
)
|
230 |
prompt = prompt.replace(utils.DEFAULT_IMAGE_TOKEN, replace_token)
|
231 |
|
@@ -276,25 +275,28 @@ def get_inference_model_by_args(args_to_parse):
|
|
276 |
text_output = text_output.replace("\n", "").replace(" ", " ")
|
277 |
text_output = text_output.split("ASSISTANT: ")[-1]
|
278 |
|
279 |
-
logging.info(
|
280 |
-
|
|
|
|
|
|
|
|
|
281 |
for i, pred_mask in enumerate(pred_masks):
|
282 |
-
if pred_mask.shape[0] == 0:
|
283 |
continue
|
284 |
-
|
285 |
pred_mask = pred_mask.detach().cpu().numpy()[0]
|
286 |
-
|
|
|
287 |
|
288 |
-
|
289 |
-
|
290 |
image_np * 0.5
|
291 |
-
+
|
292 |
-
)[
|
293 |
|
294 |
-
output_str = f"ASSISTANT: {text_output}"
|
295 |
-
output_image
|
296 |
-
|
297 |
-
return output_image, output_str
|
298 |
|
299 |
logging.info("prepared inference function!")
|
300 |
return inference
|
@@ -303,7 +305,7 @@ def get_inference_model_by_args(args_to_parse):
|
|
303 |
@session_logger.set_uuid_logging
|
304 |
def get_gradio_interface(
|
305 |
fn_inference: Callable
|
306 |
-
|
307 |
return gr.Interface(
|
308 |
fn_inference,
|
309 |
inputs=[
|
@@ -311,7 +313,8 @@ def get_gradio_interface(
|
|
311 |
gr.Image(type="filepath", label="Input Image")
|
312 |
],
|
313 |
outputs=[
|
314 |
-
gr.Image(type="pil", label="
|
|
|
315 |
gr.Textbox(lines=1, placeholder=None, label="Text Output")
|
316 |
],
|
317 |
title=constants.title,
|
|
|
17 |
from model.llava.mm_utils import tokenizer_image_token
|
18 |
from model.segment_anything.utils.transforms import ResizeLongestSide
|
19 |
|
|
|
20 |
placeholders = utils.create_placeholder_variables()
|
21 |
|
22 |
|
|
|
95 |
|
96 |
@session_logger.set_uuid_logging
|
97 |
def preprocess(
|
98 |
+
x,
|
99 |
+
pixel_mean=torch.Tensor([123.675, 116.28, 103.53]).view(-1, 1, 1),
|
100 |
+
pixel_std=torch.Tensor([58.395, 57.12, 57.375]).view(-1, 1, 1),
|
101 |
+
img_size=1024,
|
102 |
) -> torch.Tensor:
|
103 |
"""Normalize pixel values and pad to a square input."""
|
104 |
logging.info("preprocess started")
|
|
|
160 |
}
|
161 |
)
|
162 |
_model = LISAForCausalLM.from_pretrained(
|
163 |
+
args_to_parse.version, low_cpu_mem_usage=True, vision_tower=args_to_parse.vision_tower,
|
164 |
+
seg_token_idx=args_to_parse.seg_token_idx, **kwargs
|
165 |
)
|
166 |
_model.config.eos_token_id = _tokenizer.eos_token_id
|
167 |
_model.config.bos_token_id = _tokenizer.bos_token_id
|
|
|
207 |
@session_logger.set_uuid_logging
|
208 |
def inference(input_str, input_image_pathname):
|
209 |
## filter out special chars
|
|
|
210 |
input_str = get_cleaned_input(input_str)
|
211 |
logging.info(f"input_str type: {type(input_str)}, input_image type: {type(input_image_pathname)}.")
|
212 |
logging.info(f"input_str: {input_str}, input_image: {type(input_image_pathname)}.")
|
|
|
224 |
prompt = utils.DEFAULT_IMAGE_TOKEN + "\n" + prompt
|
225 |
if args_to_parse.use_mm_start_end:
|
226 |
replace_token = (
|
227 |
+
utils.DEFAULT_IM_START_TOKEN + utils.DEFAULT_IMAGE_TOKEN + utils.DEFAULT_IM_END_TOKEN
|
228 |
)
|
229 |
prompt = prompt.replace(utils.DEFAULT_IMAGE_TOKEN, replace_token)
|
230 |
|
|
|
275 |
text_output = text_output.replace("\n", "").replace(" ", " ")
|
276 |
text_output = text_output.split("ASSISTANT: ")[-1]
|
277 |
|
278 |
+
logging.info(
|
279 |
+
f"found n {len(pred_masks)} prediction masks, "
|
280 |
+
f"text_output type: {type(text_output)}, text_output: {text_output}."
|
281 |
+
)
|
282 |
+
output_image = no_seg_out
|
283 |
+
output_mask = no_seg_out
|
284 |
for i, pred_mask in enumerate(pred_masks):
|
285 |
+
if pred_mask.shape[0] == 0 or pred_mask.shape[1] == 0:
|
286 |
continue
|
|
|
287 |
pred_mask = pred_mask.detach().cpu().numpy()[0]
|
288 |
+
pred_mask_bool = pred_mask > 0
|
289 |
+
output_mask = pred_mask_bool.astype(np.uint8) * 255
|
290 |
|
291 |
+
output_image = image_np.copy()
|
292 |
+
output_image[pred_mask_bool] = (
|
293 |
image_np * 0.5
|
294 |
+
+ pred_mask_bool[:, :, None].astype(np.uint8) * np.array([255, 0, 0]) * 0.5
|
295 |
+
)[pred_mask_bool]
|
296 |
|
297 |
+
output_str = f"ASSISTANT: {text_output} ..."
|
298 |
+
logging.info(f"output_image type: {type(output_mask)}.")
|
299 |
+
return output_image, output_mask, output_str
|
|
|
300 |
|
301 |
logging.info("prepared inference function!")
|
302 |
return inference
|
|
|
305 |
@session_logger.set_uuid_logging
|
306 |
def get_gradio_interface(
|
307 |
fn_inference: Callable
|
308 |
+
):
|
309 |
return gr.Interface(
|
310 |
fn_inference,
|
311 |
inputs=[
|
|
|
313 |
gr.Image(type="filepath", label="Input Image")
|
314 |
],
|
315 |
outputs=[
|
316 |
+
gr.Image(type="pil", label="segmentation Output"),
|
317 |
+
gr.Image(type="pil", label="mask Output"),
|
318 |
gr.Textbox(lines=1, placeholder=None, label="Text Output")
|
319 |
],
|
320 |
title=constants.title,
|