Spaces:
Paused
Paused
alessandro trinca tornidor
commited on
Commit
·
ca06190
1
Parent(s):
9d7a440
[refactor] more refactor app.py/1
Browse files
app.py
CHANGED
@@ -4,9 +4,9 @@ import re
|
|
4 |
import sys
|
5 |
from typing import Callable
|
6 |
|
7 |
-
import nh3
|
8 |
import cv2
|
9 |
import gradio as gr
|
|
|
10 |
import numpy as np
|
11 |
import torch
|
12 |
import torch.nn.functional as F
|
@@ -20,7 +20,56 @@ from utils.utils import (DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN,
|
|
20 |
DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX)
|
21 |
|
22 |
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
parser = argparse.ArgumentParser(description="LISA chat")
|
25 |
parser.add_argument("--version", default="xinlai/LISA-13B-llama2-v1")
|
26 |
parser.add_argument("--vis_save_path", default="./vis_output", type=str)
|
@@ -47,7 +96,7 @@ def parse_args(args):
|
|
47 |
type=str,
|
48 |
choices=["llava_v1", "llava_llama_2"],
|
49 |
)
|
50 |
-
return parser.parse_args(
|
51 |
|
52 |
|
53 |
def preprocess(
|
@@ -154,55 +203,6 @@ transform = ResizeLongestSide(args.image_size)
|
|
154 |
model.eval()
|
155 |
|
156 |
|
157 |
-
# Gradio
|
158 |
-
examples = [
|
159 |
-
[
|
160 |
-
"Where can the driver see the car speed in this image? Please output segmentation mask.",
|
161 |
-
"./resources/imgs/example1.jpg",
|
162 |
-
],
|
163 |
-
[
|
164 |
-
"Can you segment the food that tastes spicy and hot?",
|
165 |
-
"./resources/imgs/example2.jpg",
|
166 |
-
],
|
167 |
-
[
|
168 |
-
"Assuming you are an autonomous driving robot, what part of the diagram would you manipulate to control the direction of travel? Please output segmentation mask and explain why.",
|
169 |
-
"./resources/imgs/example1.jpg",
|
170 |
-
],
|
171 |
-
[
|
172 |
-
"What can make the woman stand higher? Please output segmentation mask and explain why.",
|
173 |
-
"./resources/imgs/example3.jpg",
|
174 |
-
],
|
175 |
-
]
|
176 |
-
output_labels = ["Segmentation Output"]
|
177 |
-
|
178 |
-
title = "LISA: Reasoning Segmentation via Large Language Model"
|
179 |
-
|
180 |
-
description = """
|
181 |
-
<font size=4>
|
182 |
-
This is the online demo of LISA. \n
|
183 |
-
If multiple users are using it at the same time, they will enter a queue, which may delay some time. \n
|
184 |
-
**Note**: **Different prompts can lead to significantly varied results**. \n
|
185 |
-
**Note**: Please try to **standardize** your input text prompts to **avoid ambiguity**, and also pay attention to whether the **punctuations** of the input are correct. \n
|
186 |
-
**Note**: Current model is **LISA-13B-llama2-v0-explanatory**, and 4-bit quantization may impair text-generation quality. \n
|
187 |
-
**Usage**: <br>
|
188 |
-
 (1) To let LISA **segment something**, input prompt like: "Can you segment xxx in this image?", "What is xxx in this image? Please output segmentation mask."; <br>
|
189 |
-
 (2) To let LISA **output an explanation**, input prompt like: "What is xxx in this image? Please output segmentation mask and explain why."; <br>
|
190 |
-
 (3) To obtain **solely language output**, you can input like what you should do in current multi-modal LLM (e.g., LLaVA). <br>
|
191 |
-
Hope you can enjoy our work!
|
192 |
-
</font>
|
193 |
-
"""
|
194 |
-
|
195 |
-
article = """
|
196 |
-
<p style='text-align: center'>
|
197 |
-
<a href='https://arxiv.org/abs/2308.00692' target='_blank'>
|
198 |
-
Preprint Paper
|
199 |
-
</a>
|
200 |
-
\n
|
201 |
-
<p style='text-align: center'>
|
202 |
-
<a href='https://github.com/dvlab-research/LISA' target='_blank'> Github Repo </a></p>
|
203 |
-
"""
|
204 |
-
|
205 |
-
|
206 |
## to be implemented
|
207 |
def inference(input_str, input_image):
|
208 |
## filter out special chars
|
@@ -291,7 +291,6 @@ def inference(input_str, input_image):
|
|
291 |
else:
|
292 |
image = image.float()
|
293 |
|
294 |
-
|
295 |
input_ids = tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
|
296 |
input_ids = input_ids.unsqueeze(0).cuda()
|
297 |
|
@@ -334,7 +333,6 @@ def inference(input_str, input_image):
|
|
334 |
return output_image, output_str
|
335 |
|
336 |
|
337 |
-
|
338 |
def server_runner(
|
339 |
fn_inference: Callable,
|
340 |
debug: bool = False,
|
|
|
4 |
import sys
|
5 |
from typing import Callable
|
6 |
|
|
|
7 |
import cv2
|
8 |
import gradio as gr
|
9 |
+
import nh3
|
10 |
import numpy as np
|
11 |
import torch
|
12 |
import torch.nn.functional as F
|
|
|
20 |
DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX)
|
21 |
|
22 |
|
23 |
+
# Gradio
|
24 |
+
examples = [
|
25 |
+
[
|
26 |
+
"Where can the driver see the car speed in this image? Please output segmentation mask.",
|
27 |
+
"./resources/imgs/example1.jpg",
|
28 |
+
],
|
29 |
+
[
|
30 |
+
"Can you segment the food that tastes spicy and hot?",
|
31 |
+
"./resources/imgs/example2.jpg",
|
32 |
+
],
|
33 |
+
[
|
34 |
+
"Assuming you are an autonomous driving robot, what part of the diagram would you manipulate to control the direction of travel? Please output segmentation mask and explain why.",
|
35 |
+
"./resources/imgs/example1.jpg",
|
36 |
+
],
|
37 |
+
[
|
38 |
+
"What can make the woman stand higher? Please output segmentation mask and explain why.",
|
39 |
+
"./resources/imgs/example3.jpg",
|
40 |
+
],
|
41 |
+
]
|
42 |
+
output_labels = ["Segmentation Output"]
|
43 |
+
|
44 |
+
title = "LISA: Reasoning Segmentation via Large Language Model"
|
45 |
+
|
46 |
+
description = """
|
47 |
+
<font size=4>
|
48 |
+
This is the online demo of LISA. \n
|
49 |
+
If multiple users are using it at the same time, they will enter a queue, which may delay some time. \n
|
50 |
+
**Note**: **Different prompts can lead to significantly varied results**. \n
|
51 |
+
**Note**: Please try to **standardize** your input text prompts to **avoid ambiguity**, and also pay attention to whether the **punctuations** of the input are correct. \n
|
52 |
+
**Note**: Current model is **LISA-13B-llama2-v0-explanatory**, and 4-bit quantization may impair text-generation quality. \n
|
53 |
+
**Usage**: <br>
|
54 |
+
 (1) To let LISA **segment something**, input prompt like: "Can you segment xxx in this image?", "What is xxx in this image? Please output segmentation mask."; <br>
|
55 |
+
 (2) To let LISA **output an explanation**, input prompt like: "What is xxx in this image? Please output segmentation mask and explain why."; <br>
|
56 |
+
 (3) To obtain **solely language output**, you can input like what you should do in current multi-modal LLM (e.g., LLaVA). <br>
|
57 |
+
Hope you can enjoy our work!
|
58 |
+
</font>
|
59 |
+
"""
|
60 |
+
|
61 |
+
article = """
|
62 |
+
<p style='text-align: center'>
|
63 |
+
<a href='https://arxiv.org/abs/2308.00692' target='_blank'>
|
64 |
+
Preprint Paper
|
65 |
+
</a>
|
66 |
+
\n
|
67 |
+
<p style='text-align: center'>
|
68 |
+
<a href='https://github.com/dvlab-research/LISA' target='_blank'> Github Repo </a></p>
|
69 |
+
"""
|
70 |
+
|
71 |
+
|
72 |
+
def parse_args(args_to_parse):
|
73 |
parser = argparse.ArgumentParser(description="LISA chat")
|
74 |
parser.add_argument("--version", default="xinlai/LISA-13B-llama2-v1")
|
75 |
parser.add_argument("--vis_save_path", default="./vis_output", type=str)
|
|
|
96 |
type=str,
|
97 |
choices=["llava_v1", "llava_llama_2"],
|
98 |
)
|
99 |
+
return parser.parse_args(args_to_parse)
|
100 |
|
101 |
|
102 |
def preprocess(
|
|
|
203 |
model.eval()
|
204 |
|
205 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
206 |
## to be implemented
|
207 |
def inference(input_str, input_image):
|
208 |
## filter out special chars
|
|
|
291 |
else:
|
292 |
image = image.float()
|
293 |
|
|
|
294 |
input_ids = tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
|
295 |
input_ids = input_ids.unsqueeze(0).cuda()
|
296 |
|
|
|
333 |
return output_image, output_str
|
334 |
|
335 |
|
|
|
336 |
def server_runner(
|
337 |
fn_inference: Callable,
|
338 |
debug: bool = False,
|