alessandro trinca tornidor commited on
Commit
ca06190
·
1 Parent(s): 9d7a440

[refactor] more refactor app.py/1

Browse files
Files changed (1) hide show
  1. app.py +52 -54
app.py CHANGED
@@ -4,9 +4,9 @@ import re
4
  import sys
5
  from typing import Callable
6
 
7
- import nh3
8
  import cv2
9
  import gradio as gr
 
10
  import numpy as np
11
  import torch
12
  import torch.nn.functional as F
@@ -20,7 +20,56 @@ from utils.utils import (DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN,
20
  DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX)
21
 
22
 
23
- def parse_args(args):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  parser = argparse.ArgumentParser(description="LISA chat")
25
  parser.add_argument("--version", default="xinlai/LISA-13B-llama2-v1")
26
  parser.add_argument("--vis_save_path", default="./vis_output", type=str)
@@ -47,7 +96,7 @@ def parse_args(args):
47
  type=str,
48
  choices=["llava_v1", "llava_llama_2"],
49
  )
50
- return parser.parse_args(args)
51
 
52
 
53
  def preprocess(
@@ -154,55 +203,6 @@ transform = ResizeLongestSide(args.image_size)
154
  model.eval()
155
 
156
 
157
- # Gradio
158
- examples = [
159
- [
160
- "Where can the driver see the car speed in this image? Please output segmentation mask.",
161
- "./resources/imgs/example1.jpg",
162
- ],
163
- [
164
- "Can you segment the food that tastes spicy and hot?",
165
- "./resources/imgs/example2.jpg",
166
- ],
167
- [
168
- "Assuming you are an autonomous driving robot, what part of the diagram would you manipulate to control the direction of travel? Please output segmentation mask and explain why.",
169
- "./resources/imgs/example1.jpg",
170
- ],
171
- [
172
- "What can make the woman stand higher? Please output segmentation mask and explain why.",
173
- "./resources/imgs/example3.jpg",
174
- ],
175
- ]
176
- output_labels = ["Segmentation Output"]
177
-
178
- title = "LISA: Reasoning Segmentation via Large Language Model"
179
-
180
- description = """
181
- <font size=4>
182
- This is the online demo of LISA. \n
183
- If multiple users are using it at the same time, they will enter a queue, which may delay some time. \n
184
- **Note**: **Different prompts can lead to significantly varied results**. \n
185
- **Note**: Please try to **standardize** your input text prompts to **avoid ambiguity**, and also pay attention to whether the **punctuations** of the input are correct. \n
186
- **Note**: Current model is **LISA-13B-llama2-v0-explanatory**, and 4-bit quantization may impair text-generation quality. \n
187
- **Usage**: <br>
188
- &ensp;(1) To let LISA **segment something**, input prompt like: "Can you segment xxx in this image?", "What is xxx in this image? Please output segmentation mask."; <br>
189
- &ensp;(2) To let LISA **output an explanation**, input prompt like: "What is xxx in this image? Please output segmentation mask and explain why."; <br>
190
- &ensp;(3) To obtain **solely language output**, you can input like what you should do in current multi-modal LLM (e.g., LLaVA). <br>
191
- Hope you can enjoy our work!
192
- </font>
193
- """
194
-
195
- article = """
196
- <p style='text-align: center'>
197
- <a href='https://arxiv.org/abs/2308.00692' target='_blank'>
198
- Preprint Paper
199
- </a>
200
- \n
201
- <p style='text-align: center'>
202
- <a href='https://github.com/dvlab-research/LISA' target='_blank'> Github Repo </a></p>
203
- """
204
-
205
-
206
  ## to be implemented
207
  def inference(input_str, input_image):
208
  ## filter out special chars
@@ -291,7 +291,6 @@ def inference(input_str, input_image):
291
  else:
292
  image = image.float()
293
 
294
-
295
  input_ids = tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
296
  input_ids = input_ids.unsqueeze(0).cuda()
297
 
@@ -334,7 +333,6 @@ def inference(input_str, input_image):
334
  return output_image, output_str
335
 
336
 
337
-
338
  def server_runner(
339
  fn_inference: Callable,
340
  debug: bool = False,
 
4
  import sys
5
  from typing import Callable
6
 
 
7
  import cv2
8
  import gradio as gr
9
+ import nh3
10
  import numpy as np
11
  import torch
12
  import torch.nn.functional as F
 
20
  DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX)
21
 
22
 
23
+ # Gradio
24
+ examples = [
25
+ [
26
+ "Where can the driver see the car speed in this image? Please output segmentation mask.",
27
+ "./resources/imgs/example1.jpg",
28
+ ],
29
+ [
30
+ "Can you segment the food that tastes spicy and hot?",
31
+ "./resources/imgs/example2.jpg",
32
+ ],
33
+ [
34
+ "Assuming you are an autonomous driving robot, what part of the diagram would you manipulate to control the direction of travel? Please output segmentation mask and explain why.",
35
+ "./resources/imgs/example1.jpg",
36
+ ],
37
+ [
38
+ "What can make the woman stand higher? Please output segmentation mask and explain why.",
39
+ "./resources/imgs/example3.jpg",
40
+ ],
41
+ ]
42
+ output_labels = ["Segmentation Output"]
43
+
44
+ title = "LISA: Reasoning Segmentation via Large Language Model"
45
+
46
+ description = """
47
+ <font size=4>
48
+ This is the online demo of LISA. \n
49
+ If multiple users are using it at the same time, they will enter a queue, which may delay some time. \n
50
+ **Note**: **Different prompts can lead to significantly varied results**. \n
51
+ **Note**: Please try to **standardize** your input text prompts to **avoid ambiguity**, and also pay attention to whether the **punctuations** of the input are correct. \n
52
+ **Note**: Current model is **LISA-13B-llama2-v0-explanatory**, and 4-bit quantization may impair text-generation quality. \n
53
+ **Usage**: <br>
54
+ &ensp;(1) To let LISA **segment something**, input prompt like: "Can you segment xxx in this image?", "What is xxx in this image? Please output segmentation mask."; <br>
55
+ &ensp;(2) To let LISA **output an explanation**, input prompt like: "What is xxx in this image? Please output segmentation mask and explain why."; <br>
56
+ &ensp;(3) To obtain **solely language output**, you can input like what you should do in current multi-modal LLM (e.g., LLaVA). <br>
57
+ Hope you can enjoy our work!
58
+ </font>
59
+ """
60
+
61
+ article = """
62
+ <p style='text-align: center'>
63
+ <a href='https://arxiv.org/abs/2308.00692' target='_blank'>
64
+ Preprint Paper
65
+ </a>
66
+ \n
67
+ <p style='text-align: center'>
68
+ <a href='https://github.com/dvlab-research/LISA' target='_blank'> Github Repo </a></p>
69
+ """
70
+
71
+
72
+ def parse_args(args_to_parse):
73
  parser = argparse.ArgumentParser(description="LISA chat")
74
  parser.add_argument("--version", default="xinlai/LISA-13B-llama2-v1")
75
  parser.add_argument("--vis_save_path", default="./vis_output", type=str)
 
96
  type=str,
97
  choices=["llava_v1", "llava_llama_2"],
98
  )
99
+ return parser.parse_args(args_to_parse)
100
 
101
 
102
  def preprocess(
 
203
  model.eval()
204
 
205
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
  ## to be implemented
207
  def inference(input_str, input_image):
208
  ## filter out special chars
 
291
  else:
292
  image = image.float()
293
 
 
294
  input_ids = tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
295
  input_ids = input_ids.unsqueeze(0).cuda()
296
 
 
333
  return output_image, output_str
334
 
335
 
 
336
  def server_runner(
337
  fn_inference: Callable,
338
  debug: bool = False,