Spaces:

aletrn
/

lisa-on-cuda

Paused

App Files Files Community

alessandro trinca tornidor commited on Feb 27, 2024

Commit

ca06190

1 Parent(s): 9d7a440

[refactor] more refactor app.py/1

Browse files

Files changed (1) hide show

app.py +52 -54

app.py CHANGED Viewed

@@ -4,9 +4,9 @@ import re
 import sys
 from typing import Callable
-import nh3
 import cv2
 import gradio as gr
 import numpy as np
 import torch
 import torch.nn.functional as F
@@ -20,7 +20,56 @@ from utils.utils import (DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN,
                          DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX)
-def parse_args(args):
     parser = argparse.ArgumentParser(description="LISA chat")
     parser.add_argument("--version", default="xinlai/LISA-13B-llama2-v1")
     parser.add_argument("--vis_save_path", default="./vis_output", type=str)
@@ -47,7 +96,7 @@ def parse_args(args):
         type=str,
         choices=["llava_v1", "llava_llama_2"],
     )
-    return parser.parse_args(args)
 def preprocess(
@@ -154,55 +203,6 @@ transform = ResizeLongestSide(args.image_size)
 model.eval()
-# Gradio
-examples = [
-    [
-        "Where can the driver see the car speed in this image? Please output segmentation mask.",
-        "./resources/imgs/example1.jpg",
-    ],
-    [
-        "Can you segment the food that tastes spicy and hot?",
-        "./resources/imgs/example2.jpg",
-    ],
-    [
-        "Assuming you are an autonomous driving robot, what part of the diagram would you manipulate to control the direction of travel? Please output segmentation mask and explain why.",
-        "./resources/imgs/example1.jpg",
-    ],
-    [
-        "What can make the woman stand higher? Please output segmentation mask and explain why.",
-        "./resources/imgs/example3.jpg",
-    ],
-]
-output_labels = ["Segmentation Output"]
-title = "LISA: Reasoning Segmentation via Large Language Model"
-description = """
-<font size=4>
-This is the online demo of LISA. \n
-If multiple users are using it at the same time, they will enter a queue, which may delay some time. \n
-**Note**: **Different prompts can lead to significantly varied results**. \n
-**Note**: Please try to **standardize** your input text prompts to **avoid ambiguity**, and also pay attention to whether the **punctuations** of the input are correct. \n
-**Note**: Current model is **LISA-13B-llama2-v0-explanatory**, and 4-bit quantization may impair text-generation quality. \n
-**Usage**: <br>
-&ensp;(1) To let LISA **segment something**, input prompt like: "Can you segment xxx in this image?", "What is xxx in this image? Please output segmentation mask."; <br>
-&ensp;(2) To let LISA **output an explanation**, input prompt like: "What is xxx in this image? Please output segmentation mask and explain why."; <br>
-&ensp;(3) To obtain **solely language output**, you can input like what you should do in current multi-modal LLM (e.g., LLaVA). <br>
-Hope you can enjoy our work!
-</font>
-"""
-article = """
-<p style='text-align: center'>
-<a href='https://arxiv.org/abs/2308.00692' target='_blank'>
-Preprint Paper
-</a>
-\n
-<p style='text-align: center'>
-<a href='https://github.com/dvlab-research/LISA' target='_blank'>   Github Repo </a></p>
-"""
 ## to be implemented
 def inference(input_str, input_image):
     ## filter out special chars
@@ -291,7 +291,6 @@ def inference(input_str, input_image):
     else:
         image = image.float()
     input_ids = tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
     input_ids = input_ids.unsqueeze(0).cuda()
@@ -334,7 +333,6 @@ def inference(input_str, input_image):
     return output_image, output_str
 def server_runner(
         fn_inference: Callable,
         debug: bool = False,

 import sys
 from typing import Callable
 import cv2
 import gradio as gr
+import nh3
 import numpy as np
 import torch
 import torch.nn.functional as F
                          DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX)
+# Gradio
+examples = [
+    [
+        "Where can the driver see the car speed in this image? Please output segmentation mask.",
+        "./resources/imgs/example1.jpg",
+    ],
+    [
+        "Can you segment the food that tastes spicy and hot?",
+        "./resources/imgs/example2.jpg",
+    ],
+    [
+        "Assuming you are an autonomous driving robot, what part of the diagram would you manipulate to control the direction of travel? Please output segmentation mask and explain why.",
+        "./resources/imgs/example1.jpg",
+    ],
+    [
+        "What can make the woman stand higher? Please output segmentation mask and explain why.",
+        "./resources/imgs/example3.jpg",
+    ],
+]
+output_labels = ["Segmentation Output"]
+title = "LISA: Reasoning Segmentation via Large Language Model"
+description = """
+<font size=4>
+This is the online demo of LISA. \n
+If multiple users are using it at the same time, they will enter a queue, which may delay some time. \n
+**Note**: **Different prompts can lead to significantly varied results**. \n
+**Note**: Please try to **standardize** your input text prompts to **avoid ambiguity**, and also pay attention to whether the **punctuations** of the input are correct. \n
+**Note**: Current model is **LISA-13B-llama2-v0-explanatory**, and 4-bit quantization may impair text-generation quality. \n
+**Usage**: <br>
+&ensp;(1) To let LISA **segment something**, input prompt like: "Can you segment xxx in this image?", "What is xxx in this image? Please output segmentation mask."; <br>
+&ensp;(2) To let LISA **output an explanation**, input prompt like: "What is xxx in this image? Please output segmentation mask and explain why."; <br>
+&ensp;(3) To obtain **solely language output**, you can input like what you should do in current multi-modal LLM (e.g., LLaVA). <br>
+Hope you can enjoy our work!
+</font>
+"""
+article = """
+<p style='text-align: center'>
+<a href='https://arxiv.org/abs/2308.00692' target='_blank'>
+Preprint Paper
+</a>
+\n
+<p style='text-align: center'>
+<a href='https://github.com/dvlab-research/LISA' target='_blank'>   Github Repo </a></p>
+"""
+def parse_args(args_to_parse):
     parser = argparse.ArgumentParser(description="LISA chat")
     parser.add_argument("--version", default="xinlai/LISA-13B-llama2-v1")
     parser.add_argument("--vis_save_path", default="./vis_output", type=str)
         type=str,
         choices=["llava_v1", "llava_llama_2"],
     )
+    return parser.parse_args(args_to_parse)
 def preprocess(
 model.eval()
 ## to be implemented
 def inference(input_str, input_image):
     ## filter out special chars
     else:
         image = image.float()
     input_ids = tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
     input_ids = input_ids.unsqueeze(0).cuda()
     return output_image, output_str
 def server_runner(
         fn_inference: Callable,
         debug: bool = False,