Spaces:

Muhusjf
/

XAI-Medical

Sleeping

App Files Files Community

Muhusystem commited on Oct 27, 2024

Commit

a5ee181

1 Parent(s): a0803c3

Split text prediction and attribution analysis into separate buttons

Browse files

Files changed (1) hide show

app.py +24 -27

app.py CHANGED Viewed

@@ -61,11 +61,9 @@ feature_extractor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-2
 # 定义推理函数
 def predict_text(image, text):
-    # 处理图像
     image = Image.fromarray(image)
     image_features = feature_extractor(images=image, return_tensors="pt")
-    # 处理文本
     inputs = tokenizer.encode_plus(
         f"Question: {text} Answer:",
         return_tensors="pt",
@@ -78,7 +76,6 @@ def predict_text(image, text):
     attention_mask = inputs["attention_mask"].long()
     pixel_values = image_features["pixel_values"]
-    # 推理
     with torch.no_grad():
         logits = model(input_ids, attention_mask, pixel_values)
         prediction = torch.argmax(logits, dim=1).item()
@@ -87,11 +84,9 @@ def predict_text(image, text):
 # 定义归因分析函数
 def generate_attribution(image, text):
-    # 处理图像
     image = Image.fromarray(image)
     image_features = feature_extractor(images=image, return_tensors="pt")
-    # 处理文本
     inputs = tokenizer.encode_plus(
         f"Question: {text} Answer:",
         return_tensors="pt",
@@ -104,7 +99,6 @@ def generate_attribution(image, text):
     attention_mask = inputs["attention_mask"].long()
     pixel_values = image_features["pixel_values"]
-    # 推理和归因分析
     with torch.no_grad():
         logits = model(input_ids, attention_mask, pixel_values)
         prediction = torch.argmax(logits, dim=1).item()
@@ -117,36 +111,39 @@ def generate_attribution(image, text):
         return_convergence_delta=True
     )
-    # 归因图像处理
     attribution_image = attributions.squeeze().cpu().numpy()
     attribution_image = (attribution_image - attribution_image.min()) / (attribution_image.max() - attribution_image.min())
     attribution_image = np.uint8(255 * attribution_image)
     attribution_image_real = convert_tensor_to_pil(attribution_image)
-    # 轮廓检测
     attribution_gray = cv2.cvtColor(np.array(attribution_image_real), cv2.COLOR_RGB2GRAY)
     _, binary_mask = cv2.threshold(attribution_gray, 128, 255, cv2.THRESH_BINARY)
     contours, _ = cv2.findContours(binary_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-    contour_image = np.array(attribution_image_real)
-    cv2.drawContours(contour_image, contours, -1, (255, 0, 0), 2)
-    # 原始图像、热图和带轮廓的图像
-    return Image.fromarray(image_features['pixel_values'][0].byte().permute(1, 2, 0).numpy()), Image.fromarray(contour_image)
 # 创建 Gradio 界面
-text_button = gr.Interface(
-    fn=predict_text,
-    inputs=["image", "text"],
-    outputs="text",
-    title="Multi-modal Inference: Text Prediction"
-)
-attribution_button = gr.Interface(
-    fn=generate_attribution,
-    inputs=["image", "text"],
-    outputs=[gr.Image(), gr.Image()],
-    title="Multi-modal Inference: Attribution Analysis"
-)
-demo = gr.TabbedInterface([text_button, attribution_button], ["Text Prediction", "Attribution Analysis"])
 demo.launch()

 # 定义推理函数
 def predict_text(image, text):
     image = Image.fromarray(image)
     image_features = feature_extractor(images=image, return_tensors="pt")
     inputs = tokenizer.encode_plus(
         f"Question: {text} Answer:",
         return_tensors="pt",
     attention_mask = inputs["attention_mask"].long()
     pixel_values = image_features["pixel_values"]
     with torch.no_grad():
         logits = model(input_ids, attention_mask, pixel_values)
         prediction = torch.argmax(logits, dim=1).item()
 # 定义归因分析函数
 def generate_attribution(image, text):
     image = Image.fromarray(image)
     image_features = feature_extractor(images=image, return_tensors="pt")
     inputs = tokenizer.encode_plus(
         f"Question: {text} Answer:",
         return_tensors="pt",
     attention_mask = inputs["attention_mask"].long()
     pixel_values = image_features["pixel_values"]
     with torch.no_grad():
         logits = model(input_ids, attention_mask, pixel_values)
         prediction = torch.argmax(logits, dim=1).item()
         return_convergence_delta=True
     )
     attribution_image = attributions.squeeze().cpu().numpy()
     attribution_image = (attribution_image - attribution_image.min()) / (attribution_image.max() - attribution_image.min())
     attribution_image = np.uint8(255 * attribution_image)
     attribution_image_real = convert_tensor_to_pil(attribution_image)
     attribution_gray = cv2.cvtColor(np.array(attribution_image_real), cv2.COLOR_RGB2GRAY)
     _, binary_mask = cv2.threshold(attribution_gray, 128, 255, cv2.THRESH_BINARY)
     contours, _ = cv2.findContours(binary_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    original_image = convert_tensor_to_pil(pixel_values.squeeze(0).numpy())
+    original_image_np = np.array(original_image)
+    cv2.drawContours(original_image_np, contours, -1, (255, 0, 0), 2)
+    return attribution_image_real, Image.fromarray(original_image_np)
 # 创建 Gradio 界面
+with gr.Blocks() as demo:
+    with gr.Row():
+        with gr.Column():
+            input_image = gr.Image(label="Input Image", interactive=True)
+            question_input = gr.Textbox(label="Question", lines=2, interactive=True)
+            clear_button = gr.Button("Clear")
+        with gr.Column():
+            predict_button = gr.Button("Answer")
+            prediction_output = gr.Textbox(label="Answer", lines=2, interactive=False)
+            attribution_button = gr.Button("Generate Attribution")
+            with gr.Row():
+                attribution_image_1 = gr.Image(label="Attribution Image", interactive=False)
+                attribution_image_2 = gr.Image(label="Attribution with Contours", interactive=False)
+    # 按钮事件绑定
+    predict_button.click(predict_text, inputs=[input_image, question_input], outputs=prediction_output)
+    attribution_button.click(generate_attribution, inputs=[input_image, question_input], outputs=[attribution_image_1, attribution_image_2])
+    clear_button.click(lambda: (None, "", ""), outputs=[input_image, question_input, prediction_output])
 demo.launch()