Muhusystem commited on
Commit
a5ee181
·
1 Parent(s): a0803c3

Split text prediction and attribution analysis into separate buttons

Browse files
Files changed (1) hide show
  1. app.py +24 -27
app.py CHANGED
@@ -61,11 +61,9 @@ feature_extractor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-2
61
 
62
  # 定义推理函数
63
  def predict_text(image, text):
64
- # 处理图像
65
  image = Image.fromarray(image)
66
  image_features = feature_extractor(images=image, return_tensors="pt")
67
 
68
- # 处理文本
69
  inputs = tokenizer.encode_plus(
70
  f"Question: {text} Answer:",
71
  return_tensors="pt",
@@ -78,7 +76,6 @@ def predict_text(image, text):
78
  attention_mask = inputs["attention_mask"].long()
79
  pixel_values = image_features["pixel_values"]
80
 
81
- # 推理
82
  with torch.no_grad():
83
  logits = model(input_ids, attention_mask, pixel_values)
84
  prediction = torch.argmax(logits, dim=1).item()
@@ -87,11 +84,9 @@ def predict_text(image, text):
87
 
88
  # 定义归因分析函数
89
  def generate_attribution(image, text):
90
- # 处理图像
91
  image = Image.fromarray(image)
92
  image_features = feature_extractor(images=image, return_tensors="pt")
93
 
94
- # 处理文本
95
  inputs = tokenizer.encode_plus(
96
  f"Question: {text} Answer:",
97
  return_tensors="pt",
@@ -104,7 +99,6 @@ def generate_attribution(image, text):
104
  attention_mask = inputs["attention_mask"].long()
105
  pixel_values = image_features["pixel_values"]
106
 
107
- # 推理和归因分析
108
  with torch.no_grad():
109
  logits = model(input_ids, attention_mask, pixel_values)
110
  prediction = torch.argmax(logits, dim=1).item()
@@ -117,36 +111,39 @@ def generate_attribution(image, text):
117
  return_convergence_delta=True
118
  )
119
 
120
- # 归因图像处理
121
  attribution_image = attributions.squeeze().cpu().numpy()
122
  attribution_image = (attribution_image - attribution_image.min()) / (attribution_image.max() - attribution_image.min())
123
  attribution_image = np.uint8(255 * attribution_image)
124
  attribution_image_real = convert_tensor_to_pil(attribution_image)
125
 
126
- # 轮廓检测
127
  attribution_gray = cv2.cvtColor(np.array(attribution_image_real), cv2.COLOR_RGB2GRAY)
128
  _, binary_mask = cv2.threshold(attribution_gray, 128, 255, cv2.THRESH_BINARY)
129
  contours, _ = cv2.findContours(binary_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
130
- contour_image = np.array(attribution_image_real)
131
- cv2.drawContours(contour_image, contours, -1, (255, 0, 0), 2)
132
 
133
- # 原始图像、热图和带轮廓的图像
134
- return Image.fromarray(image_features['pixel_values'][0].byte().permute(1, 2, 0).numpy()), Image.fromarray(contour_image)
 
 
 
135
 
136
  # 创建 Gradio 界面
137
- text_button = gr.Interface(
138
- fn=predict_text,
139
- inputs=["image", "text"],
140
- outputs="text",
141
- title="Multi-modal Inference: Text Prediction"
142
- )
143
-
144
- attribution_button = gr.Interface(
145
- fn=generate_attribution,
146
- inputs=["image", "text"],
147
- outputs=[gr.Image(), gr.Image()],
148
- title="Multi-modal Inference: Attribution Analysis"
149
- )
150
-
151
- demo = gr.TabbedInterface([text_button, attribution_button], ["Text Prediction", "Attribution Analysis"])
 
 
 
 
152
  demo.launch()
 
61
 
62
  # 定义推理函数
63
  def predict_text(image, text):
 
64
  image = Image.fromarray(image)
65
  image_features = feature_extractor(images=image, return_tensors="pt")
66
 
 
67
  inputs = tokenizer.encode_plus(
68
  f"Question: {text} Answer:",
69
  return_tensors="pt",
 
76
  attention_mask = inputs["attention_mask"].long()
77
  pixel_values = image_features["pixel_values"]
78
 
 
79
  with torch.no_grad():
80
  logits = model(input_ids, attention_mask, pixel_values)
81
  prediction = torch.argmax(logits, dim=1).item()
 
84
 
85
  # 定义归因分析函数
86
  def generate_attribution(image, text):
 
87
  image = Image.fromarray(image)
88
  image_features = feature_extractor(images=image, return_tensors="pt")
89
 
 
90
  inputs = tokenizer.encode_plus(
91
  f"Question: {text} Answer:",
92
  return_tensors="pt",
 
99
  attention_mask = inputs["attention_mask"].long()
100
  pixel_values = image_features["pixel_values"]
101
 
 
102
  with torch.no_grad():
103
  logits = model(input_ids, attention_mask, pixel_values)
104
  prediction = torch.argmax(logits, dim=1).item()
 
111
  return_convergence_delta=True
112
  )
113
 
 
114
  attribution_image = attributions.squeeze().cpu().numpy()
115
  attribution_image = (attribution_image - attribution_image.min()) / (attribution_image.max() - attribution_image.min())
116
  attribution_image = np.uint8(255 * attribution_image)
117
  attribution_image_real = convert_tensor_to_pil(attribution_image)
118
 
 
119
  attribution_gray = cv2.cvtColor(np.array(attribution_image_real), cv2.COLOR_RGB2GRAY)
120
  _, binary_mask = cv2.threshold(attribution_gray, 128, 255, cv2.THRESH_BINARY)
121
  contours, _ = cv2.findContours(binary_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
 
 
122
 
123
+ original_image = convert_tensor_to_pil(pixel_values.squeeze(0).numpy())
124
+ original_image_np = np.array(original_image)
125
+ cv2.drawContours(original_image_np, contours, -1, (255, 0, 0), 2)
126
+
127
+ return attribution_image_real, Image.fromarray(original_image_np)
128
 
129
  # 创建 Gradio 界面
130
+ with gr.Blocks() as demo:
131
+ with gr.Row():
132
+ with gr.Column():
133
+ input_image = gr.Image(label="Input Image", interactive=True)
134
+ question_input = gr.Textbox(label="Question", lines=2, interactive=True)
135
+ clear_button = gr.Button("Clear")
136
+ with gr.Column():
137
+ predict_button = gr.Button("Answer")
138
+ prediction_output = gr.Textbox(label="Answer", lines=2, interactive=False)
139
+ attribution_button = gr.Button("Generate Attribution")
140
+ with gr.Row():
141
+ attribution_image_1 = gr.Image(label="Attribution Image", interactive=False)
142
+ attribution_image_2 = gr.Image(label="Attribution with Contours", interactive=False)
143
+
144
+ # 按钮事件绑定
145
+ predict_button.click(predict_text, inputs=[input_image, question_input], outputs=prediction_output)
146
+ attribution_button.click(generate_attribution, inputs=[input_image, question_input], outputs=[attribution_image_1, attribution_image_2])
147
+ clear_button.click(lambda: (None, "", ""), outputs=[input_image, question_input, prediction_output])
148
+
149
  demo.launch()