Muhusystem commited on
Commit
a0803c3
·
1 Parent(s): fc7ef08

Split text prediction and attribution analysis into separate buttons

Browse files
Files changed (1) hide show
  1. app.py +27 -26
app.py CHANGED
@@ -61,9 +61,11 @@ feature_extractor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-2
61
 
62
  # 定义推理函数
63
  def predict_text(image, text):
 
64
  image = Image.fromarray(image)
65
  image_features = feature_extractor(images=image, return_tensors="pt")
66
 
 
67
  inputs = tokenizer.encode_plus(
68
  f"Question: {text} Answer:",
69
  return_tensors="pt",
@@ -76,6 +78,7 @@ def predict_text(image, text):
76
  attention_mask = inputs["attention_mask"].long()
77
  pixel_values = image_features["pixel_values"]
78
 
 
79
  with torch.no_grad():
80
  logits = model(input_ids, attention_mask, pixel_values)
81
  prediction = torch.argmax(logits, dim=1).item()
@@ -84,9 +87,11 @@ def predict_text(image, text):
84
 
85
  # 定义归因分析函数
86
  def generate_attribution(image, text):
 
87
  image = Image.fromarray(image)
88
  image_features = feature_extractor(images=image, return_tensors="pt")
89
 
 
90
  inputs = tokenizer.encode_plus(
91
  f"Question: {text} Answer:",
92
  return_tensors="pt",
@@ -99,6 +104,7 @@ def generate_attribution(image, text):
99
  attention_mask = inputs["attention_mask"].long()
100
  pixel_values = image_features["pixel_values"]
101
 
 
102
  with torch.no_grad():
103
  logits = model(input_ids, attention_mask, pixel_values)
104
  prediction = torch.argmax(logits, dim=1).item()
@@ -111,41 +117,36 @@ def generate_attribution(image, text):
111
  return_convergence_delta=True
112
  )
113
 
 
114
  attribution_image = attributions.squeeze().cpu().numpy()
115
  attribution_image = (attribution_image - attribution_image.min()) / (attribution_image.max() - attribution_image.min())
116
  attribution_image = np.uint8(255 * attribution_image)
117
  attribution_image_real = convert_tensor_to_pil(attribution_image)
118
 
 
119
  attribution_gray = cv2.cvtColor(np.array(attribution_image_real), cv2.COLOR_RGB2GRAY)
120
  _, binary_mask = cv2.threshold(attribution_gray, 128, 255, cv2.THRESH_BINARY)
121
  contours, _ = cv2.findContours(binary_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
 
 
122
 
123
- original_image = convert_tensor_to_pil(pixel_values.squeeze(0).numpy())
124
- original_image_np = np.array(original_image)
125
- cv2.drawContours(original_image_np, contours, -1, (255, 0, 0), 2)
126
-
127
- return attribution_image_real, Image.fromarray(original_image_np)
128
 
129
  # 创建 Gradio 界面
130
- with gr.Blocks() as demo:
131
- with gr.Row():
132
- with gr.Column():
133
- input_image = gr.Image(label="Input Image", type="pil", height=400)
134
- question_input = gr.Textbox(label="Question", lines=3, max_lines=3)
135
- clear_button = gr.Button("Clear")
136
- with gr.Column():
137
- with gr.Row():
138
- predict_button = gr.Button("Answer")
139
- prediction_output = gr.Textbox(label="Answer", lines=3, max_lines=3)
140
- with gr.Row():
141
- attribution_button = gr.Button("Generate Attribution")
142
- with gr.Row():
143
- attribution_image_1 = gr.Image(label="Attribution Image", height=400)
144
- attribution_image_2 = gr.Image(label="Attribution with Contours", height=400)
145
-
146
- predict_button.click(predict_text, inputs=[input_image, question_input], outputs=prediction_output)
147
- attribution_button.click(generate_attribution, inputs=[input_image, question_input], outputs=[attribution_image_1, attribution_image_2])
148
- clear_button.click(lambda: (None, "", ""), outputs=[input_image, question_input, prediction_output])
149
-
150
- # 启动 Gradio 界面
151
  demo.launch()
 
61
 
62
  # 定义推理函数
63
  def predict_text(image, text):
64
+ # 处理图像
65
  image = Image.fromarray(image)
66
  image_features = feature_extractor(images=image, return_tensors="pt")
67
 
68
+ # 处理文本
69
  inputs = tokenizer.encode_plus(
70
  f"Question: {text} Answer:",
71
  return_tensors="pt",
 
78
  attention_mask = inputs["attention_mask"].long()
79
  pixel_values = image_features["pixel_values"]
80
 
81
+ # 推理
82
  with torch.no_grad():
83
  logits = model(input_ids, attention_mask, pixel_values)
84
  prediction = torch.argmax(logits, dim=1).item()
 
87
 
88
  # 定义归因分析函数
89
  def generate_attribution(image, text):
90
+ # 处理图像
91
  image = Image.fromarray(image)
92
  image_features = feature_extractor(images=image, return_tensors="pt")
93
 
94
+ # 处理文本
95
  inputs = tokenizer.encode_plus(
96
  f"Question: {text} Answer:",
97
  return_tensors="pt",
 
104
  attention_mask = inputs["attention_mask"].long()
105
  pixel_values = image_features["pixel_values"]
106
 
107
+ # 推理和归因分析
108
  with torch.no_grad():
109
  logits = model(input_ids, attention_mask, pixel_values)
110
  prediction = torch.argmax(logits, dim=1).item()
 
117
  return_convergence_delta=True
118
  )
119
 
120
+ # 归因图像处理
121
  attribution_image = attributions.squeeze().cpu().numpy()
122
  attribution_image = (attribution_image - attribution_image.min()) / (attribution_image.max() - attribution_image.min())
123
  attribution_image = np.uint8(255 * attribution_image)
124
  attribution_image_real = convert_tensor_to_pil(attribution_image)
125
 
126
+ # 轮廓检测
127
  attribution_gray = cv2.cvtColor(np.array(attribution_image_real), cv2.COLOR_RGB2GRAY)
128
  _, binary_mask = cv2.threshold(attribution_gray, 128, 255, cv2.THRESH_BINARY)
129
  contours, _ = cv2.findContours(binary_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
130
+ contour_image = np.array(attribution_image_real)
131
+ cv2.drawContours(contour_image, contours, -1, (255, 0, 0), 2)
132
 
133
+ # 原始图像、热图和带轮廓的图像
134
+ return Image.fromarray(image_features['pixel_values'][0].byte().permute(1, 2, 0).numpy()), Image.fromarray(contour_image)
 
 
 
135
 
136
  # 创建 Gradio 界面
137
+ text_button = gr.Interface(
138
+ fn=predict_text,
139
+ inputs=["image", "text"],
140
+ outputs="text",
141
+ title="Multi-modal Inference: Text Prediction"
142
+ )
143
+
144
+ attribution_button = gr.Interface(
145
+ fn=generate_attribution,
146
+ inputs=["image", "text"],
147
+ outputs=[gr.Image(), gr.Image()],
148
+ title="Multi-modal Inference: Attribution Analysis"
149
+ )
150
+
151
+ demo = gr.TabbedInterface([text_button, attribution_button], ["Text Prediction", "Attribution Analysis"])
 
 
 
 
 
 
152
  demo.launch()