Spaces:
Sleeping
Sleeping
Muhusystem
commited on
Commit
·
a0803c3
1
Parent(s):
fc7ef08
Split text prediction and attribution analysis into separate buttons
Browse files
app.py
CHANGED
@@ -61,9 +61,11 @@ feature_extractor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-2
|
|
61 |
|
62 |
# 定义推理函数
|
63 |
def predict_text(image, text):
|
|
|
64 |
image = Image.fromarray(image)
|
65 |
image_features = feature_extractor(images=image, return_tensors="pt")
|
66 |
|
|
|
67 |
inputs = tokenizer.encode_plus(
|
68 |
f"Question: {text} Answer:",
|
69 |
return_tensors="pt",
|
@@ -76,6 +78,7 @@ def predict_text(image, text):
|
|
76 |
attention_mask = inputs["attention_mask"].long()
|
77 |
pixel_values = image_features["pixel_values"]
|
78 |
|
|
|
79 |
with torch.no_grad():
|
80 |
logits = model(input_ids, attention_mask, pixel_values)
|
81 |
prediction = torch.argmax(logits, dim=1).item()
|
@@ -84,9 +87,11 @@ def predict_text(image, text):
|
|
84 |
|
85 |
# 定义归因分析函数
|
86 |
def generate_attribution(image, text):
|
|
|
87 |
image = Image.fromarray(image)
|
88 |
image_features = feature_extractor(images=image, return_tensors="pt")
|
89 |
|
|
|
90 |
inputs = tokenizer.encode_plus(
|
91 |
f"Question: {text} Answer:",
|
92 |
return_tensors="pt",
|
@@ -99,6 +104,7 @@ def generate_attribution(image, text):
|
|
99 |
attention_mask = inputs["attention_mask"].long()
|
100 |
pixel_values = image_features["pixel_values"]
|
101 |
|
|
|
102 |
with torch.no_grad():
|
103 |
logits = model(input_ids, attention_mask, pixel_values)
|
104 |
prediction = torch.argmax(logits, dim=1).item()
|
@@ -111,41 +117,36 @@ def generate_attribution(image, text):
|
|
111 |
return_convergence_delta=True
|
112 |
)
|
113 |
|
|
|
114 |
attribution_image = attributions.squeeze().cpu().numpy()
|
115 |
attribution_image = (attribution_image - attribution_image.min()) / (attribution_image.max() - attribution_image.min())
|
116 |
attribution_image = np.uint8(255 * attribution_image)
|
117 |
attribution_image_real = convert_tensor_to_pil(attribution_image)
|
118 |
|
|
|
119 |
attribution_gray = cv2.cvtColor(np.array(attribution_image_real), cv2.COLOR_RGB2GRAY)
|
120 |
_, binary_mask = cv2.threshold(attribution_gray, 128, 255, cv2.THRESH_BINARY)
|
121 |
contours, _ = cv2.findContours(binary_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
|
|
|
|
122 |
|
123 |
-
|
124 |
-
|
125 |
-
cv2.drawContours(original_image_np, contours, -1, (255, 0, 0), 2)
|
126 |
-
|
127 |
-
return attribution_image_real, Image.fromarray(original_image_np)
|
128 |
|
129 |
# 创建 Gradio 界面
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
predict_button.click(predict_text, inputs=[input_image, question_input], outputs=prediction_output)
|
147 |
-
attribution_button.click(generate_attribution, inputs=[input_image, question_input], outputs=[attribution_image_1, attribution_image_2])
|
148 |
-
clear_button.click(lambda: (None, "", ""), outputs=[input_image, question_input, prediction_output])
|
149 |
-
|
150 |
-
# 启动 Gradio 界面
|
151 |
demo.launch()
|
|
|
61 |
|
62 |
# 定义推理函数
|
63 |
def predict_text(image, text):
|
64 |
+
# 处理图像
|
65 |
image = Image.fromarray(image)
|
66 |
image_features = feature_extractor(images=image, return_tensors="pt")
|
67 |
|
68 |
+
# 处理文本
|
69 |
inputs = tokenizer.encode_plus(
|
70 |
f"Question: {text} Answer:",
|
71 |
return_tensors="pt",
|
|
|
78 |
attention_mask = inputs["attention_mask"].long()
|
79 |
pixel_values = image_features["pixel_values"]
|
80 |
|
81 |
+
# 推理
|
82 |
with torch.no_grad():
|
83 |
logits = model(input_ids, attention_mask, pixel_values)
|
84 |
prediction = torch.argmax(logits, dim=1).item()
|
|
|
87 |
|
88 |
# 定义归因分析函数
|
89 |
def generate_attribution(image, text):
|
90 |
+
# 处理图像
|
91 |
image = Image.fromarray(image)
|
92 |
image_features = feature_extractor(images=image, return_tensors="pt")
|
93 |
|
94 |
+
# 处理文本
|
95 |
inputs = tokenizer.encode_plus(
|
96 |
f"Question: {text} Answer:",
|
97 |
return_tensors="pt",
|
|
|
104 |
attention_mask = inputs["attention_mask"].long()
|
105 |
pixel_values = image_features["pixel_values"]
|
106 |
|
107 |
+
# 推理和归因分析
|
108 |
with torch.no_grad():
|
109 |
logits = model(input_ids, attention_mask, pixel_values)
|
110 |
prediction = torch.argmax(logits, dim=1).item()
|
|
|
117 |
return_convergence_delta=True
|
118 |
)
|
119 |
|
120 |
+
# 归因图像处理
|
121 |
attribution_image = attributions.squeeze().cpu().numpy()
|
122 |
attribution_image = (attribution_image - attribution_image.min()) / (attribution_image.max() - attribution_image.min())
|
123 |
attribution_image = np.uint8(255 * attribution_image)
|
124 |
attribution_image_real = convert_tensor_to_pil(attribution_image)
|
125 |
|
126 |
+
# 轮廓检测
|
127 |
attribution_gray = cv2.cvtColor(np.array(attribution_image_real), cv2.COLOR_RGB2GRAY)
|
128 |
_, binary_mask = cv2.threshold(attribution_gray, 128, 255, cv2.THRESH_BINARY)
|
129 |
contours, _ = cv2.findContours(binary_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
130 |
+
contour_image = np.array(attribution_image_real)
|
131 |
+
cv2.drawContours(contour_image, contours, -1, (255, 0, 0), 2)
|
132 |
|
133 |
+
# 原始图像、热图和带轮廓的图像
|
134 |
+
return Image.fromarray(image_features['pixel_values'][0].byte().permute(1, 2, 0).numpy()), Image.fromarray(contour_image)
|
|
|
|
|
|
|
135 |
|
136 |
# 创建 Gradio 界面
|
137 |
+
text_button = gr.Interface(
|
138 |
+
fn=predict_text,
|
139 |
+
inputs=["image", "text"],
|
140 |
+
outputs="text",
|
141 |
+
title="Multi-modal Inference: Text Prediction"
|
142 |
+
)
|
143 |
+
|
144 |
+
attribution_button = gr.Interface(
|
145 |
+
fn=generate_attribution,
|
146 |
+
inputs=["image", "text"],
|
147 |
+
outputs=[gr.Image(), gr.Image()],
|
148 |
+
title="Multi-modal Inference: Attribution Analysis"
|
149 |
+
)
|
150 |
+
|
151 |
+
demo = gr.TabbedInterface([text_button, attribution_button], ["Text Prediction", "Attribution Analysis"])
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
demo.launch()
|