Spaces:
Running
Running
output formatting
Browse files- evaluation_logic.py +4 -10
evaluation_logic.py
CHANGED
@@ -166,23 +166,17 @@ def run_evaluation(inference_api, model_name, prompt_format="duckdbinstgranitesh
|
|
166 |
if metrics:
|
167 |
yield "Overall Results:"
|
168 |
overall_metrics = metrics['exec']['all']
|
169 |
-
yield f"
|
170 |
-
yield f"
|
171 |
-
yield f"Exact Match Accuracy: {overall_metrics['exact']:.3f}"
|
172 |
-
yield f"Equality: {metrics['equality']['equality']:.3f}"
|
173 |
-
yield f"Edit Distance: {metrics['edit_distance']['edit_distance']:.3f}"
|
174 |
|
175 |
-
yield "\nResults by Category:"
|
176 |
categories = ['easy', 'medium', 'hard', 'duckdb', 'ddl', 'all']
|
177 |
|
178 |
for category in categories:
|
179 |
if category in metrics['exec']:
|
180 |
-
yield f"\n{category}:"
|
181 |
category_metrics = metrics['exec'][category]
|
182 |
-
yield f"
|
183 |
-
yield f"Execution Accuracy: {category_metrics['exec']:.3f}"
|
184 |
else:
|
185 |
-
yield f"
|
186 |
else:
|
187 |
yield "No evaluation metrics returned."
|
188 |
except Exception as e:
|
|
|
166 |
if metrics:
|
167 |
yield "Overall Results:"
|
168 |
overall_metrics = metrics['exec']['all']
|
169 |
+
yield f"All (n={overall_metrics['count']}) - Execution Accuracy: {overall_metrics['exec']:.3f}"
|
170 |
+
yield f"All (n={overall_metrics['count']}) - Edit Distance: {metrics['edit_distance']['edit_distance']:.3f}"
|
|
|
|
|
|
|
171 |
|
|
|
172 |
categories = ['easy', 'medium', 'hard', 'duckdb', 'ddl', 'all']
|
173 |
|
174 |
for category in categories:
|
175 |
if category in metrics['exec']:
|
|
|
176 |
category_metrics = metrics['exec'][category]
|
177 |
+
yield f"{category} (n={category_metrics['count']}) - Execution Accuracy: {category_metrics['exec']:.3f}"
|
|
|
178 |
else:
|
179 |
+
yield f"{category}: No data available"
|
180 |
else:
|
181 |
yield "No evaluation metrics returned."
|
182 |
except Exception as e:
|