Spaces:
Running
Running
use concurrent futures instead of signal
Browse files- duckdb-nsql/eval/evaluate.py +29 -29
- evaluation_logic.py +2 -2
duckdb-nsql/eval/evaluate.py
CHANGED
@@ -12,6 +12,7 @@ import click
|
|
12 |
import pandas as pd
|
13 |
from rich.console import Console
|
14 |
from tqdm.auto import tqdm
|
|
|
15 |
|
16 |
sys.path.append(os.path.join(os.path.dirname(__file__), "."))
|
17 |
# from metrics.spider import evaluation as spider_evaluation # type: ignore # noqa: E402
|
@@ -113,15 +114,24 @@ def compute_exact_match_metric(
|
|
113 |
return exact_match
|
114 |
|
115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
def compute_test_suite_metric(
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
) -> tuple[Any, list[int | None]]:
|
126 |
"""Compute test suite execution metric."""
|
127 |
evaluator = test_suite_evaluation.Evaluator(
|
@@ -135,37 +145,27 @@ def compute_test_suite_metric(
|
|
135 |
# Only used for Sparc/CoSQL
|
136 |
turn_scores: dict[str, list] = {"exec": [], "exact": []}
|
137 |
by_row_metrics: list[int | None] = []
|
|
|
138 |
for prediction, reference, gold_db, setup_sql, validate_sql, category in tqdm(
|
139 |
-
|
140 |
-
|
141 |
):
|
142 |
turn_idx = 0
|
143 |
# skip final utterance-query pairs
|
144 |
if turn_idx < 0:
|
145 |
continue
|
146 |
|
147 |
-
#
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
ex_metrics = evaluator.evaluate_one(
|
153 |
-
gold_db,
|
154 |
-
reference,
|
155 |
-
prediction,
|
156 |
-
setup_sql,
|
157 |
-
validate_sql,
|
158 |
-
turn_scores,
|
159 |
-
idx=turn_idx,
|
160 |
-
category=category,
|
161 |
-
)
|
162 |
-
signal.alarm(0)
|
163 |
|
|
|
164 |
by_row_metrics.append(int(ex_metrics["exec"]))
|
165 |
-
|
166 |
-
raise e
|
167 |
by_row_metrics.append(None)
|
168 |
-
|
169 |
evaluator.finalize()
|
170 |
return evaluator.scores, by_row_metrics
|
171 |
|
|
|
12 |
import pandas as pd
|
13 |
from rich.console import Console
|
14 |
from tqdm.auto import tqdm
|
15 |
+
from concurrent.futures import ThreadPoolExecutor, TimeoutError
|
16 |
|
17 |
sys.path.append(os.path.join(os.path.dirname(__file__), "."))
|
18 |
# from metrics.spider import evaluation as spider_evaluation # type: ignore # noqa: E402
|
|
|
114 |
return exact_match
|
115 |
|
116 |
|
117 |
+
def evaluate_with_timeout(evaluator, *args, timeout):
|
118 |
+
with ThreadPoolExecutor(max_workers=1) as executor:
|
119 |
+
future = executor.submit(evaluator.evaluate_one, *args)
|
120 |
+
try:
|
121 |
+
result = future.result(timeout=timeout)
|
122 |
+
except TimeoutError:
|
123 |
+
result = None
|
124 |
+
return result
|
125 |
+
|
126 |
def compute_test_suite_metric(
|
127 |
+
predictions: list,
|
128 |
+
references: list,
|
129 |
+
gold_dbs: list,
|
130 |
+
setup_sqls: list,
|
131 |
+
validate_sqls: list,
|
132 |
+
kmaps: dict,
|
133 |
+
db_dir: str,
|
134 |
+
categories: list[str] = None,
|
135 |
) -> tuple[Any, list[int | None]]:
|
136 |
"""Compute test suite execution metric."""
|
137 |
evaluator = test_suite_evaluation.Evaluator(
|
|
|
145 |
# Only used for Sparc/CoSQL
|
146 |
turn_scores: dict[str, list] = {"exec": [], "exact": []}
|
147 |
by_row_metrics: list[int | None] = []
|
148 |
+
|
149 |
for prediction, reference, gold_db, setup_sql, validate_sql, category in tqdm(
|
150 |
+
zip(predictions, references, gold_dbs, setup_sqls, validate_sqls, categories),
|
151 |
+
total=len(predictions),
|
152 |
):
|
153 |
turn_idx = 0
|
154 |
# skip final utterance-query pairs
|
155 |
if turn_idx < 0:
|
156 |
continue
|
157 |
|
158 |
+
# Use the new function to evaluate with timeout
|
159 |
+
ex_metrics = evaluate_with_timeout(
|
160 |
+
evaluator, gold_db, reference, prediction, setup_sql, validate_sql,
|
161 |
+
turn_scores, timeout=TIMEOUT_SECONDS
|
162 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
|
164 |
+
if ex_metrics:
|
165 |
by_row_metrics.append(int(ex_metrics["exec"]))
|
166 |
+
else:
|
|
|
167 |
by_row_metrics.append(None)
|
168 |
+
|
169 |
evaluator.finalize()
|
170 |
return evaluator.scores, by_row_metrics
|
171 |
|
evaluation_logic.py
CHANGED
@@ -60,8 +60,8 @@ def run_prediction(model_name, prompt_format, output_file):
|
|
60 |
else:
|
61 |
table_params = []
|
62 |
|
63 |
-
if len(table_params) == 0:
|
64 |
-
yield f"[red] WARNING: No tables found for {db_id} [/red]"
|
65 |
|
66 |
text_to_sql_inputs.append(TextToSQLParams(
|
67 |
instruction=question,
|
|
|
60 |
else:
|
61 |
table_params = []
|
62 |
|
63 |
+
#if len(table_params) == 0:
|
64 |
+
#yield f"[red] WARNING: No tables found for {db_id} [/red]"
|
65 |
|
66 |
text_to_sql_inputs.append(TextToSQLParams(
|
67 |
instruction=question,
|