Spaces:
Running
Running
debug
Browse files- app.py +0 -2
- evaluation_logic.py +5 -4
app.py
CHANGED
@@ -9,8 +9,6 @@ PROMPT_TEMPLATES = {
|
|
9 |
}
|
10 |
|
11 |
def gradio_run_evaluation(inference_api, model_name, prompt_format, openrouter_token=None, custom_prompt=None):
|
12 |
-
print(f"DEBUG - Prompt format: {prompt_format}, Custom prompt content: {custom_prompt}")
|
13 |
-
|
14 |
# Set environment variable if OpenRouter token is provided
|
15 |
if inference_api == "openrouter":
|
16 |
os.environ["OPENROUTER_API_KEY"] = str(openrouter_token)
|
|
|
9 |
}
|
10 |
|
11 |
def gradio_run_evaluation(inference_api, model_name, prompt_format, openrouter_token=None, custom_prompt=None):
|
|
|
|
|
12 |
# Set environment variable if OpenRouter token is provided
|
13 |
if inference_api == "openrouter":
|
14 |
os.environ["OPENROUTER_API_KEY"] = str(openrouter_token)
|
evaluation_logic.py
CHANGED
@@ -57,7 +57,7 @@ def save_prediction(inference_api, model_name, prompt_format, question, generate
|
|
57 |
def save_evaluation(inference_api, model_name, prompt_format, custom_prompt, metrics):
|
58 |
evaluation_file = evaluation_folder / f"evaluation_{file_uuid}.json"
|
59 |
evaluation_folder.mkdir(parents=True, exist_ok=True)
|
60 |
-
|
61 |
# Extract and flatten the category-specific execution metrics
|
62 |
categories = ['easy', 'medium', 'hard', 'duckdb', 'ddl', 'all']
|
63 |
flattened_metrics = {
|
@@ -67,7 +67,7 @@ def save_evaluation(inference_api, model_name, prompt_format, custom_prompt, met
|
|
67 |
"custom_prompt": str(custom_prompt),
|
68 |
"timestamp": datetime.now().isoformat()
|
69 |
}
|
70 |
-
|
71 |
# Flatten each category's metrics into separate columns
|
72 |
for category in categories:
|
73 |
if category in metrics['exec']:
|
@@ -101,6 +101,7 @@ def run_prediction(inference_api, model_name, prompt_format, custom_prompt, outp
|
|
101 |
try:
|
102 |
# Initialize necessary components
|
103 |
data_formatter = DefaultLoader()
|
|
|
104 |
if prompt_format.startswith("custom"):
|
105 |
prompt_formatter = PROMPT_FORMATTERS["custom"]()
|
106 |
prompt_formatter.PROMPT_TEMPLATE = custom_prompt
|
@@ -235,7 +236,7 @@ def run_evaluation(inference_api, model_name, prompt_format="duckdbinstgranitesh
|
|
235 |
|
236 |
# Save evaluation results to dataset
|
237 |
save_evaluation(inference_api, model_name, prompt_format, custom_prompt, metrics)
|
238 |
-
|
239 |
yield "Evaluation completed."
|
240 |
|
241 |
if metrics:
|
@@ -245,7 +246,7 @@ def run_evaluation(inference_api, model_name, prompt_format="duckdbinstgranitesh
|
|
245 |
yield f"All (n={overall_metrics['count']}) - Edit Distance: {metrics['edit_distance']['edit_distance']:.3f}"
|
246 |
|
247 |
categories = ['easy', 'medium', 'hard', 'duckdb', 'ddl', 'all']
|
248 |
-
|
249 |
for category in categories:
|
250 |
if category in metrics['exec']:
|
251 |
category_metrics = metrics['exec'][category]
|
|
|
57 |
def save_evaluation(inference_api, model_name, prompt_format, custom_prompt, metrics):
|
58 |
evaluation_file = evaluation_folder / f"evaluation_{file_uuid}.json"
|
59 |
evaluation_folder.mkdir(parents=True, exist_ok=True)
|
60 |
+
|
61 |
# Extract and flatten the category-specific execution metrics
|
62 |
categories = ['easy', 'medium', 'hard', 'duckdb', 'ddl', 'all']
|
63 |
flattened_metrics = {
|
|
|
67 |
"custom_prompt": str(custom_prompt),
|
68 |
"timestamp": datetime.now().isoformat()
|
69 |
}
|
70 |
+
|
71 |
# Flatten each category's metrics into separate columns
|
72 |
for category in categories:
|
73 |
if category in metrics['exec']:
|
|
|
101 |
try:
|
102 |
# Initialize necessary components
|
103 |
data_formatter = DefaultLoader()
|
104 |
+
print(f"DEBUG - Prompt format: {prompt_format}, Custom prompt content: {custom_prompt}")
|
105 |
if prompt_format.startswith("custom"):
|
106 |
prompt_formatter = PROMPT_FORMATTERS["custom"]()
|
107 |
prompt_formatter.PROMPT_TEMPLATE = custom_prompt
|
|
|
236 |
|
237 |
# Save evaluation results to dataset
|
238 |
save_evaluation(inference_api, model_name, prompt_format, custom_prompt, metrics)
|
239 |
+
|
240 |
yield "Evaluation completed."
|
241 |
|
242 |
if metrics:
|
|
|
246 |
yield f"All (n={overall_metrics['count']}) - Edit Distance: {metrics['edit_distance']['edit_distance']:.3f}"
|
247 |
|
248 |
categories = ['easy', 'medium', 'hard', 'duckdb', 'ddl', 'all']
|
249 |
+
|
250 |
for category in categories:
|
251 |
if category in metrics['exec']:
|
252 |
category_metrics = metrics['exec'][category]
|