tdoehmen commited on
Commit
a940235
1 Parent(s): 788509a
Files changed (2) hide show
  1. app.py +0 -2
  2. evaluation_logic.py +5 -4
app.py CHANGED
@@ -9,8 +9,6 @@ PROMPT_TEMPLATES = {
9
  }
10
 
11
  def gradio_run_evaluation(inference_api, model_name, prompt_format, openrouter_token=None, custom_prompt=None):
12
- print(f"DEBUG - Prompt format: {prompt_format}, Custom prompt content: {custom_prompt}")
13
-
14
  # Set environment variable if OpenRouter token is provided
15
  if inference_api == "openrouter":
16
  os.environ["OPENROUTER_API_KEY"] = str(openrouter_token)
 
9
  }
10
 
11
  def gradio_run_evaluation(inference_api, model_name, prompt_format, openrouter_token=None, custom_prompt=None):
 
 
12
  # Set environment variable if OpenRouter token is provided
13
  if inference_api == "openrouter":
14
  os.environ["OPENROUTER_API_KEY"] = str(openrouter_token)
evaluation_logic.py CHANGED
@@ -57,7 +57,7 @@ def save_prediction(inference_api, model_name, prompt_format, question, generate
57
  def save_evaluation(inference_api, model_name, prompt_format, custom_prompt, metrics):
58
  evaluation_file = evaluation_folder / f"evaluation_{file_uuid}.json"
59
  evaluation_folder.mkdir(parents=True, exist_ok=True)
60
-
61
  # Extract and flatten the category-specific execution metrics
62
  categories = ['easy', 'medium', 'hard', 'duckdb', 'ddl', 'all']
63
  flattened_metrics = {
@@ -67,7 +67,7 @@ def save_evaluation(inference_api, model_name, prompt_format, custom_prompt, met
67
  "custom_prompt": str(custom_prompt),
68
  "timestamp": datetime.now().isoformat()
69
  }
70
-
71
  # Flatten each category's metrics into separate columns
72
  for category in categories:
73
  if category in metrics['exec']:
@@ -101,6 +101,7 @@ def run_prediction(inference_api, model_name, prompt_format, custom_prompt, outp
101
  try:
102
  # Initialize necessary components
103
  data_formatter = DefaultLoader()
 
104
  if prompt_format.startswith("custom"):
105
  prompt_formatter = PROMPT_FORMATTERS["custom"]()
106
  prompt_formatter.PROMPT_TEMPLATE = custom_prompt
@@ -235,7 +236,7 @@ def run_evaluation(inference_api, model_name, prompt_format="duckdbinstgranitesh
235
 
236
  # Save evaluation results to dataset
237
  save_evaluation(inference_api, model_name, prompt_format, custom_prompt, metrics)
238
-
239
  yield "Evaluation completed."
240
 
241
  if metrics:
@@ -245,7 +246,7 @@ def run_evaluation(inference_api, model_name, prompt_format="duckdbinstgranitesh
245
  yield f"All (n={overall_metrics['count']}) - Edit Distance: {metrics['edit_distance']['edit_distance']:.3f}"
246
 
247
  categories = ['easy', 'medium', 'hard', 'duckdb', 'ddl', 'all']
248
-
249
  for category in categories:
250
  if category in metrics['exec']:
251
  category_metrics = metrics['exec'][category]
 
57
  def save_evaluation(inference_api, model_name, prompt_format, custom_prompt, metrics):
58
  evaluation_file = evaluation_folder / f"evaluation_{file_uuid}.json"
59
  evaluation_folder.mkdir(parents=True, exist_ok=True)
60
+
61
  # Extract and flatten the category-specific execution metrics
62
  categories = ['easy', 'medium', 'hard', 'duckdb', 'ddl', 'all']
63
  flattened_metrics = {
 
67
  "custom_prompt": str(custom_prompt),
68
  "timestamp": datetime.now().isoformat()
69
  }
70
+
71
  # Flatten each category's metrics into separate columns
72
  for category in categories:
73
  if category in metrics['exec']:
 
101
  try:
102
  # Initialize necessary components
103
  data_formatter = DefaultLoader()
104
+ print(f"DEBUG - Prompt format: {prompt_format}, Custom prompt content: {custom_prompt}")
105
  if prompt_format.startswith("custom"):
106
  prompt_formatter = PROMPT_FORMATTERS["custom"]()
107
  prompt_formatter.PROMPT_TEMPLATE = custom_prompt
 
236
 
237
  # Save evaluation results to dataset
238
  save_evaluation(inference_api, model_name, prompt_format, custom_prompt, metrics)
239
+
240
  yield "Evaluation completed."
241
 
242
  if metrics:
 
246
  yield f"All (n={overall_metrics['count']}) - Edit Distance: {metrics['edit_distance']['edit_distance']:.3f}"
247
 
248
  categories = ['easy', 'medium', 'hard', 'duckdb', 'ddl', 'all']
249
+
250
  for category in categories:
251
  if category in metrics['exec']:
252
  category_metrics = metrics['exec'][category]