Spaces:

duckdb-nsql-hub
/

DuckDB-SQL-Eval

Running

File size: 2,840 Bytes

977063a
5051da6
 
 
f9d0ccd
977063a
5051da6
 
 
f9d0ccd
 
 
5051da6
 
f9d0ccd
 
 
5051da6
 
f9d0ccd
5051da6
11fbe39
6853b2a
 
 
 
 
 
f9d0ccd
6853b2a
f9d0ccd
 
5051da6
 
f9d0ccd
5051da6
 
 
 
 
 
 
 
 
 
 
11fbe39
6853b2a
 
 
 
 
5051da6
 
 
 
 
 
 
 
 
f9d0ccd
5051da6
 
 
 
 
 
 
49c6a0b
5051da6
 
f9d0ccd
5051da6
f9d0ccd
5051da6
49c6a0b
5051da6
49c6a0b
977063a

import gradio as gr
import subprocess
import os
import re
from datetime import datetime

def run_evaluation(model_name):
    results = []

    # Use the secret OpenRouter API key from the Hugging Face space
    if "OPENROUTER_API_KEY" not in os.environ:
        return "Error: OPENROUTER_API_KEY not found in environment variables."

    try:
        # Set up environment
        env = os.environ.copy()
        env["OPENROUTER_API_KEY"] = os.environ["OPENROUTER_API_KEY"]

        # Run inference
        current_date = datetime.now().strftime("%Y%m%d")
        inference_cmd = f"""
        cd duckdb-nsql/ && 
        python eval/predict.py \
        predict \
        eval/data/dev.json \
        eval/data/tables.json \
        --output-dir output/ \
        --stop-tokens ';' \
        --max-tokens 30000 \
        --overwrite-manifest \
        --manifest-client openrouter \
        --manifest-engine {model_name} \
        --prompt-format duckdbinstgraniteshort
        """
        inference_result = subprocess.run(inference_cmd, shell=True, check=True, capture_output=True, text=True, env=env)
        results.append("Inference completed.")

        # Extract JSON file path from inference output
        json_path_match = re.search(r'(.*\.json)', inference_result.stdout)
        if not json_path_match:
            raise ValueError("Could not find JSON file path in inference output")
        json_file = os.path.basename(json_path_match.group(1))
        results.append(f"Generated JSON file: {json_file}")

        # Run evaluation
        eval_cmd = f"""
        cd duckdb-nsql/ && 
        python eval/evaluate.py evaluate \
        --gold eval/data/dev.json \
        --db eval/data/databases/ \
        --tables eval/data/tables.json \
        --output-dir output/ \
        --pred output/{json_file}
        """
        eval_result = subprocess.run(eval_cmd, shell=True, check=True, capture_output=True, text=True)

        # Extract and format metrics from eval output
        metrics = eval_result.stdout
        if metrics:
            results.append(f"Evaluation completed:\n{metrics}")
        else:
            results.append("Evaluation completed, but couldn't get metrics.")

    except subprocess.CalledProcessError as e:
        results.append(f"Error occurred: {str(e)}")
        results.append(f"Command output: {e.output}")
    except Exception as e:
        results.append(f"An unexpected error occurred: {str(e)}")

    return "\n\n".join(results)

with gr.Blocks() as demo:
    gr.Markdown("# DuckDB SQL Evaluation App (OpenRouter)")

    model_name = gr.Textbox(label="Model Name (e.g., qwen/qwen-2.5-72b-instruct)")
    start_btn = gr.Button("Start Evaluation")
    output = gr.Textbox(label="Output", lines=20)

    start_btn.click(fn=run_evaluation, inputs=[model_name], outputs=output)

demo.launch()