import duckdb from pathlib import Path import argparse import re import ast from datasets import load_dataset def export_to_duckdb(conn, results_dir: Path): # SQL schema (matching results.completions) create_string = """CREATE TABLE IF NOT EXISTS completions ( parent_dir TEXT NOT NULL, prompt_id INT NOT NULL, prompt TEXT NOT NULL, completion TEXT NOT NULL, count INT NOT NULL, temperature FLOAT NOT NULL, top_p FLOAT NOT NULL, max_tokens INT NOT NULL, PRIMARY KEY (parent_dir, prompt_id, prompt, completion) )""" conn.execute(create_string) created = 0 # Counter for inserted rows parquet_files = list(results_dir.rglob("*.parquet")) for parquet_file in parquet_files: parquet_file = str(parquet_file) # Load the dataset from the Parquet file details = load_dataset("parquet", data_files=parquet_file, split="train") for completion_item in details: # Extract values safely parent_dir = parquet_file.split("/results/details/")[1].split("/")[0] specifics = ast.literal_eval(completion_item['specifics']) # Convert string to dictionary prompt_id = specifics.get('ID', -1) prompt = completion_item.get("full_prompt", "") completion = ast.literal_eval(completion_item.get("predictions", ""))[0] while type(completion) != str: completion = completion[0] count = 0 # Placeholder value temperature = 0.0 # Placeholder value top_p = 0.0 # Placeholder value max_tokens = 2048 # Placeholder value # Insert into DuckDB conn.execute( "INSERT INTO completions VALUES (?, ?, ?, ?, ?, ?, ?, ?)", (parent_dir, prompt_id, prompt, completion, count, temperature, top_p, max_tokens), ) created += 1 conn.commit() print(f"Created {created} rows.") def to_duckdb(db_path: str, results_dir: Path): """Create a DuckDB connection and export data.""" conn = duckdb.connect(db_path) export_to_duckdb(conn, results_dir) conn.close() def main(): """Command-line interface for exporting Parquet data to DuckDB.""" parser = argparse.ArgumentParser() parser.add_argument("db_path", type=str, help="Path to DuckDB database file") parser.add_argument("results_dir", type=Path, help="Path to results dir") args = parser.parse_args() to_duckdb(args.db_path, args.results_dir) if __name__ == "__main__": main() #results_dir = "/mnt/ssd/aryawu/lighteval/results/details" #python npr_to_duckdb.py "/mnt/ssd/aryawu/lighteval/results.duckdb" "/mnt/ssd/aryawu/lighteval/results/details"