verbal-reasoning-challenge / lighteval_to_duckdb.py
Aryarya's picture
skip excluded tasks
468b095
raw
history blame
2.78 kB
import duckdb
from pathlib import Path
import argparse
import re
import ast
from datasets import load_dataset
def export_to_duckdb(conn, results_dir: Path):
# SQL schema (matching results.completions)
create_string = """CREATE TABLE IF NOT EXISTS completions (
parent_dir TEXT NOT NULL,
prompt_id INT NOT NULL,
prompt TEXT NOT NULL,
completion TEXT NOT NULL,
count INT NOT NULL,
temperature FLOAT NOT NULL,
top_p FLOAT NOT NULL,
max_tokens INT NOT NULL,
PRIMARY KEY (parent_dir, prompt_id, prompt, completion)
)"""
conn.execute(create_string)
created = 0 # Counter for inserted rows
parquet_files = list(results_dir.rglob("*.parquet"))
for parquet_file in parquet_files:
parquet_file = str(parquet_file)
# Load the dataset from the Parquet file
details = load_dataset("parquet", data_files=parquet_file, split="train")
for completion_item in details:
# Extract values safely
parent_dir = parquet_file.split("/results/details/")[1].split("/")[0]
specifics = ast.literal_eval(completion_item['specifics']) # Convert string to dictionary
prompt_id = specifics.get('ID', -1)
prompt = completion_item.get("full_prompt", "")
completion = ast.literal_eval(completion_item.get("predictions", ""))[0]
while type(completion) != str:
completion = completion[0]
count = 0 # Placeholder value
temperature = 0.0 # Placeholder value
top_p = 0.0 # Placeholder value
max_tokens = 2048 # Placeholder value
# Insert into DuckDB
conn.execute(
"INSERT INTO completions VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
(parent_dir, prompt_id, prompt, completion, count, temperature, top_p, max_tokens),
)
created += 1
conn.commit()
print(f"Created {created} rows.")
def to_duckdb(db_path: str, results_dir: Path):
"""Create a DuckDB connection and export data."""
conn = duckdb.connect(db_path)
export_to_duckdb(conn, results_dir)
conn.close()
def main():
"""Command-line interface for exporting Parquet data to DuckDB."""
parser = argparse.ArgumentParser()
parser.add_argument("db_path", type=str, help="Path to DuckDB database file")
parser.add_argument("results_dir", type=Path, help="Path to results dir")
args = parser.parse_args()
to_duckdb(args.db_path, args.results_dir)
if __name__ == "__main__":
main()
#results_dir = "/mnt/ssd/aryawu/lighteval/results/details"
#python npr_to_duckdb.py "/mnt/ssd/aryawu/lighteval/results.duckdb" "/mnt/ssd/aryawu/lighteval/results/details"