Spaces:
Running
Running
import duckdb | |
from pathlib import Path | |
import argparse | |
import re | |
import ast | |
from datasets import load_dataset | |
def export_to_duckdb(conn, results_dir: Path): | |
# SQL schema (matching results.completions) | |
create_string = """CREATE TABLE IF NOT EXISTS completions ( | |
parent_dir TEXT NOT NULL, | |
prompt_id INT NOT NULL, | |
prompt TEXT NOT NULL, | |
completion TEXT NOT NULL, | |
count INT NOT NULL, | |
temperature FLOAT NOT NULL, | |
top_p FLOAT NOT NULL, | |
max_tokens INT NOT NULL, | |
PRIMARY KEY (parent_dir, prompt_id, prompt, completion) | |
)""" | |
conn.execute(create_string) | |
created = 0 # Counter for inserted rows | |
parquet_files = list(results_dir.rglob("*.parquet")) | |
for parquet_file in parquet_files: | |
parquet_file = str(parquet_file) | |
# Load the dataset from the Parquet file | |
details = load_dataset("parquet", data_files=parquet_file, split="train") | |
for completion_item in details: | |
# Extract values safely | |
parent_dir = parquet_file.split("/results/details/")[1].split("/")[0] | |
specifics = ast.literal_eval(completion_item['specifics']) # Convert string to dictionary | |
prompt_id = specifics.get('ID', -1) | |
prompt = completion_item.get("full_prompt", "") | |
completion = ast.literal_eval(completion_item.get("predictions", ""))[0] | |
while type(completion) != str: | |
completion = completion[0] | |
count = 0 # Placeholder value | |
temperature = 0.0 # Placeholder value | |
top_p = 0.0 # Placeholder value | |
max_tokens = 2048 # Placeholder value | |
# Insert into DuckDB | |
conn.execute( | |
"INSERT INTO completions VALUES (?, ?, ?, ?, ?, ?, ?, ?)", | |
(parent_dir, prompt_id, prompt, completion, count, temperature, top_p, max_tokens), | |
) | |
created += 1 | |
conn.commit() | |
print(f"Created {created} rows.") | |
def to_duckdb(db_path: str, results_dir: Path): | |
"""Create a DuckDB connection and export data.""" | |
conn = duckdb.connect(db_path) | |
export_to_duckdb(conn, results_dir) | |
conn.close() | |
def main(): | |
"""Command-line interface for exporting Parquet data to DuckDB.""" | |
parser = argparse.ArgumentParser() | |
parser.add_argument("db_path", type=str, help="Path to DuckDB database file") | |
parser.add_argument("results_dir", type=Path, help="Path to results dir") | |
args = parser.parse_args() | |
to_duckdb(args.db_path, args.results_dir) | |
if __name__ == "__main__": | |
main() | |
#results_dir = "/mnt/ssd/aryawu/lighteval/results/details" | |
#python npr_to_duckdb.py "/mnt/ssd/aryawu/lighteval/results.duckdb" "/mnt/ssd/aryawu/lighteval/results/details" |