Spaces:

nuprl
/

verbal-reasoning-challenge

Running

App Files Files

verbal-reasoning-challenge / lighteval_to_duckdb.py

Aryarya

skip excluded tasks

468b095 7 days ago

raw

history blame

2.78 kB

	import duckdb
	from pathlib import Path
	import argparse
	import re
	import ast
	from datasets import load_dataset

	def export_to_duckdb(conn, results_dir: Path):
	# SQL schema (matching results.completions)
	create_string = """CREATE TABLE IF NOT EXISTS completions (
	parent_dir TEXT NOT NULL,
	prompt_id INT NOT NULL,
	prompt TEXT NOT NULL,
	completion TEXT NOT NULL,
	count INT NOT NULL,
	temperature FLOAT NOT NULL,
	top_p FLOAT NOT NULL,
	max_tokens INT NOT NULL,
	PRIMARY KEY (parent_dir, prompt_id, prompt, completion)
	)"""
	conn.execute(create_string)
	created = 0 # Counter for inserted rows
	parquet_files = list(results_dir.rglob("*.parquet"))
	for parquet_file in parquet_files:
	parquet_file = str(parquet_file)
	# Load the dataset from the Parquet file
	details = load_dataset("parquet", data_files=parquet_file, split="train")

	for completion_item in details:
	# Extract values safely
	parent_dir = parquet_file.split("/results/details/")[1].split("/")[0]
	specifics = ast.literal_eval(completion_item['specifics']) # Convert string to dictionary
	prompt_id = specifics.get('ID', -1)
	prompt = completion_item.get("full_prompt", "")
	completion = ast.literal_eval(completion_item.get("predictions", ""))[0]
	while type(completion) != str:
	completion = completion[0]
	count = 0 # Placeholder value
	temperature = 0.0 # Placeholder value
	top_p = 0.0 # Placeholder value
	max_tokens = 2048 # Placeholder value

	# Insert into DuckDB
	conn.execute(
	"INSERT INTO completions VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
	(parent_dir, prompt_id, prompt, completion, count, temperature, top_p, max_tokens),
	)
	created += 1

	conn.commit()
	print(f"Created {created} rows.")


	def to_duckdb(db_path: str, results_dir: Path):
	"""Create a DuckDB connection and export data."""
	conn = duckdb.connect(db_path)
	export_to_duckdb(conn, results_dir)
	conn.close()

	def main():
	"""Command-line interface for exporting Parquet data to DuckDB."""
	parser = argparse.ArgumentParser()
	parser.add_argument("db_path", type=str, help="Path to DuckDB database file")
	parser.add_argument("results_dir", type=Path, help="Path to results dir")
	args = parser.parse_args()
	to_duckdb(args.db_path, args.results_dir)

	if __name__ == "__main__":
	main()

	#results_dir = "/mnt/ssd/aryawu/lighteval/results/details"
	#python npr_to_duckdb.py "/mnt/ssd/aryawu/lighteval/results.duckdb" "/mnt/ssd/aryawu/lighteval/results/details"