File size: 2,775 Bytes
468b095
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import duckdb
from pathlib import Path
import argparse
import re
import ast
from datasets import load_dataset

def export_to_duckdb(conn, results_dir: Path):
    # SQL schema (matching results.completions)
    create_string = """CREATE TABLE IF NOT EXISTS completions  (
        parent_dir TEXT NOT NULL,
        prompt_id INT NOT NULL,
        prompt TEXT NOT NULL,
        completion TEXT NOT NULL,
        count INT NOT NULL,
        temperature FLOAT NOT NULL,
        top_p FLOAT NOT NULL, 
        max_tokens INT NOT NULL,
        PRIMARY KEY (parent_dir, prompt_id, prompt, completion)
    )"""
    conn.execute(create_string)
    created = 0  # Counter for inserted rows
    parquet_files = list(results_dir.rglob("*.parquet"))
    for parquet_file in parquet_files:
        parquet_file = str(parquet_file)
        # Load the dataset from the Parquet file
        details = load_dataset("parquet", data_files=parquet_file, split="train")

        for completion_item in details:
            # Extract values safely
            parent_dir = parquet_file.split("/results/details/")[1].split("/")[0]
            specifics = ast.literal_eval(completion_item['specifics'])  # Convert string to dictionary
            prompt_id = specifics.get('ID', -1)
            prompt = completion_item.get("full_prompt", "")
            completion = ast.literal_eval(completion_item.get("predictions", ""))[0]
            while type(completion) != str:
                completion = completion[0]
            count = 0  # Placeholder value
            temperature = 0.0  # Placeholder value
            top_p = 0.0  # Placeholder value
            max_tokens = 2048  # Placeholder value

            # Insert into DuckDB
            conn.execute(
                "INSERT INTO completions VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
                (parent_dir, prompt_id, prompt, completion, count, temperature, top_p, max_tokens),
            )
            created += 1

    conn.commit()
    print(f"Created {created} rows.")


def to_duckdb(db_path: str, results_dir: Path):
    """Create a DuckDB connection and export data."""
    conn = duckdb.connect(db_path)
    export_to_duckdb(conn, results_dir)
    conn.close()

def main():
    """Command-line interface for exporting Parquet data to DuckDB."""
    parser = argparse.ArgumentParser()
    parser.add_argument("db_path", type=str, help="Path to DuckDB database file")
    parser.add_argument("results_dir", type=Path, help="Path to results dir")
    args = parser.parse_args()
    to_duckdb(args.db_path, args.results_dir)

if __name__ == "__main__":
    main()

#results_dir = "/mnt/ssd/aryawu/lighteval/results/details"
#python npr_to_duckdb.py "/mnt/ssd/aryawu/lighteval/results.duckdb" "/mnt/ssd/aryawu/lighteval/results/details"