Spaces:
Sleeping
Sleeping
- metrics.py +0 -37
metrics.py
CHANGED
|
@@ -100,11 +100,6 @@ def load_results_sample_one_only():
|
|
| 100 |
WHERE rn = 1;
|
| 101 |
"""
|
| 102 |
conn.execute(query).fetchall()
|
| 103 |
-
# #print how how many rows are in the table
|
| 104 |
-
# print(conn.execute("SELECT COUNT(*) FROM sampled").fetchall())
|
| 105 |
-
# #describe the sampled table
|
| 106 |
-
# print(conn.execute("DESCRIBE sampled").fetchall())
|
| 107 |
-
|
| 108 |
conn.execute("""
|
| 109 |
CREATE TABLE challenges AS
|
| 110 |
SELECT * FROM 'puzzles_cleaned.csv'
|
|
@@ -218,38 +213,6 @@ def accuracy_by_model(conn):
|
|
| 218 |
AnswerCheck
|
| 219 |
""")
|
| 220 |
|
| 221 |
-
def accuracy_by_model_only_one(conn):
|
| 222 |
-
query = """
|
| 223 |
-
WITH FirstResponses AS (
|
| 224 |
-
SELECT
|
| 225 |
-
parent_dir AS model,
|
| 226 |
-
prompt_id,
|
| 227 |
-
completion,
|
| 228 |
-
count,
|
| 229 |
-
ROW_NUMBER() OVER (PARTITION BY parent_dir, prompt_id) AS rn
|
| 230 |
-
FROM results.completions
|
| 231 |
-
WHERE parent_dir = 'completions-r1_cursor_hosted' -- Only consider rows where parent_dir is 'r1_cursor_hosted'
|
| 232 |
-
),
|
| 233 |
-
AnswerCheck AS (
|
| 234 |
-
SELECT
|
| 235 |
-
fr.model,
|
| 236 |
-
SUM(fr.count) AS total,
|
| 237 |
-
SUM(fr.count * CAST(check_answer(fr.completion, c.answer) AS INTEGER)) AS correct
|
| 238 |
-
FROM FirstResponses fr
|
| 239 |
-
JOIN challenges c ON fr.prompt_id = c.ID
|
| 240 |
-
WHERE fr.rn = 1 -- Select only the first response per model per prompt
|
| 241 |
-
GROUP BY fr.model
|
| 242 |
-
)
|
| 243 |
-
SELECT
|
| 244 |
-
model,
|
| 245 |
-
total,
|
| 246 |
-
correct,
|
| 247 |
-
ROUND(correct / total, 2) AS accuracy
|
| 248 |
-
FROM AnswerCheck;
|
| 249 |
-
"""
|
| 250 |
-
return conn.sql(query)
|
| 251 |
-
|
| 252 |
-
|
| 253 |
def main():
|
| 254 |
parser = argparse.ArgumentParser()
|
| 255 |
parser.add_argument("--by-model-and-time", action="store_true")
|
|
|
|
| 100 |
WHERE rn = 1;
|
| 101 |
"""
|
| 102 |
conn.execute(query).fetchall()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
conn.execute("""
|
| 104 |
CREATE TABLE challenges AS
|
| 105 |
SELECT * FROM 'puzzles_cleaned.csv'
|
|
|
|
| 213 |
AnswerCheck
|
| 214 |
""")
|
| 215 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
def main():
|
| 217 |
parser = argparse.ArgumentParser()
|
| 218 |
parser.add_argument("--by-model-and-time", action="store_true")
|