Spaces:
Build error
Build error
add llmonitor & start scoring
Browse files- pages/index.js +22 -16
- run/database.db +0 -0
- run/queriers.py +4 -0
- run/requirements.txt +2 -1
- run/run.py +86 -9
pages/index.js
CHANGED
@@ -47,7 +47,7 @@ export default function Home({ prompts, models }) {
|
|
47 |
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
48 |
</Head>
|
49 |
<main>
|
50 |
-
<h1>
|
51 |
<br />
|
52 |
<p>
|
53 |
Benchmarks like HellaSwag are a bit too abstract for me to get a sense
|
@@ -69,13 +69,13 @@ export default function Home({ prompts, models }) {
|
|
69 |
<br />
|
70 |
<p>
|
71 |
{`view: `}
|
|
|
|
|
|
|
|
|
72 |
<a href="#" onClick={() => changeView("prompt")}>
|
73 |
-
|
74 |
</a>{" "}
|
75 |
-
/{" "}
|
76 |
-
<a href="#" onClick={() => changeView("model")}>
|
77 |
-
all models
|
78 |
-
</a>
|
79 |
</p>
|
80 |
<br />
|
81 |
{viewBy === "prompt" ? (
|
@@ -103,16 +103,22 @@ export default function Home({ prompts, models }) {
|
|
103 |
</>
|
104 |
) : (
|
105 |
<ul>
|
106 |
-
{models
|
107 |
-
|
108 |
-
|
109 |
-
<
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
</ul>
|
117 |
)}
|
118 |
<br />
|
|
|
47 |
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
48 |
</Head>
|
49 |
<main>
|
50 |
+
<h1>Crowdsourced LLM Benchmark</h1>
|
51 |
<br />
|
52 |
<p>
|
53 |
Benchmarks like HellaSwag are a bit too abstract for me to get a sense
|
|
|
69 |
<br />
|
70 |
<p>
|
71 |
{`view: `}
|
72 |
+
<a href="#" onClick={() => changeView("model")}>
|
73 |
+
models
|
74 |
+
</a>{" "}
|
75 |
+
/
|
76 |
<a href="#" onClick={() => changeView("prompt")}>
|
77 |
+
prompts
|
78 |
</a>{" "}
|
|
|
|
|
|
|
|
|
79 |
</p>
|
80 |
<br />
|
81 |
{viewBy === "prompt" ? (
|
|
|
103 |
</>
|
104 |
) : (
|
105 |
<ul>
|
106 |
+
{models
|
107 |
+
.score((s) => s.score)
|
108 |
+
.map((model, i) => (
|
109 |
+
<li key={i}>
|
110 |
+
{model.name} -{" "}
|
111 |
+
<Link
|
112 |
+
href={`/model/${model.api_id
|
113 |
+
.split("/")
|
114 |
+
.pop()
|
115 |
+
.toLowerCase()}`}
|
116 |
+
>
|
117 |
+
results
|
118 |
+
</Link>{" "}
|
119 |
+
- score: {model.score}
|
120 |
+
</li>
|
121 |
+
))}
|
122 |
</ul>
|
123 |
)}
|
124 |
<br />
|
run/database.db
CHANGED
Binary files a/run/database.db and b/run/database.db differ
|
|
run/queriers.py
CHANGED
@@ -4,6 +4,8 @@ import json
|
|
4 |
import requests
|
5 |
from dotenv import load_dotenv
|
6 |
|
|
|
|
|
7 |
load_dotenv()
|
8 |
|
9 |
TOGETHER_API_KEY = os.getenv('TOGETHER_API_KEY')
|
@@ -15,6 +17,8 @@ OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
|
|
15 |
|
16 |
MAX_TOKENS = 300
|
17 |
|
|
|
|
|
18 |
def together(model, params):
|
19 |
def format_prompt(prompt, prompt_type):
|
20 |
if prompt_type == "language":
|
|
|
4 |
import requests
|
5 |
from dotenv import load_dotenv
|
6 |
|
7 |
+
from llmonitor import monitor
|
8 |
+
|
9 |
load_dotenv()
|
10 |
|
11 |
TOGETHER_API_KEY = os.getenv('TOGETHER_API_KEY')
|
|
|
17 |
|
18 |
MAX_TOKENS = 300
|
19 |
|
20 |
+
monitor(openai)
|
21 |
+
|
22 |
def together(model, params):
|
23 |
def format_prompt(prompt, prompt_type):
|
24 |
if prompt_type == "language":
|
run/requirements.txt
CHANGED
@@ -2,4 +2,5 @@ openai
|
|
2 |
pandas
|
3 |
requests
|
4 |
python-dotenv
|
5 |
-
gradio
|
|
|
|
2 |
pandas
|
3 |
requests
|
4 |
python-dotenv
|
5 |
+
gradio
|
6 |
+
llmonitor
|
run/run.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import sqlite3
|
2 |
import time
|
3 |
-
|
|
|
4 |
from queriers import together, cohere, openai_func, openrouter, ai21, alephalpha
|
5 |
|
6 |
db = sqlite3.connect("./database.db")
|
@@ -22,6 +23,12 @@ models = [dict(model) for model in models]
|
|
22 |
prompts = cursor.execute("SELECT * FROM prompts").fetchall()
|
23 |
prompts = [dict(prompt) for prompt in prompts]
|
24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
def insert_result(modelId, promptId, result, duration, rate):
|
26 |
cursor.execute(
|
27 |
"INSERT INTO results (model, prompt, result, duration, rate) VALUES (?, ?, ?, ?, ?)",
|
@@ -89,15 +96,85 @@ def ask_prompt(prompt, model):
|
|
89 |
total_benchmarks = len(models) * len(prompts)
|
90 |
print(f"Running {total_benchmarks} benchmarks")
|
91 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
|
93 |
-
for
|
94 |
-
if
|
95 |
-
|
96 |
-
for prompt in prompts:
|
97 |
-
if prompt["type"] != "code" and model["type"] == "code":
|
98 |
-
print("Skipping non-code benchmark for code model")
|
99 |
-
continue
|
100 |
|
101 |
-
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
db.close()
|
|
|
1 |
import sqlite3
|
2 |
import time
|
3 |
+
from termcolor import colored
|
4 |
+
from llmonitor import agent
|
5 |
from queriers import together, cohere, openai_func, openrouter, ai21, alephalpha
|
6 |
|
7 |
db = sqlite3.connect("./database.db")
|
|
|
23 |
prompts = cursor.execute("SELECT * FROM prompts").fetchall()
|
24 |
prompts = [dict(prompt) for prompt in prompts]
|
25 |
|
26 |
+
|
27 |
+
def get_results():
|
28 |
+
results = cursor.execute("SELECT * FROM results").fetchall()
|
29 |
+
print(results[0].keys())
|
30 |
+
return [dict(result) for result in results]
|
31 |
+
|
32 |
def insert_result(modelId, promptId, result, duration, rate):
|
33 |
cursor.execute(
|
34 |
"INSERT INTO results (model, prompt, result, duration, rate) VALUES (?, ?, ?, ?, ?)",
|
|
|
96 |
total_benchmarks = len(models) * len(prompts)
|
97 |
print(f"Running {total_benchmarks} benchmarks")
|
98 |
|
99 |
+
# # Run prompts
|
100 |
+
# for model in models:
|
101 |
+
# if model["type"] == "language":
|
102 |
+
# continue
|
103 |
+
# for prompt in prompts:
|
104 |
+
# if prompt["type"] != "code" and model["type"] == "code":
|
105 |
+
# print("Skipping non-code benchmark for code model")
|
106 |
+
# continue
|
107 |
+
|
108 |
+
# ask_prompt(prompt, model)
|
109 |
+
|
110 |
+
# Calculate scores
|
111 |
+
results = get_results()
|
112 |
+
|
113 |
+
@agent(name="RateResult")
|
114 |
+
def rate_result(result):
|
115 |
+
rubrics = cursor.execute(
|
116 |
+
"SELECT * FROM rubrics WHERE prompt = ?",
|
117 |
+
(result["prompt"],)
|
118 |
+
).fetchall()
|
119 |
+
|
120 |
+
has_rubrics = len(rubrics) > 0
|
121 |
+
|
122 |
+
if not has_rubrics:
|
123 |
+
return
|
124 |
+
|
125 |
+
print(colored('---------------------------', 'white'))
|
126 |
+
print(colored('----------RATING-----------', 'white'))
|
127 |
+
print(colored('---------------------------', 'white'))
|
128 |
+
print(colored(result["result"], 'cyan'))
|
129 |
+
print(colored('---------------------------', 'white'))
|
130 |
+
|
131 |
+
score = None
|
132 |
+
|
133 |
+
for rubric in rubrics:
|
134 |
+
|
135 |
+
print('Rubric: '+colored(rubric["grading"], 'magenta'))
|
136 |
+
|
137 |
+
if result["result"].strip() == "":
|
138 |
+
score = 0
|
139 |
+
else:
|
140 |
+
grading_text = (
|
141 |
+
f'You help verify that the following answer match this condition: the answer {rubric["grading"]}. Note: the answer might be imcomplete, in which case do your best to assess based on what the full result would be.\n\n'
|
142 |
+
f'\n\n--START OF THE ANSWER--\n{result["result"]}\n--END OF THE ANSWER--\n\n'
|
143 |
+
f'Take a deep breath and explain step by step how you come to the conclusion.'
|
144 |
+
f'Finally, reply on the last line with YES if the following answer matches this condition (otherwies reply NO).'
|
145 |
+
)
|
146 |
+
|
147 |
+
# get gpt-4 model
|
148 |
+
gpt4 = next((item for item in models if item['api_id'] == 'gpt-4'), None)
|
149 |
+
|
150 |
+
prompt = { }
|
151 |
+
|
152 |
+
response_text = openai_func(gpt4, {"text": grading_text})
|
153 |
+
|
154 |
+
print(colored(f"-> {response_text}", 'yellow'))
|
155 |
+
|
156 |
+
last_line = response_text.splitlines()[-1]
|
157 |
+
|
158 |
+
# If it includes a yes, then it's valid
|
159 |
+
if "YES" in last_line:
|
160 |
+
print(colored(f'Valid! + {rubric["points"]} points', 'green'))
|
161 |
+
score = rubric["points"] if score is None else score + rubric["points"]
|
162 |
+
|
163 |
+
print('Final score: '+colored(score, 'cyan'))
|
164 |
+
|
165 |
+
return score
|
166 |
+
|
167 |
+
|
168 |
|
169 |
+
for result in results:
|
170 |
+
if not result["score"]:
|
171 |
+
score = rate_result(result)
|
|
|
|
|
|
|
|
|
172 |
|
173 |
+
if score is not None:
|
174 |
+
cursor.execute(
|
175 |
+
"UPDATE results SET score = ? WHERE id == ?",
|
176 |
+
(score, result["id"])
|
177 |
+
)
|
178 |
+
db.commit()
|
179 |
|
180 |
db.close()
|