Initial design
Browse files- app.py +38 -3
- best_judges_single_agg.csv +49 -49
app.py
CHANGED
@@ -1,6 +1,41 @@
|
|
1 |
import pandas as pd
|
2 |
import streamlit as st
|
3 |
|
4 |
-
|
5 |
-
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import pandas as pd
|
2 |
import streamlit as st
|
3 |
|
4 |
+
st.set_page_config(
|
5 |
+
page_title="JuStRank",
|
6 |
+
page_icon="🏋️♂️",
|
7 |
+
layout="wide",
|
8 |
+
initial_sidebar_state="auto",
|
9 |
+
menu_items=None,
|
10 |
+
)
|
11 |
+
|
12 |
+
st.title("JuStRank")
|
13 |
+
st.subheader("Judges for Ranking Candidate Systems")
|
14 |
+
|
15 |
+
|
16 |
+
def prettify_judge_name(judge_name):
|
17 |
+
pretty_judge = (judge_name[0].upper()+judge_name[1:]).replace("Gpt", "GPT")
|
18 |
+
return pretty_judge
|
19 |
+
|
20 |
+
|
21 |
+
def format_digits(flt, num_digits=3):
|
22 |
+
format_str = "{:."+str(num_digits-1)+"f}"
|
23 |
+
format_str_zeroes = "{:."+str(num_digits)+"f}"
|
24 |
+
return format_str_zeroes.format(flt)[1:] if (0 < flt < 1) else format_str.format(flt)
|
25 |
+
|
26 |
+
|
27 |
+
df = pd.read_csv("./best_judges_single_agg.csv")[["Judge Model", "Realization", "Ranking Agreement", "Decisiveness", "Bias"]]
|
28 |
+
df["Judge Model"] = df["Judge Model"].apply(prettify_judge_name)
|
29 |
+
|
30 |
+
styled_data = (
|
31 |
+
df.style.background_gradient(subset=["Ranking Agreement"])
|
32 |
+
.background_gradient(
|
33 |
+
subset=["Ranking Agreement"],
|
34 |
+
cmap="RdYlGn",
|
35 |
+
vmin=0.5,
|
36 |
+
vmax=df["Ranking Agreement"].max(),
|
37 |
+
)
|
38 |
+
.format(subset=["Ranking Agreement", "Decisiveness", "Bias"], formatter=format_digits)
|
39 |
+
.set_properties(**{"text-align": "center"})
|
40 |
+
)
|
41 |
+
st.dataframe(styled_data, hide_index=True, use_container_width=True)
|
best_judges_single_agg.csv
CHANGED
@@ -1,49 +1,49 @@
|
|
1 |
-
|
2 |
-
URM-LLaMa-3.1-8B,URM-LLaMa-3.1-8B_BT,Reward,0.
|
3 |
-
Qwen2.5-72B-Instruct,Qwen2.5-72B-Instruct#bad-good_textual-score_BT,Likert,0.
|
4 |
-
Qwen2.5-72B-Instruct,Qwen2.5-72B-Instruct#0-100_verbalized-score_BT,Numeric,0.
|
5 |
-
mistral-large-instruct-2407,mistral-large-instruct-2407#bad-good_textual-score_BT,Likert,0.
|
6 |
-
gpt-4o-2024-11-20,gpt-4o-2024-11-20#comparative-anchor-gpt-4-0314_BT,Anchor,0.
|
7 |
-
mistral-large-instruct-2407,mistral-large-instruct-2407#0-100_verbalized-score_BT,Numeric,0.
|
8 |
-
llama-3-1-405b-instruct-fp8,llama-3-1-405b-instruct-fp8#0-100_verbalized-score_BT,Numeric,0.
|
9 |
-
gpt-4o-mini-2024-07-18,gpt-4o-mini-2024-07-18#0-100_verbalized-score_BT,Numeric,0.
|
10 |
-
gpt-4o-mini-2024-07-18,gpt-4o-mini-2024-07-18#bad-good_textual-score_BT,Likert,0.
|
11 |
-
llama-3-1-70b-instruct,llama-3-1-70b-instruct#0-100_verbalized-score_BT,Numeric,0.
|
12 |
-
Qwen2.5-72B-Instruct,Qwen2.5-72B-Instruct#comparative-anchor-gpt-4-0314_BT,Anchor,0.
|
13 |
-
llama-3-1-405b-instruct-fp8,llama-3-1-405b-instruct-fp8#bad-good_textual-score_BT,Likert,0.
|
14 |
-
Skywork-Reward-Llama-3.1-8B-v0.2,Skywork-Reward-Llama-3.1-8B-v0.2_BT,Reward,0.
|
15 |
-
Qwen2.5-72B-Instruct,Qwen2.5-72B-Instruct#good-yes-no_logprob-score_BT,TokenProbs,0.
|
16 |
-
mixtral-8x22B-instruct-v0.1,mixtral-8x22B-instruct-v0.1#0-100_verbalized-score_BT,Numeric,0.
|
17 |
-
gpt-4o-2024-11-20,gpt-4o-2024-11-20#0-100_verbalized-score_BT,Numeric,0.
|
18 |
-
gpt-4o-2024-11-20,gpt-4o-2024-11-20#bad-good_textual-score_BT,Likert,0.
|
19 |
-
llama-3-1-70b-instruct,llama-3-1-70b-instruct#good-yes-no_logprob-score_BT,TokenProbs,0.
|
20 |
-
Llama-3-OffsetBias-RM-8B,Llama-3-OffsetBias-RM-8B_BT,Reward,0.
|
21 |
-
ArmoRM-Llama3-8B-v0.1,ArmoRM-Llama3-8B-v0.1_BT,Reward,0.
|
22 |
-
gpt-4o-mini-2024-07-18,gpt-4o-mini-2024-07-18#good-yes-no_logprob-score_BT,TokenProbs,0.
|
23 |
-
llama-3-70b-instruct,llama-3-70b-instruct#0-100_verbalized-score_BT,Numeric,0.
|
24 |
-
Llama-3.1-8B-Instruct,Llama-3.1-8B-Instruct#good-yes-no_logprob-score_BT,TokenProbs,0.
|
25 |
-
mixtral-8x22B-instruct-v0.1,mixtral-8x22B-instruct-v0.1#bad-good_textual-score_BT,Likert,0.
|
26 |
-
llama-3-1-405b-instruct-fp8,llama-3-1-405b-instruct-fp8#comparative-anchor-gpt-4-0314_BT,Anchor,0.
|
27 |
-
mistral-large-instruct-2407,mistral-large-instruct-2407#comparative-anchor-gpt-4-0314_BT,Anchor,0.
|
28 |
-
Llama-3.1-8B-Instruct,Llama-3.1-8B-Instruct#bad-good_textual-score_BT,Likert,0.
|
29 |
-
llama-3-1-70b-instruct,llama-3-1-70b-instruct#bad-good_textual-score_BT,Likert,0.
|
30 |
-
internlm2-20b-reward,internlm2-20b-reward_BT,Reward,0.
|
31 |
-
internlm2-7b-reward,internlm2-7b-reward_BT,Reward,0.
|
32 |
-
GRM-Llama3.2-3B-rewardmodel-ft,GRM-Llama3.2-3B-rewardmodel-ft_BT,Reward,0.
|
33 |
-
mixtral-8x22B-instruct-v0.1,mixtral-8x22B-instruct-v0.1#good-yes-no_logprob-score_BT,TokenProbs,0.
|
34 |
-
gpt-4o-2024-11-20,gpt-4o-2024-11-20#good-yes-no_logprob-score_BT,TokenProbs,0.
|
35 |
-
llama-3-70b-instruct,llama-3-70b-instruct#bad-good_textual-score_BT,Likert,0.
|
36 |
-
llama-3-1-70b-instruct,llama-3-1-70b-instruct#comparative-anchor-gpt-4-0314_BT,Anchor,0.
|
37 |
-
Llama-3.1-8B-Instruct,Llama-3.1-8B-Instruct#comparative-anchor-gpt-4-0314_BT,Anchor,0.
|
38 |
-
llama-3-1-405b-instruct-fp8,llama-3-1-405b-instruct-fp8#good-yes-no_logprob-score_BT,TokenProbs,0.
|
39 |
-
Llama-3.1-8B-Instruct,Llama-3.1-8B-Instruct#0-100_verbalized-score_BT,Numeric,0.
|
40 |
-
llama-3-70b-instruct,llama-3-70b-instruct#good-yes-no_logprob-score_BT,TokenProbs,0.
|
41 |
-
gpt-4o-mini-2024-07-18,gpt-4o-mini-2024-07-18#comparative-anchor-gpt-4-0314_BT,Anchor,0.
|
42 |
-
mixtral-8x7B-instruct-v0.1,mixtral-8x7B-instruct-v0.1#0-100_verbalized-score_BT,Numeric,0.
|
43 |
-
mixtral-8x7B-instruct-v0.1,mixtral-8x7B-instruct-v0.1#comparative-anchor-gpt-4-0314_BT,Anchor,0.
|
44 |
-
mixtral-8x22B-instruct-v0.1,mixtral-8x22B-instruct-v0.1#comparative-anchor-gpt-4-0314_BT,Anchor,0.
|
45 |
-
llama-3-70b-instruct,llama-3-70b-instruct#comparative-anchor-gpt-4-0314_BT,Anchor,0.
|
46 |
-
Eurus-RM-7b,Eurus-RM-7b_BT,Reward,0.
|
47 |
-
mixtral-8x7B-instruct-v0.1,mixtral-8x7B-instruct-v0.1#bad-good_textual-score_BT,Likert,0.
|
48 |
-
mixtral-8x7B-instruct-v0.1,mixtral-8x7B-instruct-v0.1#good-yes-no_logprob-score_BT,TokenProbs,0.
|
49 |
-
mistral-large-instruct-2407,mistral-large-instruct-2407#good-yes-no_logprob-score_BT,TokenProbs,0.
|
|
|
1 |
+
Judge Model,full_name,Realization,Ranking Agreement,Decisiveness,Bias,ci_low,ci_high
|
2 |
+
URM-LLaMa-3.1-8B,URM-LLaMa-3.1-8B_BT,Reward,0.818819404,1.836856612,0.084857794,,
|
3 |
+
Qwen2.5-72B-Instruct,Qwen2.5-72B-Instruct#bad-good_textual-score_BT,Likert,0.817304917,4.755366194,0.079246328,,
|
4 |
+
Qwen2.5-72B-Instruct,Qwen2.5-72B-Instruct#0-100_verbalized-score_BT,Numeric,0.814143776,4.087812685,0.079252047,,
|
5 |
+
mistral-large-instruct-2407,mistral-large-instruct-2407#bad-good_textual-score_BT,Likert,0.810637054,5.471086171,0.085717614,,
|
6 |
+
gpt-4o-2024-11-20,gpt-4o-2024-11-20#comparative-anchor-gpt-4-0314_BT,Anchor,0.809468147,3.073700094,0.084683633,,
|
7 |
+
mistral-large-instruct-2407,mistral-large-instruct-2407#0-100_verbalized-score_BT,Numeric,0.809468147,3.010016115,0.082093515,,
|
8 |
+
llama-3-1-405b-instruct-fp8,llama-3-1-405b-instruct-fp8#0-100_verbalized-score_BT,Numeric,0.804792519,4.330580225,0.087135759,,
|
9 |
+
gpt-4o-mini-2024-07-18,gpt-4o-mini-2024-07-18#0-100_verbalized-score_BT,Numeric,0.803623612,2.911340337,0.076904561,,
|
10 |
+
gpt-4o-mini-2024-07-18,gpt-4o-mini-2024-07-18#bad-good_textual-score_BT,Likert,0.797779077,4.610807214,0.087159333,,
|
11 |
+
llama-3-1-70b-instruct,llama-3-1-70b-instruct#0-100_verbalized-score_BT,Numeric,0.797779077,2.693966881,0.086833957,,
|
12 |
+
Qwen2.5-72B-Instruct,Qwen2.5-72B-Instruct#comparative-anchor-gpt-4-0314_BT,Anchor,0.794272355,2.929554168,0.089619978,,
|
13 |
+
llama-3-1-405b-instruct-fp8,llama-3-1-405b-instruct-fp8#bad-good_textual-score_BT,Likert,0.787258913,5.218423477,0.097263668,,
|
14 |
+
Skywork-Reward-Llama-3.1-8B-v0.2,Skywork-Reward-Llama-3.1-8B-v0.2_BT,Reward,0.777907656,2.461196439,0.099684483,,
|
15 |
+
Qwen2.5-72B-Instruct,Qwen2.5-72B-Instruct#good-yes-no_logprob-score_BT,TokenProbs,0.776738749,2.689252148,0.081655614,,
|
16 |
+
mixtral-8x22B-instruct-v0.1,mixtral-8x22B-instruct-v0.1#0-100_verbalized-score_BT,Numeric,0.775569842,2.123702381,0.088781499,,
|
17 |
+
gpt-4o-2024-11-20,gpt-4o-2024-11-20#0-100_verbalized-score_BT,Numeric,0.774400935,2.147368211,0.07704892,,
|
18 |
+
gpt-4o-2024-11-20,gpt-4o-2024-11-20#bad-good_textual-score_BT,Likert,0.772873462,5.485635896,0.089067918,,
|
19 |
+
llama-3-1-70b-instruct,llama-3-1-70b-instruct#good-yes-no_logprob-score_BT,TokenProbs,0.765049679,1.259994089,0.069748002,,
|
20 |
+
Llama-3-OffsetBias-RM-8B,Llama-3-OffsetBias-RM-8B_BT,Reward,0.765049679,1.386859931,0.075669848,,
|
21 |
+
ArmoRM-Llama3-8B-v0.1,ArmoRM-Llama3-8B-v0.1_BT,Reward,0.762711864,1.839870032,0.092372835,,
|
22 |
+
gpt-4o-mini-2024-07-18,gpt-4o-mini-2024-07-18#good-yes-no_logprob-score_BT,TokenProbs,0.752191701,2.102594937,0.08401741,,
|
23 |
+
llama-3-70b-instruct,llama-3-70b-instruct#0-100_verbalized-score_BT,Numeric,0.74868498,1.273829005,0.08433286,,
|
24 |
+
Llama-3.1-8B-Instruct,Llama-3.1-8B-Instruct#good-yes-no_logprob-score_BT,TokenProbs,0.74050263,0.598380841,0.0612823,,
|
25 |
+
mixtral-8x22B-instruct-v0.1,mixtral-8x22B-instruct-v0.1#bad-good_textual-score_BT,Likert,0.738164816,2.534301905,0.107585602,,
|
26 |
+
llama-3-1-405b-instruct-fp8,llama-3-1-405b-instruct-fp8#comparative-anchor-gpt-4-0314_BT,Anchor,0.729982466,3.577096075,0.112359855,,
|
27 |
+
mistral-large-instruct-2407,mistral-large-instruct-2407#comparative-anchor-gpt-4-0314_BT,Anchor,0.725306838,2.129762371,0.111101469,,
|
28 |
+
Llama-3.1-8B-Instruct,Llama-3.1-8B-Instruct#bad-good_textual-score_BT,Likert,0.722969024,0.934857221,0.090203472,,
|
29 |
+
llama-3-1-70b-instruct,llama-3-1-70b-instruct#bad-good_textual-score_BT,Likert,0.721800117,3.901943148,0.120093327,,
|
30 |
+
internlm2-20b-reward,internlm2-20b-reward_BT,Reward,0.717124489,1.900369161,0.098389178,,
|
31 |
+
internlm2-7b-reward,internlm2-7b-reward_BT,Reward,0.71244886,2.3536645,0.113364304,,
|
32 |
+
GRM-Llama3.2-3B-rewardmodel-ft,GRM-Llama3.2-3B-rewardmodel-ft_BT,Reward,0.711279953,2.302320479,0.113801314,,
|
33 |
+
mixtral-8x22B-instruct-v0.1,mixtral-8x22B-instruct-v0.1#good-yes-no_logprob-score_BT,TokenProbs,0.701928697,1.84957128,0.088445538,,
|
34 |
+
gpt-4o-2024-11-20,gpt-4o-2024-11-20#good-yes-no_logprob-score_BT,TokenProbs,0.700380036,2.224158523,0.093196512,,
|
35 |
+
llama-3-70b-instruct,llama-3-70b-instruct#bad-good_textual-score_BT,Likert,0.698421975,2.400124125,0.122002475,,
|
36 |
+
llama-3-1-70b-instruct,llama-3-1-70b-instruct#comparative-anchor-gpt-4-0314_BT,Anchor,0.687901812,2.711477731,0.12621163,,
|
37 |
+
Llama-3.1-8B-Instruct,Llama-3.1-8B-Instruct#comparative-anchor-gpt-4-0314_BT,Anchor,0.677381648,0.868408018,0.085109852,,
|
38 |
+
llama-3-1-405b-instruct-fp8,llama-3-1-405b-instruct-fp8#good-yes-no_logprob-score_BT,TokenProbs,0.671537113,1.549707158,0.09227088,,
|
39 |
+
Llama-3.1-8B-Instruct,Llama-3.1-8B-Instruct#0-100_verbalized-score_BT,Numeric,0.668030392,1.204521517,0.104312797,,
|
40 |
+
llama-3-70b-instruct,llama-3-70b-instruct#good-yes-no_logprob-score_BT,TokenProbs,0.663354763,0.774852442,0.07120683,,
|
41 |
+
gpt-4o-mini-2024-07-18,gpt-4o-mini-2024-07-18#comparative-anchor-gpt-4-0314_BT,Anchor,0.658679135,1.412320144,0.110822533,,
|
42 |
+
mixtral-8x7B-instruct-v0.1,mixtral-8x7B-instruct-v0.1#0-100_verbalized-score_BT,Numeric,0.656341321,1.270349906,0.102331076,,
|
43 |
+
mixtral-8x7B-instruct-v0.1,mixtral-8x7B-instruct-v0.1#comparative-anchor-gpt-4-0314_BT,Anchor,0.655172414,1.167971623,0.101686017,,
|
44 |
+
mixtral-8x22B-instruct-v0.1,mixtral-8x22B-instruct-v0.1#comparative-anchor-gpt-4-0314_BT,Anchor,0.641145529,1.497179025,0.13983279,,
|
45 |
+
llama-3-70b-instruct,llama-3-70b-instruct#comparative-anchor-gpt-4-0314_BT,Anchor,0.632963179,1.820241227,0.131968221,,
|
46 |
+
Eurus-RM-7b,Eurus-RM-7b_BT,Reward,0.628287551,2.492726583,0.138112675,,
|
47 |
+
mixtral-8x7B-instruct-v0.1,mixtral-8x7B-instruct-v0.1#bad-good_textual-score_BT,Likert,0.589713618,0.838122343,0.110173865,,
|
48 |
+
mixtral-8x7B-instruct-v0.1,mixtral-8x7B-instruct-v0.1#good-yes-no_logprob-score_BT,TokenProbs,0.427235535,0.738981983,0.107075052,,
|
49 |
+
mistral-large-instruct-2407,mistral-large-instruct-2407#good-yes-no_logprob-score_BT,TokenProbs,0.368790181,1.165216882,0.122587285,,
|