arielgera commited on
Commit
9b2d9e3
1 Parent(s): 09f141d

Initial design

Browse files
Files changed (2) hide show
  1. app.py +38 -3
  2. best_judges_single_agg.csv +49 -49
app.py CHANGED
@@ -1,6 +1,41 @@
1
  import pandas as pd
2
  import streamlit as st
3
 
4
- x = st.slider('Select a value')
5
- df = pd.read_csv("./best_judges_single_agg.csv")
6
- st.dataframe(df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import pandas as pd
2
  import streamlit as st
3
 
4
+ st.set_page_config(
5
+ page_title="JuStRank",
6
+ page_icon="🏋️‍♂️",
7
+ layout="wide",
8
+ initial_sidebar_state="auto",
9
+ menu_items=None,
10
+ )
11
+
12
+ st.title("JuStRank")
13
+ st.subheader("Judges for Ranking Candidate Systems")
14
+
15
+
16
+ def prettify_judge_name(judge_name):
17
+ pretty_judge = (judge_name[0].upper()+judge_name[1:]).replace("Gpt", "GPT")
18
+ return pretty_judge
19
+
20
+
21
+ def format_digits(flt, num_digits=3):
22
+ format_str = "{:."+str(num_digits-1)+"f}"
23
+ format_str_zeroes = "{:."+str(num_digits)+"f}"
24
+ return format_str_zeroes.format(flt)[1:] if (0 < flt < 1) else format_str.format(flt)
25
+
26
+
27
+ df = pd.read_csv("./best_judges_single_agg.csv")[["Judge Model", "Realization", "Ranking Agreement", "Decisiveness", "Bias"]]
28
+ df["Judge Model"] = df["Judge Model"].apply(prettify_judge_name)
29
+
30
+ styled_data = (
31
+ df.style.background_gradient(subset=["Ranking Agreement"])
32
+ .background_gradient(
33
+ subset=["Ranking Agreement"],
34
+ cmap="RdYlGn",
35
+ vmin=0.5,
36
+ vmax=df["Ranking Agreement"].max(),
37
+ )
38
+ .format(subset=["Ranking Agreement", "Decisiveness", "Bias"], formatter=format_digits)
39
+ .set_properties(**{"text-align": "center"})
40
+ )
41
+ st.dataframe(styled_data, hide_index=True, use_container_width=True)
best_judges_single_agg.csv CHANGED
@@ -1,49 +1,49 @@
1
- Judge_Model,full_name,Realization,kt_with_elo,beta_fit,bias_std,ci_low,ci_high
2
- URM-LLaMa-3.1-8B,URM-LLaMa-3.1-8B_BT,Reward,0.8188194038573934,1.8368566117313365,0.08485779417852375,,
3
- Qwen2.5-72B-Instruct,Qwen2.5-72B-Instruct#bad-good_textual-score_BT,Likert,0.8173049165314519,4.755366194422462,0.07924632786346093,,
4
- Qwen2.5-72B-Instruct,Qwen2.5-72B-Instruct#0-100_verbalized-score_BT,Numeric,0.8141437755698421,4.0878126854835,0.07925204683413378,,
5
- mistral-large-instruct-2407,mistral-large-instruct-2407#bad-good_textual-score_BT,Likert,0.8106370543541787,5.471086170786369,0.08571761438483068,,
6
- gpt-4o-2024-11-20,gpt-4o-2024-11-20#comparative-anchor-gpt-4-0314_BT,Anchor,0.8094681472822909,3.0737000941858494,0.08468363327726611,,
7
- mistral-large-instruct-2407,mistral-large-instruct-2407#0-100_verbalized-score_BT,Numeric,0.8094681472822909,3.0100161147388955,0.0820935147925871,,
8
- llama-3-1-405b-instruct-fp8,llama-3-1-405b-instruct-fp8#0-100_verbalized-score_BT,Numeric,0.8047925189947399,4.330580224795526,0.08713575870563035,,
9
- gpt-4o-mini-2024-07-18,gpt-4o-mini-2024-07-18#0-100_verbalized-score_BT,Numeric,0.8036236119228521,2.911340336840001,0.07690456122004496,,
10
- gpt-4o-mini-2024-07-18,gpt-4o-mini-2024-07-18#bad-good_textual-score_BT,Likert,0.7977790765634132,4.610807214374205,0.08715933303084132,,
11
- llama-3-1-70b-instruct,llama-3-1-70b-instruct#0-100_verbalized-score_BT,Numeric,0.7977790765634132,2.6939668808867303,0.08683395704068392,,
12
- Qwen2.5-72B-Instruct,Qwen2.5-72B-Instruct#comparative-anchor-gpt-4-0314_BT,Anchor,0.7942723553477498,2.9295541683113755,0.08961997841795598,,
13
- llama-3-1-405b-instruct-fp8,llama-3-1-405b-instruct-fp8#bad-good_textual-score_BT,Likert,0.7872589129164231,5.218423477156838,0.09726366816471475,,
14
- Skywork-Reward-Llama-3.1-8B-v0.2,Skywork-Reward-Llama-3.1-8B-v0.2_BT,Reward,0.7779076563413208,2.461196439206365,0.09968448349021375,,
15
- Qwen2.5-72B-Instruct,Qwen2.5-72B-Instruct#good-yes-no_logprob-score_BT,TokenProbs,0.776738749269433,2.689252148396911,0.08165561425906073,,
16
- mixtral-8x22B-instruct-v0.1,mixtral-8x22B-instruct-v0.1#0-100_verbalized-score_BT,Numeric,0.7755698421975452,2.123702380621778,0.08878149867714963,,
17
- gpt-4o-2024-11-20,gpt-4o-2024-11-20#0-100_verbalized-score_BT,Numeric,0.7744009351256574,2.147368211099292,0.07704891970422703,,
18
- gpt-4o-2024-11-20,gpt-4o-2024-11-20#bad-good_textual-score_BT,Likert,0.7728734618416162,5.485635896159162,0.08906791777991026,,
19
- llama-3-1-70b-instruct,llama-3-1-70b-instruct#good-yes-no_logprob-score_BT,TokenProbs,0.7650496785505552,1.2599940887461256,0.06974800184734668,,
20
- Llama-3-OffsetBias-RM-8B,Llama-3-OffsetBias-RM-8B_BT,Reward,0.7650496785505552,1.386859930640412,0.07566984800414184,,
21
- ArmoRM-Llama3-8B-v0.1,ArmoRM-Llama3-8B-v0.1_BT,Reward,0.7627118644067796,1.8398700318743302,0.09237283513647337,,
22
- gpt-4o-mini-2024-07-18,gpt-4o-mini-2024-07-18#good-yes-no_logprob-score_BT,TokenProbs,0.7521917007597895,2.10259493695348,0.08401740959016844,,
23
- llama-3-70b-instruct,llama-3-70b-instruct#0-100_verbalized-score_BT,Numeric,0.7486849795441262,1.2738290052706196,0.0843328604031163,,
24
- Llama-3.1-8B-Instruct,Llama-3.1-8B-Instruct#good-yes-no_logprob-score_BT,TokenProbs,0.7405026300409117,0.5983808410581415,0.06128229980875954,,
25
- mixtral-8x22B-instruct-v0.1,mixtral-8x22B-instruct-v0.1#bad-good_textual-score_BT,Likert,0.7381648158971361,2.534301904660848,0.10758560218545998,,
26
- llama-3-1-405b-instruct-fp8,llama-3-1-405b-instruct-fp8#comparative-anchor-gpt-4-0314_BT,Anchor,0.7299824663939216,3.577096074866618,0.11235985485344185,,
27
- mistral-large-instruct-2407,mistral-large-instruct-2407#comparative-anchor-gpt-4-0314_BT,Anchor,0.7253068381063704,2.1297623705935567,0.11110146913759139,,
28
- Llama-3.1-8B-Instruct,Llama-3.1-8B-Instruct#bad-good_textual-score_BT,Likert,0.7229690239625949,0.9348572206360583,0.09020347195052465,,
29
- llama-3-1-70b-instruct,llama-3-1-70b-instruct#bad-good_textual-score_BT,Likert,0.7218001168907071,3.901943147622847,0.12009332653938945,,
30
- internlm2-20b-reward,internlm2-20b-reward_BT,Reward,0.7171244886031559,1.9003691605695128,0.09838917792721068,,
31
- internlm2-7b-reward,internlm2-7b-reward_BT,Reward,0.7124488603156048,2.353664499796978,0.11336430399297745,,
32
- GRM-Llama3.2-3B-rewardmodel-ft,GRM-Llama3.2-3B-rewardmodel-ft_BT,Reward,0.7112799532437171,2.302320479143431,0.11380131384129795,,
33
- mixtral-8x22B-instruct-v0.1,mixtral-8x22B-instruct-v0.1#good-yes-no_logprob-score_BT,TokenProbs,0.7019286966686147,1.849571280249153,0.08844553785560817,,
34
- gpt-4o-2024-11-20,gpt-4o-2024-11-20#good-yes-no_logprob-score_BT,TokenProbs,0.7003800357687263,2.2241585234952765,0.09319651198050675,,
35
- llama-3-70b-instruct,llama-3-70b-instruct#bad-good_textual-score_BT,Likert,0.6984219754529515,2.4001241250045453,0.12200247480571864,,
36
- llama-3-1-70b-instruct,llama-3-1-70b-instruct#comparative-anchor-gpt-4-0314_BT,Anchor,0.6879018118059613,2.711477731374198,0.1262116303875313,,
37
- Llama-3.1-8B-Instruct,Llama-3.1-8B-Instruct#comparative-anchor-gpt-4-0314_BT,Anchor,0.6773816481589713,0.8684080176854654,0.08510985248268717,,
38
- llama-3-1-405b-instruct-fp8,llama-3-1-405b-instruct-fp8#good-yes-no_logprob-score_BT,TokenProbs,0.6715371127995324,1.5497071579540496,0.09227087986533976,,
39
- Llama-3.1-8B-Instruct,Llama-3.1-8B-Instruct#0-100_verbalized-score_BT,Numeric,0.668030391583869,1.2045215166190555,0.10431279729383011,,
40
- llama-3-70b-instruct,llama-3-70b-instruct#good-yes-no_logprob-score_BT,TokenProbs,0.6633547632963179,0.774852442203426,0.0712068298228739,,
41
- gpt-4o-mini-2024-07-18,gpt-4o-mini-2024-07-18#comparative-anchor-gpt-4-0314_BT,Anchor,0.6586791350087667,1.4123201435751862,0.1108225332258117,,
42
- mixtral-8x7B-instruct-v0.1,mixtral-8x7B-instruct-v0.1#0-100_verbalized-score_BT,Numeric,0.6563413208649912,1.2703499058409227,0.10233107571803857,,
43
- mixtral-8x7B-instruct-v0.1,mixtral-8x7B-instruct-v0.1#comparative-anchor-gpt-4-0314_BT,Anchor,0.6551724137931034,1.1679716230736195,0.10168601661101805,,
44
- mixtral-8x22B-instruct-v0.1,mixtral-8x22B-instruct-v0.1#comparative-anchor-gpt-4-0314_BT,Anchor,0.64114552893045,1.4971790245204495,0.1398327898420888,,
45
- llama-3-70b-instruct,llama-3-70b-instruct#comparative-anchor-gpt-4-0314_BT,Anchor,0.6329631794272355,1.8202412271839277,0.13196822136679537,,
46
- Eurus-RM-7b,Eurus-RM-7b_BT,Reward,0.6282875511396844,2.492726583183693,0.13811267469299746,,
47
- mixtral-8x7B-instruct-v0.1,mixtral-8x7B-instruct-v0.1#bad-good_textual-score_BT,Likert,0.5897136177673874,0.8381223432288695,0.11017386542259801,,
48
- mixtral-8x7B-instruct-v0.1,mixtral-8x7B-instruct-v0.1#good-yes-no_logprob-score_BT,TokenProbs,0.4272355347749853,0.7389819834751149,0.1070750522871957,,
49
- mistral-large-instruct-2407,mistral-large-instruct-2407#good-yes-no_logprob-score_BT,TokenProbs,0.3687901811805961,1.1652168815452648,0.12258728493955592,,
 
1
+ Judge Model,full_name,Realization,Ranking Agreement,Decisiveness,Bias,ci_low,ci_high
2
+ URM-LLaMa-3.1-8B,URM-LLaMa-3.1-8B_BT,Reward,0.818819404,1.836856612,0.084857794,,
3
+ Qwen2.5-72B-Instruct,Qwen2.5-72B-Instruct#bad-good_textual-score_BT,Likert,0.817304917,4.755366194,0.079246328,,
4
+ Qwen2.5-72B-Instruct,Qwen2.5-72B-Instruct#0-100_verbalized-score_BT,Numeric,0.814143776,4.087812685,0.079252047,,
5
+ mistral-large-instruct-2407,mistral-large-instruct-2407#bad-good_textual-score_BT,Likert,0.810637054,5.471086171,0.085717614,,
6
+ gpt-4o-2024-11-20,gpt-4o-2024-11-20#comparative-anchor-gpt-4-0314_BT,Anchor,0.809468147,3.073700094,0.084683633,,
7
+ mistral-large-instruct-2407,mistral-large-instruct-2407#0-100_verbalized-score_BT,Numeric,0.809468147,3.010016115,0.082093515,,
8
+ llama-3-1-405b-instruct-fp8,llama-3-1-405b-instruct-fp8#0-100_verbalized-score_BT,Numeric,0.804792519,4.330580225,0.087135759,,
9
+ gpt-4o-mini-2024-07-18,gpt-4o-mini-2024-07-18#0-100_verbalized-score_BT,Numeric,0.803623612,2.911340337,0.076904561,,
10
+ gpt-4o-mini-2024-07-18,gpt-4o-mini-2024-07-18#bad-good_textual-score_BT,Likert,0.797779077,4.610807214,0.087159333,,
11
+ llama-3-1-70b-instruct,llama-3-1-70b-instruct#0-100_verbalized-score_BT,Numeric,0.797779077,2.693966881,0.086833957,,
12
+ Qwen2.5-72B-Instruct,Qwen2.5-72B-Instruct#comparative-anchor-gpt-4-0314_BT,Anchor,0.794272355,2.929554168,0.089619978,,
13
+ llama-3-1-405b-instruct-fp8,llama-3-1-405b-instruct-fp8#bad-good_textual-score_BT,Likert,0.787258913,5.218423477,0.097263668,,
14
+ Skywork-Reward-Llama-3.1-8B-v0.2,Skywork-Reward-Llama-3.1-8B-v0.2_BT,Reward,0.777907656,2.461196439,0.099684483,,
15
+ Qwen2.5-72B-Instruct,Qwen2.5-72B-Instruct#good-yes-no_logprob-score_BT,TokenProbs,0.776738749,2.689252148,0.081655614,,
16
+ mixtral-8x22B-instruct-v0.1,mixtral-8x22B-instruct-v0.1#0-100_verbalized-score_BT,Numeric,0.775569842,2.123702381,0.088781499,,
17
+ gpt-4o-2024-11-20,gpt-4o-2024-11-20#0-100_verbalized-score_BT,Numeric,0.774400935,2.147368211,0.07704892,,
18
+ gpt-4o-2024-11-20,gpt-4o-2024-11-20#bad-good_textual-score_BT,Likert,0.772873462,5.485635896,0.089067918,,
19
+ llama-3-1-70b-instruct,llama-3-1-70b-instruct#good-yes-no_logprob-score_BT,TokenProbs,0.765049679,1.259994089,0.069748002,,
20
+ Llama-3-OffsetBias-RM-8B,Llama-3-OffsetBias-RM-8B_BT,Reward,0.765049679,1.386859931,0.075669848,,
21
+ ArmoRM-Llama3-8B-v0.1,ArmoRM-Llama3-8B-v0.1_BT,Reward,0.762711864,1.839870032,0.092372835,,
22
+ gpt-4o-mini-2024-07-18,gpt-4o-mini-2024-07-18#good-yes-no_logprob-score_BT,TokenProbs,0.752191701,2.102594937,0.08401741,,
23
+ llama-3-70b-instruct,llama-3-70b-instruct#0-100_verbalized-score_BT,Numeric,0.74868498,1.273829005,0.08433286,,
24
+ Llama-3.1-8B-Instruct,Llama-3.1-8B-Instruct#good-yes-no_logprob-score_BT,TokenProbs,0.74050263,0.598380841,0.0612823,,
25
+ mixtral-8x22B-instruct-v0.1,mixtral-8x22B-instruct-v0.1#bad-good_textual-score_BT,Likert,0.738164816,2.534301905,0.107585602,,
26
+ llama-3-1-405b-instruct-fp8,llama-3-1-405b-instruct-fp8#comparative-anchor-gpt-4-0314_BT,Anchor,0.729982466,3.577096075,0.112359855,,
27
+ mistral-large-instruct-2407,mistral-large-instruct-2407#comparative-anchor-gpt-4-0314_BT,Anchor,0.725306838,2.129762371,0.111101469,,
28
+ Llama-3.1-8B-Instruct,Llama-3.1-8B-Instruct#bad-good_textual-score_BT,Likert,0.722969024,0.934857221,0.090203472,,
29
+ llama-3-1-70b-instruct,llama-3-1-70b-instruct#bad-good_textual-score_BT,Likert,0.721800117,3.901943148,0.120093327,,
30
+ internlm2-20b-reward,internlm2-20b-reward_BT,Reward,0.717124489,1.900369161,0.098389178,,
31
+ internlm2-7b-reward,internlm2-7b-reward_BT,Reward,0.71244886,2.3536645,0.113364304,,
32
+ GRM-Llama3.2-3B-rewardmodel-ft,GRM-Llama3.2-3B-rewardmodel-ft_BT,Reward,0.711279953,2.302320479,0.113801314,,
33
+ mixtral-8x22B-instruct-v0.1,mixtral-8x22B-instruct-v0.1#good-yes-no_logprob-score_BT,TokenProbs,0.701928697,1.84957128,0.088445538,,
34
+ gpt-4o-2024-11-20,gpt-4o-2024-11-20#good-yes-no_logprob-score_BT,TokenProbs,0.700380036,2.224158523,0.093196512,,
35
+ llama-3-70b-instruct,llama-3-70b-instruct#bad-good_textual-score_BT,Likert,0.698421975,2.400124125,0.122002475,,
36
+ llama-3-1-70b-instruct,llama-3-1-70b-instruct#comparative-anchor-gpt-4-0314_BT,Anchor,0.687901812,2.711477731,0.12621163,,
37
+ Llama-3.1-8B-Instruct,Llama-3.1-8B-Instruct#comparative-anchor-gpt-4-0314_BT,Anchor,0.677381648,0.868408018,0.085109852,,
38
+ llama-3-1-405b-instruct-fp8,llama-3-1-405b-instruct-fp8#good-yes-no_logprob-score_BT,TokenProbs,0.671537113,1.549707158,0.09227088,,
39
+ Llama-3.1-8B-Instruct,Llama-3.1-8B-Instruct#0-100_verbalized-score_BT,Numeric,0.668030392,1.204521517,0.104312797,,
40
+ llama-3-70b-instruct,llama-3-70b-instruct#good-yes-no_logprob-score_BT,TokenProbs,0.663354763,0.774852442,0.07120683,,
41
+ gpt-4o-mini-2024-07-18,gpt-4o-mini-2024-07-18#comparative-anchor-gpt-4-0314_BT,Anchor,0.658679135,1.412320144,0.110822533,,
42
+ mixtral-8x7B-instruct-v0.1,mixtral-8x7B-instruct-v0.1#0-100_verbalized-score_BT,Numeric,0.656341321,1.270349906,0.102331076,,
43
+ mixtral-8x7B-instruct-v0.1,mixtral-8x7B-instruct-v0.1#comparative-anchor-gpt-4-0314_BT,Anchor,0.655172414,1.167971623,0.101686017,,
44
+ mixtral-8x22B-instruct-v0.1,mixtral-8x22B-instruct-v0.1#comparative-anchor-gpt-4-0314_BT,Anchor,0.641145529,1.497179025,0.13983279,,
45
+ llama-3-70b-instruct,llama-3-70b-instruct#comparative-anchor-gpt-4-0314_BT,Anchor,0.632963179,1.820241227,0.131968221,,
46
+ Eurus-RM-7b,Eurus-RM-7b_BT,Reward,0.628287551,2.492726583,0.138112675,,
47
+ mixtral-8x7B-instruct-v0.1,mixtral-8x7B-instruct-v0.1#bad-good_textual-score_BT,Likert,0.589713618,0.838122343,0.110173865,,
48
+ mixtral-8x7B-instruct-v0.1,mixtral-8x7B-instruct-v0.1#good-yes-no_logprob-score_BT,TokenProbs,0.427235535,0.738981983,0.107075052,,
49
+ mistral-large-instruct-2407,mistral-large-instruct-2407#good-yes-no_logprob-score_BT,TokenProbs,0.368790181,1.165216882,0.122587285,,