File size: 4,420 Bytes
5472531 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
"""
Compute agreement among judges.
Usage:
python compute_agreement.py --judges gpt4-pair human --votefiles human_judgments.json gpt4_pair_judgments.json
python compute_agreement.py --judges human human --votefiles human_judgments.json
"""
import argparse
import json
import os
import numpy as np
def get_judge_name(judge):
if isinstance(judge, list) and judge[0] == "gpt-4" and judge[1].startswith("pair"):
return "gpt4-pair"
if judge.startswith("expert"):
return "human"
if judge.startswith("author"):
return "author"
def revert(vote):
if vote == "model_a":
return "model_b"
elif vote == "model_b":
return "model_a"
return vote
def get_mt_bench_votes_data(raw_votes):
data = [{}, {}]
for judge_votes in raw_votes:
for vote in judge_votes:
turn = vote["turn"] - 1
if vote["model_a"] < vote["model_b"]:
key = (vote["question_id"], vote["model_a"], vote["model_b"])
winner = vote["winner"]
else:
key = (vote["question_id"], vote["model_b"], vote["model_a"])
winner = revert(vote["winner"])
judge = get_judge_name(vote["judge"])
if key not in data[turn]:
data[turn][key] = {}
if judge not in data[turn][key]:
data[turn][key][judge] = []
data[turn][key][judge].append(winner)
return data
def convertvote(vote):
if "tie" in vote:
return "tie"
return vote
def equalvote(vote1, vote2):
if "tie" in vote1 and "tie" in vote2:
return True
return vote1 == vote2
# data: Dict[qid -> List[vote]]
def get_mt_bench_agreement(data, judge1, judge2, ban):
if judge1.startswith("gpt4") and judge2 == "human":
stats = [0, 0]
for votes in data.values():
if judge1 not in votes or judge2 not in votes:
continue
assert len(votes[judge1]) == 1
if convertvote(votes[judge1][0]) in ban:
continue
for v in votes[judge2]:
if convertvote(v) in ban:
continue
stats[1] += 1
stats[0] += equalvote(votes[judge1][0], v)
return stats[0], stats[1]
elif judge1 == "human" and judge2 == "human":
stats = [0, 0]
for votes in data.values():
if "human" not in votes:
continue
for i in range(len(votes["human"]) - 1):
for j in range(i + 1, len(votes["human"])):
if (
convertvote(votes["human"][i]) in ban
or convertvote(votes["human"][j]) in ban
):
continue
stats[1] += 1
stats[0] += equalvote(votes["human"][i], votes["human"][j])
return stats[0], stats[1]
else:
raise Exception("Unsupported judges.")
def run_mt_bench_agreement(judges, votefiles):
# votes[i]: List of votes
votes = []
for filename in votefiles:
with open(filename, "r") as f:
data = json.load(f)
votes.append(data)
data = get_mt_bench_votes_data(votes)
agree, total = get_mt_bench_agreement(data[0], judges[0], judges[1], ban=[])
print(
f"turn 1 with tie. #total: {total}, #agree: {agree}, ratio: {agree/total:.2f}"
)
agree, total = get_mt_bench_agreement(data[0], judges[0], judges[1], ban=["tie"])
print(
f"turn 1 without tie. #total: {total}, #agree: {agree}, ratio: {agree/total:.2f}"
)
agree, total = get_mt_bench_agreement(data[1], judges[0], judges[1], ban=[])
print(
f"turn 2 with tie. #total: {total}, #agree: {agree}, ratio: {agree/total:.2f}"
)
agree, total = get_mt_bench_agreement(data[1], judges[0], judges[1], ban=["tie"])
print(
f"turn 2 without tie. #total: {total}, #agree: {agree}, ratio: {agree/total:.2f}"
)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--judges", nargs=2, type=str, default=["gpt4-pair", "human"])
parser.add_argument(
"--votefiles",
nargs="+",
type=str,
default=["gpt4_judgments.json", "human_judgments.json"],
)
args = parser.parse_args()
run_mt_bench_agreement(args.judges, args.votefiles)
|