Spaces:
Building
Building
update README.md
Browse files- app.py +2 -2
- constants.py +6 -3
- file/results.xlsx +0 -0
- src/utils_display.py +136 -114
app.py
CHANGED
@@ -83,8 +83,8 @@ def build_leaderboard(
|
|
83 |
|
84 |
gr.Markdown(TABLE_INTRODUCTION, elem_classes="markdown-text")
|
85 |
data_spilt_radio = gr.Radio(
|
86 |
-
choices=["
|
87 |
-
value="
|
88 |
label=SELECT_SET_INTRO,
|
89 |
)
|
90 |
|
|
|
83 |
|
84 |
gr.Markdown(TABLE_INTRODUCTION, elem_classes="markdown-text")
|
85 |
data_spilt_radio = gr.Radio(
|
86 |
+
choices=["Chinese", "English"],
|
87 |
+
value="Chinese",
|
88 |
label=SELECT_SET_INTRO,
|
89 |
)
|
90 |
|
constants.py
CHANGED
@@ -32,11 +32,13 @@ XLSX_DIR = "./file//results.xlsx"
|
|
32 |
|
33 |
LEADERBOARD_INTRODUCTION = """# 🏆 S-Eval Leaderboard
|
34 |
## 🔔 Updates
|
|
|
|
|
35 |
📣 [2024/05/23]: We publish our [paper](https://arxiv.org/abs/2405.14191) and first release 2,000 base risk prompts.
|
36 |
|
37 |
### ❗️ Note
|
38 |
Due to the limited machine resource, please refresh the page if a connection timeout error occurs.
|
39 |
-
|
40 |
You can get more detailed information from our [Project](https://github.com/IS2Lab/S-Eval) and [Paper](https://arxiv.org/abs/2405.14191).
|
41 |
"""
|
42 |
|
@@ -45,11 +47,12 @@ SELECT_SET_INTRO = (
|
|
45 |
)
|
46 |
|
47 |
TABLE_INTRODUCTION_1 = """In the table below, we summarize the safety scores (%) of differnet models on Base Risk Prompt Set."""
|
48 |
-
TABLE_INTRODUCTION_2 = """In the table below, we summarize the attack success
|
49 |
|
50 |
|
51 |
LEADERBORAD_INFO = """
|
52 |
-
|
|
|
53 |
"""
|
54 |
|
55 |
|
|
|
32 |
|
33 |
LEADERBOARD_INTRODUCTION = """# 🏆 S-Eval Leaderboard
|
34 |
## 🔔 Updates
|
35 |
+
📣 [2024/05/31]: We release 20,000 corresponding attack prompts.
|
36 |
+
|
37 |
📣 [2024/05/23]: We publish our [paper](https://arxiv.org/abs/2405.14191) and first release 2,000 base risk prompts.
|
38 |
|
39 |
### ❗️ Note
|
40 |
Due to the limited machine resource, please refresh the page if a connection timeout error occurs.
|
41 |
+
|
42 |
You can get more detailed information from our [Project](https://github.com/IS2Lab/S-Eval) and [Paper](https://arxiv.org/abs/2405.14191).
|
43 |
"""
|
44 |
|
|
|
47 |
)
|
48 |
|
49 |
TABLE_INTRODUCTION_1 = """In the table below, we summarize the safety scores (%) of differnet models on Base Risk Prompt Set."""
|
50 |
+
TABLE_INTRODUCTION_2 = """In the table below, we summarize the attack success rates (%) of the instruction attacks in Attack Prompt Set on different models"""
|
51 |
|
52 |
|
53 |
LEADERBORAD_INFO = """
|
54 |
+
S-Eval is designed to be a new comprehensive, multi-dimensional and open-ended safety evaluation benchmark. So far, S-Eval has 220,000 evaluation prompts in total (and is still in active expansion), including 20,000 base risk prompts (10,000 in Chinese and 10,000 in English) and 200,000 *corresponding* attack prompts derived from 10 popular adversarial instruction attacks. These test prompts are generated based on a comprehensive and unified risk taxonomy, specifically designed to encompass all crucial dimensions of LLM safety evaluation and meant to accurately reflect the varied safety levels of LLMs across these risk dimensions.
|
55 |
+
More details on the construction of the test suite including model-based test generation, selection and the expert critique LLM can be found in our [paper](https://arxiv.org/abs/2405.14191).
|
56 |
"""
|
57 |
|
58 |
|
file/results.xlsx
CHANGED
Binary files a/file/results.xlsx and b/file/results.xlsx differ
|
|
src/utils_display.py
CHANGED
@@ -1,121 +1,143 @@
|
|
1 |
-
from dataclasses import dataclass
|
2 |
|
3 |
|
4 |
# These classes are for user facing column names, to avoid having to change them
|
5 |
# all around the code when a modif is needed
|
6 |
-
@dataclass
|
7 |
-
class ColumnContent:
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
def fields(raw_class):
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
@dataclass(frozen=True)
|
23 |
-
class AutoEvalColumn: # Auto evals column
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
@dataclass(frozen=True)
|
44 |
-
class EloEvalColumn: # Elo evals column
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
@dataclass(frozen=True)
|
53 |
-
class EvalQueueColumn: # Queue column
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
LLAMAS = [
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
]
|
68 |
-
|
69 |
-
|
70 |
-
KOALA_LINK = "https://huggingface.co/TheBloke/koala-13B-HF"
|
71 |
-
VICUNA_LINK = "https://huggingface.co/lmsys/vicuna-13b-delta-v1.1"
|
72 |
-
OASST_LINK = "https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"
|
73 |
-
DOLLY_LINK = "https://huggingface.co/databricks/dolly-v2-12b"
|
74 |
-
MODEL_PAGE = "https://huggingface.co/models"
|
75 |
-
LLAMA_LINK = "https://ai.facebook.com/blog/large-language-model-llama-meta-ai/"
|
76 |
-
VICUNA_LINK = "https://huggingface.co/CarperAI/stable-vicuna-13b-delta"
|
77 |
-
ALPACA_LINK = "https://crfm.stanford.edu/2023/03/13/alpaca.html"
|
78 |
-
|
79 |
-
|
80 |
-
def model_hyperlink(link, model_name):
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
def make_clickable_model(model_name):
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
# else:
|
105 |
# link = MODEL_PAGE
|
106 |
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
def styled_error(error):
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
def styled_warning(warn):
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
def styled_message(message):
|
119 |
-
|
120 |
-
|
121 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# from dataclasses import dataclass
|
2 |
|
3 |
|
4 |
# These classes are for user facing column names, to avoid having to change them
|
5 |
# all around the code when a modif is needed
|
6 |
+
# @dataclass
|
7 |
+
# class ColumnContent:
|
8 |
+
# name: str
|
9 |
+
# type: str
|
10 |
+
# displayed_by_default: bool
|
11 |
+
# hidden: bool = False
|
12 |
+
# never_hidden: bool = False
|
13 |
+
# dummy: bool = False
|
14 |
+
|
15 |
+
|
16 |
+
# def fields(raw_class):
|
17 |
+
# return [
|
18 |
+
# v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"
|
19 |
+
# ]
|
20 |
+
|
21 |
+
|
22 |
+
# @dataclass(frozen=True)
|
23 |
+
# class AutoEvalColumn: # Auto evals column
|
24 |
+
|
25 |
+
# model_type_symbol = ColumnContent("T", "str", True)
|
26 |
+
# model = ColumnContent("Model", "markdown", True, never_hidden=True)
|
27 |
+
# average = ColumnContent("Average ⬆️", "number", True)
|
28 |
+
# arc = ColumnContent("ARC", "number", True)
|
29 |
+
# hellaswag = ColumnContent("HellaSwag", "number", True)
|
30 |
+
# mmlu = ColumnContent("MMLU", "number", True)
|
31 |
+
# truthfulqa = ColumnContent("TruthfulQA", "number", True)
|
32 |
+
# model_type = ColumnContent("Type", "str", False)
|
33 |
+
# precision = ColumnContent("Precision", "str", False, True)
|
34 |
+
# license = ColumnContent("Hub License", "str", False)
|
35 |
+
# params = ColumnContent("#Params (B)", "number", False)
|
36 |
+
# likes = ColumnContent("Hub ❤️", "number", False)
|
37 |
+
# revision = ColumnContent("Model sha", "str", False, False)
|
38 |
+
# dummy = ColumnContent(
|
39 |
+
# "model_name_for_query", "str", True
|
40 |
+
# ) # dummy col to implement search bar (hidden by custom CSS)
|
41 |
+
|
42 |
+
|
43 |
+
# @dataclass(frozen=True)
|
44 |
+
# class EloEvalColumn: # Elo evals column
|
45 |
+
# model = ColumnContent("Model", "markdown", True)
|
46 |
+
# gpt4 = ColumnContent("GPT-4 (all)", "number", True)
|
47 |
+
# human_all = ColumnContent("Human (all)", "number", True)
|
48 |
+
# human_instruct = ColumnContent("Human (instruct)", "number", True)
|
49 |
+
# human_code_instruct = ColumnContent("Human (code-instruct)", "number", True)
|
50 |
+
|
51 |
+
|
52 |
+
# @dataclass(frozen=True)
|
53 |
+
# class EvalQueueColumn: # Queue column
|
54 |
+
# model = ColumnContent("model", "markdown", True)
|
55 |
+
# revision = ColumnContent("revision", "str", True)
|
56 |
+
# private = ColumnContent("private", "bool", True)
|
57 |
+
# precision = ColumnContent("precision", "bool", True)
|
58 |
+
# weight_type = ColumnContent("weight_type", "str", "Original")
|
59 |
+
# status = ColumnContent("status", "str", True)
|
60 |
+
|
61 |
+
|
62 |
+
# LLAMAS = [
|
63 |
+
# "huggingface/llama-7b",
|
64 |
+
# "huggingface/llama-13b",
|
65 |
+
# "huggingface/llama-30b",
|
66 |
+
# "huggingface/llama-65b",
|
67 |
+
# ]
|
68 |
+
|
69 |
+
|
70 |
+
# KOALA_LINK = "https://huggingface.co/TheBloke/koala-13B-HF"
|
71 |
+
# VICUNA_LINK = "https://huggingface.co/lmsys/vicuna-13b-delta-v1.1"
|
72 |
+
# OASST_LINK = "https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"
|
73 |
+
# DOLLY_LINK = "https://huggingface.co/databricks/dolly-v2-12b"
|
74 |
+
# MODEL_PAGE = "https://huggingface.co/models"
|
75 |
+
# LLAMA_LINK = "https://ai.facebook.com/blog/large-language-model-llama-meta-ai/"
|
76 |
+
# VICUNA_LINK = "https://huggingface.co/CarperAI/stable-vicuna-13b-delta"
|
77 |
+
# ALPACA_LINK = "https://crfm.stanford.edu/2023/03/13/alpaca.html"
|
78 |
+
|
79 |
+
|
80 |
+
# def model_hyperlink(link, model_name):
|
81 |
+
# return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
82 |
+
|
83 |
+
|
84 |
+
# def make_clickable_model(model_name):
|
85 |
+
# link = f"https://huggingface.co/{model_name}"
|
86 |
+
|
87 |
+
# if model_name in LLAMAS:
|
88 |
+
# link = LLAMA_LINK
|
89 |
+
# model_name = model_name.split("/")[1]
|
90 |
+
# elif model_name == "HuggingFaceH4/stable-vicuna-13b-2904":
|
91 |
+
# link = VICUNA_LINK
|
92 |
+
# model_name = "stable-vicuna-13b"
|
93 |
+
# elif model_name == "HuggingFaceH4/llama-7b-ift-alpaca":
|
94 |
+
# link = ALPACA_LINK
|
95 |
+
# model_name = "alpaca-13b"
|
96 |
+
# if model_name == "dolly-12b":
|
97 |
+
# link = DOLLY_LINK
|
98 |
+
# elif model_name == "vicuna-13b":
|
99 |
+
# link = VICUNA_LINK
|
100 |
+
# elif model_name == "koala-13b":
|
101 |
+
# link = KOALA_LINK
|
102 |
+
# elif model_name == "oasst-12b":
|
103 |
+
# link = OASST_LINK
|
104 |
# else:
|
105 |
# link = MODEL_PAGE
|
106 |
|
107 |
+
# return model_hyperlink(link, model_name)
|
108 |
+
|
109 |
+
|
110 |
+
# def styled_error(error):
|
111 |
+
# return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
|
112 |
+
|
113 |
+
|
114 |
+
# def styled_warning(warn):
|
115 |
+
# return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
|
116 |
+
|
117 |
+
|
118 |
+
# def styled_message(message):
|
119 |
+
# return (
|
120 |
+
# f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
|
121 |
+
# )
|
122 |
+
|
123 |
+
Qwen_1_8B_Chat_Link = "https://huggingface.co/Qwen/Qwen-1_8B-Chat"
|
124 |
+
Qwen_7B_Chat_Link = "https://huggingface.co/Qwen/Qwen-7B-Chat"
|
125 |
+
Qwen_14B_Chat_Link = "https://huggingface.co/Qwen/Qwen-14B-Chat"
|
126 |
+
Qwen_72B_Chat_Link = "https://huggingface.co/Qwen/Qwen-72B-Chat"
|
127 |
+
Gemma_2B_it_Link = "https://huggingface.co/google/gemma-2b-it"
|
128 |
+
Gemma_7B_it__Link = "https://huggingface.co/google/gemma-7b-it"
|
129 |
+
ChatGLM3_6B_Link = "https://huggingface.co/THUDM/chatglm3-6b"
|
130 |
+
Mistral_7B_Instruct_v0_2_Link = "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2"
|
131 |
+
LLaMA_2_7B_Chat_Link = "https://huggingface.co/meta-llama/Llama-2-7b-chat-hf"
|
132 |
+
LLaMA_2_13B_Chat_Link = "https://huggingface.co/meta-llama/Llama-2-13b-chat-hf"
|
133 |
+
LLaMA_2_70B_Chat_Link = "https://huggingface.co/meta-llama/Llama-2-70b-chat-hf"
|
134 |
+
LLaMA_3_8B_Instruct_Link = "https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct"
|
135 |
+
LLaMA_3_70B_Instruct_Link = "https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct"
|
136 |
+
Vicuna_7B_v1_3_Link = "https://huggingface.co/lmsys/vicuna-7b-v1.3"
|
137 |
+
Vicuna_13B_v1_3_Link = "https://huggingface.co/lmsys/vicuna-13b-v1.3"
|
138 |
+
Vicuna_33B_v1_3_Link = "https://huggingface.co/lmsys/vicuna-33b-v1.3"
|
139 |
+
Baichuan2_13B_Chat_Link = "https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat"
|
140 |
+
Yi_34B_Chat_Link = "https://huggingface.co/01-ai/Yi-34B-Chat"
|
141 |
+
GPT_4_Turbo_Link = "https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4"
|
142 |
+
ErnieBot_4_0_Link = "https://cloud.baidu.com/doc/WENXINWORKSHOP/s/clntwmv7t"
|
143 |
+
Gemini_1_0_Pro_Link = "https://ai.google.dev/gemini-api/docs/models/gemini"
|