Spaces:
Running
Running
XufengDuan
commited on
Commit
•
ad27ecb
1
Parent(s):
86c17df
update scripts
Browse files- app.py +3 -5
- src/backend/model_operations.py +208 -180
- src/display/about.py +7 -16
- src/display/formatting.py +3 -1
- src/envs.py +2 -6
- src/leaderboard/read_evals.py +1 -13
- src/populate.py +4 -4
app.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
import logging
|
2 |
-
|
3 |
import gradio as gr
|
4 |
import pandas as pd
|
5 |
from apscheduler.schedulers.background import BackgroundScheduler
|
@@ -9,7 +8,6 @@ from main_backend import PENDING_STATUS, RUNNING_STATUS, FINISHED_STATUS, FAILED
|
|
9 |
from src.backend import sort_queue
|
10 |
from src.envs import EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, RESULTS_REPO
|
11 |
import src.backend.manage_requests as manage_requests
|
12 |
-
|
13 |
import socket
|
14 |
import src.display.about as about
|
15 |
from src.display.css_html_js import custom_css
|
@@ -21,12 +19,11 @@ import os
|
|
21 |
import datetime
|
22 |
import spacy_transformers
|
23 |
import pprint
|
|
|
24 |
|
25 |
pp = pprint.PrettyPrinter(width=80)
|
26 |
-
|
27 |
TOKEN = os.environ.get("H4_TOKEN", None)
|
28 |
print("TOKEN", TOKEN)
|
29 |
-
import src.backend.run_eval_suite as run_eval_suite
|
30 |
|
31 |
def ui_snapshot_download(repo_id, local_dir, repo_type, tqdm_class, etag_timeout):
|
32 |
try:
|
@@ -45,7 +42,8 @@ def init_space():
|
|
45 |
# sync model_type with open-llm-leaderboard
|
46 |
ui_snapshot_download(repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
|
47 |
ui_snapshot_download(repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
|
48 |
-
|
|
|
49 |
|
50 |
finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, utils.EVAL_COLS)
|
51 |
return original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
|
|
|
1 |
import logging
|
|
|
2 |
import gradio as gr
|
3 |
import pandas as pd
|
4 |
from apscheduler.schedulers.background import BackgroundScheduler
|
|
|
8 |
from src.backend import sort_queue
|
9 |
from src.envs import EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, RESULTS_REPO
|
10 |
import src.backend.manage_requests as manage_requests
|
|
|
11 |
import socket
|
12 |
import src.display.about as about
|
13 |
from src.display.css_html_js import custom_css
|
|
|
19 |
import datetime
|
20 |
import spacy_transformers
|
21 |
import pprint
|
22 |
+
import src.backend.run_eval_suite as run_eval_suite
|
23 |
|
24 |
pp = pprint.PrettyPrinter(width=80)
|
|
|
25 |
TOKEN = os.environ.get("H4_TOKEN", None)
|
26 |
print("TOKEN", TOKEN)
|
|
|
27 |
|
28 |
def ui_snapshot_download(repo_id, local_dir, repo_type, tqdm_class, etag_timeout):
|
29 |
try:
|
|
|
42 |
# sync model_type with open-llm-leaderboard
|
43 |
ui_snapshot_download(repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
|
44 |
ui_snapshot_download(repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
|
45 |
+
|
46 |
+
original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, utils.COLS, utils.BENCHMARK_COLS)
|
47 |
|
48 |
finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, utils.EVAL_COLS)
|
49 |
return original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
|
src/backend/model_operations.py
CHANGED
@@ -2,7 +2,7 @@ import os
|
|
2 |
import time
|
3 |
from datetime import datetime
|
4 |
import logging
|
5 |
-
from pathlib import Path
|
6 |
import requests
|
7 |
import json
|
8 |
|
@@ -135,12 +135,12 @@ class SummaryGenerator:
|
|
135 |
# prompt = {}
|
136 |
# for index, row in tqdm(df_prompt.iterrows(), total=df_prompt.shape[0]):
|
137 |
# prompt['E' + row['Item']] = row['Prompt']
|
138 |
-
xls = pd.ExcelFile(dataset)
|
139 |
sheet_names = xls.sheet_names
|
140 |
# sheet_names = df.sheetnames
|
141 |
-
print(f"Total: {len(sheet_names)}")
|
142 |
-
print(sheet_names)
|
143 |
-
|
144 |
Experiment_ID, Questions_ID, Item_ID, Condition, User_prompt, Response, Factor_2, Stimuli_1 = [], [], [], [], [] ,[], [], []
|
145 |
exit_outer_loop = False # bad model
|
146 |
for i, sheet_name in enumerate(sheet_names, start=1):
|
@@ -150,17 +150,17 @@ class SummaryGenerator:
|
|
150 |
# if i > 2 and i ==1:
|
151 |
# continue
|
152 |
print(i, sheet_name)
|
153 |
-
df_sheet = pd.read_excel(xls, sheet_name=sheet_name)
|
154 |
-
|
155 |
# 假设第一列是'Prompt0',但这里我们使用列名来避免硬编码
|
156 |
-
if 'Prompt0' in df_sheet.columns:
|
157 |
-
prompt_column = df_sheet['Prompt0']
|
158 |
-
else:
|
159 |
# 如果'Prompt0'列不存在,则跳过该工作表或进行其他处理
|
160 |
-
continue
|
161 |
if i == 3 :
|
162 |
-
word1_list = df_sheet['Stimuli-2']
|
163 |
-
word2_list = df_sheet['Stimuli-3']
|
164 |
V2_column = []
|
165 |
for jj in range(len(word1_list)):
|
166 |
V2_column.append(word1_list[jj] + '_' + word2_list[jj])
|
@@ -175,17 +175,17 @@ class SummaryGenerator:
|
|
175 |
Item_column = df_sheet["Item"]
|
176 |
Condition_column = df_sheet["Condition"]
|
177 |
Stimuli_1_column = df_sheet["Stimuli-1"]
|
178 |
-
if 'Stimuli-2' in df_sheet.columns:
|
179 |
Stimuli_2_column = df_sheet["Stimuli-2"]
|
180 |
|
181 |
for j, prompt_value in enumerate(tqdm(prompt_column, desc=f"Processing {sheet_name}"), start=0):
|
182 |
if exit_outer_loop:
|
183 |
break
|
184 |
-
ID = 'E' + str(i)
|
185 |
# q_ID = ID + '_' + str(j)
|
186 |
-
|
187 |
# print(ID, q_ID, prompt_value)
|
188 |
-
system_prompt = envs.SYSTEM_PROMPT
|
189 |
_user_prompt = prompt_value
|
190 |
for ii in range(10):
|
191 |
# user_prompt = f"{envs.USER_PROMPT}\nPassage:\n{_source}"
|
@@ -193,7 +193,7 @@ class SummaryGenerator:
|
|
193 |
try:
|
194 |
'''调用'''
|
195 |
print(ID,'-',ii)
|
196 |
-
|
197 |
_response = self.generate_summary(system_prompt, _user_prompt)
|
198 |
# print(f"Finish index {index}")
|
199 |
break
|
@@ -212,17 +212,24 @@ class SummaryGenerator:
|
|
212 |
print(f"Quota has reached, wait for {wait_time}")
|
213 |
time.sleep(wait_time)
|
214 |
else:
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
break
|
221 |
-
except Exception as e:
|
222 |
-
exceptions.append(e)
|
223 |
print(f"Error at index {i}: {e}")
|
224 |
-
|
225 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
226 |
|
227 |
if exit_outer_loop:
|
228 |
break
|
@@ -272,9 +279,9 @@ class SummaryGenerator:
|
|
272 |
Experiment_ID.append(ID)
|
273 |
Questions_ID.append(q_column[j])
|
274 |
User_prompt.append(_user_prompt)
|
275 |
-
|
276 |
Response.append(_response2)
|
277 |
-
|
278 |
Factor_2.append(V2_column[j])
|
279 |
Stimuli_1.append(Stimuli_2_column[j])
|
280 |
Item_ID.append(Item_column[j])
|
@@ -286,18 +293,18 @@ class SummaryGenerator:
|
|
286 |
User_prompt.append(_user_prompt)
|
287 |
Response.append(_response1)
|
288 |
|
289 |
-
|
290 |
-
|
291 |
Factor_2.append(V2_column[j])
|
292 |
Stimuli_1.append(Stimuli_1_column[j])
|
293 |
Item_ID.append(Item_column[j])
|
294 |
Condition.append(Condition_column[j])
|
295 |
-
|
296 |
else:
|
297 |
Experiment_ID.append(ID)
|
298 |
Questions_ID.append(q_column[j])
|
299 |
User_prompt.append(_user_prompt)
|
300 |
-
|
301 |
Response.append(_response)
|
302 |
if i == 6:
|
303 |
Factor_2.append(Condition_column[j])
|
@@ -309,7 +316,7 @@ class SummaryGenerator:
|
|
309 |
Condition.append(Condition_column[j])
|
310 |
print(_response)
|
311 |
|
312 |
-
|
313 |
# exit()
|
314 |
|
315 |
# Sleep to prevent hitting rate limits too frequently
|
@@ -322,14 +329,14 @@ class SummaryGenerator:
|
|
322 |
print(f'Save summaries to {save_path}')
|
323 |
fpath = Path(save_path)
|
324 |
fpath.parent.mkdir(parents=True, exist_ok=True)
|
325 |
-
self.summaries_df.to_csv(fpath)
|
326 |
|
327 |
self.exceptions = exceptions
|
328 |
# self._compute_avg_length()
|
329 |
# self._compute_answer_rate()
|
330 |
|
331 |
return self.summaries_df
|
332 |
-
|
333 |
def generate_summary(self, system_prompt: str, user_prompt: str):
|
334 |
# Using Together AI API
|
335 |
using_together_api = False
|
@@ -388,28 +395,115 @@ class SummaryGenerator:
|
|
388 |
result = ''
|
389 |
print(result)
|
390 |
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
391 |
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
405 |
return result
|
406 |
-
|
407 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
408 |
elif 'gemini' in self.model_id.lower():
|
409 |
genai.configure(api_key=os.getenv('GOOGLE_AI_API_KEY'))
|
410 |
generation_config = {
|
411 |
"temperature": 0,
|
412 |
-
"top_p": 0.95,
|
413 |
"top_k": 0,
|
414 |
"max_output_tokens": 50,
|
415 |
# "response_mime_type": "application/json",
|
@@ -432,101 +526,35 @@ class SummaryGenerator:
|
|
432 |
"threshold": "BLOCK_NONE"
|
433 |
},
|
434 |
]
|
435 |
-
model = genai.GenerativeModel(
|
436 |
-
|
437 |
-
|
438 |
-
|
|
|
|
|
439 |
convo = model.start_chat(history=[])
|
440 |
convo.send_message(user_prompt)
|
441 |
# print(convo.last)
|
442 |
result = convo.last.text
|
443 |
print(result)
|
444 |
return result
|
445 |
-
|
446 |
-
|
447 |
-
|
448 |
-
|
449 |
-
|
450 |
-
|
451 |
-
#
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
|
459 |
-
# api_base= "https://api-inference.huggingface.co/models/" + self.model_id,
|
460 |
-
# )
|
461 |
-
# self.model_id = 'command-r-plus' if 'command' in self.model_id else self.model_id
|
462 |
-
# response = litellm.completion(
|
463 |
-
# model="huggingface/" + self.model_id,
|
464 |
-
# # mistralai/Mistral-7B-Instruct-v0.1",
|
465 |
-
# messages=[{"role": "system", "content": system_prompt},
|
466 |
-
# {"role": "user", "content": user_prompt}],
|
467 |
-
# #temperature=0.0,
|
468 |
-
# max_tokens=1024,
|
469 |
-
# api_base="https://api-inference.huggingface.co/models/" + self.model_id)
|
470 |
-
# print("模型返回结果",response)
|
471 |
-
# print("模型返回结果结束")
|
472 |
-
# # exit()
|
473 |
-
# result = response['choices'][0]['message']['content']
|
474 |
-
# print(result)
|
475 |
-
from huggingface_hub import InferenceClient
|
476 |
-
print("token_for_request:",envs.TOKEN)
|
477 |
-
print(self.model_id)
|
478 |
-
client = InferenceClient(self.model_id,api_key=envs.TOKEN,headers={"X-use-cache": "false"})
|
479 |
-
messages = [{"role": "system", "content": system_prompt},{"role": "user", "content": user_prompt}]
|
480 |
-
# outputs = client.chat_completion(messages, max_tokens=50)
|
481 |
-
result = None
|
482 |
-
while result is None:
|
483 |
-
outputs = client.chat_completion(messages, max_tokens=50)
|
484 |
-
result = outputs['choices'][0]['message']['content']
|
485 |
-
|
486 |
-
if result is None:
|
487 |
-
time.sleep(1) # Optional: Add a small delay before retrying
|
488 |
-
|
489 |
-
return result
|
490 |
-
# exit()
|
491 |
-
except: # fail to call api. run it locally.
|
492 |
-
self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, trust_remote_code=True)
|
493 |
-
print("Tokenizer loaded")
|
494 |
-
self.local_model = AutoModelForCausalLM.from_pretrained(self.model_id, trust_remote_code=True, device_map="auto", torch_dtype="auto", cache_dir='/home/paperspace/cache')
|
495 |
-
print("Local model loaded")
|
496 |
# exit()
|
497 |
# Using local model
|
498 |
-
if self.local_model: # cannot call API. using local model
|
499 |
-
messages=[
|
500 |
-
{"role": "system", "content": system_prompt}, # gemma-1.1 does not accept system role
|
501 |
-
{"role": "user", "content": user_prompt}
|
502 |
-
]
|
503 |
-
try: # some models support pipeline
|
504 |
-
pipe = pipeline(
|
505 |
-
"text-generation",
|
506 |
-
model=self.local_model,
|
507 |
-
tokenizer=self.tokenizer,
|
508 |
-
)
|
509 |
-
|
510 |
-
generation_args = {
|
511 |
-
"max_new_tokens": 50,
|
512 |
-
"return_full_text": False,
|
513 |
-
#"temperature": 0.0,
|
514 |
-
"do_sample": False,
|
515 |
-
}
|
516 |
|
517 |
-
output = pipe(messages, **generation_args)
|
518 |
-
result = output[0]['generated_text']
|
519 |
-
print(result)
|
520 |
-
except:
|
521 |
-
prompt = self.tokenizer.apply_chat_template(messages,add_generation_prompt=True, tokenize=False)
|
522 |
-
print(prompt)
|
523 |
-
input_ids = self.tokenizer(prompt, return_tensors="pt").to('cuda')
|
524 |
-
with torch.no_grad():
|
525 |
-
outputs = self.local_model.generate(**input_ids, max_new_tokens=50, do_sample=True, pad_token_id=self.tokenizer.eos_token_id)
|
526 |
-
result = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
527 |
-
result = result.replace(prompt[0], '')
|
528 |
-
print(result)
|
529 |
-
return result
|
530 |
|
531 |
def _compute_avg_length(self):
|
532 |
"""
|
@@ -607,7 +635,7 @@ class EvaluationModel:
|
|
607 |
for i in range(len(summaries_df["Experiment"])):
|
608 |
# vote_1_1, vote_1_2, vote_1_3 = 0, 0, 0
|
609 |
# print()
|
610 |
-
if pd.isna(summaries_df["Response"][i]):
|
611 |
output.append("Other")
|
612 |
continue
|
613 |
rs = summaries_df["Response"][i].strip().lower()
|
@@ -627,7 +655,7 @@ class EvaluationModel:
|
|
627 |
output.append("Spiky")
|
628 |
else:
|
629 |
output.append("Other")
|
630 |
-
|
631 |
|
632 |
'''Exp2'''
|
633 |
|
@@ -647,12 +675,12 @@ class EvaluationModel:
|
|
647 |
break
|
648 |
if male == 0 and female == 0 :
|
649 |
output.append("Other")
|
650 |
-
|
651 |
'''Exp3'''
|
652 |
elif summaries_df["Experiment"][i] == "E3":
|
653 |
# rs = summaries_df["Response"][i].strip()
|
654 |
print("E3", rs)
|
655 |
-
if pd.isna(summaries_df["Factor 2"][i]):
|
656 |
output.append("Other")
|
657 |
else:
|
658 |
if summaries_df["Factor 2"][i].strip() == "LS":
|
@@ -668,9 +696,9 @@ class EvaluationModel:
|
|
668 |
elif "3" in rs:
|
669 |
output.append("Long")
|
670 |
else:
|
671 |
-
output.append("Other")
|
672 |
'''Exp4'''
|
673 |
-
|
674 |
elif summaries_df["Experiment"][i] == "E4":
|
675 |
# rs = summaries_df["Response"][i].strip()
|
676 |
target = summaries_df["Factor 2"][i].strip().lower()
|
@@ -704,8 +732,8 @@ class EvaluationModel:
|
|
704 |
verb = item2verb2[item_id].lower()
|
705 |
sentence = sti2.replace("...","") + " " + rs.replace(sti2, "")
|
706 |
print("E5", verb, sentence)
|
707 |
-
|
708 |
-
|
709 |
doc = nlp1(sentence.replace(" "," "))
|
710 |
# print(doc)
|
711 |
# print()
|
@@ -745,8 +773,8 @@ class EvaluationModel:
|
|
745 |
|
746 |
elif summaries_df["Experiment"][i] == "E6":
|
747 |
sentence = summaries_df["Stimuli 1"][i].strip().lower()
|
748 |
-
print("E6", sentence)
|
749 |
-
doc = nlp1(sentence)
|
750 |
subject = "None"
|
751 |
obj = "None"
|
752 |
# 遍历依存关系,寻找主语和宾语
|
@@ -767,9 +795,9 @@ class EvaluationModel:
|
|
767 |
output.append("NP")
|
768 |
else:
|
769 |
print(rs, subject, obj, "Other")
|
770 |
-
output.append("Other")
|
|
|
771 |
|
772 |
-
|
773 |
|
774 |
|
775 |
'''Exp7'''
|
@@ -786,7 +814,7 @@ class EvaluationModel:
|
|
786 |
'''Exp8'''
|
787 |
elif summaries_df["Experiment"][i] == "E8":
|
788 |
# rs = summaries_df["Response"][i].strip()
|
789 |
-
|
790 |
if "something is wrong with the question" in rs:
|
791 |
output.append("1")
|
792 |
else:
|
@@ -795,7 +823,7 @@ class EvaluationModel:
|
|
795 |
'''Exp9'''
|
796 |
elif summaries_df["Experiment"][i] == "E9":
|
797 |
male, female = 0, 0
|
798 |
-
|
799 |
# rs = summaries_df["Response"][i].strip()
|
800 |
if "because" in rs:
|
801 |
rs = rs.replace("because because","because").split("because")[1]
|
@@ -847,8 +875,8 @@ class EvaluationModel:
|
|
847 |
# '''LLM'''
|
848 |
# self.data = pd.DataFrame(list(zip(summaries_df["Experiment"], summaries_df["Question_ID"], summaries_df["Item"], summaries_df["Response"], summaries_df["Factor 2"], summaries_df["Stimuli 1"], output)),
|
849 |
# columns=["Experiment", "Question_ID", "Item", "Response", "Factor 2", "Simulate 1","Coding"])
|
850 |
-
print(self.data.head())
|
851 |
-
|
852 |
return self.data
|
853 |
def code_results_llm(self, summaries_df):
|
854 |
'''code results from LLM's response'''
|
@@ -878,7 +906,7 @@ class EvaluationModel:
|
|
878 |
for i in range(len(summaries_df["Experiment"])):
|
879 |
# vote_1_1, vote_1_2, vote_1_3 = 0, 0, 0
|
880 |
# print()
|
881 |
-
if pd.isna(summaries_df["Response"][i]):
|
882 |
output.append("Other")
|
883 |
continue
|
884 |
rs = summaries_df["Response"][i].strip().lower()
|
@@ -893,7 +921,7 @@ class EvaluationModel:
|
|
893 |
output.append("Spiky")
|
894 |
else:
|
895 |
output.append("Other")
|
896 |
-
|
897 |
|
898 |
'''Exp2'''
|
899 |
|
@@ -913,13 +941,13 @@ class EvaluationModel:
|
|
913 |
break
|
914 |
if male == 0 and female == 0 :
|
915 |
output.append("Other")
|
916 |
-
|
917 |
'''Exp3'''
|
918 |
elif summaries_df["Experiment"][i] == "E3":
|
919 |
# rs = summaries_df["Response"][i].strip()
|
920 |
print("E3", rs)
|
921 |
rs = rs.replace('"', '')
|
922 |
-
pair = summaries_df["Factor 2"][i]
|
923 |
word1, word2 = pair.split('_')
|
924 |
|
925 |
if rs == word1:
|
@@ -980,8 +1008,8 @@ class EvaluationModel:
|
|
980 |
verb = item2verb2[item_id].lower()
|
981 |
sentence = sti2.replace("...","") + " " + rs.replace(sti2, "")
|
982 |
print("E5", verb, sentence)
|
983 |
-
|
984 |
-
|
985 |
doc = nlp1(sentence.replace(" "," "))
|
986 |
# print(doc)
|
987 |
# print()
|
@@ -1021,8 +1049,8 @@ class EvaluationModel:
|
|
1021 |
|
1022 |
elif summaries_df["Experiment"][i] == "E6":
|
1023 |
sentence = summaries_df["Stimuli 1"][i].strip().lower()
|
1024 |
-
print("E6", sentence)
|
1025 |
-
doc = nlp1(sentence)
|
1026 |
subject = "None"
|
1027 |
obj = "None"
|
1028 |
# 遍历依存关系,寻找主语和宾语
|
@@ -1043,9 +1071,9 @@ class EvaluationModel:
|
|
1043 |
output.append("NP")
|
1044 |
else:
|
1045 |
print(rs, subject, obj, "Other")
|
1046 |
-
output.append("Other")
|
|
|
1047 |
|
1048 |
-
|
1049 |
|
1050 |
|
1051 |
'''Exp7'''
|
@@ -1072,7 +1100,7 @@ class EvaluationModel:
|
|
1072 |
'''Exp9'''
|
1073 |
elif summaries_df["Experiment"][i] == "E9":
|
1074 |
male, female = 0, 0
|
1075 |
-
|
1076 |
# rs = summaries_df["Response"][i].strip()
|
1077 |
if "because" in rs:
|
1078 |
rs = rs.replace("because because","because").split("because")[1]
|
@@ -1125,14 +1153,14 @@ class EvaluationModel:
|
|
1125 |
'''LLM'''
|
1126 |
self.data = pd.DataFrame(list(zip(summaries_df["Experiment"], summaries_df["Question_ID"], summaries_df["Item"], summaries_df["Response"], summaries_df["Factor 2"], summaries_df["Stimuli 1"], output)),
|
1127 |
columns=["Experiment", "Question_ID", "Item", "Response", "Factor 2", "Simulate 1","Coding"])
|
1128 |
-
print(self.data.head())
|
1129 |
-
|
1130 |
return self.data
|
1131 |
-
|
1132 |
|
1133 |
-
|
1134 |
-
|
1135 |
-
|
|
|
1136 |
|
1137 |
|
1138 |
def calculate_js_divergence(self, file_path_1, file_path_2):
|
@@ -1225,7 +1253,7 @@ class EvaluationModel:
|
|
1225 |
print("avg_js_divergence:", avg_js_divergence)
|
1226 |
|
1227 |
return avg_js_divergence
|
1228 |
-
|
1229 |
|
1230 |
def evaluate_humanlike(self, summaries_df: object, human_data_path: object, result_save_path: object) -> object:
|
1231 |
'''
|
@@ -1272,19 +1300,19 @@ class EvaluationModel:
|
|
1272 |
|
1273 |
|
1274 |
|
1275 |
-
|
1276 |
|
1277 |
|
1278 |
|
1279 |
|
1280 |
-
|
1281 |
-
|
1282 |
-
|
1283 |
-
|
1284 |
|
1285 |
|
1286 |
-
|
1287 |
-
|
|
|
|
|
|
|
|
|
|
|
1288 |
|
1289 |
|
1290 |
|
|
|
2 |
import time
|
3 |
from datetime import datetime
|
4 |
import logging
|
5 |
+
from pathlib import Path
|
6 |
import requests
|
7 |
import json
|
8 |
|
|
|
135 |
# prompt = {}
|
136 |
# for index, row in tqdm(df_prompt.iterrows(), total=df_prompt.shape[0]):
|
137 |
# prompt['E' + row['Item']] = row['Prompt']
|
138 |
+
xls = pd.ExcelFile(dataset)
|
139 |
sheet_names = xls.sheet_names
|
140 |
# sheet_names = df.sheetnames
|
141 |
+
print(f"Total: {len(sheet_names)}")
|
142 |
+
print(sheet_names)
|
143 |
+
|
144 |
Experiment_ID, Questions_ID, Item_ID, Condition, User_prompt, Response, Factor_2, Stimuli_1 = [], [], [], [], [] ,[], [], []
|
145 |
exit_outer_loop = False # bad model
|
146 |
for i, sheet_name in enumerate(sheet_names, start=1):
|
|
|
150 |
# if i > 2 and i ==1:
|
151 |
# continue
|
152 |
print(i, sheet_name)
|
153 |
+
df_sheet = pd.read_excel(xls, sheet_name=sheet_name)
|
154 |
+
|
155 |
# 假设第一列是'Prompt0',但这里我们使用列名来避免硬编码
|
156 |
+
if 'Prompt0' in df_sheet.columns:
|
157 |
+
prompt_column = df_sheet['Prompt0']
|
158 |
+
else:
|
159 |
# 如果'Prompt0'列不存在,则跳过该工作表或进行其他处理
|
160 |
+
continue
|
161 |
if i == 3 :
|
162 |
+
word1_list = df_sheet['Stimuli-2']
|
163 |
+
word2_list = df_sheet['Stimuli-3']
|
164 |
V2_column = []
|
165 |
for jj in range(len(word1_list)):
|
166 |
V2_column.append(word1_list[jj] + '_' + word2_list[jj])
|
|
|
175 |
Item_column = df_sheet["Item"]
|
176 |
Condition_column = df_sheet["Condition"]
|
177 |
Stimuli_1_column = df_sheet["Stimuli-1"]
|
178 |
+
if 'Stimuli-2' in df_sheet.columns:
|
179 |
Stimuli_2_column = df_sheet["Stimuli-2"]
|
180 |
|
181 |
for j, prompt_value in enumerate(tqdm(prompt_column, desc=f"Processing {sheet_name}"), start=0):
|
182 |
if exit_outer_loop:
|
183 |
break
|
184 |
+
ID = 'E' + str(i)
|
185 |
# q_ID = ID + '_' + str(j)
|
186 |
+
|
187 |
# print(ID, q_ID, prompt_value)
|
188 |
+
system_prompt = envs.SYSTEM_PROMPT
|
189 |
_user_prompt = prompt_value
|
190 |
for ii in range(10):
|
191 |
# user_prompt = f"{envs.USER_PROMPT}\nPassage:\n{_source}"
|
|
|
193 |
try:
|
194 |
'''调用'''
|
195 |
print(ID,'-',ii)
|
196 |
+
|
197 |
_response = self.generate_summary(system_prompt, _user_prompt)
|
198 |
# print(f"Finish index {index}")
|
199 |
break
|
|
|
212 |
print(f"Quota has reached, wait for {wait_time}")
|
213 |
time.sleep(wait_time)
|
214 |
else:
|
215 |
+
max_retries = 30
|
216 |
+
retries = 0
|
217 |
+
wait_time = 120
|
218 |
+
|
219 |
+
while retries < max_retries:
|
|
|
|
|
|
|
220 |
print(f"Error at index {i}: {e}")
|
221 |
+
time.sleep(wait_time)
|
222 |
+
try:
|
223 |
+
_response = self.generate_summary(system_prompt, _user_prompt)
|
224 |
+
break
|
225 |
+
except Exception as e:
|
226 |
+
exceptions.append(e)
|
227 |
+
retries += 1
|
228 |
+
print(f"Retry {retries}/{max_retries} failed at index {i}: {e}")
|
229 |
+
if retries >= max_retries:
|
230 |
+
exit_outer_loop = True
|
231 |
+
break
|
232 |
+
|
233 |
|
234 |
if exit_outer_loop:
|
235 |
break
|
|
|
279 |
Experiment_ID.append(ID)
|
280 |
Questions_ID.append(q_column[j])
|
281 |
User_prompt.append(_user_prompt)
|
282 |
+
|
283 |
Response.append(_response2)
|
284 |
+
|
285 |
Factor_2.append(V2_column[j])
|
286 |
Stimuli_1.append(Stimuli_2_column[j])
|
287 |
Item_ID.append(Item_column[j])
|
|
|
293 |
User_prompt.append(_user_prompt)
|
294 |
Response.append(_response1)
|
295 |
|
296 |
+
|
297 |
+
|
298 |
Factor_2.append(V2_column[j])
|
299 |
Stimuli_1.append(Stimuli_1_column[j])
|
300 |
Item_ID.append(Item_column[j])
|
301 |
Condition.append(Condition_column[j])
|
302 |
+
|
303 |
else:
|
304 |
Experiment_ID.append(ID)
|
305 |
Questions_ID.append(q_column[j])
|
306 |
User_prompt.append(_user_prompt)
|
307 |
+
|
308 |
Response.append(_response)
|
309 |
if i == 6:
|
310 |
Factor_2.append(Condition_column[j])
|
|
|
316 |
Condition.append(Condition_column[j])
|
317 |
print(_response)
|
318 |
|
319 |
+
|
320 |
# exit()
|
321 |
|
322 |
# Sleep to prevent hitting rate limits too frequently
|
|
|
329 |
print(f'Save summaries to {save_path}')
|
330 |
fpath = Path(save_path)
|
331 |
fpath.parent.mkdir(parents=True, exist_ok=True)
|
332 |
+
self.summaries_df.to_csv(fpath)
|
333 |
|
334 |
self.exceptions = exceptions
|
335 |
# self._compute_avg_length()
|
336 |
# self._compute_answer_rate()
|
337 |
|
338 |
return self.summaries_df
|
339 |
+
|
340 |
def generate_summary(self, system_prompt: str, user_prompt: str):
|
341 |
# Using Together AI API
|
342 |
using_together_api = False
|
|
|
395 |
result = ''
|
396 |
print(result)
|
397 |
return result
|
398 |
+
if self.local_model: # cannot call API. using local model
|
399 |
+
messages=[
|
400 |
+
{"role": "system", "content": system_prompt}, # gemma-1.1 does not accept system role
|
401 |
+
{"role": "user", "content": user_prompt}
|
402 |
+
]
|
403 |
+
try: # some models support pipeline
|
404 |
+
pipe = pipeline(
|
405 |
+
"text-generation",
|
406 |
+
model=self.local_model,
|
407 |
+
tokenizer=self.tokenizer,
|
408 |
+
)
|
409 |
|
410 |
+
generation_args = {
|
411 |
+
"max_new_tokens": 50,
|
412 |
+
"return_full_text": False,
|
413 |
+
#"temperature": 0.0,
|
414 |
+
"do_sample": False,
|
415 |
+
}
|
416 |
+
|
417 |
+
output = pipe(messages, **generation_args)
|
418 |
+
result = output[0]['generated_text']
|
419 |
+
print(result)
|
420 |
+
except:
|
421 |
+
prompt = self.tokenizer.apply_chat_template(messages,add_generation_prompt=True, tokenize=False)
|
422 |
+
print(prompt)
|
423 |
+
input_ids = self.tokenizer(prompt, return_tensors="pt").to('cuda')
|
424 |
+
with torch.no_grad():
|
425 |
+
outputs = self.local_model.generate(**input_ids, max_new_tokens=50, do_sample=True, pad_token_id=self.tokenizer.eos_token_id)
|
426 |
+
result = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
427 |
+
result = result.replace(prompt[0], '')
|
428 |
+
print(result)
|
429 |
return result
|
430 |
+
|
431 |
+
|
432 |
+
elif self.local_model is None:
|
433 |
+
# print(self.model_id)
|
434 |
+
# print(self.api_base)
|
435 |
+
# mistralai/Mistral-7B-Instruct-v0.1
|
436 |
+
# https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.1
|
437 |
+
# Using HF API or download checkpoints
|
438 |
+
try: # try use HuggingFace API
|
439 |
+
from huggingface_hub import InferenceClient
|
440 |
+
print("token_for_request:",envs.TOKEN)
|
441 |
+
print(self.model_id)
|
442 |
+
client = InferenceClient(self.model_id,api_key=envs.TOKEN,headers={"X-use-cache": "false"})
|
443 |
+
messages = [{"role": "system", "content": system_prompt},{"role": "user", "content": user_prompt}]
|
444 |
+
# outputs = client.chat_completion(messages, max_tokens=50)
|
445 |
+
result = None
|
446 |
+
while result is None:
|
447 |
+
outputs = client.chat_completion(messages, max_tokens=50)
|
448 |
+
result = outputs['choices'][0]['message']['content']
|
449 |
+
|
450 |
+
if result is None:
|
451 |
+
time.sleep(1) # Optional: Add a small delay before retrying
|
452 |
+
|
453 |
+
return result
|
454 |
+
|
455 |
+
except Exception as e:
|
456 |
+
print(f"Error with TOKEN: {envs.TOKEN}, trying with TOKEN1")
|
457 |
+
try:
|
458 |
+
client = InferenceClient(self.model_id, api_key=envs.TOKEN1, headers={"X-use-cache": "false"})
|
459 |
+
messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
|
460 |
+
result = None
|
461 |
+
while result is None:
|
462 |
+
outputs = client.chat_completion(messages, max_tokens=50)
|
463 |
+
result = outputs['choices'][0]['message']['content']
|
464 |
+
|
465 |
+
if result is None:
|
466 |
+
time.sleep(1) # Optional: Add a small delay before retrying
|
467 |
+
|
468 |
+
return result
|
469 |
+
except Exception as e:
|
470 |
+
print(f"Error with TOKEN1: {envs.TOKEN1}")
|
471 |
+
raise e
|
472 |
+
|
473 |
+
# except: # fail to call api. run it locally.
|
474 |
+
# self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, trust_remote_code=True)
|
475 |
+
# print("Tokenizer loaded")
|
476 |
+
# self.local_model = AutoModelForCausalLM.from_pretrained(self.model_id, trust_remote_code=True, device_map="auto", torch_dtype="auto", cache_dir='/home/paperspace/cache')
|
477 |
+
# print("Local model loaded")
|
478 |
+
# response = litellm.completion(
|
479 |
+
# model="huggingface/"+'command-r-plus' if 'command' in self.model_id else self.model_id,
|
480 |
+
# messages=[{"role": "system", "content": system_prompt},
|
481 |
+
# {"role": "user", "content": user_prompt}],
|
482 |
+
# temperature=0.0,
|
483 |
+
# max_tokens=1024,
|
484 |
+
# api_base= "https://api-inference.huggingface.co/models/" + self.model_id,
|
485 |
+
# )
|
486 |
+
# self.model_id = 'command-r-plus' if 'command' in self.model_id else self.model_id
|
487 |
+
# response = litellm.completion(
|
488 |
+
# model="huggingface/" + self.model_id,
|
489 |
+
# # mistralai/Mistral-7B-Instruct-v0.1",
|
490 |
+
# messages=[{"role": "system", "content": system_prompt},
|
491 |
+
# {"role": "user", "content": user_prompt}],
|
492 |
+
# #temperature=0.0,
|
493 |
+
# max_tokens=1024,
|
494 |
+
# api_base="https://api-inference.huggingface.co/models/" + self.model_id)
|
495 |
+
# print("模型返回结果",response)
|
496 |
+
# print("模型返回结果结束")
|
497 |
+
# # exit()
|
498 |
+
# result = response['choices'][0]['message']['content']
|
499 |
+
# print(result)
|
500 |
+
# exit()
|
501 |
+
# Using Google AI API for Gemini models
|
502 |
elif 'gemini' in self.model_id.lower():
|
503 |
genai.configure(api_key=os.getenv('GOOGLE_AI_API_KEY'))
|
504 |
generation_config = {
|
505 |
"temperature": 0,
|
506 |
+
"top_p": 0.95, # cannot change
|
507 |
"top_k": 0,
|
508 |
"max_output_tokens": 50,
|
509 |
# "response_mime_type": "application/json",
|
|
|
526 |
"threshold": "BLOCK_NONE"
|
527 |
},
|
528 |
]
|
529 |
+
model = genai.GenerativeModel(
|
530 |
+
model_name="gemini-1.5-pro-latest" if "gemini-1.5-pro" in self.model_id.lower() else
|
531 |
+
self.model_id.lower().split('google/')[-1],
|
532 |
+
generation_config=generation_config,
|
533 |
+
system_instruction=system_prompt,
|
534 |
+
safety_settings=safety_settings)
|
535 |
convo = model.start_chat(history=[])
|
536 |
convo.send_message(user_prompt)
|
537 |
# print(convo.last)
|
538 |
result = convo.last.text
|
539 |
print(result)
|
540 |
return result
|
541 |
+
# Using OpenAI API
|
542 |
+
elif 'gpt' in self.model_id.lower():
|
543 |
+
response = litellm.completion(
|
544 |
+
model=self.model_id.replace('openai/',''),
|
545 |
+
messages=[{"role": "system", "content": system_prompt},
|
546 |
+
{"role": "user", "content": user_prompt}],
|
547 |
+
# temperature=0.0,
|
548 |
+
max_tokens=50,
|
549 |
+
api_key = os.getenv('OpenAI_key')
|
550 |
+
)
|
551 |
+
result = response['choices'][0]['message']['content']
|
552 |
+
# print()
|
553 |
+
print(result)
|
554 |
+
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
555 |
# exit()
|
556 |
# Using local model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
557 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
558 |
|
559 |
def _compute_avg_length(self):
|
560 |
"""
|
|
|
635 |
for i in range(len(summaries_df["Experiment"])):
|
636 |
# vote_1_1, vote_1_2, vote_1_3 = 0, 0, 0
|
637 |
# print()
|
638 |
+
if pd.isna(summaries_df["Response"][i]):
|
639 |
output.append("Other")
|
640 |
continue
|
641 |
rs = summaries_df["Response"][i].strip().lower()
|
|
|
655 |
output.append("Spiky")
|
656 |
else:
|
657 |
output.append("Other")
|
658 |
+
|
659 |
|
660 |
'''Exp2'''
|
661 |
|
|
|
675 |
break
|
676 |
if male == 0 and female == 0 :
|
677 |
output.append("Other")
|
678 |
+
|
679 |
'''Exp3'''
|
680 |
elif summaries_df["Experiment"][i] == "E3":
|
681 |
# rs = summaries_df["Response"][i].strip()
|
682 |
print("E3", rs)
|
683 |
+
if pd.isna(summaries_df["Factor 2"][i]):
|
684 |
output.append("Other")
|
685 |
else:
|
686 |
if summaries_df["Factor 2"][i].strip() == "LS":
|
|
|
696 |
elif "3" in rs:
|
697 |
output.append("Long")
|
698 |
else:
|
699 |
+
output.append("Other")
|
700 |
'''Exp4'''
|
701 |
+
|
702 |
elif summaries_df["Experiment"][i] == "E4":
|
703 |
# rs = summaries_df["Response"][i].strip()
|
704 |
target = summaries_df["Factor 2"][i].strip().lower()
|
|
|
732 |
verb = item2verb2[item_id].lower()
|
733 |
sentence = sti2.replace("...","") + " " + rs.replace(sti2, "")
|
734 |
print("E5", verb, sentence)
|
735 |
+
|
736 |
+
|
737 |
doc = nlp1(sentence.replace(" "," "))
|
738 |
# print(doc)
|
739 |
# print()
|
|
|
773 |
|
774 |
elif summaries_df["Experiment"][i] == "E6":
|
775 |
sentence = summaries_df["Stimuli 1"][i].strip().lower()
|
776 |
+
print("E6", sentence)
|
777 |
+
doc = nlp1(sentence)
|
778 |
subject = "None"
|
779 |
obj = "None"
|
780 |
# 遍历依存关系,寻找主语和宾语
|
|
|
795 |
output.append("NP")
|
796 |
else:
|
797 |
print(rs, subject, obj, "Other")
|
798 |
+
output.append("Other")
|
799 |
+
|
800 |
|
|
|
801 |
|
802 |
|
803 |
'''Exp7'''
|
|
|
814 |
'''Exp8'''
|
815 |
elif summaries_df["Experiment"][i] == "E8":
|
816 |
# rs = summaries_df["Response"][i].strip()
|
817 |
+
|
818 |
if "something is wrong with the question" in rs:
|
819 |
output.append("1")
|
820 |
else:
|
|
|
823 |
'''Exp9'''
|
824 |
elif summaries_df["Experiment"][i] == "E9":
|
825 |
male, female = 0, 0
|
826 |
+
|
827 |
# rs = summaries_df["Response"][i].strip()
|
828 |
if "because" in rs:
|
829 |
rs = rs.replace("because because","because").split("because")[1]
|
|
|
875 |
# '''LLM'''
|
876 |
# self.data = pd.DataFrame(list(zip(summaries_df["Experiment"], summaries_df["Question_ID"], summaries_df["Item"], summaries_df["Response"], summaries_df["Factor 2"], summaries_df["Stimuli 1"], output)),
|
877 |
# columns=["Experiment", "Question_ID", "Item", "Response", "Factor 2", "Simulate 1","Coding"])
|
878 |
+
print(self.data.head())
|
879 |
+
|
880 |
return self.data
|
881 |
def code_results_llm(self, summaries_df):
|
882 |
'''code results from LLM's response'''
|
|
|
906 |
for i in range(len(summaries_df["Experiment"])):
|
907 |
# vote_1_1, vote_1_2, vote_1_3 = 0, 0, 0
|
908 |
# print()
|
909 |
+
if pd.isna(summaries_df["Response"][i]):
|
910 |
output.append("Other")
|
911 |
continue
|
912 |
rs = summaries_df["Response"][i].strip().lower()
|
|
|
921 |
output.append("Spiky")
|
922 |
else:
|
923 |
output.append("Other")
|
924 |
+
|
925 |
|
926 |
'''Exp2'''
|
927 |
|
|
|
941 |
break
|
942 |
if male == 0 and female == 0 :
|
943 |
output.append("Other")
|
944 |
+
|
945 |
'''Exp3'''
|
946 |
elif summaries_df["Experiment"][i] == "E3":
|
947 |
# rs = summaries_df["Response"][i].strip()
|
948 |
print("E3", rs)
|
949 |
rs = rs.replace('"', '')
|
950 |
+
pair = summaries_df["Factor 2"][i]
|
951 |
word1, word2 = pair.split('_')
|
952 |
|
953 |
if rs == word1:
|
|
|
1008 |
verb = item2verb2[item_id].lower()
|
1009 |
sentence = sti2.replace("...","") + " " + rs.replace(sti2, "")
|
1010 |
print("E5", verb, sentence)
|
1011 |
+
|
1012 |
+
|
1013 |
doc = nlp1(sentence.replace(" "," "))
|
1014 |
# print(doc)
|
1015 |
# print()
|
|
|
1049 |
|
1050 |
elif summaries_df["Experiment"][i] == "E6":
|
1051 |
sentence = summaries_df["Stimuli 1"][i].strip().lower()
|
1052 |
+
print("E6", sentence)
|
1053 |
+
doc = nlp1(sentence)
|
1054 |
subject = "None"
|
1055 |
obj = "None"
|
1056 |
# 遍历依存关系,寻找主语和宾语
|
|
|
1071 |
output.append("NP")
|
1072 |
else:
|
1073 |
print(rs, subject, obj, "Other")
|
1074 |
+
output.append("Other")
|
1075 |
+
|
1076 |
|
|
|
1077 |
|
1078 |
|
1079 |
'''Exp7'''
|
|
|
1100 |
'''Exp9'''
|
1101 |
elif summaries_df["Experiment"][i] == "E9":
|
1102 |
male, female = 0, 0
|
1103 |
+
|
1104 |
# rs = summaries_df["Response"][i].strip()
|
1105 |
if "because" in rs:
|
1106 |
rs = rs.replace("because because","because").split("because")[1]
|
|
|
1153 |
'''LLM'''
|
1154 |
self.data = pd.DataFrame(list(zip(summaries_df["Experiment"], summaries_df["Question_ID"], summaries_df["Item"], summaries_df["Response"], summaries_df["Factor 2"], summaries_df["Stimuli 1"], output)),
|
1155 |
columns=["Experiment", "Question_ID", "Item", "Response", "Factor 2", "Simulate 1","Coding"])
|
1156 |
+
print(self.data.head())
|
1157 |
+
|
1158 |
return self.data
|
|
|
1159 |
|
1160 |
+
|
1161 |
+
|
1162 |
+
|
1163 |
+
|
1164 |
|
1165 |
|
1166 |
def calculate_js_divergence(self, file_path_1, file_path_2):
|
|
|
1253 |
print("avg_js_divergence:", avg_js_divergence)
|
1254 |
|
1255 |
return avg_js_divergence
|
1256 |
+
|
1257 |
|
1258 |
def evaluate_humanlike(self, summaries_df: object, human_data_path: object, result_save_path: object) -> object:
|
1259 |
'''
|
|
|
1300 |
|
1301 |
|
1302 |
|
|
|
1303 |
|
1304 |
|
1305 |
|
1306 |
|
|
|
|
|
|
|
|
|
1307 |
|
1308 |
|
1309 |
+
|
1310 |
+
|
1311 |
+
|
1312 |
+
|
1313 |
+
|
1314 |
+
|
1315 |
+
|
1316 |
|
1317 |
|
1318 |
|
src/display/about.py
CHANGED
@@ -33,15 +33,13 @@ An improved version (HHEM v2) is integrated into the [Vectara platform](https://
|
|
33 |
LLM_BENCHMARKS_TEXT = """
|
34 |
## Introduction
|
35 |
|
36 |
-
|
37 |
-
|
38 |
-
Hallucinations refer to instances where a model introduces factually incorrect or unrelated content in its summaries.
|
39 |
|
40 |
## How it works
|
41 |
|
42 |
-
|
43 |
-
|
44 |
-
The model card for HHEM can be found [here](https://huggingface.co/vectara/hallucination_evaluation_model).
|
45 |
|
46 |
## Evaluation Dataset
|
47 |
|
@@ -49,10 +47,8 @@ Our evaluation dataset consists of 1006 documents from multiple public datasets,
|
|
49 |
We generate summaries for each of these documents using submitted LLMs and compute hallucination scores for each pair of document and generated summary. (Check the prompt we used [here](https://huggingface.co/spaces/vectara/Hallucination-evaluation-leaderboard))
|
50 |
|
51 |
## Metrics Explained
|
52 |
-
-
|
53 |
-
-
|
54 |
-
- Answer Rate: Percentage of summaries that are non-empty. This is either the model refuses to generate a response or throws an error due to various reasons. (e.g. the model believes that the document includes inappropriate content)
|
55 |
-
- Average Summary Length: The average word count of generated summaries
|
56 |
|
57 |
## Note on non-Hugging Face models
|
58 |
On HHEM leaderboard, There are currently models such as GPT variants that are not available on the Hugging Face model hub. We ran the evaluations for these models on our own and uploaded the results to the leaderboard.
|
@@ -61,18 +57,13 @@ If you would like to submit your model that is not available on the Hugging Face
|
|
61 |
## Model Submissions and Reproducibility
|
62 |
You can submit your model for evaluation, whether it's hosted on the Hugging Face model hub or not. (Though it is recommended to host your model on the Hugging Face)
|
63 |
|
64 |
-
### For models not available on the Hugging Face model hub:
|
65 |
-
1) Access generated summaries used for evaluation [here](https://github.com/vectara/hallucination-leaderboard) in "leaderboard_summaries.csv".
|
66 |
-
2) The text generation prompt is available under "Prompt Used" section in the repository's README.
|
67 |
-
3) Details on API Integration for evaluations are under "API Integration Details".
|
68 |
-
|
69 |
### For models available on the Hugging Face model hub:
|
70 |
To replicate the evaluation result for a Hugging Face model:
|
71 |
|
72 |
1) Clone the Repository
|
73 |
```python
|
74 |
git lfs install
|
75 |
-
git clone https://huggingface.co/spaces/
|
76 |
```
|
77 |
2) Install the Requirements
|
78 |
```python
|
|
|
33 |
LLM_BENCHMARKS_TEXT = """
|
34 |
## Introduction
|
35 |
|
36 |
+
This study aims to compare the similarities between human and model responses in language use by employing ten psycholinguistic tasks. Each task consists of multiple stimuli, with each stimulus having both expected and unexpected responses.
|
37 |
+
To quantify the similarity, we collected responses from 2000 human participants, creating a binomial distribution for each stimulus within each task. The same stimuli were then presented to a language model, generating another binomial distribution for comparison.
|
|
|
38 |
|
39 |
## How it works
|
40 |
|
41 |
+
To measure the similarity between human and model responses, we utilize the Jensen-Shannon (JS) divergence. This method allows us to compare the two binomial distributions (one from human responses and one from model responses) for each stimulus.
|
42 |
+
The similarity is quantified by calculating 1 minus the JS divergence, where a value closer to 1 indicates higher similarity.
|
|
|
43 |
|
44 |
## Evaluation Dataset
|
45 |
|
|
|
47 |
We generate summaries for each of these documents using submitted LLMs and compute hallucination scores for each pair of document and generated summary. (Check the prompt we used [here](https://huggingface.co/spaces/vectara/Hallucination-evaluation-leaderboard))
|
48 |
|
49 |
## Metrics Explained
|
50 |
+
- Individual Task Similarity: For each psycholinguistic task, we calculate the humanlike score for each stimulus, providing a measure of how closely the model’s responses resemble those of humans.
|
51 |
+
- Average Similarity: The average of the humanlike scores across all stimuli and tasks, giving an overall indication of the model’s performance in mimicking human language use.
|
|
|
|
|
52 |
|
53 |
## Note on non-Hugging Face models
|
54 |
On HHEM leaderboard, There are currently models such as GPT variants that are not available on the Hugging Face model hub. We ran the evaluations for these models on our own and uploaded the results to the leaderboard.
|
|
|
57 |
## Model Submissions and Reproducibility
|
58 |
You can submit your model for evaluation, whether it's hosted on the Hugging Face model hub or not. (Though it is recommended to host your model on the Hugging Face)
|
59 |
|
|
|
|
|
|
|
|
|
|
|
60 |
### For models available on the Hugging Face model hub:
|
61 |
To replicate the evaluation result for a Hugging Face model:
|
62 |
|
63 |
1) Clone the Repository
|
64 |
```python
|
65 |
git lfs install
|
66 |
+
git clone https://huggingface.co/spaces/Simondon/HumanLikeness
|
67 |
```
|
68 |
2) Install the Requirements
|
69 |
```python
|
src/display/formatting.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import os
|
2 |
from datetime import datetime, timezone
|
|
|
3 |
|
4 |
from huggingface_hub import HfApi
|
5 |
from huggingface_hub.hf_api import ModelInfo
|
@@ -29,7 +30,8 @@ def styled_message(message):
|
|
29 |
|
30 |
|
31 |
def has_no_nan_values(df, columns):
|
32 |
-
return df[
|
|
|
33 |
|
34 |
|
35 |
def has_nan_values(df, columns):
|
|
|
1 |
import os
|
2 |
from datetime import datetime, timezone
|
3 |
+
import numpy as np
|
4 |
|
5 |
from huggingface_hub import HfApi
|
6 |
from huggingface_hub.hf_api import ModelInfo
|
|
|
30 |
|
31 |
|
32 |
def has_no_nan_values(df, columns):
|
33 |
+
return df.iloc[:, 2].apply(lambda x: not any(np.isnan(val) for val in x))
|
34 |
+
|
35 |
|
36 |
|
37 |
def has_nan_values(df, columns):
|
src/envs.py
CHANGED
@@ -6,12 +6,8 @@ from huggingface_hub import HfApi
|
|
6 |
# replace this with our token
|
7 |
# TOKEN = os.environ.get("HF_TOKEN", None)
|
8 |
TOKEN = os.getenv("H4_TOKEN")
|
9 |
-
|
10 |
-
# print(TOKEN)
|
11 |
-
# OWNER = "vectara"
|
12 |
-
# REPO_ID = f"{OWNER}/Humanlike"
|
13 |
-
# QUEUE_REPO = f"{OWNER}/requests"
|
14 |
-
# RESULTS_REPO = f"{OWNER}/results"
|
15 |
|
16 |
|
17 |
OWNER = "Simondon" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
|
|
6 |
# replace this with our token
|
7 |
# TOKEN = os.environ.get("HF_TOKEN", None)
|
8 |
TOKEN = os.getenv("H4_TOKEN")
|
9 |
+
TOKEN1 = os.getenv("H4_TOKEN1")
|
10 |
+
# print("H4_token:", TOKEN)
|
|
|
|
|
|
|
|
|
11 |
|
12 |
|
13 |
OWNER = "Simondon" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
src/leaderboard/read_evals.py
CHANGED
@@ -155,23 +155,11 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
155 |
model_result_filepaths = []
|
156 |
print("results_path", results_path)
|
157 |
for root, _, files in os.walk(results_path):
|
158 |
-
# We should only have json files in model results
|
159 |
print("file",files)
|
160 |
-
|
161 |
-
# if not files or any([not f.endswith(".json") for f in files]):
|
162 |
-
|
163 |
-
# continue
|
164 |
for f in files:
|
165 |
if f.endswith(".json"):
|
166 |
-
|
167 |
-
# Sort the files by date
|
168 |
-
# try:
|
169 |
-
# files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
170 |
-
# except dateutil.parser._parser.ParserError:
|
171 |
-
# files = [files[-1]]
|
172 |
-
|
173 |
model_result_filepaths.extend([os.path.join(root, f)])
|
174 |
-
print("model_result_filepaths", model_result_filepaths)
|
175 |
# exit()
|
176 |
eval_results = {}
|
177 |
for model_result_filepath in model_result_filepaths:
|
|
|
155 |
model_result_filepaths = []
|
156 |
print("results_path", results_path)
|
157 |
for root, _, files in os.walk(results_path):
|
|
|
158 |
print("file",files)
|
|
|
|
|
|
|
|
|
159 |
for f in files:
|
160 |
if f.endswith(".json"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
model_result_filepaths.extend([os.path.join(root, f)])
|
162 |
+
print("model_result_filepaths:", model_result_filepaths)
|
163 |
# exit()
|
164 |
eval_results = {}
|
165 |
for model_result_filepath in model_result_filepaths:
|
src/populate.py
CHANGED
@@ -11,19 +11,19 @@ import src.leaderboard.read_evals as read_evals
|
|
11 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
12 |
print(results_path, requests_path)
|
13 |
raw_data = read_evals.get_raw_eval_results(results_path, requests_path)
|
14 |
-
print("raw_data:",raw_data)
|
15 |
all_data_json = [v.to_dict() for v in raw_data]
|
16 |
|
17 |
-
print(all_data_json)
|
18 |
df = pd.DataFrame.from_records(all_data_json)
|
19 |
-
print(df)
|
20 |
# exit()
|
21 |
df = df.sort_values(by=[utils.AutoEvalColumn.hallucination_rate.name], ascending=True)
|
22 |
df = df[cols].round(decimals=2)
|
23 |
|
24 |
# filter out if any of the benchmarks have not been produced
|
25 |
df = df[formatting.has_no_nan_values(df, benchmark_cols)]
|
26 |
-
return
|
27 |
|
28 |
|
29 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
|
11 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
12 |
print(results_path, requests_path)
|
13 |
raw_data = read_evals.get_raw_eval_results(results_path, requests_path)
|
14 |
+
#print("raw_data:",raw_data)
|
15 |
all_data_json = [v.to_dict() for v in raw_data]
|
16 |
|
17 |
+
#print(all_data_json)
|
18 |
df = pd.DataFrame.from_records(all_data_json)
|
19 |
+
print("all results:",df)
|
20 |
# exit()
|
21 |
df = df.sort_values(by=[utils.AutoEvalColumn.hallucination_rate.name], ascending=True)
|
22 |
df = df[cols].round(decimals=2)
|
23 |
|
24 |
# filter out if any of the benchmarks have not been produced
|
25 |
df = df[formatting.has_no_nan_values(df, benchmark_cols)]
|
26 |
+
return df
|
27 |
|
28 |
|
29 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|