pminervini commited on
Commit
3d44a49
·
1 Parent(s): 21eac98
cli/eval-cli.py CHANGED
@@ -35,7 +35,8 @@ def main():
35
  # my_task = Task("memo-trap", "acc", "memo-trap", 0)
36
  # my_task = Task("selfcheckgpt", "avg-selfcheckgpt", "SGPT", 2)
37
  # my_task = Task("ifeval", "prompt_level_strict_acc", "IFEval", 0)
38
- my_task = Task("truefalse_cieacf", "acc", "TrueFalse", 5)
 
39
 
40
  eval_logger = utils.eval_logger
41
  import logging
 
35
  # my_task = Task("memo-trap", "acc", "memo-trap", 0)
36
  # my_task = Task("selfcheckgpt", "avg-selfcheckgpt", "SGPT", 2)
37
  # my_task = Task("ifeval", "prompt_level_strict_acc", "IFEval", 0)
38
+ # my_task = Task("truefalse_cieacf", "acc", "TrueFalse", 5)
39
+ my_task = Task("faithdial_hallu", "acc", "FaithDIAL", 2)
40
 
41
  eval_logger = utils.eval_logger
42
  import logging
src/backend/tasks/faithdial/faithdial.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ group: faithdial
2
+ task: faithdial_hallu
3
+ dataset_path: McGill-NLP/FaithDial
4
+ training_split: train
5
+ validation_split: validation
6
+ test_split: test
7
+ output_type: multiple_choice
8
+ doc_to_text: !function utils.doc_to_text
9
+ doc_to_target: !function utils.doc_to_target
10
+ # process_results: !function utils.process_results
11
+ doc_to_choice: ["false", "true"]
12
+ metric_list:
13
+ - metric: acc
14
+ higher_is_better: True
15
+ metadata:
16
+ version: 0.0
src/backend/tasks/faithdial/utils.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Union
2
+ ValueType = Union[str, List[str]]
3
+
4
+
5
+ def doc_to_text(doc: dict[str, ValueType]) -> str:
6
+ history_str = " ".join([f'[{"Human" if i % 2 == 0 else "Assistant"}] {m}' for i, m in enumerate(doc['history'])])
7
+ doc_text = f'#Knowledge#: {doc["knowledge"]}\n#Dialogue History#: {history_str}\n#Response#: {doc["response"]}\n#Hallucinated#:'
8
+ # breakpoint()
9
+ return doc_text
10
+
11
+
12
+ def doc_to_target(doc: dict[str, ValueType]) -> str:
13
+ res = "true" if "Hallucination" in doc["BEGIN"] else "false"
14
+ # breakpoint()
15
+ return res
16
+
17
+
18
+ def process_results(doc: dict[str, ValueType], results: List[str]) -> dict[str, float]:
19
+ # breakpoint()
20
+ return {"acc": 0.0}
src/backend/tasks/halueval/utils.py CHANGED
@@ -83,13 +83,13 @@ You should try your best to determine if the summary contains non-factual or hal
83
 
84
  def doc_to_text_qa(doc: dict[str, str]) -> str:
85
  # prompt = instruction + "\n\n#Question#: " + question + "\n#Answer#: " + answer + "\n#Your Judgement#:"
86
- doc_text = QA_INSTURCTIONS + "\n\n#Knowledge: " + doc["knowledge"] + "\n#Question#: " + doc["question"] + "\n#Answer#: " + doc["answer"] + "\n#Your Judgement#:"
87
  return doc_text
88
 
89
 
90
  def doc_to_text_dialogue(doc: dict[str, str]) -> str:
91
  # prompt = instruction + "\n\n#Dialogue History#: " + dialog + "\n#Response#: " + response + "\n#Your Judgement#:"
92
- doc_text = DIALOGUE_INSTRUCTIONS + "\n\n#Knowledge: " + doc["knowledge"] + "\n#Dialogue History#: " + doc["dialogue_history"] + "\n#Response#: " + doc["response"] + "\n#Your Judgement#:"
93
  return doc_text
94
 
95
 
@@ -127,7 +127,7 @@ def compute_metrics(gold_answer: str, prediction: str) -> dict[str, float]:
127
  return res
128
 
129
 
130
- def process_results(doc: dict[str, str], results: list[str]):
131
  # results is e.g., ['Yes']
132
  gold_list = doc_to_target(doc)
133
  # gold_list is e.g., 'yes'
 
83
 
84
  def doc_to_text_qa(doc: dict[str, str]) -> str:
85
  # prompt = instruction + "\n\n#Question#: " + question + "\n#Answer#: " + answer + "\n#Your Judgement#:"
86
+ doc_text = QA_INSTURCTIONS + "\n\n#Knowledge#: " + doc["knowledge"] + "\n#Question#: " + doc["question"] + "\n#Answer#: " + doc["answer"] + "\n#Your Judgement#:"
87
  return doc_text
88
 
89
 
90
  def doc_to_text_dialogue(doc: dict[str, str]) -> str:
91
  # prompt = instruction + "\n\n#Dialogue History#: " + dialog + "\n#Response#: " + response + "\n#Your Judgement#:"
92
+ doc_text = DIALOGUE_INSTRUCTIONS + "\n\n#Knowledge#: " + doc["knowledge"] + "\n#Dialogue History#: " + doc["dialogue_history"] + "\n#Response#: " + doc["response"] + "\n#Your Judgement#:"
93
  return doc_text
94
 
95
 
 
127
  return res
128
 
129
 
130
+ def process_results(doc: dict[str, str], results: list[str]) -> dict[str, float]:
131
  # results is e.g., ['Yes']
132
  gold_list = doc_to_target(doc)
133
  # gold_list is e.g., 'yes'