Blair Yang commited on
Commit
91143ec
·
1 Parent(s): de1d92a

nwo able to record responses

Browse files
Sample.py CHANGED
@@ -52,9 +52,10 @@ def sample_random_entry(dataset='', topic='', model='', n=1):
52
 
53
  # print(f"Sampling {n} random entries from {dataset} - {topic} - {model}")
54
  card_lst = sample_card(dataset, topic, model)
55
- qa = sample_QA_entry(dataset, topic, model)
56
 
57
  display_dict, info_dict = process_for_display(card_lst, qa)
 
58
 
59
  return display_dict, info_dict
60
 
@@ -108,8 +109,9 @@ def sample_QA_entry(dataset='', topic='', model='', n=1):
108
  df = df[df['model'] == model]
109
  sample = df.sample(1)
110
  # Convert to dictionary
 
111
  sample = sample.to_dict(orient='records')[0]
112
- return (sample)
113
 
114
  if __name__ == '__main__':
115
  sample_random_entry(n=5)
 
52
 
53
  # print(f"Sampling {n} random entries from {dataset} - {topic} - {model}")
54
  card_lst = sample_card(dataset, topic, model)
55
+ qa, index = sample_QA_entry(dataset, topic, model)
56
 
57
  display_dict, info_dict = process_for_display(card_lst, qa)
58
+ info_dict['index'] = index
59
 
60
  return display_dict, info_dict
61
 
 
109
  df = df[df['model'] == model]
110
  sample = df.sample(1)
111
  # Convert to dictionary
112
+ sample_idx = sample.index[0]
113
  sample = sample.to_dict(orient='records')[0]
114
+ return sample, sample_idx
115
 
116
  if __name__ == '__main__':
117
  sample_random_entry(n=5)
__pycache__/Sample.cpython-311.pyc CHANGED
Binary files a/__pycache__/Sample.cpython-311.pyc and b/__pycache__/Sample.cpython-311.pyc differ
 
app.py CHANGED
@@ -1,9 +1,26 @@
1
  import gradio as gr
2
  from Sample import sample_random_entry
3
  from Config import TOPICS
 
 
 
4
 
 
5
  info_dict = {}
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  def sample_and_display(topic):
8
  # If a topic is selected, use it to sample a new entry
9
  global info_dict
@@ -18,7 +35,7 @@ def evaluate_guess(reasoning, correctness, confidence, topic):
18
  global info_dict
19
  # Here your logic will go to evaluate the guess
20
  # Placeholder for the correct logic to determine the correct answer
21
- correct_answer = "Correctly"
22
  evaluation_response = "Correct" if correctness == correct_answer else "Incorrect"
23
 
24
  # Assuming info_dict is updated by sample_and_display function
@@ -27,6 +44,23 @@ def evaluate_guess(reasoning, correctness, confidence, topic):
27
 
28
  # Update the completion text
29
  completion_text = f"Completion: {actual_completion}\n\nChoice: {chr(info_dict.get('verdict', 0) + 65)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  return evaluation_response, actual_model, completion_text
31
 
32
  # Initial sampling
@@ -43,7 +77,7 @@ with gr.Blocks() as app:
43
  with gr.Column(scale=1):
44
  question = gr.Textbox(value=question_text, label="Question", interactive=False)
45
  reasoning = gr.Textbox(lines=5, placeholder="Your reasoning (optional)")
46
- correctness = gr.Radio(choices=["Correct", "Incorrect"], label="I believe the model will answer this question")
47
  confidence = gr.Slider(minimum=0, maximum=10, step=1, label="Confidence")
48
  output_text = gr.Text(label="Evaluation Output")
49
  submit_button = gr.Button("Submit")
 
1
  import gradio as gr
2
  from Sample import sample_random_entry
3
  from Config import TOPICS
4
+ import pandas as pd
5
+ import os
6
+ from threading import Lock
7
 
8
+ lock = Lock()
9
  info_dict = {}
10
 
11
+ def append_to_csv(output_path, row_data, header_names):
12
+ # Acquire the lock before accessing the file
13
+ with lock:
14
+ # Check if file exists and is not empty
15
+ if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
16
+ # File exists and is not empty, append without headers
17
+ df = pd.DataFrame([row_data])
18
+ df.to_csv(output_path, mode='a', header=False, index=False)
19
+ else:
20
+ # File does not exist or is empty, write with headers
21
+ df = pd.DataFrame([row_data], columns=header_names)
22
+ df.to_csv(output_path, mode='w', header=True, index=False)
23
+
24
  def sample_and_display(topic):
25
  # If a topic is selected, use it to sample a new entry
26
  global info_dict
 
35
  global info_dict
36
  # Here your logic will go to evaluate the guess
37
  # Placeholder for the correct logic to determine the correct answer
38
+ correct_answer = 'Correctly' if info_dict['correctness'] else 'Incorrectly'
39
  evaluation_response = "Correct" if correctness == correct_answer else "Incorrect"
40
 
41
  # Assuming info_dict is updated by sample_and_display function
 
44
 
45
  # Update the completion text
46
  completion_text = f"Completion: {actual_completion}\n\nChoice: {chr(info_dict.get('verdict', 0) + 65)}"
47
+
48
+ question_index = info_dict.get('index', -1)
49
+ question_topic = topic
50
+ output_path = f'responses/mmlu/{question_topic}/response.csv'
51
+ entry = dict()
52
+
53
+ entry['index'] = question_index
54
+ entry['model'] = actual_model
55
+ entry['reasoning'] = reasoning
56
+ entry['correctness'] = correctness == correct_answer
57
+ entry['confidence'] = confidence
58
+
59
+ header_names = ['index', 'model', 'reasoning', 'correctness', 'confidence'] # Add other headers as necessary
60
+
61
+ append_to_csv(output_path, entry, header_names)
62
+
63
+
64
  return evaluation_response, actual_model, completion_text
65
 
66
  # Initial sampling
 
77
  with gr.Column(scale=1):
78
  question = gr.Textbox(value=question_text, label="Question", interactive=False)
79
  reasoning = gr.Textbox(lines=5, placeholder="Your reasoning (optional)")
80
+ correctness = gr.Radio(choices=["Correct", "Incorrect"], label="I beplaceholderlieve the model will answer this question")
81
  confidence = gr.Slider(minimum=0, maximum=10, step=1, label="Confidence")
82
  output_text = gr.Text(label="Evaluation Output")
83
  submit_button = gr.Button("Submit")
responses/.DS_Store ADDED
Binary file (6.15 kB). View file
 
responses/mmlu/.DS_Store ADDED
Binary file (6.15 kB). View file
 
responses/mmlu/high_school_physics/response.csv ADDED
@@ -0,0 +1 @@
 
 
1
+ index,model,reasoning,correctness,confidence