allenpark commited on
Commit
57db206
β€’
1 Parent(s): e3e882a

fix: examples

Browse files
Files changed (1) hide show
  1. app.py +22 -10
app.py CHANGED
@@ -53,7 +53,8 @@ EXAMPLES = [
53
  "user_input": "How does metformin work to treat diabetes?",
54
  "retrieved_context": "Metformin reduces hepatic glucose production, decreases intestinal glucose absorption, and improves insulin sensitivity by increasing peripheral glucose uptake.",
55
  "pass_criteria": "Does the MODEL OUTPUT explain the mechanism of action accurately and completely?",
56
- "rubric": "1. Incorrect mechanism\n2. Partially correct but missing key elements\n3. Mostly correct with minor omissions\n4. Fully correct and comprehensive"
 
57
  },
58
  {
59
  "emoji": "πŸ“ˆ",
@@ -61,7 +62,8 @@ EXAMPLES = [
61
  "user_input": "What is a bull market?",
62
  "gold_answer": "A bull market is a financial market condition where prices are rising or expected to rise, typically defined by a 20% rise from recent lows.",
63
  "pass_criteria": "Does the MODEL OUTPUT provide a complete and accurate definition?",
64
- "rubric": "1. Incorrect or misleading\n2. Basic but incomplete\n3. Accurate but missing technical details\n4. Complete with technical specifics\n5. Comprehensive with market context"
 
65
  },
66
  {
67
  "emoji": "πŸ«€",
@@ -69,14 +71,17 @@ EXAMPLES = [
69
  "user_input": "What are the diagnostic criteria for hypertension?",
70
  "retrieved_context": "Stage 1 hypertension: systolic 130-139 or diastolic 80-89 mmHg. Stage 2: systolic β‰₯140 or diastolic β‰₯90 mmHg.",
71
  "pass_criteria": "Does the MODEL OUTPUT accurately reflect current diagnostic guidelines?",
72
- "rubric": "1. Incorrect values\n2. Partially correct but imprecise\n3. Correct but missing staging\n4. Complete with staging information\n5. Comprehensive with risk factors"
 
73
  },
74
  {
75
  "emoji": "πŸ’°",
76
  "model_output": "ETFs are investment funds traded on stock exchanges, offering diversification and lower fees than mutual funds.",
77
  "user_input": "What are ETFs and their advantages?",
78
  "pass_criteria": "Does the MODEL OUTPUT explain both the concept and benefits accurately?",
79
- "rubric": "1. Incorrect explanation\n2. Basic definition only\n3. Correct with some advantages\n4. Complete with multiple advantages"
 
 
80
  },
81
  {
82
  "emoji": "πŸ₯",
@@ -84,7 +89,8 @@ EXAMPLES = [
84
  "user_input": "What is MRSA?",
85
  "retrieved_context": "MRSA (Methicillin-resistant Staphylococcus aureus) is a bacteria resistant to many antibiotics. It can cause skin infections, pneumonia, and bloodstream infections.",
86
  "pass_criteria": "Does the MODEL OUTPUT explain both resistance and clinical significance?",
87
- "rubric": "1. Incorrect information\n2. Only mentions resistance\n3. Correct but incomplete clinical picture\n4. Complete with resistance and clinical aspects\n5. Comprehensive with treatment options"
 
88
  },
89
  {
90
  "emoji": "πŸ“Š",
@@ -92,7 +98,8 @@ EXAMPLES = [
92
  "user_input": "What is diversification in investing?",
93
  "gold_answer": "Diversification is a risk management strategy that mixes various investments within a portfolio to reduce exposure to any single asset or risk.",
94
  "pass_criteria": "Does the MODEL OUTPUT explain both the concept and purpose of diversification?",
95
- "rubric": "1. Incorrect concept\n2. Basic definition only\n3. Explains concept with limited context\n4. Complete with risk management aspects\n5. Comprehensive with practical examples"
 
96
  },
97
  {
98
  "emoji": "🧬",
@@ -100,14 +107,17 @@ EXAMPLES = [
100
  "user_input": "What causes Type 2 diabetes?",
101
  "retrieved_context": "Type 2 diabetes develops when the body becomes resistant to insulin or the pancreas doesn't produce enough insulin. Risk factors include obesity, physical inactivity, and genetics.",
102
  "pass_criteria": "Does the MODEL OUTPUT explain both pathophysiology and risk factors?",
103
- "rubric": "1. Incorrect pathophysiology\n2. Basic mechanism only\n3. Correct mechanism with partial risk factors\n4. Complete with risk factors\n5. Comprehensive with prevention strategies"
 
104
  },
105
  {
106
  "emoji": "πŸ’΅",
107
  "model_output": "A mortgage amortization schedule shows monthly payments divided between principal and interest over the loan term.",
108
  "user_input": "What is mortgage amortization?",
109
  "pass_criteria": "Does the MODEL OUTPUT explain the concept and components clearly?",
110
- "rubric": "1. Incorrect explanation\n2. Basic definition only\n3. Explains components without context\n4. Complete with payment breakdown\n5. Comprehensive with practical implications"
 
 
111
  },
112
  {
113
  "emoji": "πŸ”¬",
@@ -115,7 +125,8 @@ EXAMPLES = [
115
  "user_input": "How do statins lower cholesterol?",
116
  "retrieved_context": "Statins block HMG-CoA reductase enzyme, reducing liver cholesterol production and increasing LDL receptor expression, leading to lower blood cholesterol.",
117
  "pass_criteria": "Does the MODEL OUTPUT explain the mechanism accurately?",
118
- "rubric": "1. Incorrect mechanism\n2. Partial mechanism only\n3. Correct mechanism without effects\n4. Complete with effects\n5. Comprehensive with clinical benefits"
 
119
  },
120
  {
121
  "emoji": "πŸ“‰",
@@ -123,7 +134,8 @@ EXAMPLES = [
123
  "user_input": "What defines a bear market?",
124
  "gold_answer": "A bear market is defined by a prolonged drop in investment prices, typically a 20% or more decline from recent highs, accompanied by widespread pessimism.",
125
  "pass_criteria": "Does the MODEL OUTPUT provide technical criteria and market sentiment?",
126
- "rubric": "1. Incorrect definition\n2. Technical criteria only\n3. Correct with partial context\n4. Complete with market sentiment\n5. Comprehensive with historical context"
 
127
  }
128
  ]
129
 
 
53
  "user_input": "How does metformin work to treat diabetes?",
54
  "retrieved_context": "Metformin reduces hepatic glucose production, decreases intestinal glucose absorption, and improves insulin sensitivity by increasing peripheral glucose uptake.",
55
  "pass_criteria": "Does the MODEL OUTPUT explain the mechanism of action accurately and completely?",
56
+ "rubric": "0. Incorrect or incomplete\n1. Fully correct and comprehensive",
57
+ "gold_answer": ""
58
  },
59
  {
60
  "emoji": "πŸ“ˆ",
 
62
  "user_input": "What is a bull market?",
63
  "gold_answer": "A bull market is a financial market condition where prices are rising or expected to rise, typically defined by a 20% rise from recent lows.",
64
  "pass_criteria": "Does the MODEL OUTPUT provide a complete and accurate definition?",
65
+ "rubric": "1. Incorrect or misleading\n2. Basic but incomplete\n3. Accurate but missing technical details\n4. Complete with technical specifics\n5. Comprehensive with market context",
66
+ "retrieved_context": ""
67
  },
68
  {
69
  "emoji": "πŸ«€",
 
71
  "user_input": "What are the diagnostic criteria for hypertension?",
72
  "retrieved_context": "Stage 1 hypertension: systolic 130-139 or diastolic 80-89 mmHg. Stage 2: systolic β‰₯140 or diastolic β‰₯90 mmHg.",
73
  "pass_criteria": "Does the MODEL OUTPUT accurately reflect current diagnostic guidelines?",
74
+ "rubric": "1. Incorrect values\n2. Partially correct but imprecise\n3. Correct but missing staging\n4. Complete with staging information\n5. Comprehensive with risk factors",
75
+ "gold_answer": ""
76
  },
77
  {
78
  "emoji": "πŸ’°",
79
  "model_output": "ETFs are investment funds traded on stock exchanges, offering diversification and lower fees than mutual funds.",
80
  "user_input": "What are ETFs and their advantages?",
81
  "pass_criteria": "Does the MODEL OUTPUT explain both the concept and benefits accurately?",
82
+ "rubric": "0. Incorrect or incomplete explanation\n1. Correct with complete benefits",
83
+ "retrieved_context": "",
84
+ "gold_answer": ""
85
  },
86
  {
87
  "emoji": "πŸ₯",
 
89
  "user_input": "What is MRSA?",
90
  "retrieved_context": "MRSA (Methicillin-resistant Staphylococcus aureus) is a bacteria resistant to many antibiotics. It can cause skin infections, pneumonia, and bloodstream infections.",
91
  "pass_criteria": "Does the MODEL OUTPUT explain both resistance and clinical significance?",
92
+ "rubric": "1. Incorrect information\n2. Only mentions resistance\n3. Correct but incomplete clinical picture\n4. Complete with resistance and clinical aspects\n5. Comprehensive with treatment options",
93
+ "gold_answer": ""
94
  },
95
  {
96
  "emoji": "πŸ“Š",
 
98
  "user_input": "What is diversification in investing?",
99
  "gold_answer": "Diversification is a risk management strategy that mixes various investments within a portfolio to reduce exposure to any single asset or risk.",
100
  "pass_criteria": "Does the MODEL OUTPUT explain both the concept and purpose of diversification?",
101
+ "rubric": "0. Incorrect or incomplete\n1. Correct and comprehensive",
102
+ "retrieved_context": ""
103
  },
104
  {
105
  "emoji": "🧬",
 
107
  "user_input": "What causes Type 2 diabetes?",
108
  "retrieved_context": "Type 2 diabetes develops when the body becomes resistant to insulin or the pancreas doesn't produce enough insulin. Risk factors include obesity, physical inactivity, and genetics.",
109
  "pass_criteria": "Does the MODEL OUTPUT explain both pathophysiology and risk factors?",
110
+ "rubric": "1. Incorrect pathophysiology\n2. Basic mechanism only\n3. Correct mechanism with partial risk factors\n4. Complete with risk factors\n5. Comprehensive with prevention strategies",
111
+ "gold_answer": ""
112
  },
113
  {
114
  "emoji": "πŸ’΅",
115
  "model_output": "A mortgage amortization schedule shows monthly payments divided between principal and interest over the loan term.",
116
  "user_input": "What is mortgage amortization?",
117
  "pass_criteria": "Does the MODEL OUTPUT explain the concept and components clearly?",
118
+ "rubric": "1. Incorrect explanation\n2. Basic definition only\n3. Explains components without context\n4. Complete with payment breakdown\n5. Comprehensive with practical implications",
119
+ "retrieved_context": "",
120
+ "gold_answer": ""
121
  },
122
  {
123
  "emoji": "πŸ”¬",
 
125
  "user_input": "How do statins lower cholesterol?",
126
  "retrieved_context": "Statins block HMG-CoA reductase enzyme, reducing liver cholesterol production and increasing LDL receptor expression, leading to lower blood cholesterol.",
127
  "pass_criteria": "Does the MODEL OUTPUT explain the mechanism accurately?",
128
+ "rubric": "0. Incorrect or incomplete mechanism\n1. Correct and complete explanation",
129
+ "gold_answer": ""
130
  },
131
  {
132
  "emoji": "πŸ“‰",
 
134
  "user_input": "What defines a bear market?",
135
  "gold_answer": "A bear market is defined by a prolonged drop in investment prices, typically a 20% or more decline from recent highs, accompanied by widespread pessimism.",
136
  "pass_criteria": "Does the MODEL OUTPUT provide technical criteria and market sentiment?",
137
+ "rubric": "1. Incorrect definition\n2. Technical criteria only\n3. Correct with partial context\n4. Complete with market sentiment\n5. Comprehensive with historical context",
138
+ "retrieved_context": ""
139
  }
140
  ]
141