File size: 15,580 Bytes
f9a609b
 
 
 
 
 
 
 
 
5e3dbdf
f9a609b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e3e882a
 
 
 
 
57db206
 
e3e882a
 
 
 
 
 
 
57db206
 
e3e882a
 
 
 
 
 
 
57db206
 
e3e882a
 
 
 
 
 
57db206
 
 
e3e882a
 
 
 
 
 
 
57db206
 
e3e882a
 
 
 
 
 
 
57db206
 
e3e882a
 
 
 
 
 
 
57db206
 
e3e882a
 
 
 
 
 
57db206
 
 
e3e882a
 
 
 
 
 
 
57db206
 
e3e882a
 
 
 
 
 
 
57db206
 
f9a609b
 
 
 
3c9da4e
17a3c33
f9a609b
 
06d6973
f9a609b
 
 
 
 
 
 
 
 
 
 
 
 
 
76957ec
 
f9a609b
 
0cc013f
7feb127
 
5f886cd
45f3b77
 
0cc013f
f9a609b
 
 
76957ec
 
 
f9a609b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e8798dd
 
 
 
5ae1b7f
e8798dd
9eca262
e8798dd
f9a609b
 
 
 
 
 
 
 
 
 
 
 
5e3dbdf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f9a609b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e8798dd
f9a609b
 
 
aef8eda
45f3b77
6cef0a4
 
 
 
 
f9a609b
 
 
 
aef8eda
6cef0a4
 
 
f9a609b
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
import os
import re
import io
import json
from typing import List, Tuple, Union
from pathlib import Path
import gradio as gr
from leptonai import Client

HF_TOKEN = os.environ.get("HF_TOKEN", None)
LEPTON_API_TOKEN = os.environ.get("LEPTON_API_TOKEN", None)

client = Client("https://yb15a7dy-glider.tin.lepton.run", "glider", LEPTON_API_TOKEN)

PROMPT = """Analyze the following pass criteria carefully and score the text based on the rubric defined below.

To perform this evaluation, you must:

1. Understand the text tags, pass criteria and rubric thoroughly.
2. Review the finer details of the text and the rubric.
3. Compare the tags to be evaluated to the score descriptions in the rubric.
4. Pay close attention to small details that might impact the final score and form accurate associations between tags and pass criteria.
5. Write a detailed reasoning justifying your evaluation in a bullet point format. 
6. The reasoning must summarize the overall strengths and weaknesses of the output while quoting exact phrases from the output wherever required.
7. Output a list of words or phrases that you believe are the most important in determining the score.
8. Assign a final score based on the scoring rubric.

Data to evaluate:
{user_input}

Pass Criteria:
{pass_criteria}

Rubric:
{rubric}

Your output must in the following format:
<reasoning>
[Detailed reasoning justifying your evaluation in a bullet point format according to the specifics defined above]
</reasoning>
<highlight>
[List of words or phrases that you believe are the most important in determining the score]
</highlight>
<score>
[The final integer score assigned based on the scoring rubric]
</score>
"""

EXAMPLES = [
    {
        "emoji": "πŸ’Š",
        "model_output": "Metformin works by reducing glucose production in the liver and improving insulin sensitivity.",
        "user_input": "How does metformin work to treat diabetes?",
        "retrieved_context": "Metformin reduces hepatic glucose production, decreases intestinal glucose absorption, and improves insulin sensitivity by increasing peripheral glucose uptake.",
        "pass_criteria": "Does the MODEL OUTPUT explain the mechanism of action accurately and completely?",
        "rubric": "0. Incorrect or incomplete\n1. Fully correct and comprehensive",
        "gold_answer": ""
    },
    {
        "emoji": "πŸ“ˆ",
        "model_output": "A bull market is characterized by rising stock prices over a sustained period.",
        "user_input": "What is a bull market?",
        "gold_answer": "A bull market is a financial market condition where prices are rising or expected to rise, typically defined by a 20% rise from recent lows.",
        "pass_criteria": "Does the MODEL OUTPUT provide a complete and accurate definition?",
        "rubric": "1. Incorrect or misleading\n2. Basic but incomplete\n3. Accurate but missing technical details\n4. Complete with technical specifics\n5. Comprehensive with market context",
        "retrieved_context": ""
    },
    {
        "emoji": "πŸ«€",
        "model_output": "Hypertension is diagnosed when blood pressure consistently exceeds 130/80 mmHg.",
        "user_input": "What are the diagnostic criteria for hypertension?",
        "retrieved_context": "Stage 1 hypertension: systolic 130-139 or diastolic 80-89 mmHg. Stage 2: systolic β‰₯140 or diastolic β‰₯90 mmHg.",
        "pass_criteria": "Does the MODEL OUTPUT accurately reflect current diagnostic guidelines?",
        "rubric": "1. Incorrect values\n2. Partially correct but imprecise\n3. Correct but missing staging\n4. Complete with staging information\n5. Comprehensive with risk factors",
        "gold_answer": ""
    },
    {
        "emoji": "πŸ’°",
        "model_output": "ETFs are investment funds traded on stock exchanges, offering diversification and lower fees than mutual funds.",
        "user_input": "What are ETFs and their advantages?",
        "pass_criteria": "Does the MODEL OUTPUT explain both the concept and benefits accurately?",
        "rubric": "0. Incorrect or incomplete explanation\n1. Correct with complete benefits",
        "retrieved_context": "",
        "gold_answer": ""
    },
    {
        "emoji": "πŸ₯",
        "model_output": "MRSA is resistant to methicillin and most beta-lactam antibiotics.",
        "user_input": "What is MRSA?",
        "retrieved_context": "MRSA (Methicillin-resistant Staphylococcus aureus) is a bacteria resistant to many antibiotics. It can cause skin infections, pneumonia, and bloodstream infections.",
        "pass_criteria": "Does the MODEL OUTPUT explain both resistance and clinical significance?",
        "rubric": "1. Incorrect information\n2. Only mentions resistance\n3. Correct but incomplete clinical picture\n4. Complete with resistance and clinical aspects\n5. Comprehensive with treatment options",
        "gold_answer": ""
    },
    {
        "emoji": "πŸ“Š",
        "model_output": "Diversification reduces risk by spreading investments across different asset classes, sectors, and geographical regions.",
        "user_input": "What is diversification in investing?",
        "gold_answer": "Diversification is a risk management strategy that mixes various investments within a portfolio to reduce exposure to any single asset or risk.",
        "pass_criteria": "Does the MODEL OUTPUT explain both the concept and purpose of diversification?",
        "rubric": "0. Incorrect or incomplete\n1. Correct and comprehensive",
        "retrieved_context": ""
    },
    {
        "emoji": "🧬",
        "model_output": "Type 2 diabetes involves insulin resistance and decreased insulin production.",
        "user_input": "What causes Type 2 diabetes?",
        "retrieved_context": "Type 2 diabetes develops when the body becomes resistant to insulin or the pancreas doesn't produce enough insulin. Risk factors include obesity, physical inactivity, and genetics.",
        "pass_criteria": "Does the MODEL OUTPUT explain both pathophysiology and risk factors?",
        "rubric": "1. Incorrect pathophysiology\n2. Basic mechanism only\n3. Correct mechanism with partial risk factors\n4. Complete with risk factors\n5. Comprehensive with prevention strategies",
        "gold_answer": ""
    },
    {
        "emoji": "πŸ’΅",
        "model_output": "A mortgage amortization schedule shows monthly payments divided between principal and interest over the loan term.",
        "user_input": "What is mortgage amortization?",
        "pass_criteria": "Does the MODEL OUTPUT explain the concept and components clearly?",
        "rubric": "1. Incorrect explanation\n2. Basic definition only\n3. Explains components without context\n4. Complete with payment breakdown\n5. Comprehensive with practical implications",
        "retrieved_context": "",
        "gold_answer": ""
    },
    {
        "emoji": "πŸ”¬",
        "model_output": "Statins work by inhibiting HMG-CoA reductase, reducing cholesterol synthesis in the liver.",
        "user_input": "How do statins lower cholesterol?",
        "retrieved_context": "Statins block HMG-CoA reductase enzyme, reducing liver cholesterol production and increasing LDL receptor expression, leading to lower blood cholesterol.",
        "pass_criteria": "Does the MODEL OUTPUT explain the mechanism accurately?",
        "rubric": "0. Incorrect or incomplete mechanism\n1. Correct and complete explanation",
        "gold_answer": ""
    },
    {
        "emoji": "πŸ“‰",
        "model_output": "A bear market occurs when stock prices fall 20% or more from recent highs.",
        "user_input": "What defines a bear market?",
        "gold_answer": "A bear market is defined by a prolonged drop in investment prices, typically a 20% or more decline from recent highs, accompanied by widespread pessimism.",
        "pass_criteria": "Does the MODEL OUTPUT provide technical criteria and market sentiment?",
        "rubric": "1. Incorrect definition\n2. Technical criteria only\n3. Correct with partial context\n4. Complete with market sentiment\n5. Comprehensive with historical context",
        "retrieved_context": ""
    }
]

HEADER = """
<div style="width: 100%; display: flex; flex-direction: column; gap: 24px; padding-top: 24px">
    <img src="https://postimage.me/images/2024/12/19/ICONGLIDER.md.png" width="325" style="position: absolute; top: 0; right: -24px">
    <div style="display: flex; justify-content: space-between; z-index: 1;">
        <a href="https://www.patronus.ai">
            <img src="https://postimage.me/images/2024/12/19/patronuslogo-white.png" width="250">
        </a>
        <div style="display: flex; gap: 12px;">
            <a href="https://huggingface.co/PatronusAI/glider">
                <img src="https://img.shields.io/badge/%F0%9F%A4%97%20Model_Card-Huggingface-orange" height="20">
            </a>
            <a href="https://github.com/patronus-ai/glider">
                <img src="https://img.shields.io/badge/GitHub-Glider-indigo" height="20">
            </a>
            <a href="https://arxiv.org/abs/2412.14140">
                <img src="https://img.shields.io/badge/arXiv-2412.14140-b31b1b.svg" height="20">
            </a>
        </div>
    </div>
    <div>
        <h1 style="color: #fff !important">GLIDER: Grading LLM Interactions and Decisions using Explainable Ranking</h1>
        <h2 style="color: #fff !important">Patronus GLIDER Demo</h2>
    </div>
</div>
<br>
<div style="color: #fff !important; width: 70%"><span style="color: inherit; font-weight: 600">GLIDER</span> is a powerful 3B evaluator LLM that can score any text input and associated context on arbitrary user defined criteria.</div>
<br>
<div style="color: #fff !important; width: 70%;"><span style="color: inherit; font-weight: 600">Getting Started</span>: First, provide a model output (text generated by your model) and user input (text used to prompt your model) and optionally a gold answer (label or gold answer to the prompt) and retrieved context (context used for text generated by your model). Next, provide a pass criteria (description of a passing evaluation). Finally, provide an optional but recommended rubric (scoring scales with explanations) and then click submit. The GLIDER Output panel will provide a score and reasoning which is a human readable explanation of the score. You can find our docs <a href="https://docs.patronus.ai/docs/evals-with-glider">here</a>.</div>
<br>
<div style="color: #fff !important; width: 70%;"><span style="color: inherit; font-weight: 600">Note</span>: In your Pass Criteria, use the attribute tags from above (i.e. MODEL OUTPUT, USER INPUT, GOLD ANSWER, RETRIEVED CONTEXT).</div>
<br>
"""

EXAMPLES_HEADER = """
<h1 style="color: #fff !important">
    Try it Yourself!
</h1>
"""

css = """
.example-button {
    width: fit-content;
    font-size: 1rem;
    font-weight: 400 !important;
    padding: .5rem 1rem;
    text-align: start;
}
.fixed-height-button {
    height: fit-content;
    word-break: break-all;
    font-size: .85rem;
}
"""

theme = gr.themes.Default(
    spacing_size="sm",
    font=[gr.themes.GoogleFont("Plus Jakarta Sans"), "Arial", "sans-serif"],
    primary_hue="indigo",
    secondary_hue="purple"
).set(
    background_fill_primary="radial-gradient(circle at 90% 0%, rgba(255,255,255,0.4), #000000 25%)")

def format_string(retrieved_context, user_input, model_output, gold_answer):
    parts = []
    if retrieved_context:
        parts.append(f"<CONTEXT>\n{retrieved_context}\n</CONTEXT>")
    if user_input:
        parts.append(f"<USER INPUT>\n{user_input}\n</USER INPUT>")
    if model_output:
        parts.append(f"<MODEL OUTPUT>\n{model_output}\n</MODEL OUTPUT>")
    if gold_answer:
        parts.append(f"<GOLD ANSWER>\n{gold_answer}\n</GOLD ANSWER>")
    return "\n".join(parts)

def extract_spans(input_string):
    # Regex patterns to extract content within the reasoning, highlight, and score tags
    reasoning_pattern = r"<reasoning>\s*(.*?)\s*</reasoning>"
    highlight_pattern = r"<highlight>\s*(.*?)\s*</highlight>"
    score_pattern = r"<score>\s*(\d+)\s*</score(?:\>|)"
    
    # Using re.search to extract the contents based on our defined patterns
    reasoning_match = re.search(reasoning_pattern, input_string, re.DOTALL)
    highlight_match = re.search(highlight_pattern, input_string)
    score_match = re.search(score_pattern, input_string)
    
    # Extracting the matched groups if present
    reasoning = reasoning_match.group(1) if reasoning_match else None
    highlight = highlight_match.group(1).strip() if highlight_match else None
    score = int(score_match.group(1)) if score_match else None
    # Return a dictionary with the extracted content
    return score, reasoning, highlight

def model_call(model_output, user_input, gold_answer, retrieved_context, pass_criteria, rubric):
    if model_output == "" or user_input == "" or pass_criteria == "":
        return "", "", ""
    combined_user_input = format_string(retrieved_context, user_input, model_output, gold_answer)
    NEW_PROMPT_FORMAT = PROMPT.format(user_input=combined_user_input, pass_criteria=pass_criteria, rubric=rubric)
    response = client.api.v1.chat.completions(
        model="glider",
        messages=[{"role": "user", "content": NEW_PROMPT_FORMAT}],
        temperature=0,
        top_p=0.999,
        max_tokens=2048,
        stream=False,
    )
    score, reasoning, highlight_spans = extract_spans(response["choices"][0]["message"]["content"])
    return score, reasoning, highlight_spans

def select_template(template):
    return template["model_output"], template["user_input"], template["gold_answer"], template["retrieved_context"], template["pass_criteria"], template["rubric"]

with gr.Blocks(css=css, theme=theme) as demo:
    gr.Markdown(HEADER)
    with gr.Row(equal_height=True):
        with gr.Column(scale=1):
            gr.Markdown("<div style='color: #fff !important; font-weight: 600'>Your Inputs</div>")
            model_output = gr.Textbox(label="MODEL OUTPUT (required)")
            user_input = gr.Textbox(label="USER INPUT (required)")
            gold_answer = gr.Textbox(label="GOLD ANSWER")
            retrieved_context = gr.Textbox(label="RETRIEVED CONTEXT")
            pass_criteria = gr.Textbox(label="Pass Criteria (required)")
            rubric = gr.Textbox(label="Rubric")
            with gr.Row():
                clear_btn = gr.ClearButton([model_output, user_input, gold_answer, retrieved_context, pass_criteria, rubric])
                submit_button = gr.Button("Submit", variant="primary")
        with gr.Column(scale=1):
            gr.Markdown("<div style='color: #fff !important; font-weight: 600'>GLIDER Output</div>")
            score = gr.Textbox(label="Score")
            reasoning = gr.Textbox(label="Reasoning")
            highlights = gr.Textbox(label="Highlights")
    gr.Markdown("&nbsp;")
    gr.Markdown(EXAMPLES_HEADER)
    with gr.Row():
        with gr.Column():
            for _, example in enumerate(EXAMPLES):
                template_btn = gr.Button(f"{example['emoji']} {example['model_output']}", elem_classes="example-button")
                template_btn.click(
                    fn=select_template,
                    inputs=[gr.State(example)],
                    outputs=[model_output, user_input, gold_answer, retrieved_context, pass_criteria, rubric]
                    )

    submit_button.click(fn=model_call, inputs=[model_output, user_input, gold_answer, retrieved_context, pass_criteria, rubric], outputs=[score, reasoning, highlights])
demo.launch()