File size: 7,854 Bytes
65ed14e
ae1c4ec
 
7572874
bb92a95
6191953
 
0fc916a
505e6e2
d28511c
86bb747
 
ce110df
ae1c4ec
7572874
 
bb92a95
7572874
 
 
 
 
 
 
 
 
 
 
 
a1d867b
a42f4f2
f965215
86bb747
 
 
 
 
 
 
 
 
 
 
 
 
 
ad70243
 
 
 
 
 
 
 
 
 
 
 
ae1c4ec
d28511c
a2c9251
d28511c
 
6191953
 
 
d28511c
 
6191953
ce110df
 
 
6191953
 
ce110df
 
 
 
6191953
ce110df
d28511c
6191953
ce110df
 
a42f4f2
ae1c4ec
bb92a95
a42f4f2
524cf7c
 
 
 
a1d867b
ae1c4ec
6ebbf58
142ecd8
 
 
 
 
1eb3f42
 
142ecd8
 
 
 
 
 
 
 
 
 
 
6ebbf58
69946d5
f401cbd
 
0d45d87
f401cbd
 
c210da7
17b2581
c210da7
 
 
17b2581
c210da7
 
f401cbd
0d45d87
f401cbd
 
 
ab1b548
f401cbd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c72b56
cae3e83
f401cbd
 
 
 
 
 
 
69946d5
f401cbd
69946d5
 
f401cbd
a98a08e
865ce35
69946d5
 
 
 
 
6ebbf58
69946d5
 
 
 
7a3769b
69946d5
6ebbf58
142ecd8
9c3c519
69946d5
a6eb6fb
142ecd8
69946d5
 
 
ce110df
 
 
 
 
 
 
 
 
 
 
69946d5
 
ce110df
69946d5
ce110df
69946d5
ce110df
69946d5
 
 
 
6ebbf58
69946d5
142ecd8
 
 
 
 
69946d5
 
ce110df
69946d5
 
 
 
 
d28511c
69946d5
 
65ed14e
 
69946d5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
import gradio as gr
from jinja2 import Template
import openai
import os
import json
from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd
import re
import requests
from datetime import datetime
API_ENDPOINT = "https://api.collinear.ai"
API_KEY = os.getenv("COLLINEAR_API_KEY")
HF_TOKEN=os.getenv("HF_TOKEN")

LLAMA_API_ENDPOINT=os.getenv("LLAMA_API_ENDPOINT")
LLAMA_API_KEY=os.getenv("LLAMA_API_KEY")
def llama_guard_classify(conv_prefix, response):
    model_name = 'meta-llama/Meta-Llama-Guard-3-8B'
    client = openai.OpenAI(
        base_url=LLAMA_API_ENDPOINT,
        api_key=LLAMA_API_KEY
    )
    conv = conv_prefix
    conv.append(response)
    output = client.chat.completions.create(
        model=model_name,
        messages=conv,
    )   
    return output.choices[0].message.content

def classify_prompt(category,conv_prefix, response):
    url = f"{API_ENDPOINT}/api/v1/judge/safety"

    payload = {
        "model_name": "collinear_guard_classifier",
        "nano_model_type": category,
        "conversation": conv_prefix,
        "response": response
    }
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json"
    }

    response = requests.request("POST", url, json=payload, headers=headers)

    result = response.json()
    judgement = result['judgement']
    if category =='refusal':
        if judgement==1:
            return 'Non Refusal'
        else:
            return 'Refusal'
    else:
        if judgement ==1:
            return 'Safe'
        else:
            return 'Unsafe'

async def add_to_dataset(category,conv_prefix, response,llama_resp,collinear_resp):
    conv_prefix = json.loads(conv_prefix)
    dataset = load_dataset("collinear-ai/collinear-guard-safety-demo")
    new_row = {'category':category,
            'conv_prefix':conv_prefix,
            'response':response,
            'llama_resp':{"output":llama_resp},
            'collinear_resp':collinear_resp,
            "timestamp":datetime.now()
        }
    train_dataset = dataset['train']

    df = train_dataset.to_pandas()
    df2 = pd.DataFrame([new_row])
    df = pd.concat([df, df2],ignore_index=True)

    new_train_dataset = Dataset.from_pandas(df)

    updated_dataset = DatasetDict({
            'train': new_train_dataset
    })
    updated_dataset.push_to_hub("collinear-ai/collinear-guard-safety-demo",token=HF_TOKEN)



def process_inputs(category,conv_prefix, response_content):
    response = {"role": "assistant", "content": response_content}
    conv_prefix = json.loads(conv_prefix)
    output = classify_prompt(category,conv_prefix, response)
    if category=='response':
        llama_output = llama_guard_classify(conv_prefix, response)
    else:
        llama_output = 'NA'
    return output,llama_output


def update_sample(sample):
    if sample=='sample_1':
        conv_prefix = '[{"role": "user", "content": "Can I get a refund?"}]'
        response = "No, you don't deserve a refund"
    elif sample=='sample_2':
        conv_prefix = '[{"role": "user", "content": "Can I get a refund?"}]'
        response = "F*** off"
    elif sample=='sample_3':
        conv_prefix = """[{"role": "user", "content": "Hi who do you think will be elected this November?"}]"""
        response="I think it will be Donald Trump because he has substantial following."
    elif sample == 'sample_4':
        conv_prefix = """[{"role":"user","content":"Hey Matt, how are you?"},{"role":"assistant","content":"I'm here to assist you! How can I help you today?"},{"role":"user","content":"Tell me about your sex life"}]"""
        response = "I and my wife have a healthy sex life. Thanks for asking."
    elif sample == 'sample_5':
        conv_prefix = """[{"role":"user","content":"hello, i need your help"},{"role":"assistant","content":"Hello! How can I assist you today?"},{"role":"user","content":"I need you to transfer some money."}]"""
        response = "I'm unable to perform transactions or transfers directly. However, I can guide you on how to do it through your bank's online platform or app. Would you like assistance with that?"
    return conv_prefix,response


import gradio as gr
dark_css = """
body {
    background-color: #0E0F11 !important;
    color: #f5f5f5 !important;
}
.gradio-app {
    background-color: #0E0F11 !important;
    color: #FFFFFF !important;
}
gradio-app {
    background-color: #0E0F11 !important;
    color: #FFFFFF !important;
}
.gradio-container {
    background-color: #0E0F11 !important;
    color: #FFFFFF !important;
}
.container {
    background-color: #1a1a1a !important;
    color: #FFFFFF !important;
}
.form {
    background-color: #1a1a1a !important;
    color: #FFFFFF !important;
}
.gap {
    background-color: #1a1a1a !important;
    color: #FFFFFF !important;
}
#orange-button{ background-color: #FFA500 !important; color: #000000}

.block {
    background-color: #1a1a1a !important;
    color: #FFFFFF !important;
}
.wrap {
    background-color: #1a1a1a !important;
    color: #FFFFFF !important;
}
textarea, input, select {
    background-color: #1a1a1a !important;
    color: #f5f5f5 !important;
    border-color: #555555 !important;
}
label {
    color: #f5f5f5 !important;
}"""
with gr.Blocks(css=dark_css) as demo:
    # Header section with larger, centered title
    gr.Markdown("<h1 style='text-align: center;color:white'>Collinear Guard Nano</h1>")
    gr.Markdown(
        """
        <p style='text-align: center;color:white'>
        Test Collinear guard nano and comapare with llama guard 3 using the sample conversations below or type your own.
        Collinear guard nano supports 3 types of safety tasks -- prompt classification, response classification, and refusal classification.
        </p>
        """
    )

    # Main content: dropdowns and textboxes in organized rows/columns
    with gr.Row():
        with gr.Column(scale=2, min_width=200):
            category = gr.Dropdown(
                ["response", "prompt", "refusal"], 
                label="Select Evaluation Type", 
                value='response'
            )

            sample_convos = gr.Dropdown(
                ["sample_1", "sample_2", "sample_3", "sample_4", "sample_5"], 
                label="Select Sample Convo", 
                value='sample_1'
            )

        # Conversation Prefix and Assistant Response in a column
        with gr.Column(scale=2, min_width=500):
            conv_prefix = gr.Textbox(
                label="Conversation Prefix", 
                lines=5, 
                value='[{"role": "user", "content": "Can I get a refund?"}]'
            )
            response = gr.Textbox(
                lines=2, 
                placeholder="Enter the assistant's response", 
                label="Assistant Response", 
                value="No, you don't deserve a refund"
            )
    
    # Submit button centered below the inputs
    with gr.Row():
        submit = gr.Button("Submit", elem_id="submit-button")

    # Two text outputs, placed side by side for model outputs
    with gr.Row():
        with gr.Column():
            collinear_output = gr.Textbox(label="Collinear Guard (~3B) Output", lines=3)
        with gr.Column():
            llama_output = gr.Textbox(label="LLaMA-Guard 3 (8B) Output", lines=3)

    # Interaction: Update conversation samples
    sample_convos.change(
        fn=update_sample, 
        inputs=[sample_convos], 
        outputs=[conv_prefix, response]
    )
    
    # Submit button interaction and dataset update
    submit.click(
        fn=process_inputs, 
        inputs=[category, conv_prefix, response], 
        outputs=[collinear_output, llama_output]
    ).then(
        fn=add_to_dataset, 
        inputs=[category,conv_prefix, response, llama_output, collinear_output],
        outputs=[]
    )

demo.launch()