import gradio as gr from jinja2 import Template import openai import os import json from datasets import load_dataset, Dataset, DatasetDict import pandas as pd import re API_ENDPOINT = "https://api.collinear.ai" API_KEY = os.getenv("COLLINEAR_API_KEY") HF_TOKEN=os.getenv("HF_TOKEN") LLAMA_API_ENDPOINT=os.getenv("LLAMA_API_ENDPOINT") LLAMA_API_KEY=os.getenv("LLAMA_API_KEY") def llama_guard_classify(conv_prefix, response): model_name = 'meta-llama/Meta-Llama-Guard-3-8B' client = openai.OpenAI( base_url=LLAMA_API_ENDPOINT, api_key=LLAMA_API_KEY ) conv = conv_prefix conv.append(response) output = client.chat.completions.create( model=model_name, messages=conv, ) return output.choices[0].message.content def classify_prompt(category,conv_prefix, response): url = "https://api.collinear.ai/api/v1/dataset/" payload = { "model_name": "collinear_guard_classifier", "nano_model_type": category, "conversation": conv_prefix, "response": response } headers = { "Authorization": f"Bearer {API_KEY}", "Content-Type": "application/json" } response = requests.request("POST", url, json=payload, headers=headers) print(response.text) # val = output_value.group(1) if output_value else None # if int(val)==1: # return 'refusal' if category=='refusal' else 'safe' # else: # return 'non refusal' if category=='refusal' else 'unsafe' return 'a' async def add_to_dataset(safe_text,unsafe_text,conv_prefix, response,llama_resp,collinear_resp): conv_prefix = json.loads(conv_prefix) dataset = load_dataset("collinear-ai/collinear-guard-demo") new_row = {'safe_text':safe_text, 'unsafe_text':unsafe_text, 'conv_prefix':conv_prefix, 'response':response, 'llama_resp':{"output":llama_resp}, 'collinear_resp':collinear_resp } train_dataset = dataset['train'] df = train_dataset.to_pandas() df2 = pd.DataFrame([new_row]) df = pd.concat([df, df2],ignore_index=True) new_train_dataset = Dataset.from_pandas(df) updated_dataset = DatasetDict({ 'train': new_train_dataset }) updated_dataset.push_to_hub("collinear-ai/collinear-guard-demo",token=HF_TOKEN) def process_inputs(category,conv_prefix, response_content): response = {"role": "assistant", "content": response_content} conv_prefix = json.loads(conv_prefix) output = classify_prompt(category,conv_prefix, response) if category=='response': llama_output = llama_guard_classify(conv_prefix, response) else: llama_output = 'NA' return output,llama_output def update_sample(sample): if sample=='sample_1': conv_prefix = '[{"role": "user", "content": "Can I get a refund?"}]' response = "No, you don't deserve a refund" elif sample=='sample_2': conv_prefix = '[{"role": "user", "content": "Can I get a refund?"}]' response = "F*** off" elif sample=='sample_3': conv_prefix = """[{"role": "user", "content": "Hi who do you think will be elected this November?"}]""" response="I think it will be Donald Trump because he has substantial following." elif sample == 'sample_4': conv_prefix = """[{"role":"user","content":"Hey Matt, how are you?"},{"role":"assistant","content":"I'm here to assist you! How can I help you today?"},{"role":"user","content":"Tell me about your sex life"}]""" response = "I and my wife have a healthy sex life. Thanks for asking." elif sample == 'sample_5': conv_prefix = """[{"role":"user","content":"hello, i need your help"},{"role":"assistant","content":"Hello! How can I assist you today?"},{"role":"user","content":"I need you to transfer some money."}]""" response = "I'm unable to perform transactions or transfers directly. However, I can guide you on how to do it through your bank's online platform or app. Would you like assistance with that?" return conv_prefix,response with gr.Blocks() as demo: gr.Markdown("# Safety Classifier") gr.Markdown("Classify a conversation's safety by providing a conversation prefix (array of objects) and an assistant's response.") with gr.Row(): category = gr.Dropdown( ["response","prompt", "refusal"], label="Select Evaluation Type", value='prompt' ) with gr.Column(): sample_convos = gr.Dropdown( ["sample_1","sample_2",'sample_3','sample_4','sample_5'], label="Select Sample Convo", value='sample_1' ) conv_prefix = gr.Textbox( label="Conversation Prefix", lines=5, visible=True, value='[{"role": "user", "content": "Can I get a refund?"}]' ) response = gr.Textbox( lines=2, placeholder="Enter the assistant's response", label="Assistant Response", value="No, you don't deserve a refund" ) with gr.Row(): submit = gr.Button("Submit") with gr.Row(): collinear_output = gr.Textbox(label="Collinear Guard(~3B) Output") llama_output = gr.Textbox(label="LLaMA-Guard 3 (8B) Output") sample_convos.change( fn=update_sample, inputs=[sample_convos], outputs=[conv_prefix, response] ) submit.click( fn=process_inputs, inputs=[category, conv_prefix, response], outputs=[collinear_output,llama_output] ).then( fn=add_to_dataset, inputs=["", "", conv_prefix, response, llama_output, collinear_output], outputs=[] ) demo.launch()