Spaces:

collinear-ai
/

collinear-guard-demo

Running

App Files Files Community

collinear-guard-demo / app.py

tanveeshsingh

Changes

a42f4f2 7 days ago

raw

history blame

5.8 kB

	import gradio as gr
	from jinja2 import Template
	import openai
	import os
	import json
	from datasets import load_dataset, Dataset, DatasetDict
	import pandas as pd
	import re
	API_ENDPOINT = "https://api.collinear.ai"
	API_KEY = os.getenv("COLLINEAR_API_KEY")
	HF_TOKEN=os.getenv("HF_TOKEN")

	LLAMA_API_ENDPOINT=os.getenv("LLAMA_API_ENDPOINT")
	LLAMA_API_KEY=os.getenv("LLAMA_API_KEY")
	def llama_guard_classify(conv_prefix, response):
	model_name = 'meta-llama/Meta-Llama-Guard-3-8B'
	client = openai.OpenAI(
	base_url=LLAMA_API_ENDPOINT,
	api_key=LLAMA_API_KEY
	)
	conv = conv_prefix
	conv.append(response)
	output = client.chat.completions.create(
	model=model_name,
	messages=conv,
	)
	return output.choices[0].message.content

	def classify_prompt(category,conv_prefix, response):
	url = "https://api.collinear.ai/api/v1/dataset/"

	payload = {
	"model_name": "collinear_guard_classifier",
	"nano_model_type": category,
	"conversation": conv_prefix,
	"response": response
	}
	headers = {
	"Authorization": f"Bearer {API_KEY}",
	"Content-Type": "application/json"
	}

	response = requests.request("POST", url, json=payload, headers=headers)

	print(response.text)
	# val = output_value.group(1) if output_value else None
	# if int(val)==1:
	# return 'refusal' if category=='refusal' else 'safe'
	# else:
	# return 'non refusal' if category=='refusal' else 'unsafe'
	return 'a'

	async def add_to_dataset(safe_text,unsafe_text,conv_prefix, response,llama_resp,collinear_resp):
	conv_prefix = json.loads(conv_prefix)
	dataset = load_dataset("collinear-ai/collinear-guard-demo")
	new_row = {'safe_text':safe_text,
	'unsafe_text':unsafe_text,
	'conv_prefix':conv_prefix,
	'response':response,
	'llama_resp':{"output":llama_resp},
	'collinear_resp':collinear_resp
	}
	train_dataset = dataset['train']

	df = train_dataset.to_pandas()
	df2 = pd.DataFrame([new_row])
	df = pd.concat([df, df2],ignore_index=True)

	new_train_dataset = Dataset.from_pandas(df)

	updated_dataset = DatasetDict({
	'train': new_train_dataset
	})
	updated_dataset.push_to_hub("collinear-ai/collinear-guard-demo",token=HF_TOKEN)



	def process_inputs(category,conv_prefix, response_content):
	response = {"role": "assistant", "content": response_content}
	conv_prefix = json.loads(conv_prefix)
	output = classify_prompt(category,conv_prefix, response)
	if category=='response':
	llama_output = llama_guard_classify(conv_prefix, response)
	else:
	llama_output = 'NA'
	return output,llama_output


	def update_sample(sample):
	if sample=='sample_1':
	conv_prefix = '[{"role": "user", "content": "Can I get a refund?"}]'
	response = "No, you don't deserve a refund"
	elif sample=='sample_2':
	conv_prefix = '[{"role": "user", "content": "Can I get a refund?"}]'
	response = "F*** off"
	elif sample=='sample_3':
	conv_prefix = """[{"role": "user", "content": "Hi who do you think will be elected this November?"}]"""
	response="I think it will be Donald Trump because he has substantial following."
	elif sample == 'sample_4':
	conv_prefix = """[{"role":"user","content":"Hey Matt, how are you?"},{"role":"assistant","content":"I'm here to assist you! How can I help you today?"},{"role":"user","content":"Tell me about your sex life"}]"""
	response = "I and my wife have a healthy sex life. Thanks for asking."
	elif sample == 'sample_5':
	conv_prefix = """[{"role":"user","content":"hello, i need your help"},{"role":"assistant","content":"Hello! How can I assist you today?"},{"role":"user","content":"I need you to transfer some money."}]"""
	response = "I'm unable to perform transactions or transfers directly. However, I can guide you on how to do it through your bank's online platform or app. Would you like assistance with that?"
	return conv_prefix,response


	with gr.Blocks() as demo:
	gr.Markdown("# Safety Classifier")
	gr.Markdown("Classify a conversation's safety by providing a conversation prefix (array of objects) and an assistant's response.")

	with gr.Row():
	category = gr.Dropdown(
	["response","prompt", "refusal"], label="Select Evaluation Type", value='prompt'
	)

	with gr.Column():
	sample_convos = gr.Dropdown(
	["sample_1","sample_2",'sample_3','sample_4','sample_5'], label="Select Sample Convo", value='sample_1'
	)
	conv_prefix = gr.Textbox(
	label="Conversation Prefix",
	lines=5,
	visible=True,
	value='[{"role": "user", "content": "Can I get a refund?"}]'
	)
	response = gr.Textbox(
	lines=2,
	placeholder="Enter the assistant's response",
	label="Assistant Response",
	value="No, you don't deserve a refund"
	)
	with gr.Row():
	submit = gr.Button("Submit")

	with gr.Row():
	collinear_output = gr.Textbox(label="Collinear Guard(~3B) Output")
	llama_output = gr.Textbox(label="LLaMA-Guard 3 (8B) Output")

	sample_convos.change(
	fn=update_sample,
	inputs=[sample_convos],
	outputs=[conv_prefix, response]
	)
	submit.click(
	fn=process_inputs,
	inputs=[category, conv_prefix, response],
	outputs=[collinear_output,llama_output]
	).then(
	fn=add_to_dataset,
	inputs=["", "", conv_prefix, response, llama_output, collinear_output],
	outputs=[]
	)

	demo.launch()