Spaces:
Sleeping
Sleeping
from selfcheckgpt.modeling_selfcheck import SelfCheckNLI, SelfCheckBERTScore, SelfCheckNgram | |
import torch | |
import spacy | |
import os | |
import gradio as gr | |
# Load the English language model | |
nlp = spacy.load("en_core_web_sm") | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
selfcheck_nli = SelfCheckNLI(device=device) # set device to 'cuda' if GPU is available | |
selfcheck_bertscore = SelfCheckBERTScore(rescale_with_baseline=True) | |
selfcheck_ngram = SelfCheckNgram(n=1) # n=1 means Unigram, n=2 means Bigram, etc. | |
openai_key = os.getenv("OPENAI_API_KEY") | |
resource_url = os.getenv("OPENAI_API_RESOURCEURL") | |
api_version =os.getenv("OPENAI_API_VERSION") | |
api_url=os.getenv("OPENAI_API_RESOURCEURL") | |
import os | |
from openai import AzureOpenAI | |
client = AzureOpenAI( | |
api_key=openai_key, | |
api_version=api_version, | |
azure_endpoint = api_url | |
) | |
deployment_name=os.getenv("model_name") #This will correspond to the custom name you chose for your deployment when you deployed a model. Use a gpt-35-turbo-instruct deployment. | |
import os | |
from openai import AzureOpenAI | |
client = AzureOpenAI( | |
api_key = openai_key, | |
api_version =api_version, | |
azure_endpoint =api_url | |
) | |
def generate_response(prompt): | |
response = client.chat.completions.create( | |
model=deployment_name, # model = "deployment_name". | |
temperature=0.0, | |
messages=[ | |
{"role": "user", "content": prompt} | |
] | |
) | |
return response.choices[0].message.content | |
def generate_response_high_temp(prompt): | |
response = client.chat.completions.create( | |
model=deployment_name, # model = "deployment_name". | |
temperature=1.0, | |
messages=[ | |
{"role": "user", "content": prompt} | |
] | |
) | |
return response.choices[0].message.content | |
def create_dataset(prompt): | |
s1 = generate_response_high_temp(prompt) | |
s2 = generate_response_high_temp(prompt) | |
s3 = generate_response_high_temp(prompt) | |
return s1, s2, s3 | |
def split_sent(sentence): | |
return [sent.text.strip() for sent in nlp(sentence).sents] | |
def func_selfcheck_nli(sentence, s1, s2, s3): | |
sentence1 = [sentence[2:-2]] | |
sample_dataset = [s1, s2, s3] | |
score = selfcheck_nli.predict( | |
sentences = sentence1, # list of sentences | |
sampled_passages = sample_dataset, # list of sampled passages | |
) | |
if (score > 0.35): | |
return f"The LLM is hallucinating with selfcheck nli score of {score}" | |
else: | |
return f"The LLM is generating true information with selfcheck nli score of {score}" | |
def func_selfcheckbert(sentence, s1, s2, s3): | |
sentence1 = [sentence[2:-2]] | |
sample_dataset = [s1, s2, s3] | |
sent_scores_bertscore = selfcheck_bertscore.predict( | |
sentences = sentence1, # list of sentences | |
sampled_passages = sample_dataset, # list of sampled passages | |
) | |
if (sent_scores_bertscore > 0.6): | |
return f"The LLM is hallucinating with selfcheck BERT score of {sent_scores_bertscore}" | |
else: | |
return f"The LLM is generating true information with selfcheck BERT score of {sent_scores_bertscore}" | |
def func_selfcheckngram(sentence, s1, s2, s3): | |
sentence1 = [sentence[2:-2]] | |
sample_dataset = [s1, s2, s3] | |
sentences_split = split_sent(sentence1[0]) | |
sent_scores_ngram = selfcheck_ngram.predict( | |
sentences = sentences_split, | |
passage = sentence1[0], | |
sampled_passages = sample_dataset, | |
) | |
avg_max_neg_logprob = sent_scores_ngram['doc_level']['avg_max_neg_logprob'] | |
if(avg_max_neg_logprob > 6): | |
return f"The LLM is hallucinating with selfcheck ngram score of {avg_max_neg_logprob}" | |
else: | |
return f"The LLM is generating true information with selfcheck ngram score of {avg_max_neg_logprob}" | |
return sent_scores_ngram | |
def generating_samples(prompt): | |
prompt_template=f"This is a Wikipedia passage on the topic of '{prompt}' in 100 words" | |
sample_response=generate_response(prompt_template) | |
s1, s2, s3 =create_dataset(prompt_template) | |
sentence=[sample_response] | |
return sentence, s1, s2, s3 | |
with gr.Blocks() as demo: | |
gr.Markdown( | |
""" | |
<h1> LLM Hackathon : LLM Hallucination Detector <h1> | |
""") | |
with gr.Column(): | |
prompt = gr.Textbox(label="prompt") | |
with gr.Column(): | |
sentence = gr.Textbox(label="response") | |
print(sentence) | |
with gr.Row(): | |
s1 = gr.Textbox(label="sample1") | |
s2 = gr.Textbox(label="sample2") | |
s3 = gr.Textbox(label="sample3") | |
with gr.Column(): | |
score= gr.Textbox(label="output") | |
output_response = gr.Button("Generate response") | |
output_response.click( | |
fn=generating_samples, | |
inputs=prompt, | |
outputs=[sentence, s1, s2, s3] | |
) | |
with gr.Row(equal_height=True): | |
self_check_nli_button = gr.Button("self check nli") | |
self_check_nli_button.click( | |
fn=func_selfcheck_nli, | |
inputs=[sentence, s1, s2, s3], | |
outputs=score | |
) | |
selfcheckbert_button = gr.Button("self check Bert") | |
selfcheckbert_button.click( | |
fn=func_selfcheckbert, | |
inputs=[sentence, s1, s2, s3], | |
outputs=score | |
) | |
self_check_ngram_button = gr.Button("self check ngram") | |
self_check_ngram_button.click( | |
fn=func_selfcheckngram, | |
inputs=[sentence, s1, s2, s3], | |
outputs=score | |
) | |
demo.launch() |