llmopscicd / test_llm.py
dpaul93's picture
Create test_llm.py
2c0321e verified
raw
history blame
994 Bytes
from transformers import pipeline
from evaluate import load as load_metric
# Initialize the text generation model
generator = pipeline("text-generation", model="gpt2")
# Test 1: Checking if the generated text has hallucinations
def test_hallucination():
prompt = "The capital of France is"
output = generator(prompt, max_length=20, num_return_sequences=1)[0]["generated_text"]
assert "Paris" in output, "Hallucination detected: Expected 'Paris' in the output."
# Test 2: Evaluate the LLM using BLEU score
def test_bleu_score():
bleu = load_metric("bleu")
reference = [["The", "capital", "of", "France", "is", "Paris"]]
hypothesis = generator("The capital of France is", max_length=20, num_return_sequences=1)[0]["generated_text"].split()
bleu_score = bleu.compute(predictions=[hypothesis], references=reference)
assert bleu_score["bleu"] > 0.5, f"Low BLEU score: {bleu_score['bleu']}"
if __name__ == "__main__":
test_hallucination()
test_bleu_score()