doberst commited on
Commit
22620c4
1 Parent(s): e6ed5f6

Upload dragon_rag_benchmark_tests_llmware.py

Browse files
dragon_rag_benchmark_tests_llmware.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """This example demonstrates running a benchmarks set of tests against llmware DRAGON models
3
+ https://huggingface.co/collections/llmware/dragon-models-65552d7648093c3f6e35d1bf
4
+ The model loading and interaction is handled with the llmware Prompt class which provides additional
5
+ capabilities like evidence checking
6
+ """
7
+
8
+ import time
9
+ from llmware.prompts import Prompt
10
+ # The datasets package is not installed automatically by llmware
11
+ try:
12
+ from datasets import load_dataset
13
+ except ImportError:
14
+ raise ImportError ("This example requires the 'datasets' Python package. You can install it with 'pip install datasets'")
15
+
16
+
17
+ # Pull a 200 question RAG benchmark test dataset from llmware HuggingFace repo
18
+ def load_rag_benchmark_tester_dataset():
19
+
20
+ dataset_name = "llmware/rag_instruct_benchmark_tester"
21
+ print(f"\n > Loading RAG dataset '{dataset_name}'...")
22
+ dataset = load_dataset(dataset_name)
23
+
24
+ test_set = []
25
+ for i, samples in enumerate(dataset["train"]):
26
+ test_set.append(samples)
27
+
28
+ return test_set
29
+
30
+ # Run the benchmark test
31
+ def run_test(model_name, prompt_list):
32
+
33
+ print(f"\n > Loading model '{model_name}'")
34
+ prompter = Prompt().load_model(model_name)
35
+
36
+ print(f"\n > Running RAG Benchmark Test against '{model_name}' - 200 questions")
37
+ for i, entry in enumerate(prompt_list):
38
+
39
+ start_time = time.time()
40
+
41
+ prompt = entry["query"]
42
+ context = entry["context"]
43
+ response = prompter.prompt_main(prompt,context=context,prompt_name="default_with_context", temperature=0.0, sample=False)
44
+
45
+ # Print results
46
+ time_taken = round(time.time() - start_time, 2)
47
+ print("\n")
48
+ print(f"{i+1}. llm_response - {response['llm_response']}")
49
+ print(f"{i+1}. gold_answer - {entry['answer']}")
50
+ print(f"{i+1}. time_taken - {time_taken}")
51
+
52
+ # Fact checking
53
+ fc = prompter.evidence_check_numbers(response)
54
+ sc = prompter.evidence_comparison_stats(response)
55
+ sr = prompter.evidence_check_sources(response)
56
+ for fc_entry in fc:
57
+ for f, facts in enumerate(fc_entry["fact_check"]):
58
+ print(f"{i+1}. fact_check - {f} {facts}")
59
+
60
+ for sc_entry in sc:
61
+ print(f"{i+1}. comparison_stats - {sc_entry['comparison_stats']}")
62
+
63
+ for sr_entry in sr:
64
+ for s, source in enumerate(sr_entry["source_review"]):
65
+ print(f"{i+1}. source - {s} {source}")
66
+
67
+ return 0
68
+
69
+
70
+ if __name__ == "__main__":
71
+
72
+ # Get the benchmark dataset
73
+ test_dataset = load_rag_benchmark_tester_dataset()
74
+
75
+ # BLING MODELS
76
+ bling_models = ["llmware/bling-1b-0.1", "llmware/bling-1.4b-0.1", "llmware/bling-falcon-1b-0.1",
77
+ "llmware/bling-cerebras-1.3b-0.1", "llmware/bling-sheared-llama-1.3b-0.1",
78
+ "llmware/bling-sheared-llama-2.7b-0.1", "llmware/bling-red-pajamas-3b-0.1",
79
+ "llmware/bling-stable-lm-3b-4e1t-v0"]
80
+
81
+ # DRAGON MODELS
82
+ dragon_models = ['llmware/dragon-yi-6b-v0', 'llmware/dragon-red-pajama-7b-v0', 'llmware/dragon-stablelm-7b-v0',
83
+ 'llmware/dragon-deci-6b-v0', 'llmware/dragon-mistral-7b-v0','llmware/dragon-falcon-7b-v0',
84
+ 'llmware/dragon-llama-7b-v0']
85
+
86
+ # Pick a model - note: if running on laptop/CPU, select a bling model
87
+ model_name = dragon_models[0]
88
+ model_name = "bling-phi-3-gguf"
89
+
90
+ output = run_test(model_name, test_dataset)
91
+