from pprint import pp from models import document import pandas as pd import dspy from dspy.teleprompt import BootstrapFewShot from dspy.evaluate import Evaluate from opus.modules.kg2summary import KG2TextModule # from utils.dataframe_utils import dataframe_to_documents from opus.utils.dataframe_utils import dataframe_to_documents from opus.metric.base import AssessPrediction turbo = dspy.OpenAI(model='gpt-3.5-turbo', temperature=0.1, max_tokens=1200, top_p=0.89, api_base='http://localhost:6000/v1/', api_key='asdf', timeout=200) # turbo = dspy.OpenAI(model='mixtral-turbo', temperature=0.5, max_tokens=1024, api_base='http://localhost:6000/v1/', api_key='asdf')#, timeout=200) dspy.settings.configure(lm=turbo) NUM_THREADS=10 from pprint import pprint class AssessPrediction(dspy.Signature): """Evaluate and compare two summaries to identify which one more accurately and effectively conveys the key points of the original content. As an expert reviewer, your role is to critically assess the clarity, coherence, and completeness of each summary. Consider the following aspects in your evaluation: - Clarity: Does the summary present the information in a clear and understandable manner? - Coherence: Is the summary logically organized and do the ideas flow smoothly? - Completeness: Does the summary capture the essential points of the original content without omitting crucial information? Based on your analysis, determine which summary does a better job at distilling the essence of the original material, making a sophisticated decision that takes into account not just the factual accuracy but also the readability and overall effectiveness of the summarization. """ summary1 = dspy.InputField() summary2 = dspy.InputField() assessment_answer = dspy.OutputField(desc="summary1 or summary2") # class AssessPrediction(dspy.Signature): # """Pick the better summary based on the example.""" # summary1 = dspy.InputField() # summary2 = dspy.InputField() # assessment_answer = dspy.OutputField(desc="summary1 or summary2") import random def factuality_metric(gold, pred, trace=None): assess = dspy.ChainOfThought(AssessPrediction) # if pred.summary_rewrite is "N/A" then default to pred.summary _summary = pred.summary_rewrite if pred.summary_rewrite != "N/A" else pred.summary # Initialize summaries with labels before shuffling summaries = [('gold', gold.summary), ('pred', _summary)] # Randomize summaries order and prepare for assessment random.shuffle(summaries) assessment_args = { 'summary1': summaries[0][1], # First summary after shuffle 'summary2': summaries[1][1] # Second summary after shuffle } # Keep track of which summary is which summary1_label = summaries[0][0] summary2_label = summaries[1][0] # Assess using the randomized summaries _winner = assess(**assessment_args) winner_label = _winner.assessment_answer.split()[0].lower() # Determine the winner based on original labels if winner_label == 'summary1': winner_is_gold = summary1_label == 'gold' else: winner_is_gold = summary2_label == 'gold' return winner_is_gold def train(): df = pd.read_parquet('./data/kg_datasetK17.parquet') print(f"Number of records: {len(df)}") random_sample = df.sample(n=5) print(f"Random sample: {random_sample}") # Setup our bootstrap teleprompter = BootstrapFewShot(metric=factuality_metric, max_bootstrapped_demos=4, max_labeled_demos=16) # Get the documents from the parquet file _documents = dataframe_to_documents(df) pp(_documents[0].text) # exit() documents = [] for doc in _documents: # doc.with_inputs('kg') doc = dspy.Example(**doc).with_inputs('kg') documents.append(doc) # random.shuffle(documents) # from pprint import pprint # pprint(documents) # Split documents into train, validation, and test sets split1, split2 = len(documents) // 3, 2 * len(documents) // 3 train, validation, test_set = documents[:split1], documents[split1:split2], documents[split2:] train = train[:20] validation = validation[:10] test_set = test_set[:15] optimized_KG2Text = teleprompter.compile(KG2TextModule(), trainset=train, valset=validation) # print(f"optimized_KG2Text: {optimized_KG2Text}") evaluate = Evaluate(devset=test_set, metric=factuality_metric, num_threads=NUM_THREADS, display_progress=True, display_table=0) score, results = evaluate(optimized_KG2Text, return_all_scores=True, return_outputs=True) # turbo.inspect_history(n=99) print(f"Optimized KG2Text Scores: {score}") print(f"Optimized KG2Text Results: {results[0]}") optimized_KG2Text.save('optimized_KG2Text') if __name__ == "__main__": train()