instruct-evolve-xml-3b / triplets2 /triples-kg2summary.py
fullstack's picture
Upload folder using huggingface_hub
6af8f75 verified
raw
history blame
4.91 kB
from pprint import pp
from models import document
import pandas as pd
import dspy
from dspy.teleprompt import BootstrapFewShot
from dspy.evaluate import Evaluate
from opus.modules.kg2summary import KG2TextModule
# from utils.dataframe_utils import dataframe_to_documents
from opus.utils.dataframe_utils import dataframe_to_documents
from opus.metric.base import AssessPrediction
turbo = dspy.OpenAI(model='gpt-3.5-turbo', temperature=0.1, max_tokens=1200, top_p=0.89, api_base='http://localhost:6000/v1/', api_key='asdf', timeout=200)
# turbo = dspy.OpenAI(model='mixtral-turbo', temperature=0.5, max_tokens=1024, api_base='http://localhost:6000/v1/', api_key='asdf')#, timeout=200)
dspy.settings.configure(lm=turbo)
NUM_THREADS=10
from pprint import pprint
class AssessPrediction(dspy.Signature):
"""Evaluate and compare two summaries to identify which one more accurately and effectively conveys the key points of the original content.
As an expert reviewer, your role is to critically assess the clarity, coherence, and completeness of each summary. Consider the following aspects in your evaluation:
- Clarity: Does the summary present the information in a clear and understandable manner?
- Coherence: Is the summary logically organized and do the ideas flow smoothly?
- Completeness: Does the summary capture the essential points of the original content without omitting crucial information?
Based on your analysis, determine which summary does a better job at distilling the essence of the original material, making a sophisticated decision that takes into account not just the factual accuracy but also the readability and overall effectiveness of the summarization.
"""
summary1 = dspy.InputField()
summary2 = dspy.InputField()
assessment_answer = dspy.OutputField(desc="summary1 or summary2")
# class AssessPrediction(dspy.Signature):
# """Pick the better summary based on the example."""
# summary1 = dspy.InputField()
# summary2 = dspy.InputField()
# assessment_answer = dspy.OutputField(desc="summary1 or summary2")
import random
def factuality_metric(gold, pred, trace=None):
assess = dspy.ChainOfThought(AssessPrediction)
# if pred.summary_rewrite is "N/A" then default to pred.summary
_summary = pred.summary_rewrite if pred.summary_rewrite != "N/A" else pred.summary
# Initialize summaries with labels before shuffling
summaries = [('gold', gold.summary), ('pred', _summary)]
# Randomize summaries order and prepare for assessment
random.shuffle(summaries)
assessment_args = {
'summary1': summaries[0][1], # First summary after shuffle
'summary2': summaries[1][1] # Second summary after shuffle
}
# Keep track of which summary is which
summary1_label = summaries[0][0]
summary2_label = summaries[1][0]
# Assess using the randomized summaries
_winner = assess(**assessment_args)
winner_label = _winner.assessment_answer.split()[0].lower()
# Determine the winner based on original labels
if winner_label == 'summary1':
winner_is_gold = summary1_label == 'gold'
else:
winner_is_gold = summary2_label == 'gold'
return winner_is_gold
def train():
df = pd.read_parquet('./data/kg_datasetK17.parquet')
print(f"Number of records: {len(df)}")
random_sample = df.sample(n=5)
print(f"Random sample: {random_sample}")
# Setup our bootstrap
teleprompter = BootstrapFewShot(metric=factuality_metric, max_bootstrapped_demos=4, max_labeled_demos=16)
# Get the documents from the parquet file
_documents = dataframe_to_documents(df)
pp(_documents[0].text)
# exit()
documents = []
for doc in _documents:
# doc.with_inputs('kg')
doc = dspy.Example(**doc).with_inputs('kg')
documents.append(doc)
# random.shuffle(documents)
# from pprint import pprint
# pprint(documents)
# Split documents into train, validation, and test sets
split1, split2 = len(documents) // 3, 2 * len(documents) // 3
train, validation, test_set = documents[:split1], documents[split1:split2], documents[split2:]
train = train[:20]
validation = validation[:10]
test_set = test_set[:15]
optimized_KG2Text = teleprompter.compile(KG2TextModule(), trainset=train, valset=validation)
# print(f"optimized_KG2Text: {optimized_KG2Text}")
evaluate = Evaluate(devset=test_set, metric=factuality_metric, num_threads=NUM_THREADS, display_progress=True, display_table=0)
score, results = evaluate(optimized_KG2Text, return_all_scores=True, return_outputs=True)
# turbo.inspect_history(n=99)
print(f"Optimized KG2Text Scores: {score}")
print(f"Optimized KG2Text Results: {results[0]}")
optimized_KG2Text.save('optimized_KG2Text')
if __name__ == "__main__":
train()