instruct-evolve-xml-3b / triplets2 /triples-kg2summary.py

Upload folder using huggingface_hub

6af8f75 verified 7 months ago

4.91 kB


	from pprint import pp
	from models import document
	import pandas as pd
	import dspy
	from dspy.teleprompt import BootstrapFewShot
	from dspy.evaluate import Evaluate


	from opus.modules.kg2summary import KG2TextModule
	# from utils.dataframe_utils import dataframe_to_documents
	from opus.utils.dataframe_utils import dataframe_to_documents

	from opus.metric.base import AssessPrediction


	turbo = dspy.OpenAI(model='gpt-3.5-turbo', temperature=0.1, max_tokens=1200, top_p=0.89, api_base='http://localhost:6000/v1/', api_key='asdf', timeout=200)
	# turbo = dspy.OpenAI(model='mixtral-turbo', temperature=0.5, max_tokens=1024, api_base='http://localhost:6000/v1/', api_key='asdf')#, timeout=200)

	dspy.settings.configure(lm=turbo)
	NUM_THREADS=10

	from pprint import pprint


	class AssessPrediction(dspy.Signature):
	"""Evaluate and compare two summaries to identify which one more accurately and effectively conveys the key points of the original content.

	As an expert reviewer, your role is to critically assess the clarity, coherence, and completeness of each summary. Consider the following aspects in your evaluation:

	- Clarity: Does the summary present the information in a clear and understandable manner?
	- Coherence: Is the summary logically organized and do the ideas flow smoothly?
	- Completeness: Does the summary capture the essential points of the original content without omitting crucial information?

	Based on your analysis, determine which summary does a better job at distilling the essence of the original material, making a sophisticated decision that takes into account not just the factual accuracy but also the readability and overall effectiveness of the summarization.
	"""
	summary1 = dspy.InputField()
	summary2 = dspy.InputField()

	assessment_answer = dspy.OutputField(desc="summary1 or summary2")

	# class AssessPrediction(dspy.Signature):
	# """Pick the better summary based on the example."""
	# summary1 = dspy.InputField()
	# summary2 = dspy.InputField()

	# assessment_answer = dspy.OutputField(desc="summary1 or summary2")

	import random


	def factuality_metric(gold, pred, trace=None):
	assess = dspy.ChainOfThought(AssessPrediction)

	# if pred.summary_rewrite is "N/A" then default to pred.summary
	_summary = pred.summary_rewrite if pred.summary_rewrite != "N/A" else pred.summary

	# Initialize summaries with labels before shuffling
	summaries = [('gold', gold.summary), ('pred', _summary)]

	# Randomize summaries order and prepare for assessment
	random.shuffle(summaries)
	assessment_args = {
	'summary1': summaries[0][1], # First summary after shuffle
	'summary2': summaries[1][1] # Second summary after shuffle
	}

	# Keep track of which summary is which
	summary1_label = summaries[0][0]
	summary2_label = summaries[1][0]

	# Assess using the randomized summaries
	_winner = assess(**assessment_args)
	winner_label = _winner.assessment_answer.split()[0].lower()

	# Determine the winner based on original labels
	if winner_label == 'summary1':
	winner_is_gold = summary1_label == 'gold'
	else:
	winner_is_gold = summary2_label == 'gold'

	return winner_is_gold


	def train():
	df = pd.read_parquet('./data/kg_datasetK17.parquet')
	print(f"Number of records: {len(df)}")
	random_sample = df.sample(n=5)
	print(f"Random sample: {random_sample}")

	# Setup our bootstrap
	teleprompter = BootstrapFewShot(metric=factuality_metric, max_bootstrapped_demos=4, max_labeled_demos=16)
	# Get the documents from the parquet file
	_documents = dataframe_to_documents(df)

	pp(_documents[0].text)
	# exit()

	documents = []
	for doc in _documents:
	# doc.with_inputs('kg')
	doc = dspy.Example(**doc).with_inputs('kg')
	documents.append(doc)
	# random.shuffle(documents)

	# from pprint import pprint
	# pprint(documents)

	# Split documents into train, validation, and test sets
	split1, split2 = len(documents) // 3, 2 * len(documents) // 3
	train, validation, test_set = documents[:split1], documents[split1:split2], documents[split2:]

	train = train[:20]
	validation = validation[:10]
	test_set = test_set[:15]
	optimized_KG2Text = teleprompter.compile(KG2TextModule(), trainset=train, valset=validation)

	# print(f"optimized_KG2Text: {optimized_KG2Text}")

	evaluate = Evaluate(devset=test_set, metric=factuality_metric, num_threads=NUM_THREADS, display_progress=True, display_table=0)

	score, results = evaluate(optimized_KG2Text, return_all_scores=True, return_outputs=True)

	# turbo.inspect_history(n=99)

	print(f"Optimized KG2Text Scores: {score}")
	print(f"Optimized KG2Text Results: {results[0]}")
	optimized_KG2Text.save('optimized_KG2Text')

	if __name__ == "__main__":
	train()