|
|
|
from pprint import pp |
|
from models import document |
|
import pandas as pd |
|
import dspy |
|
from dspy.teleprompt import BootstrapFewShot |
|
from dspy.evaluate import Evaluate |
|
|
|
|
|
from opus.modules.kg2summary import KG2TextModule |
|
|
|
from opus.utils.dataframe_utils import dataframe_to_documents |
|
|
|
from opus.metric.base import AssessPrediction |
|
|
|
|
|
turbo = dspy.OpenAI(model='gpt-3.5-turbo', temperature=0.1, max_tokens=1200, top_p=0.89, api_base='http://localhost:6000/v1/', api_key='asdf', timeout=200) |
|
|
|
|
|
dspy.settings.configure(lm=turbo) |
|
NUM_THREADS=10 |
|
|
|
from pprint import pprint |
|
|
|
|
|
class AssessPrediction(dspy.Signature): |
|
"""Evaluate and compare two summaries to identify which one more accurately and effectively conveys the key points of the original content. |
|
|
|
As an expert reviewer, your role is to critically assess the clarity, coherence, and completeness of each summary. Consider the following aspects in your evaluation: |
|
|
|
- Clarity: Does the summary present the information in a clear and understandable manner? |
|
- Coherence: Is the summary logically organized and do the ideas flow smoothly? |
|
- Completeness: Does the summary capture the essential points of the original content without omitting crucial information? |
|
|
|
Based on your analysis, determine which summary does a better job at distilling the essence of the original material, making a sophisticated decision that takes into account not just the factual accuracy but also the readability and overall effectiveness of the summarization. |
|
""" |
|
summary1 = dspy.InputField() |
|
summary2 = dspy.InputField() |
|
|
|
assessment_answer = dspy.OutputField(desc="summary1 or summary2") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import random |
|
|
|
|
|
def factuality_metric(gold, pred, trace=None): |
|
assess = dspy.ChainOfThought(AssessPrediction) |
|
|
|
|
|
_summary = pred.summary_rewrite if pred.summary_rewrite != "N/A" else pred.summary |
|
|
|
|
|
summaries = [('gold', gold.summary), ('pred', _summary)] |
|
|
|
|
|
random.shuffle(summaries) |
|
assessment_args = { |
|
'summary1': summaries[0][1], |
|
'summary2': summaries[1][1] |
|
} |
|
|
|
|
|
summary1_label = summaries[0][0] |
|
summary2_label = summaries[1][0] |
|
|
|
|
|
_winner = assess(**assessment_args) |
|
winner_label = _winner.assessment_answer.split()[0].lower() |
|
|
|
|
|
if winner_label == 'summary1': |
|
winner_is_gold = summary1_label == 'gold' |
|
else: |
|
winner_is_gold = summary2_label == 'gold' |
|
|
|
return winner_is_gold |
|
|
|
|
|
def train(): |
|
df = pd.read_parquet('./data/kg_datasetK17.parquet') |
|
print(f"Number of records: {len(df)}") |
|
random_sample = df.sample(n=5) |
|
print(f"Random sample: {random_sample}") |
|
|
|
|
|
teleprompter = BootstrapFewShot(metric=factuality_metric, max_bootstrapped_demos=4, max_labeled_demos=16) |
|
|
|
_documents = dataframe_to_documents(df) |
|
|
|
pp(_documents[0].text) |
|
|
|
|
|
documents = [] |
|
for doc in _documents: |
|
|
|
doc = dspy.Example(**doc).with_inputs('kg') |
|
documents.append(doc) |
|
|
|
|
|
|
|
|
|
|
|
|
|
split1, split2 = len(documents) // 3, 2 * len(documents) // 3 |
|
train, validation, test_set = documents[:split1], documents[split1:split2], documents[split2:] |
|
|
|
train = train[:20] |
|
validation = validation[:10] |
|
test_set = test_set[:15] |
|
optimized_KG2Text = teleprompter.compile(KG2TextModule(), trainset=train, valset=validation) |
|
|
|
|
|
|
|
evaluate = Evaluate(devset=test_set, metric=factuality_metric, num_threads=NUM_THREADS, display_progress=True, display_table=0) |
|
|
|
score, results = evaluate(optimized_KG2Text, return_all_scores=True, return_outputs=True) |
|
|
|
|
|
|
|
print(f"Optimized KG2Text Scores: {score}") |
|
print(f"Optimized KG2Text Results: {results[0]}") |
|
optimized_KG2Text.save('optimized_KG2Text') |
|
|
|
if __name__ == "__main__": |
|
train() |
|
|
|
|