from pprint import pp
from models import document
import pandas as pd
import dspy
from dspy.teleprompt import BootstrapFewShot
from dspy.evaluate import Evaluate


from opus.modules.kg2summary import KG2TextModule
# from utils.dataframe_utils import dataframe_to_documents
from opus.utils.dataframe_utils import dataframe_to_documents

from opus.metric.base import AssessPrediction


turbo = dspy.OpenAI(model='gpt-3.5-turbo', temperature=0.1, max_tokens=1200, top_p=0.89, api_base='http://localhost:6000/v1/', api_key='asdf', timeout=200)
# turbo = dspy.OpenAI(model='mixtral-turbo', temperature=0.5, max_tokens=1024, api_base='http://localhost:6000/v1/', api_key='asdf')#, timeout=200)

dspy.settings.configure(lm=turbo)
NUM_THREADS=10

from pprint import pprint


class AssessPrediction(dspy.Signature):
    """Evaluate and compare two summaries to identify which one more accurately and effectively conveys the key points of the original content.
    
    As an expert reviewer, your role is to critically assess the clarity, coherence, and completeness of each summary. Consider the following aspects in your evaluation:
    
    - Clarity: Does the summary present the information in a clear and understandable manner?
    - Coherence: Is the summary logically organized and do the ideas flow smoothly?
    - Completeness: Does the summary capture the essential points of the original content without omitting crucial information?
    
    Based on your analysis, determine which summary does a better job at distilling the essence of the original material, making a sophisticated decision that takes into account not just the factual accuracy but also the readability and overall effectiveness of the summarization.
    """
    summary1 = dspy.InputField()
    summary2 = dspy.InputField()

    assessment_answer = dspy.OutputField(desc="summary1 or summary2")

# class AssessPrediction(dspy.Signature):
#     """Pick the better summary based on the example."""
#     summary1 = dspy.InputField()
#     summary2 = dspy.InputField()

#     assessment_answer = dspy.OutputField(desc="summary1 or summary2")

import random


def factuality_metric(gold, pred, trace=None):
    assess = dspy.ChainOfThought(AssessPrediction)
    
    # if pred.summary_rewrite is "N/A" then default to pred.summary
    _summary = pred.summary_rewrite if pred.summary_rewrite != "N/A" else pred.summary

    # Initialize summaries with labels before shuffling
    summaries = [('gold', gold.summary), ('pred', _summary)]
    
    # Randomize summaries order and prepare for assessment
    random.shuffle(summaries)
    assessment_args = {
        'summary1': summaries[0][1],  # First summary after shuffle
        'summary2': summaries[1][1]   # Second summary after shuffle
    }
    
    # Keep track of which summary is which
    summary1_label = summaries[0][0]
    summary2_label = summaries[1][0]
    
    # Assess using the randomized summaries
    _winner = assess(**assessment_args)
    winner_label = _winner.assessment_answer.split()[0].lower()
    
    # Determine the winner based on original labels
    if winner_label == 'summary1':
        winner_is_gold = summary1_label == 'gold'
    else:
        winner_is_gold = summary2_label == 'gold'

    return winner_is_gold


def train():
    df = pd.read_parquet('./data/kg_datasetK17.parquet')
    print(f"Number of records: {len(df)}")
    random_sample = df.sample(n=5)
    print(f"Random sample: {random_sample}")

    # Setup our bootstrap
    teleprompter = BootstrapFewShot(metric=factuality_metric, max_bootstrapped_demos=4, max_labeled_demos=16)
    # Get the documents from the parquet file
    _documents = dataframe_to_documents(df)

    pp(_documents[0].text)
    # exit()

    documents = []
    for doc in _documents:
        # doc.with_inputs('kg')
        doc = dspy.Example(**doc).with_inputs('kg')
        documents.append(doc)
    # random.shuffle(documents)

    # from pprint import pprint
    # pprint(documents)

    # Split documents into train, validation, and test sets
    split1, split2 = len(documents) // 3, 2 * len(documents) // 3
    train, validation, test_set = documents[:split1], documents[split1:split2], documents[split2:]

    train = train[:20]
    validation = validation[:10]
    test_set = test_set[:15]
    optimized_KG2Text = teleprompter.compile(KG2TextModule(), trainset=train, valset=validation)

    # print(f"optimized_KG2Text: {optimized_KG2Text}")

    evaluate = Evaluate(devset=test_set, metric=factuality_metric, num_threads=NUM_THREADS, display_progress=True, display_table=0)

    score, results = evaluate(optimized_KG2Text, return_all_scores=True, return_outputs=True) 

    # turbo.inspect_history(n=99)

    print(f"Optimized KG2Text Scores: {score}")
    print(f"Optimized KG2Text Results: {results[0]}")
    optimized_KG2Text.save('optimized_KG2Text')

if __name__ == "__main__":
    train()