from dotenv import load_dotenv import gradio as gr import json import numpy as np from utils.model import Model from utils.metric import metric_rouge_score from pages.summarization_playground import generate_answer load_dotenv() def display_results(response_list): overall_score = np.mean([r['metric_score']['rouge_score'] for r in response_list]) html_output = f"

Overall Score: {overall_score:.2f}

" for i, item in enumerate(response_list, 1): dialogue = item['dialogue'] summary = item['summary'] response = item['response'] rouge_score = item['metric_score']['rouge_score'] html_output += f"""
Response {i} (Rouge Score: {rouge_score:.2f})

Dialogue

{dialogue}

Summary

{summary}

Response

{response}
""" return html_output def process(model_selection, prompt, num=10): response_list = [] dataset = json.loads("test_samples/test_data.json") for data in dataset: dialogue = data['dialogue'] format = data['format'] summary = data['summary'] response = generate_answer(dialogue, model_selection, prompt + f' Output following {format} format.') rouge_score = metric_rouge_score(response, summary) response_list.append( { 'dialogue': dialogue, 'summary': summary, 'response': response, 'metric_score': { 'rouge_score': rouge_score } } ) return display_results(response_list) def create_batch_evaluation_interface(): with gr.Blocks() as demo: gr.Markdown("## Here are evaluation setups. It will run though datapoints in test_data.josn to generate and evaluate. Show results once finished.") model_dropdown = gr.Dropdown(choices=Model.__model_list__, label="Choose a model", value=Model.__model_list__[0]) Template_text = gr.Textbox(value="""Summarize the following dialogue""", label='Input Prompting Template', lines=8, placeholder='Input your prompts') submit_button = gr.Button("✨ Submit ✨") output = gr.HTML(label="Results") submit_button.click( process, inputs=[model_dropdown, Template_text], outputs=output ) return demo if __name__ == "__main__": demo = create_batch_evaluation_interface() demo.launch()