File size: 2,961 Bytes
143b62d
 
 
 
4fb58cc
87bb867
143b62d
 
 
 
d3d48e1
143b62d
 
 
87bb867
4fb58cc
 
 
87bb867
 
 
 
 
 
 
 
 
 
 
 
 
e302f12
87bb867
 
 
e302f12
87bb867
 
 
e302f12
87bb867
 
 
 
 
 
 
143b62d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87bb867
 
143b62d
 
aa733b6
87bb867
aa733b6
143b62d
af1a6de
143b62d
f253a0d
143b62d
f253a0d
143b62d
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from dotenv import load_dotenv
import gradio as gr
import random

import numpy as np

from utils.model import Model
from utils.data import dataset
from utils.metric import metric_rouge_score

from pages.summarization_playground import model, generate_answer

load_dotenv()

def display_results(response_list):
    overall_score = np.mean([r['metric_score']['rouge_score'] for r in response_list])
    
    html_output = f"<h2>Overall Score: {overall_score:.2f}</h2>"
    
    for i, item in enumerate(response_list, 1):
        dialogue = item['dialogue']
        summary = item['summary']
        response = item['response']
        rouge_score = item['metric_score']['rouge_score']
        
        html_output += f"""
        <details>
        <summary>Response {i} (Rouge Score: {rouge_score:.2f})</summary>
        <div style="display: flex; justify-content: space-between;">
            <div style="width: 30%;">
                <h3>Dialogue</h3>
                {dialogue}
            </div>
            <div style="width: 30%;">
                <h3>Summary</h3>
                {summary}
            </div>
            <div style="width: 30%;">
                <h3>Response</h3>
                {response}
            </div>
        </div>
        </details>
        """
    
    return html_output

def process(seed, model_selection, prompt, num=10):
    random.seed(seed)
    response_list = []

    for data in random.choices(dataset, k=num):
        dialogue = data['dialogue']
        summary = data['summary']
        response = generate_answer(dialogue, model, model_selection, prompt)

        rouge_score = metric_rouge_score(response, summary)

        response_list.append(
            {
                'dialogue': dialogue,
                'summary': summary,
                'response': response,
                'metric_score': {
                    'rouge_score': rouge_score
                }
            }
        )

    return display_results(response_list)


def create_batch_evaluation_interface():
    with gr.Blocks() as demo:
        gr.Markdown("## Here are evaluation setups. It will randomly sample 10 data points to generate and evaluate. Show results once finished.")
        
        with gr.Row():
            seed = gr.Number(value=8, info="pick your favoriate random seed", precision=0)
            model_dropdown = gr.Dropdown(choices=Model.__model_list__, label="Choose a model", value=Model.__model_list__[0])
        Template_text = gr.Textbox(value="""Summarize the following dialogue""", label='Input Prompting Template', lines=8, placeholder='Input your prompts')
        submit_button = gr.Button("✨ Submit ✨")
        output = gr.HTML(label="Results")

        submit_button.click(
            process,
            inputs=[seed, model_dropdown, Template_text],
            outputs=output
        )

    return demo

if __name__ == "__main__":
    demo = create_batch_evaluation_interface()
    demo.launch()