File size: 6,066 Bytes
aac6e8d
 
 
 
 
 
 
 
 
 
 
 
55e6f4a
aac6e8d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# gradio app for the LLM model --> use the retr environment
# Run the script and open the link in the browser.

import os
import pandas as pd
import datasets
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# training from scratch with latbert tokenizer
CHECKPOINT_PATH= 'scratch_2-nodes_tokenizer_latbert-original_packing_fcocchi/'
CHECKPOINT_PATH= 'itserr/scratch_2-nodes_tokenizer_latbert-original_packing_fcocchi'

print(f"Loading model from: {CHECKPOINT_PATH}")
tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT_PATH, token=os.environ['HF_TOKEN_READ'])
model = AutoModelForCausalLM.from_pretrained(CHECKPOINT_PATH, token=os.environ['HF_TOKEN_READ'])

preference_dataset_name= "itserr/latin_gpt_preferences"
global dataset_hf
dataset_hf = datasets.load_dataset(preference_dataset_name, token=os.environ['HF_TOKEN_READ'], download_mode='force_redownload')
dataset_hf = dataset_hf['train'].to_pandas()
print(dataset_hf.shape)

description="""
This is a Latin Language Model (LLM) based on GPT-2 and it was trained on a large corpus of Latin texts and can generate text in Latin. \n
Demo instructions:
- Enter a prompt in Latin in the Input Text box.
- Select the temperature value to control the randomness of the generated text (higher value produce a more creative and unstable answer).
- Click the 'Generate Text' button to trigger model generation.
- (Optional) insert a Feedback text in the box.
- Click the 'Like' or 'Dislike' button to judge the generation correctness. 
"""
title= "(L<sup>2</sup>) - Latin Language Model"
article= "hello world ..."
examples= ['Accidere ex una scintilla', 'Audacter calumniare,', 'Consolatium misero comites']
logo_image= '/work/pnrr_itserr/latin_model/demo_gpt/ITSERR_row_logo.png'

def generate_text(prompt, slider):
    if torch.cuda.is_available(): device = torch.device("cuda")      
    else: 
        device = torch.device("cpu")
        print("No GPU available")
    
    print("***** Generate *****")
    text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=device)
    #generated_text = text_generator(prompt, max_length=100)
    generated_text = text_generator(prompt, max_length=50, do_sample=True, temperature=slider, repetition_penalty=2.0, truncation=True)
    return generated_text[0]['generated_text']

# Function to handle user preferences
def handle_preference(preference, input, output, feedback, temp_value):
    """
    Format values stored in preferences:
        - input text
        - output generated text
        - user feedback
        - float temperature value
    """
    # first time staring from a csv file (edited the present one), then work with parquet file
    # input_text,generated_text,feedback,temperature,like,dislike,count_like,count_dislike
    global dataset_hf
    if input == output:
        output_tuple= ("", "")
    else:
        output_tuple= (input, output.split(input)[-1])
    if preference == "like":
        dislike=0
        like=1
        count_like= dataset_hf.iloc[-1]['count_like']
        count_dislike= dataset_hf.iloc[-1]['count_dislike']
        if output_tuple[1] != "" :
            count_like= dataset_hf.iloc[-1]['count_like'] + 1

    elif preference == "dislike":
        dislike=1
        like=0
        count_like= dataset_hf.iloc[-1]['count_like']
        count_dislike= dataset_hf.iloc[-1]['count_dislike']
        if output_tuple[1] != "" :
            count_dislike= dataset_hf.iloc[-1]['count_dislike'] + 1

    inp_text= output_tuple[0]
    out_text= output_tuple[1]
    new_data = pd.DataFrame({'input_text': inp_text, 'generated_text': out_text, 'feedback': feedback,
                              'temperature': float(temp_value), 'like': like, 'dislike': dislike, 
                              'count_like': count_like, 'count_dislike': count_dislike}, index=[0])
    dataset_hf = pd.concat([dataset_hf, new_data], ignore_index=True)
    hf_dataset = datasets.Dataset.from_pandas(dataset_hf)
    dataset_dict = datasets.DatasetDict({"train": hf_dataset})
    dataset_dict.push_to_hub(preference_dataset_name, token=os.environ['HF_TOKEN_WRITE'])
    
    # print dataset statistics
    print(f"Admin log: like: {count_like} and dislike: {count_dislike}")
    return f"You select '{preference}' as answer of the model generation. Thank you for your time!"

custom_css = """
#logo {
    display: block;
    margin-left: auto;
    margin-right: auto;
    width: 280px;
    height: 140px;
}
"""

with gr.Blocks(css=custom_css) as demo:
    gr.Image(logo_image, elem_id="logo")
    gr.Markdown(f"<h1 style='text-align: center;'>{title}</h1>")
    gr.Markdown(description)
    
    with gr.Row():
        with gr.Column():
            input_text = gr.Textbox(lines=5, placeholder="Enter latin text here...", label="Input Text")
        with gr.Column():
            output_text = gr.Textbox(lines=5, placeholder="Output text will appear here...", label="Output Text")

    gr.Examples(examples=examples, inputs=input_text)
    temperature_slider = gr.Slider(minimum=0.1, maximum=5.0, step=0.1, value=1.0, label="Temperature")
    
    clean_button = gr.Button("Generate Text")
    clean_button.click(fn=generate_text, inputs=[input_text, temperature_slider], outputs=output_text)
    feedback_output = gr.Textbox(lines=1, placeholder="If you want to provide a feedback, please fill this box ...", label="Feedback")

    with gr.Row():
        like_button = gr.Button("Like")
        dislike_button = gr.Button("Dislike")

    button_output = gr.Textbox(lines=1, placeholder="Please submit your choice", label="Latin Language Model Demo")
    like_button.click(fn=lambda x,y,z,v: handle_preference("like", x, y, z, v), inputs=[input_text, output_text, feedback_output, temperature_slider], outputs=button_output)
    dislike_button.click(fn=lambda x,y,z,v: handle_preference("dislike", x, y, z, v), inputs=[input_text, output_text, feedback_output, temperature_slider], outputs=button_output)
    #gr.Markdown(article)

demo.launch(share=True, debug=True)