File size: 2,827 Bytes
5195c78
 
 
f213f73
5195c78
 
01fa835
5195c78
a0e0c48
 
0ea9587
a0e0c48
 
 
 
0ea9587
a0e0c48
5195c78
 
a0e0c48
5195c78
 
 
 
 
a0e0c48
 
8bae99c
 
f213f73
a0e0c48
0af0fc5
f213f73
0f9e51d
f213f73
0f9e51d
 
8bae99c
f213f73
8bae99c
b9ba584
5195c78
0eacc14
51e8e62
 
0eacc14
 
5195c78
 
0af0fc5
0eacc14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
03a5566
5195c78
 
0eacc14
5195c78
03a5566
f140bff
5195c78
b9ba584
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from transformers import pipeline
import gradio as gr
import json
import time

# Initialize the pipeline with the new model
pipe = pipeline("text-generation", model="Blexus/icn_savant_v2_instruct")

def format_prompt(message, history):
    prompt = ""
    
    #for entry in history:
    #    if len(entry) == 2:
    #        user_prompt, bot_response = entry
    #        prompt += f"USER: {user_prompt} <|endofuser|>\nASSISTANT: {bot_response}<|endoftext|>\n"
    
    prompt += f"<|in|> {message} <|out|>"
    return prompt

def generate(prompt, history, temperature=0.9, max_new_tokens=4096, top_p=0.9, repetition_penalty=1.2):
    temperature = float(temperature)
    if temperature < 1e-2:
        temperature = 1e-2
    top_p = float(top_p)

    formatted_prompt = format_prompt(prompt, history)
    response_text = "We are sorry but ICN savant doesn't know how to answer."
    # Generate the response without streaming
    try:
            response = pipe(formatted_prompt, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p, repetition_penalty=repetition_penalty)[0]["generated_text"]
            response_text = response.split("<|end|>")[-1].strip()

            # Simulate streaming by yielding parts of the response
            accumulated_response = ""  # To keep track of the full response
            for char in response_text:
                accumulated_response += char  # Append the new character
                yield accumulated_response  # Yield the accumulated response
                time.sleep(0.02)  # Add a slight delay to simulate typing

    except Exception as e:
            print(f"Error generating response: {e}")

customCSS = """
#component-7 {
  height: 1600px;
  flex-grow: 4;
}
"""

additional_inputs = [
    gr.Slider(
        label="Temperature",
        value=0.9,
        minimum=0.0,
        maximum=1.0,
        step=0.05,
        interactive=True,
        info="Higher values produce more diverse outputs",
    ),
    gr.Slider(
        label="Max new tokens",
        value=1024,
        minimum=64,
        maximum=4096,
        step=64,
        interactive=True,
        info="The maximum numbers of new tokens",
    ),
    gr.Slider(
        label="Top-p (nucleus sampling)",
        value=0.90,
        minimum=0.0,
        maximum=1,
        step=0.05,
        interactive=True,
        info="Higher values sample more low-probability tokens",
    ),
    gr.Slider(
        label="Repetition penalty",
        value=1.2,
        minimum=1.0,
        maximum=2.0,
        step=0.05,
        interactive=True,
        info="Penalize repeated tokens",
    )
]

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.ChatInterface(
        generate,
        additional_inputs=additional_inputs,
    )
    
#demo.set_css(customCSS)

demo.queue().launch(debug=True)