File size: 3,090 Bytes
5195c78
 
 
f213f73
5195c78
 
d42bf01
5195c78
0eacc14
d42bf01
0ea9587
 
0af0fc5
0ea9587
d42bf01
0ea9587
d42bf01
5195c78
 
0af0fc5
5195c78
 
 
 
 
7116955
0af0fc5
8bae99c
 
f213f73
 
0af0fc5
f213f73
0f9e51d
f213f73
0f9e51d
 
8bae99c
f213f73
8bae99c
b9ba584
5195c78
0eacc14
51e8e62
 
0eacc14
 
5195c78
 
0af0fc5
c0c5ff7
0eacc14
d42bf01
0eacc14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
03a5566
5195c78
 
0eacc14
5195c78
03a5566
 
5195c78
b9ba584
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
from transformers import pipeline
import gradio as gr
import json
import time

# Initialize the pipeline with the new model
pipe = pipeline("text-generation", model="Blexus/Quble_test_model_v1_INSTRUCT_v2")

def format_prompt(message, system, history):
    prompt = f"SYSTEM: {system} <|endofsystem|>"
    
    for entry in history:
        if len(entry) == 2:
            user_prompt, bot_response = entry
            prompt += f"USER: {user_prompt} <|endofuser|>\nASSISTANT: {bot_response}<|endoftext|>\n"
    
    prompt += f"USER: {message}<|endofuser|>\nASSISTANT:"
    return prompt

def generate(prompt, system, history, temperature=0.9, max_new_tokens=4096, top_p=0.9, repetition_penalty=1.2):
    temperature = float(temperature)
    if temperature < 1e-2:
        temperature = 1e-2
    top_p = float(top_p)

    formatted_prompt = format_prompt(prompt, system, history)
    response_text = "We are sorry but Quble doesn't know how to answer."
    # Generate the response without streaming
    try:
            response = pipe(formatted_prompt, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p, repetition_penalty=repetition_penalty)[0]["generated_text"]
            response_text = response.split("ASSISTANT:")[-1].strip()

            # Simulate streaming by yielding parts of the response
            accumulated_response = ""  # To keep track of the full response
            for char in response_text:
                accumulated_response += char  # Append the new character
                yield accumulated_response  # Yield the accumulated response
                time.sleep(0.02)  # Add a slight delay to simulate typing

    except Exception as e:
            print(f"Error generating response: {e}")

customCSS = """
#component-7 {
  height: 1600px;
  flex-grow: 4;
}
"""

additional_inputs = [
    gr.Textbox(
        label="System prompt",
        value="You are a helpful intelligent assistant. Your name is Quble.",
        info="System prompt",
        interactive=True,
    ),
    gr.Slider(
        label="Temperature",
        value=0.9,
        minimum=0.0,
        maximum=1.0,
        step=0.05,
        interactive=True,
        info="Higher values produce more diverse outputs",
    ),
    gr.Slider(
        label="Max new tokens",
        value=1024,
        minimum=64,
        maximum=4096,
        step=64,
        interactive=True,
        info="The maximum numbers of new tokens",
    ),
    gr.Slider(
        label="Top-p (nucleus sampling)",
        value=0.90,
        minimum=0.0,
        maximum=1,
        step=0.05,
        interactive=True,
        info="Higher values sample more low-probability tokens",
    ),
    gr.Slider(
        label="Repetition penalty",
        value=1.2,
        minimum=1.0,
        maximum=2.0,
        step=0.05,
        interactive=True,
        info="Penalize repeated tokens",
    )
]

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.ChatInterface(
        generate,
        additional_inputs=additional_inputs,
    )
    
demo.set_css(customCSS)

demo.queue().launch(debug=True)