Spaces:
Sleeping
Sleeping
File size: 3,090 Bytes
5195c78 f213f73 5195c78 d42bf01 5195c78 0eacc14 d42bf01 0ea9587 0af0fc5 0ea9587 d42bf01 0ea9587 d42bf01 5195c78 0af0fc5 5195c78 7116955 0af0fc5 8bae99c f213f73 0af0fc5 f213f73 0f9e51d f213f73 0f9e51d 8bae99c f213f73 8bae99c b9ba584 5195c78 0eacc14 51e8e62 0eacc14 5195c78 0af0fc5 c0c5ff7 0eacc14 d42bf01 0eacc14 03a5566 5195c78 0eacc14 5195c78 03a5566 5195c78 b9ba584 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
from transformers import pipeline
import gradio as gr
import json
import time
# Initialize the pipeline with the new model
pipe = pipeline("text-generation", model="Blexus/Quble_test_model_v1_INSTRUCT_v2")
def format_prompt(message, system, history):
prompt = f"SYSTEM: {system} <|endofsystem|>"
for entry in history:
if len(entry) == 2:
user_prompt, bot_response = entry
prompt += f"USER: {user_prompt} <|endofuser|>\nASSISTANT: {bot_response}<|endoftext|>\n"
prompt += f"USER: {message}<|endofuser|>\nASSISTANT:"
return prompt
def generate(prompt, system, history, temperature=0.9, max_new_tokens=4096, top_p=0.9, repetition_penalty=1.2):
temperature = float(temperature)
if temperature < 1e-2:
temperature = 1e-2
top_p = float(top_p)
formatted_prompt = format_prompt(prompt, system, history)
response_text = "We are sorry but Quble doesn't know how to answer."
# Generate the response without streaming
try:
response = pipe(formatted_prompt, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p, repetition_penalty=repetition_penalty)[0]["generated_text"]
response_text = response.split("ASSISTANT:")[-1].strip()
# Simulate streaming by yielding parts of the response
accumulated_response = "" # To keep track of the full response
for char in response_text:
accumulated_response += char # Append the new character
yield accumulated_response # Yield the accumulated response
time.sleep(0.02) # Add a slight delay to simulate typing
except Exception as e:
print(f"Error generating response: {e}")
customCSS = """
#component-7 {
height: 1600px;
flex-grow: 4;
}
"""
additional_inputs = [
gr.Textbox(
label="System prompt",
value="You are a helpful intelligent assistant. Your name is Quble.",
info="System prompt",
interactive=True,
),
gr.Slider(
label="Temperature",
value=0.9,
minimum=0.0,
maximum=1.0,
step=0.05,
interactive=True,
info="Higher values produce more diverse outputs",
),
gr.Slider(
label="Max new tokens",
value=1024,
minimum=64,
maximum=4096,
step=64,
interactive=True,
info="The maximum numbers of new tokens",
),
gr.Slider(
label="Top-p (nucleus sampling)",
value=0.90,
minimum=0.0,
maximum=1,
step=0.05,
interactive=True,
info="Higher values sample more low-probability tokens",
),
gr.Slider(
label="Repetition penalty",
value=1.2,
minimum=1.0,
maximum=2.0,
step=0.05,
interactive=True,
info="Penalize repeated tokens",
)
]
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.ChatInterface(
generate,
additional_inputs=additional_inputs,
)
demo.set_css(customCSS)
demo.queue().launch(debug=True) |