Spaces:
Sleeping
Sleeping
File size: 2,827 Bytes
5195c78 f213f73 5195c78 01fa835 5195c78 a0e0c48 0ea9587 a0e0c48 0ea9587 a0e0c48 5195c78 a0e0c48 5195c78 a0e0c48 8bae99c f213f73 a0e0c48 0af0fc5 f213f73 0f9e51d f213f73 0f9e51d 8bae99c f213f73 8bae99c b9ba584 5195c78 0eacc14 51e8e62 0eacc14 5195c78 0af0fc5 0eacc14 03a5566 5195c78 0eacc14 5195c78 03a5566 f140bff 5195c78 b9ba584 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
from transformers import pipeline
import gradio as gr
import json
import time
# Initialize the pipeline with the new model
pipe = pipeline("text-generation", model="Blexus/icn_savant_v2_instruct")
def format_prompt(message, history):
prompt = ""
#for entry in history:
# if len(entry) == 2:
# user_prompt, bot_response = entry
# prompt += f"USER: {user_prompt} <|endofuser|>\nASSISTANT: {bot_response}<|endoftext|>\n"
prompt += f"<|in|> {message} <|out|>"
return prompt
def generate(prompt, history, temperature=0.9, max_new_tokens=4096, top_p=0.9, repetition_penalty=1.2):
temperature = float(temperature)
if temperature < 1e-2:
temperature = 1e-2
top_p = float(top_p)
formatted_prompt = format_prompt(prompt, history)
response_text = "We are sorry but ICN savant doesn't know how to answer."
# Generate the response without streaming
try:
response = pipe(formatted_prompt, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p, repetition_penalty=repetition_penalty)[0]["generated_text"]
response_text = response.split("<|end|>")[-1].strip()
# Simulate streaming by yielding parts of the response
accumulated_response = "" # To keep track of the full response
for char in response_text:
accumulated_response += char # Append the new character
yield accumulated_response # Yield the accumulated response
time.sleep(0.02) # Add a slight delay to simulate typing
except Exception as e:
print(f"Error generating response: {e}")
customCSS = """
#component-7 {
height: 1600px;
flex-grow: 4;
}
"""
additional_inputs = [
gr.Slider(
label="Temperature",
value=0.9,
minimum=0.0,
maximum=1.0,
step=0.05,
interactive=True,
info="Higher values produce more diverse outputs",
),
gr.Slider(
label="Max new tokens",
value=1024,
minimum=64,
maximum=4096,
step=64,
interactive=True,
info="The maximum numbers of new tokens",
),
gr.Slider(
label="Top-p (nucleus sampling)",
value=0.90,
minimum=0.0,
maximum=1,
step=0.05,
interactive=True,
info="Higher values sample more low-probability tokens",
),
gr.Slider(
label="Repetition penalty",
value=1.2,
minimum=1.0,
maximum=2.0,
step=0.05,
interactive=True,
info="Penalize repeated tokens",
)
]
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.ChatInterface(
generate,
additional_inputs=additional_inputs,
)
#demo.set_css(customCSS)
demo.queue().launch(debug=True) |