Spaces:
Sleeping
Sleeping
File size: 1,538 Bytes
09ba624 7681b17 12a5ccc fe0631f 12a5ccc 473b660 7681b17 473b660 7681b17 6f0c766 7681b17 473b660 6f0c766 7681b17 6f0c766 7681b17 7cc2e2f 6f0c766 473b660 7681b17 09ba624 473b660 7681b17 7cc2e2f 473b660 09ba624 473b660 09ba624 473b660 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
import gradio as gr
from llama_cpp import Llama
import os
os.system("pip install -U huggingface_hub")
os.system("huggingface-cli download Qwen/Qwen2.5-0.5B-Instruct-GGUF qwen2.5-0.5b-instruct-q2_k.gguf --local-dir . --local-dir-use-symlinks False")
# Load the Qwen GGUF model
MODEL_PATH = "./qwen2.5-0.5b-instruct-q2_k.gguf" # Ensure the file exists in this path
model = Llama(model_path=MODEL_PATH)
# Define the chat function
def respond(message, history, system_message, max_tokens, temperature, top_p):
# Prepare the full prompt
prompt = f"{system_message}\n"
for user_msg, assistant_msg in history:
prompt += f"User: {user_msg}\nAssistant: {assistant_msg}\n"
prompt += f"User: {message}\nAssistant:"
# Generate response using llama-cpp
response = model(
prompt,
max_tokens=max_tokens,
# temperature=temperature,
# top_p=top_p
)
# Extract text response
return response["choices"][0]["text"].strip()
# Define Gradio chat interface
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(value="You are a helpful AI assistant.", label="System message"),
gr.Slider(minimum=10, maximum=1024, value=256, step=10, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature no effect"),
gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="Top-p (nucleus sampling) no effect"),
],
)
# Launch Gradio app
if __name__ == "__main__":
demo.launch()
|