File size: 1,538 Bytes
09ba624
7681b17
12a5ccc
 
fe0631f
12a5ccc
473b660
7681b17
 
 
473b660
7681b17
6f0c766
7681b17
 
473b660
6f0c766
 
 
7681b17
6f0c766
7681b17
 
7cc2e2f
 
6f0c766
473b660
7681b17
 
 
 
09ba624
473b660
 
7681b17
7cc2e2f
 
 
473b660
09ba624
473b660
 
09ba624
473b660
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import gradio as gr
from llama_cpp import Llama
import os

os.system("pip install -U huggingface_hub")
os.system("huggingface-cli download Qwen/Qwen2.5-0.5B-Instruct-GGUF qwen2.5-0.5b-instruct-q2_k.gguf --local-dir . --local-dir-use-symlinks False")

# Load the Qwen GGUF model
MODEL_PATH = "./qwen2.5-0.5b-instruct-q2_k.gguf"  # Ensure the file exists in this path
model = Llama(model_path=MODEL_PATH)

# Define the chat function
def respond(message, history, system_message, max_tokens, temperature, top_p):
    # Prepare the full prompt
    prompt = f"{system_message}\n"
    for user_msg, assistant_msg in history:
        prompt += f"User: {user_msg}\nAssistant: {assistant_msg}\n"
    prompt += f"User: {message}\nAssistant:"

    # Generate response using llama-cpp
    response = model(
        prompt, 
        max_tokens=max_tokens, 
        # temperature=temperature, 
        # top_p=top_p
    )

    # Extract text response
    return response["choices"][0]["text"].strip()

# Define Gradio chat interface
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a helpful AI assistant.", label="System message"),
        gr.Slider(minimum=10, maximum=1024, value=256, step=10, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature no effect"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="Top-p (nucleus sampling) no effect"),
    ],
)

# Launch Gradio app
if __name__ == "__main__":
    demo.launch()