V2 / app.py
Menelthornil's picture
Update app.py
e689201 verified
import gradio as gr
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM
import torch
# Ensure Torch uses CPU
torch.set_default_device("cpu")
# Model details
model_name_or_path = "TheBloke/Wizard-Vicuna-7B-Uncensored-SuperHOT-8K-GPTQ"
model_basename = "model" # Match uploaded file basename
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
model = AutoGPTQForCausalLM.from_quantized(
model_name_or_path,
model_basename=model_basename,
trust_remote_code=True,
device_map="auto", # Use auto to ensure compatibility with CPU
use_safetensors=True,
torch_dtype=torch.float32, # Use float32 for CPU compatibility
quantize_config=None,
)
# Core personality prompt
core_personality = """You are Vespa Companion, a witty and knowledgeable AI designed to engage in thoughtful and helpful conversations about life, technology, and Vespa scooters."""
# Function to generate a response
def generate_response(input_text):
try:
# Prepare the input prompt
prompt_template = f"{core_personality}\n\nUSER: {input_text}\nASSISTANT:"
input_ids = tokenizer(prompt_template, return_tensors="pt").input_ids # Use CPU
# Define generation configuration
generation_config = {
"max_new_tokens": 1024, # Allow longer responses if memory permits
"temperature": 0.7,
"top_p": 0.95,
"repetition_penalty": 1.2,
}
# Generate the response
output_ids = model.generate(
input_ids=input_ids,
max_new_tokens=generation_config["max_new_tokens"],
temperature=generation_config["temperature"],
top_p=generation_config["top_p"],
repetition_penalty=generation_config["repetition_penalty"],
)
# Decode and clean the output
raw_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
# Extract only the assistant's response
if "ASSISTANT:" in raw_output:
response = raw_output.split("ASSISTANT:")[-1].strip()
else:
response = raw_output.strip()
return response
except Exception as e:
return f"Response Generation Error: {e}"
# Chat function for Gradio
def chat_with_memory(history, user_input):
# Add user input to history
history.append({"role": "user", "content": user_input})
# Generate response
response = generate_response(user_input)
history.append({"role": "assistant", "content": response})
# Format history for display
display_history = [
(entry["content"] if entry["role"] == "user" else None, entry["content"] if entry["role"] == "assistant" else None)
for entry in history
]
return display_history, history
# Gradio app setup
with gr.Blocks() as demo:
gr.Markdown("## Vespa Companion - Intelligent Chatbot")
chatbot = gr.Chatbot(label="Chat with Vespa Companion")
with gr.Row():
msg = gr.Textbox(placeholder="Type your message here...", label="Your Message")
clear = gr.Button("Clear Conversation")
history = gr.State([]) # Initialize conversation history
msg.submit(chat_with_memory, [history, msg], [chatbot, history])
clear.click(lambda: ([], []), None, [chatbot, history])
# Launch the app
demo.launch()