Spaces:

Menelthornil
/

V2

Paused

App Files Files Community

V2 / app.py

Menelthornil

Update app.py

e689201 verified 4 months ago

raw

history blame contribute delete

3.38 kB

	import gradio as gr
	from transformers import AutoTokenizer
	from auto_gptq import AutoGPTQForCausalLM
	import torch

	# Ensure Torch uses CPU
	torch.set_default_device("cpu")

	# Model details
	model_name_or_path = "TheBloke/Wizard-Vicuna-7B-Uncensored-SuperHOT-8K-GPTQ"
	model_basename = "model" # Match uploaded file basename

	# Load the tokenizer and model
	tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
	model = AutoGPTQForCausalLM.from_quantized(
	model_name_or_path,
	model_basename=model_basename,
	trust_remote_code=True,
	device_map="auto", # Use auto to ensure compatibility with CPU
	use_safetensors=True,
	torch_dtype=torch.float32, # Use float32 for CPU compatibility
	quantize_config=None,
	)

	# Core personality prompt
	core_personality = """You are Vespa Companion, a witty and knowledgeable AI designed to engage in thoughtful and helpful conversations about life, technology, and Vespa scooters."""

	# Function to generate a response
	def generate_response(input_text):
	try:
	# Prepare the input prompt
	prompt_template = f"{core_personality}\n\nUSER: {input_text}\nASSISTANT:"
	input_ids = tokenizer(prompt_template, return_tensors="pt").input_ids # Use CPU

	# Define generation configuration
	generation_config = {
	"max_new_tokens": 1024, # Allow longer responses if memory permits
	"temperature": 0.7,
	"top_p": 0.95,
	"repetition_penalty": 1.2,
	}

	# Generate the response
	output_ids = model.generate(
	input_ids=input_ids,
	max_new_tokens=generation_config["max_new_tokens"],
	temperature=generation_config["temperature"],
	top_p=generation_config["top_p"],
	repetition_penalty=generation_config["repetition_penalty"],
	)

	# Decode and clean the output
	raw_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
	# Extract only the assistant's response
	if "ASSISTANT:" in raw_output:
	response = raw_output.split("ASSISTANT:")[-1].strip()
	else:
	response = raw_output.strip()

	return response
	except Exception as e:
	return f"Response Generation Error: {e}"

	# Chat function for Gradio
	def chat_with_memory(history, user_input):
	# Add user input to history
	history.append({"role": "user", "content": user_input})

	# Generate response
	response = generate_response(user_input)
	history.append({"role": "assistant", "content": response})

	# Format history for display
	display_history = [
	(entry["content"] if entry["role"] == "user" else None, entry["content"] if entry["role"] == "assistant" else None)
	for entry in history
	]
	return display_history, history

	# Gradio app setup
	with gr.Blocks() as demo:
	gr.Markdown("## Vespa Companion - Intelligent Chatbot")
	chatbot = gr.Chatbot(label="Chat with Vespa Companion")
	with gr.Row():
	msg = gr.Textbox(placeholder="Type your message here...", label="Your Message")
	clear = gr.Button("Clear Conversation")
	history = gr.State([]) # Initialize conversation history
	msg.submit(chat_with_memory, [history, msg], [chatbot, history])
	clear.click(lambda: ([], []), None, [chatbot, history])

	# Launch the app
	demo.launch()