Spaces:

Aradhya15
/

Mistral7b

Runtime error

Mistral7b / app.py

AradhyaAlva

trial1

a74d654 5 months ago

4.49 kB

	import gradio as gr
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from peft import PeftModel
	from dotenv import load_dotenv
	import torch
	import cohere
	from pinecone import Pinecone, ServerlessSpec
	import os

	# Load the environment variables
	load_dotenv()
	# Model configuration
	max_seq_length = 2048
	dtype = None
	load_in_4bit = True
	PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
	hf_token = os.getenv("HF_TOKEN")

	class ModelInterface:
	def __init__(self):
	# Model names and paths
	model_name = "Aradhya15/Mistral7b_hypertuned" # Replace with your Hugging Face model path
	base_model_name = "mistralai/Mistral-7B-v0.1"

	base_model = AutoModelForCausalLM.from_pretrained(base_model_name, use_auth_token=hf_token)

	# Load the tokenizer
	self.tokenizer = AutoTokenizer.from_pretrained(base_model_name, use_auth_token=hf_token)
	self.tokenizer.pad_token = self.tokenizer.eos_token

	# Load the PEFT adapter
	self.model = PeftModel.from_pretrained(base_model, model_name)
	if torch.cuda.is_available():
	print("GPU is available and ready to use:", torch.cuda.get_device_name(0))
	else:
	print("GPU not detected; using CPU.")
	# Check for GPU availability
	if torch.cuda.is_available():
	device = torch.device("cuda") # Use the first available GPU
	else:
	device = torch.device("cpu") # Fallback to CPU

	# Convert the model to half-precision (optional)
	self.model = self.model.half()

	# Move the model to the chosen device (GPU or CPU)
	self.model = self.model.to(device)

	print(f"Model is moved to {device}")

	# Initialize Cohere
	self.cohere_client = cohere.Client(api_key=os.getenv("COHERE_API_KEY"))

	# Initialize Pinecone with your API key
	pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
	self.index = pc.Index("cohere-pinecone-tree")

	def generate_response(self, query):
	try:
	# Generate query embedding
	response = self.cohere_client.embed(
	texts=[query],
	model="embed-english-light-v2.0"
	)
	query_embedding = response.embeddings[0]

	# Retrieve documents
	results = self.index.query(
	vector=query_embedding,
	top_k=5,
	include_metadata=True
	)
	retrieved_context = "\n".join(
	[result["metadata"]["text"] for result in results["matches"]]
	)

	# Prepare input for model
	messages = [
	{"role": "system", "content": f"Context: {retrieved_context}"},
	{"role": "user", "content": query},
	]

	# Tokenize input
	inputs = self.tokenizer(
	f"Context: {retrieved_context}\nUser: {query}",
	return_tensors="pt",
	truncation=True,
	padding=True,
	max_length=max_seq_length
	).to("cuda")

	# Generate response from model
	outputs = self.model.generate(
	inputs["input_ids"],
	max_new_tokens=64,
	temperature=0.3,
	top_p=0.9,
	repetition_penalty=1.2
	)

	response_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
	return response_text

	except Exception as e:
	return f"Error generating response: {str(e)}"

	# Create Gradio interface
	def create_interface():
	interface = ModelInterface()

	def predict(message):
	return interface.generate_response(message)

	iface = gr.Interface(
	fn=predict,
	inputs=gr.Textbox(label="Enter your question"),
	outputs=gr.Textbox(label="Response"),
	title="RAG-Enhanced LLM Assistant",
	description="Ask a question and get a response enhanced with retrieved context.",
	examples=[["What are the best practices for tree planting?"], ["How can I improve soil quality in my garden?"]]
	)

	return iface

	# Launch the interface
	if __name__ == "__main__":
	iface = create_interface()
	iface.launch()

	# import torch
	# print(torch.__version__)
	# print(torch.cuda.is_available())
	# print(torch.version.cuda) # This should show "12.6" or compatible