Mistral7b / app.py
AradhyaAlva
trial1
a74d654
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from dotenv import load_dotenv
import torch
import cohere
from pinecone import Pinecone, ServerlessSpec
import os
# Load the environment variables
load_dotenv()
# Model configuration
max_seq_length = 2048
dtype = None
load_in_4bit = True
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
hf_token = os.getenv("HF_TOKEN")
class ModelInterface:
def __init__(self):
# Model names and paths
model_name = "Aradhya15/Mistral7b_hypertuned" # Replace with your Hugging Face model path
base_model_name = "mistralai/Mistral-7B-v0.1"
base_model = AutoModelForCausalLM.from_pretrained(base_model_name, use_auth_token=hf_token)
# Load the tokenizer
self.tokenizer = AutoTokenizer.from_pretrained(base_model_name, use_auth_token=hf_token)
self.tokenizer.pad_token = self.tokenizer.eos_token
# Load the PEFT adapter
self.model = PeftModel.from_pretrained(base_model, model_name)
if torch.cuda.is_available():
print("GPU is available and ready to use:", torch.cuda.get_device_name(0))
else:
print("GPU not detected; using CPU.")
# Check for GPU availability
if torch.cuda.is_available():
device = torch.device("cuda") # Use the first available GPU
else:
device = torch.device("cpu") # Fallback to CPU
# Convert the model to half-precision (optional)
self.model = self.model.half()
# Move the model to the chosen device (GPU or CPU)
self.model = self.model.to(device)
print(f"Model is moved to {device}")
# Initialize Cohere
self.cohere_client = cohere.Client(api_key=os.getenv("COHERE_API_KEY"))
# Initialize Pinecone with your API key
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
self.index = pc.Index("cohere-pinecone-tree")
def generate_response(self, query):
try:
# Generate query embedding
response = self.cohere_client.embed(
texts=[query],
model="embed-english-light-v2.0"
)
query_embedding = response.embeddings[0]
# Retrieve documents
results = self.index.query(
vector=query_embedding,
top_k=5,
include_metadata=True
)
retrieved_context = "\n".join(
[result["metadata"]["text"] for result in results["matches"]]
)
# Prepare input for model
messages = [
{"role": "system", "content": f"Context: {retrieved_context}"},
{"role": "user", "content": query},
]
# Tokenize input
inputs = self.tokenizer(
f"Context: {retrieved_context}\nUser: {query}",
return_tensors="pt",
truncation=True,
padding=True,
max_length=max_seq_length
).to("cuda")
# Generate response from model
outputs = self.model.generate(
inputs["input_ids"],
max_new_tokens=64,
temperature=0.3,
top_p=0.9,
repetition_penalty=1.2
)
response_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
return response_text
except Exception as e:
return f"Error generating response: {str(e)}"
# Create Gradio interface
def create_interface():
interface = ModelInterface()
def predict(message):
return interface.generate_response(message)
iface = gr.Interface(
fn=predict,
inputs=gr.Textbox(label="Enter your question"),
outputs=gr.Textbox(label="Response"),
title="RAG-Enhanced LLM Assistant",
description="Ask a question and get a response enhanced with retrieved context.",
examples=[["What are the best practices for tree planting?"], ["How can I improve soil quality in my garden?"]]
)
return iface
# Launch the interface
if __name__ == "__main__":
iface = create_interface()
iface.launch()
# import torch
# print(torch.__version__)
# print(torch.cuda.is_available())
# print(torch.version.cuda) # This should show "12.6" or compatible