import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel from dotenv import load_dotenv import torch import cohere from pinecone import Pinecone, ServerlessSpec import os # Load the environment variables load_dotenv() # Model configuration max_seq_length = 2048 dtype = None load_in_4bit = True PINECONE_API_KEY = os.getenv("PINECONE_API_KEY") hf_token = os.getenv("HF_TOKEN") class ModelInterface: def __init__(self): # Model names and paths model_name = "Aradhya15/Mistral7b_hypertuned" # Replace with your Hugging Face model path base_model_name = "mistralai/Mistral-7B-v0.1" base_model = AutoModelForCausalLM.from_pretrained(base_model_name, use_auth_token=hf_token) # Load the tokenizer self.tokenizer = AutoTokenizer.from_pretrained(base_model_name, use_auth_token=hf_token) self.tokenizer.pad_token = self.tokenizer.eos_token # Load the PEFT adapter self.model = PeftModel.from_pretrained(base_model, model_name) if torch.cuda.is_available(): print("GPU is available and ready to use:", torch.cuda.get_device_name(0)) else: print("GPU not detected; using CPU.") # Check for GPU availability if torch.cuda.is_available(): device = torch.device("cuda") # Use the first available GPU else: device = torch.device("cpu") # Fallback to CPU # Convert the model to half-precision (optional) self.model = self.model.half() # Move the model to the chosen device (GPU or CPU) self.model = self.model.to(device) print(f"Model is moved to {device}") # Initialize Cohere self.cohere_client = cohere.Client(api_key=os.getenv("COHERE_API_KEY")) # Initialize Pinecone with your API key pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY")) self.index = pc.Index("cohere-pinecone-tree") def generate_response(self, query): try: # Generate query embedding response = self.cohere_client.embed( texts=[query], model="embed-english-light-v2.0" ) query_embedding = response.embeddings[0] # Retrieve documents results = self.index.query( vector=query_embedding, top_k=5, include_metadata=True ) retrieved_context = "\n".join( [result["metadata"]["text"] for result in results["matches"]] ) # Prepare input for model messages = [ {"role": "system", "content": f"Context: {retrieved_context}"}, {"role": "user", "content": query}, ] # Tokenize input inputs = self.tokenizer( f"Context: {retrieved_context}\nUser: {query}", return_tensors="pt", truncation=True, padding=True, max_length=max_seq_length ).to("cuda") # Generate response from model outputs = self.model.generate( inputs["input_ids"], max_new_tokens=64, temperature=0.3, top_p=0.9, repetition_penalty=1.2 ) response_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True) return response_text except Exception as e: return f"Error generating response: {str(e)}" # Create Gradio interface def create_interface(): interface = ModelInterface() def predict(message): return interface.generate_response(message) iface = gr.Interface( fn=predict, inputs=gr.Textbox(label="Enter your question"), outputs=gr.Textbox(label="Response"), title="RAG-Enhanced LLM Assistant", description="Ask a question and get a response enhanced with retrieved context.", examples=[["What are the best practices for tree planting?"], ["How can I improve soil quality in my garden?"]] ) return iface # Launch the interface if __name__ == "__main__": iface = create_interface() iface.launch() # import torch # print(torch.__version__) # print(torch.cuda.is_available()) # print(torch.version.cuda) # This should show "12.6" or compatible