File size: 4,492 Bytes
b6a6448 1ba809f b6a6448 1ba809f a74d654 b6a6448 1ba809f b6a6448 a74d654 b6a6448 1ba809f a74d654 1ba809f b6a6448 1ba809f b6a6448 1ba809f b6a6448 1ba809f b6a6448 1ba809f b6a6448 1ba809f b6a6448 1ba809f b6a6448 1ba809f b6a6448 1ba809f b6a6448 1ba809f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from dotenv import load_dotenv
import torch
import cohere
from pinecone import Pinecone, ServerlessSpec
import os
# Load the environment variables
load_dotenv()
# Model configuration
max_seq_length = 2048
dtype = None
load_in_4bit = True
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
hf_token = os.getenv("HF_TOKEN")
class ModelInterface:
def __init__(self):
# Model names and paths
model_name = "Aradhya15/Mistral7b_hypertuned" # Replace with your Hugging Face model path
base_model_name = "mistralai/Mistral-7B-v0.1"
base_model = AutoModelForCausalLM.from_pretrained(base_model_name, use_auth_token=hf_token)
# Load the tokenizer
self.tokenizer = AutoTokenizer.from_pretrained(base_model_name, use_auth_token=hf_token)
self.tokenizer.pad_token = self.tokenizer.eos_token
# Load the PEFT adapter
self.model = PeftModel.from_pretrained(base_model, model_name)
if torch.cuda.is_available():
print("GPU is available and ready to use:", torch.cuda.get_device_name(0))
else:
print("GPU not detected; using CPU.")
# Check for GPU availability
if torch.cuda.is_available():
device = torch.device("cuda") # Use the first available GPU
else:
device = torch.device("cpu") # Fallback to CPU
# Convert the model to half-precision (optional)
self.model = self.model.half()
# Move the model to the chosen device (GPU or CPU)
self.model = self.model.to(device)
print(f"Model is moved to {device}")
# Initialize Cohere
self.cohere_client = cohere.Client(api_key=os.getenv("COHERE_API_KEY"))
# Initialize Pinecone with your API key
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
self.index = pc.Index("cohere-pinecone-tree")
def generate_response(self, query):
try:
# Generate query embedding
response = self.cohere_client.embed(
texts=[query],
model="embed-english-light-v2.0"
)
query_embedding = response.embeddings[0]
# Retrieve documents
results = self.index.query(
vector=query_embedding,
top_k=5,
include_metadata=True
)
retrieved_context = "\n".join(
[result["metadata"]["text"] for result in results["matches"]]
)
# Prepare input for model
messages = [
{"role": "system", "content": f"Context: {retrieved_context}"},
{"role": "user", "content": query},
]
# Tokenize input
inputs = self.tokenizer(
f"Context: {retrieved_context}\nUser: {query}",
return_tensors="pt",
truncation=True,
padding=True,
max_length=max_seq_length
).to("cuda")
# Generate response from model
outputs = self.model.generate(
inputs["input_ids"],
max_new_tokens=64,
temperature=0.3,
top_p=0.9,
repetition_penalty=1.2
)
response_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
return response_text
except Exception as e:
return f"Error generating response: {str(e)}"
# Create Gradio interface
def create_interface():
interface = ModelInterface()
def predict(message):
return interface.generate_response(message)
iface = gr.Interface(
fn=predict,
inputs=gr.Textbox(label="Enter your question"),
outputs=gr.Textbox(label="Response"),
title="RAG-Enhanced LLM Assistant",
description="Ask a question and get a response enhanced with retrieved context.",
examples=[["What are the best practices for tree planting?"], ["How can I improve soil quality in my garden?"]]
)
return iface
# Launch the interface
if __name__ == "__main__":
iface = create_interface()
iface.launch()
# import torch
# print(torch.__version__)
# print(torch.cuda.is_available())
# print(torch.version.cuda) # This should show "12.6" or compatible |