File size: 4,492 Bytes
b6a6448
1ba809f
 
 
 
 
 
 
b6a6448
1ba809f
 
 
 
 
 
 
a74d654
b6a6448
1ba809f
 
 
 
 
b6a6448
a74d654
b6a6448
1ba809f
a74d654
1ba809f
 
 
 
 
 
 
 
 
 
 
 
 
b6a6448
1ba809f
 
b6a6448
1ba809f
 
b6a6448
1ba809f
 
 
 
 
 
 
 
b6a6448
1ba809f
 
 
 
 
 
 
 
b6a6448
1ba809f
 
 
 
 
 
 
 
 
b6a6448
1ba809f
 
 
 
 
 
 
 
 
 
 
 
 
 
b6a6448
1ba809f
 
 
 
 
 
 
 
 
 
 
b6a6448
1ba809f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b6a6448
1ba809f
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from dotenv import load_dotenv
import torch
import cohere
from pinecone import Pinecone, ServerlessSpec
import os

# Load the environment variables
load_dotenv()
# Model configuration
max_seq_length = 2048
dtype = None
load_in_4bit = True
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
hf_token = os.getenv("HF_TOKEN")

class ModelInterface:
    def __init__(self):
        # Model names and paths
        model_name = "Aradhya15/Mistral7b_hypertuned"  # Replace with your Hugging Face model path
        base_model_name = "mistralai/Mistral-7B-v0.1"

        base_model = AutoModelForCausalLM.from_pretrained(base_model_name, use_auth_token=hf_token)

        # Load the tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(base_model_name, use_auth_token=hf_token)
        self.tokenizer.pad_token = self.tokenizer.eos_token
        
        # Load the PEFT adapter
        self.model = PeftModel.from_pretrained(base_model, model_name)
        if torch.cuda.is_available():
            print("GPU is available and ready to use:", torch.cuda.get_device_name(0))
        else:
            print("GPU not detected; using CPU.")
        # Check for GPU availability
        if torch.cuda.is_available():
            device = torch.device("cuda")  # Use the first available GPU
        else:
            device = torch.device("cpu")    # Fallback to CPU

        # Convert the model to half-precision (optional)
        self.model = self.model.half()

        # Move the model to the chosen device (GPU or CPU)
        self.model = self.model.to(device)

        print(f"Model is moved to {device}")
        
        # Initialize Cohere
        self.cohere_client = cohere.Client(api_key=os.getenv("COHERE_API_KEY"))
        
        # Initialize Pinecone with your API key
        pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
        self.index = pc.Index("cohere-pinecone-tree")

    def generate_response(self, query):
        try:
            # Generate query embedding
            response = self.cohere_client.embed(
                texts=[query], 
                model="embed-english-light-v2.0"
            )
            query_embedding = response.embeddings[0]

            # Retrieve documents
            results = self.index.query(
                vector=query_embedding, 
                top_k=5, 
                include_metadata=True
            )
            retrieved_context = "\n".join(
                [result["metadata"]["text"] for result in results["matches"]]
            )

            # Prepare input for model
            messages = [
                {"role": "system", "content": f"Context: {retrieved_context}"},
                {"role": "user", "content": query},
            ]
            
            # Tokenize input
            inputs = self.tokenizer(
                f"Context: {retrieved_context}\nUser: {query}",
                return_tensors="pt",
                truncation=True,
                padding=True,
                max_length=max_seq_length
            ).to("cuda")

            # Generate response from model
            outputs = self.model.generate(
                inputs["input_ids"],
                max_new_tokens=64,
                temperature=0.3,
                top_p=0.9,
                repetition_penalty=1.2
            )
            
            response_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            return response_text

        except Exception as e:
            return f"Error generating response: {str(e)}"

# Create Gradio interface
def create_interface():
    interface = ModelInterface()
    
    def predict(message):
        return interface.generate_response(message)
    
    iface = gr.Interface(
        fn=predict,
        inputs=gr.Textbox(label="Enter your question"),
        outputs=gr.Textbox(label="Response"),
        title="RAG-Enhanced LLM Assistant",
        description="Ask a question and get a response enhanced with retrieved context.",
        examples=[["What are the best practices for tree planting?"], ["How can I improve soil quality in my garden?"]]
    )
    
    return iface

# Launch the interface
if __name__ == "__main__":
    iface = create_interface()
    iface.launch()

# import torch
# print(torch.__version__)
# print(torch.cuda.is_available())
# print(torch.version.cuda)  # This should show "12.6" or compatible