ak0601 commited on
Commit
4732c3f
·
verified ·
1 Parent(s): fb4434c

Upload 7 files

Browse files
Files changed (7) hide show
  1. .env +1 -0
  2. Precollege.py +153 -0
  3. Updated_structred_aman.docx +0 -0
  4. chat_1.py +336 -0
  5. chatbot_gemini.py +168 -0
  6. chatbot_openai.py +502 -0
  7. requirements.txt +91 -0
.env ADDED
@@ -0,0 +1 @@
 
 
1
+ GOOGLE_API_KEY = "AIzaSyCOVPvbV9NEg2dYAsP5i98bQnsGQW_qWMc"
Precollege.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ import pathlib
4
+ import getpass
5
+ import streamlit as st
6
+ import google.generativeai as genai
7
+ from langchain_community.vectorstores import Chroma
8
+ from langchain_community.document_loaders import Docx2txtLoader
9
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+ from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
11
+ from langchain_core.messages import HumanMessage, SystemMessage
12
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings
13
+ from langchain.chains import create_history_aware_retriever, create_retrieval_chain
14
+ from langchain.chains.combine_documents import create_stuff_documents_chain
15
+
16
+ os.environ["GOOGLE_API_KEY"] = "AIzaSyCOVPvbV9NEg2dYAsP5i98bQnsGQW_qWMc"
17
+
18
+ from langchain_google_genai import ChatGoogleGenerativeAI
19
+
20
+ llm = ChatGoogleGenerativeAI(
21
+ model="gemini-1.5-pro-latest",
22
+ temperature=0,
23
+ max_tokens=None,
24
+ timeout=None,
25
+ max_retries=2,
26
+ )
27
+
28
+ embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
29
+
30
+ def get_vectorstore_from_docx(docx_file):
31
+ try:
32
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as temp_file:
33
+ temp_file.write(docx_file.read())
34
+ temp_file_path = temp_file.name
35
+
36
+ loader = Docx2txtLoader(temp_file_path)
37
+ documents = loader.load()
38
+
39
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=0)
40
+ document_chunks = text_splitter.split_documents(documents)
41
+
42
+ vector_store = Chroma.from_documents(
43
+ embedding=embeddings,
44
+ documents=document_chunks,
45
+ persist_directory="./data"
46
+ )
47
+ os.remove(temp_file_path)
48
+ return vector_store
49
+ except Exception as e:
50
+ st.error(f"Error creating vector store: {e}")
51
+ return None
52
+
53
+ def get_context_retriever_chain(vector_store):
54
+ retriever = vector_store.as_retriever()
55
+
56
+ prompt = ChatPromptTemplate.from_messages([
57
+ MessagesPlaceholder(variable_name="chat_history"),
58
+ ("user", "{input}"),
59
+
60
+ ("system", """Act as a PreCollege AI assistant dedicated to guiding students through their JEE Mains journey. Your goal is to provide personalized, accurate, and interactive advice for students seeking college admissions guidance. Tailor your responses to address students' individual needs, including:
61
+
62
+ 1. College Selection and Counseling: Help students identify colleges they qualify for based on their JEE Mains rank and preferences, including NITs, IIITs, GFTIs, and private institutions. Consider factors like location, course offerings, placement records, and fees.
63
+
64
+ 2. Admission Process Guidance: Clarify the college admission procedures, including JoSAA counseling, spot rounds, document verification, and category-specific quotas (if applicable).
65
+
66
+ 3. Career and Branch Selection Advice: Assist students in making informed decisions about their preferred engineering branches based on interest, market trends, and scope of opportunities.
67
+
68
+ Interactive Sessions: Engage students in Q&A sessions to answer their doubts related to preparation, counseling, and career choices.
69
+
70
+ Maintain a professional and friendly tone. Use your expertise to ensure students receive relevant and clear information. Provide examples, stats, and other insights to support your advice wherever needed""")
71
+ ])
72
+
73
+ retriever_chain = create_history_aware_retriever(llm, retriever, prompt)
74
+ return retriever_chain
75
+
76
+ def get_conversational_chain(retriever_chain):
77
+ prompt = ChatPromptTemplate.from_messages([
78
+ ("system", "Answer the user's questions based on the context below:\n\n{context}"),
79
+ MessagesPlaceholder(variable_name="chat_history"),
80
+ ("user", "{input}")
81
+ ])
82
+
83
+ stuff_documents_chain = create_stuff_documents_chain(llm, prompt)
84
+ return create_retrieval_chain(retriever_chain, stuff_documents_chain)
85
+
86
+ def get_response(user_query):
87
+ retriever_chain = get_context_retriever_chain(st.session_state.vector_store)
88
+ conversation_rag_chain = get_conversational_chain(retriever_chain)
89
+
90
+ formatted_chat_history = []
91
+ for message in st.session_state.chat_history:
92
+ if isinstance(message, HumanMessage):
93
+ formatted_chat_history.append({"author": "user", "content": message.content})
94
+ elif isinstance(message, SystemMessage):
95
+ formatted_chat_history.append({"author": "assistant", "content": message.content})
96
+
97
+ response = conversation_rag_chain.invoke({
98
+ "chat_history": formatted_chat_history,
99
+ "input": user_query
100
+ })
101
+
102
+ return response['answer']
103
+
104
+ st.set_page_config(page_title="College Data Chatbot")
105
+ st.title("College Data Chatbot")
106
+
107
+ with st.sidebar:
108
+ st.header("Settings")
109
+ docx_files = st.file_uploader("Upload College Data Document", accept_multiple_files=True)
110
+
111
+ if not docx_files:
112
+ st.info("Please upload a .docx file")
113
+ else:
114
+ docx_file = docx_files[0]
115
+
116
+ if "docx_name" in st.session_state and st.session_state.docx_name != docx_file.name:
117
+ st.session_state.pop("vector_store", None)
118
+ st.session_state.pop("chat_history", None)
119
+
120
+ if st.button("Preprocess"):
121
+ st.session_state.vector_store = get_vectorstore_from_docx(docx_file)
122
+ if st.session_state.vector_store:
123
+ st.session_state.docx_name = docx_file.name
124
+ st.success("Document processed successfully!")
125
+
126
+ if "chat_history" not in st.session_state:
127
+ st.session_state.chat_history = [
128
+ {"author": "assistant", "content": "Hello, I am a bot. How can I help you?"}
129
+ ]
130
+
131
+ if st.session_state.get("vector_store") is None:
132
+ st.info("Please preprocess the document by clicking the 'Preprocess' button in the sidebar.")
133
+ else:
134
+ for message in st.session_state.chat_history:
135
+ if message["author"] == "assistant":
136
+ with st.chat_message("system"):
137
+ st.write(message["content"])
138
+ elif message["author"] == "user":
139
+ with st.chat_message("human"):
140
+ st.write(message["content"])
141
+
142
+ with st.form(key="chat_form", clear_on_submit=True):
143
+ user_query = st.text_input("Type your message here...", key="user_input")
144
+ submit_button = st.form_submit_button("Send")
145
+
146
+ if submit_button and user_query:
147
+ # Get bot response
148
+ response = get_response(user_query)
149
+ st.session_state.chat_history.append({"author": "user", "content": user_query})
150
+ st.session_state.chat_history.append({"author": "assistant", "content": response})
151
+
152
+ # Rerun the app to refresh the chat display
153
+ st.rerun()
Updated_structred_aman.docx ADDED
Binary file (77.9 kB). View file
 
chat_1.py ADDED
@@ -0,0 +1,336 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import os
2
+ # import streamlit as st
3
+ # import google.generativeai as genai
4
+ # from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
5
+ # from langchain_community.document_loaders import Docx2txtLoader
6
+ # from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+ # from langchain_community.vectorstores import Chroma
8
+ # from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
9
+ # from langchain_core.messages import HumanMessage, SystemMessage
10
+ # from langchain.chains import create_history_aware_retriever, create_retrieval_chain
11
+ # from langchain.chains.combine_documents import create_stuff_documents_chain
12
+ # from langchain.embeddings import HuggingFaceEmbeddings
13
+ # from bert_score import score
14
+ # from sklearn.metrics import f1_score
15
+ # import pysqlite3
16
+ # import sys
17
+ # sys.modules['sqlite3'] = pysqlite3
18
+
19
+ # # Retrieve Google API Key
20
+ # GOOGLE_API_KEY = "AIzaSyAytkzRS0Xp0pCyo6WqKJ4m1o330bF-gPk"
21
+ # if not GOOGLE_API_KEY:
22
+ # raise ValueError("Gemini API key not found. Please set it in the .env file.")
23
+ # os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY
24
+
25
+ # # Streamlit configuration
26
+ # st.set_page_config(page_title="College Data Chatbot", layout="centered")
27
+ # st.title("PreCollege Chatbot GEMINI+ HuggingFace Embeddings")
28
+
29
+ # # Initialize LLM and embeddings
30
+ # llm = ChatGoogleGenerativeAI(
31
+ # model="gemini-1.5-pro-latest",
32
+ # temperature=0.2,
33
+ # max_tokens=None,
34
+ # timeout=None,
35
+ # max_retries=2,
36
+ # )
37
+ # embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
38
+
39
+ # # Load vector store
40
+ # def load_preprocessed_vectorstore():
41
+ # try:
42
+ # loader = Docx2txtLoader("./Updated_structred_aman.docx")
43
+ # documents = loader.load()
44
+
45
+ # text_splitter = RecursiveCharacterTextSplitter(
46
+ # separators=["\n\n", "\n", ". ", " ", ""],
47
+ # chunk_size=3000,
48
+ # chunk_overlap=1000
49
+ # )
50
+ # document_chunks = text_splitter.split_documents(documents)
51
+
52
+ # vector_store = Chroma.from_documents(
53
+ # embedding=embeddings,
54
+ # documents=document_chunks,
55
+ # persist_directory="./data32"
56
+ # )
57
+ # return vector_store
58
+ # except Exception as e:
59
+ # st.error(f"Error creating vector store: {e}")
60
+ # return None
61
+
62
+ # # Evaluation Metrics
63
+ # def calculate_recall_at_k(retrieved_docs, relevant_docs, k=5):
64
+ # retrieved_top_k = retrieved_docs[:k]
65
+ # relevant_in_top_k = len(set(retrieved_top_k).intersection(set(relevant_docs)))
66
+ # total_relevant = len(relevant_docs)
67
+ # return relevant_in_top_k / total_relevant if total_relevant > 0 else 0.0
68
+
69
+ # def calculate_bertscore(generated_responses, reference_responses):
70
+ # P, R, F1 = score(generated_responses, reference_responses, lang="en", rescale_with_baseline=True)
71
+ # return {"precision": P.mean().item(), "recall": R.mean().item(), "f1": F1.mean().item()}
72
+
73
+ # def calculate_f1_score(generated_response, relevant_text):
74
+ # generated_tokens = set(generated_response.split())
75
+ # relevant_tokens = set(relevant_text.split())
76
+ # intersection = generated_tokens.intersection(relevant_tokens)
77
+
78
+ # precision = len(intersection) / len(generated_tokens) if len(generated_tokens) > 0 else 0
79
+ # recall = len(intersection) / len(relevant_tokens) if len(relevant_tokens) > 0 else 0
80
+
81
+ # if precision + recall > 0:
82
+ # f1 = 2 * (precision * recall) / (precision + recall)
83
+ # else:
84
+ # f1 = 0.0
85
+ # return f1
86
+
87
+ # # Context Retriever Chain
88
+ # def get_context_retriever_chain(vector_store):
89
+ # retriever = vector_store.as_retriever()
90
+ # prompt = ChatPromptTemplate.from_messages([
91
+ # MessagesPlaceholder(variable_name="chat_history"),
92
+ # ("human", "{input}"),
93
+ # ("system", """Given a chat history and the latest user question,
94
+ # reformulate it as a standalone question without using chat history.
95
+ # Do NOT answer it, just reformulate.""")
96
+ # ])
97
+ # return create_history_aware_retriever(llm, retriever, prompt)
98
+
99
+ # def get_conversational_chain(retriever_chain):
100
+ # prompt = ChatPromptTemplate.from_messages([
101
+ # ("system", """Hello! I'm your PreCollege AI assistant. I'll guide you through your JEE Mains journey.
102
+ # To get started, share your JEE Mains rank and preferred engineering branches or colleges."""),
103
+ # MessagesPlaceholder(variable_name="chat_history"),
104
+ # ("human", "{input}")
105
+ # ])
106
+ # stuff_documents_chain = create_stuff_documents_chain(llm, prompt)
107
+ # return create_retrieval_chain(retriever_chain, stuff_documents_chain)
108
+
109
+ # def get_response(user_query):
110
+ # retriever_chain = get_context_retriever_chain(st.session_state.vector_store)
111
+ # conversation_rag_chain = get_conversational_chain(retriever_chain)
112
+
113
+ # formatted_chat_history = []
114
+ # for message in st.session_state.chat_history:
115
+ # if isinstance(message, HumanMessage):
116
+ # formatted_chat_history.append({"author": "user", "content": message.content})
117
+ # elif isinstance(message, SystemMessage):
118
+ # formatted_chat_history.append({"author": "assistant", "content": message.content})
119
+
120
+ # response = conversation_rag_chain.invoke({
121
+ # "chat_history": formatted_chat_history,
122
+ # "input": user_query
123
+ # })
124
+
125
+ # return response['answer']
126
+
127
+ # # Initialize vector store and metrics
128
+ # st.session_state.vector_store = load_preprocessed_vectorstore()
129
+ # if "metrics" not in st.session_state:
130
+ # st.session_state.metrics = {"recall_at_5": [], "bert_scores": [], "f1_scores": []}
131
+
132
+ # # Initialize chat history
133
+ # if "chat_history" not in st.session_state:
134
+ # st.session_state.chat_history = [
135
+ # {"author": "assistant", "content": "Hello, I am Precollege. How can I help you?"}
136
+ # ]
137
+
138
+ # # Main app logic
139
+ # if st.session_state.get("vector_store") is None:
140
+ # st.error("Failed to load preprocessed data. Ensure the data exists in './data' directory.")
141
+ # else:
142
+ # with st.container():
143
+ # for message in st.session_state.chat_history:
144
+ # if message["author"] == "assistant":
145
+ # with st.chat_message("system"):
146
+ # st.write(message["content"])
147
+ # elif message["author"] == "user":
148
+ # with st.chat_message("human"):
149
+ # st.write(message["content"])
150
+
151
+ # with st.container():
152
+ # with st.form(key="chat_form", clear_on_submit=True):
153
+ # user_query = st.text_input("Type your message here...", key="user_input")
154
+ # submit_button = st.form_submit_button("Send")
155
+
156
+ # if submit_button and user_query:
157
+ # # Get response
158
+ # response = get_response(user_query)
159
+ # st.session_state.chat_history.append({"author": "user", "content": user_query})
160
+ # st.session_state.chat_history.append({"author": "assistant", "content": response})
161
+
162
+ # # Dummy relevant docs for metrics demonstration
163
+ # retrieved_docs = ["doc1", "doc2", "doc3"] # Replace with actual IDs from retriever
164
+ # relevant_docs = ["doc1", "doc4"] # Replace with ground truth IDs
165
+ # recall_at_5 = calculate_recall_at_k(retrieved_docs, relevant_docs)
166
+ # st.session_state.metrics["recall_at_5"].append(recall_at_5)
167
+
168
+ # # Dummy reference and relevant text
169
+ # reference_response = "Gold-standard answer here."
170
+ # bert_scores = calculate_bertscore([response], [reference_response])
171
+ # st.session_state.metrics["bert_scores"].append(bert_scores["f1"])
172
+
173
+ # f1_score_value = calculate_f1_score(response, "Relevant text here")
174
+ # st.session_state.metrics["f1_scores"].append(f1_score_value)
175
+
176
+ # # Display evaluation metrics
177
+ # st.write("Evaluation Metrics:")
178
+ # st.write(f"Recall@5: {recall_at_5:.2f}")
179
+ # st.write(f"BERTScore F1: {bert_scores['f1']:.2f}")
180
+ # st.write(f"Faithfulness F1: {f1_score_value:.2f}")
181
+
182
+ # st.rerun()
183
+
184
+
185
+
186
+ import os
187
+ import streamlit as st
188
+ import google.generativeai as genai
189
+ from langchain_google_genai import ChatGoogleGenerativeAI
190
+ from langchain_community.document_loaders import Docx2txtLoader
191
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
192
+ from langchain_community.vectorstores import Chroma
193
+ from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
194
+ from langchain_core.messages import HumanMessage, SystemMessage
195
+ from langchain.chains import create_history_aware_retriever, create_retrieval_chain
196
+ from langchain.chains.combine_documents import create_stuff_documents_chain
197
+ from langchain.embeddings import HuggingFaceEmbeddings
198
+ import pysqlite3
199
+ import sys
200
+ sys.modules['sqlite3'] = pysqlite3
201
+
202
+ # Set the Google API key
203
+ GOOGLE_API_KEY = "AIzaSyCvkV4v4NPnPE2TcDGpIaJx56OIf_vUCnU"
204
+ if not GOOGLE_API_KEY:
205
+ raise ValueError("Gemini API key not found. Please set it in the .env file.")
206
+
207
+ os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY
208
+
209
+ # Streamlit app configuration
210
+ st.set_page_config(page_title="College Data Chatbot", layout="centered")
211
+ st.title("PreCollege Chatbot GEMINI+ HuggingFace Embeddings")
212
+
213
+ # Initialize the Google Gemini LLM
214
+ llm = ChatGoogleGenerativeAI(
215
+ model="gemini-1.5-pro-latest",
216
+ temperature=0.2,
217
+ max_tokens=None,
218
+ timeout=None,
219
+ max_retries=2,
220
+ )
221
+
222
+ # Initialize embeddings using HuggingFace
223
+ embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
224
+
225
+ def load_preprocessed_vectorstore():
226
+ """Loads documents, splits them, and creates a Chroma vector store."""
227
+ try:
228
+ loader = Docx2txtLoader("./Updated_structred_aman.docx")
229
+ documents = loader.load()
230
+
231
+ text_splitter = RecursiveCharacterTextSplitter(
232
+ separators=["\n\n", "\n", ". ", " ", ""],
233
+ chunk_size=3000,
234
+ chunk_overlap=1000
235
+ )
236
+
237
+ document_chunks = text_splitter.split_documents(documents)
238
+
239
+ vector_store = Chroma.from_documents(
240
+ embedding=embeddings,
241
+ documents=document_chunks,
242
+ persist_directory="./data32"
243
+ )
244
+ return vector_store
245
+ except Exception as e:
246
+ st.error(f"Error creating vector store: {e}")
247
+ return None
248
+
249
+ def get_context_retriever_chain(vector_store):
250
+ """Creates a history-aware retriever chain."""
251
+ retriever = vector_store.as_retriever()
252
+
253
+ # Define the prompt for the retriever chain
254
+ prompt = ChatPromptTemplate.from_messages([
255
+ MessagesPlaceholder(variable_name="chat_history"),
256
+ ("human", "{input}"),
257
+ ("system", """Given the chat history, context, and the latest user question, formulate a standalone question
258
+ that can be understood without the chat history. Use the context to provide a relevant answer if possible.
259
+ If the question is beyond the scope of the context, return:
260
+ 'This question is beyond the scope of the available information. Please contact your mentor for further assistance.'
261
+ """)
262
+ ])
263
+
264
+ retriever_chain = create_history_aware_retriever(llm, retriever, prompt)
265
+ return retriever_chain
266
+
267
+ def get_conversational_chain(retriever_chain):
268
+ """Creates a conversational chain using the retriever chain."""
269
+ prompt = ChatPromptTemplate.from_messages([
270
+ ("system", """Hello! I'm your PreCollege AI assistant, here to help you with your JEE Mains journey.
271
+ Please provide your JEE Mains rank and preferred engineering branches or colleges,
272
+ and I'll give you tailored advice based on our verified database.
273
+ Note: I will only provide information that is available within our database to ensure accuracy. Let's get started!
274
+ \n\n{context}"""),
275
+ MessagesPlaceholder(variable_name="chat_history"),
276
+ ("human", "{input}")
277
+ ])
278
+
279
+ stuff_documents_chain = create_stuff_documents_chain(llm, prompt)
280
+ return create_retrieval_chain(retriever_chain, stuff_documents_chain)
281
+
282
+ def get_response(user_query):
283
+ """Gets a response from the conversational RAG chain."""
284
+ retriever_chain = get_context_retriever_chain(st.session_state.vector_store)
285
+ conversation_rag_chain = get_conversational_chain(retriever_chain)
286
+
287
+ formatted_chat_history = []
288
+ for message in st.session_state.chat_history:
289
+ if isinstance(message, HumanMessage):
290
+ formatted_chat_history.append({"author": "user", "content": message.content})
291
+ elif isinstance(message, SystemMessage):
292
+ formatted_chat_history.append({"author": "assistant", "content": message.content})
293
+
294
+ response = conversation_rag_chain.invoke({
295
+ "chat_history": formatted_chat_history,
296
+ "input": user_query
297
+ })
298
+
299
+ return response['answer']
300
+
301
+ # Load the preprocessed vector store from the local directory
302
+ if "vector_store" not in st.session_state:
303
+ st.session_state.vector_store = load_preprocessed_vectorstore()
304
+
305
+ # Initialize chat history if not present
306
+ if "chat_history" not in st.session_state:
307
+ st.session_state.chat_history = []
308
+
309
+ # Main app logic
310
+ if st.session_state.vector_store is None:
311
+ st.error("Failed to load preprocessed data. Please ensure the data exists in './data32' directory.")
312
+ else:
313
+ # Display chat history
314
+ with st.container():
315
+ for message in st.session_state.chat_history:
316
+ if message.get("author") == "assistant":
317
+ with st.chat_message("assistant"):
318
+ st.write(message.get("content"))
319
+ elif message.get("author") == "user":
320
+ with st.chat_message("user"):
321
+ st.write(message.get("content"))
322
+
323
+ # Add user input box below the chat
324
+ if user_query := st.chat_input("Type your message here..."):
325
+ # Append user query to chat history
326
+ st.session_state.chat_history.append({"author": "user", "content": user_query})
327
+
328
+ # Get bot response
329
+ response = get_response(user_query)
330
+
331
+ # Append response to chat history
332
+ st.session_state.chat_history.append({"author": "assistant", "content": response})
333
+
334
+ # Display response
335
+ with st.chat_message("assistant"):
336
+ st.write(response)
chatbot_gemini.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ import google.generativeai as genai
4
+ # from langchain_openai import OpenAI /
5
+ from langchain_openai import OpenAIEmbeddings
6
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings
7
+ from langchain_google_genai import ChatGoogleGenerativeAI
8
+ # from langchain_openai import OpenAIEmbeddings
9
+ from langchain_community.document_loaders import Docx2txtLoader
10
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
11
+ from langchain_community.vectorstores import Chroma
12
+ from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
13
+ from langchain_core.messages import HumanMessage, SystemMessage
14
+ from langchain.chains import create_history_aware_retriever, create_retrieval_chain
15
+ from langchain.chains.combine_documents import create_stuff_documents_chain
16
+ from dotenv import load_dotenv
17
+ from langchain.embeddings import HuggingFaceEmbeddings
18
+ import pysqlite3
19
+ import sys
20
+ sys.modules['sqlite3'] = pysqlite3
21
+
22
+ import os
23
+ os.environ["TRANSFORMERS_OFFLINE"] = "1"
24
+
25
+ # Retrieve OpenAI API key from the .env file
26
+ GOOGLE_API_KEY = "AIzaSyC1-QUzA45IlCosX__sKlzNAgVZGEaHc0c"
27
+ # GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
28
+ # OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
29
+
30
+ if not GOOGLE_API_KEY:
31
+ raise ValueError("Gemini API key not found. Please set it in the .env file.")
32
+
33
+ # Set OpenAI API key
34
+ os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY
35
+ # os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
36
+ # Streamlit app configuration
37
+ st.set_page_config(page_title="College Data Chatbot", layout="centered")
38
+ st.title("PreCollege Chatbot GEMINI+ HuggingFace Embeddings")
39
+
40
+ # Initialize OpenAI LLM
41
+ llm = ChatGoogleGenerativeAI(
42
+ model="gemini-1.5-pro-latest",
43
+ temperature=0.2, # Slightly higher for varied responses
44
+ max_tokens=None,
45
+ timeout=None,
46
+ max_retries=2,
47
+ )
48
+
49
+ # Initialize embeddings using OpenAI
50
+ embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
51
+
52
+ def load_preprocessed_vectorstore():
53
+ try:
54
+ loader = Docx2txtLoader("./Updated_structred_aman.docx")
55
+ documents = loader.load()
56
+
57
+ text_splitter = RecursiveCharacterTextSplitter(
58
+ separators=["\n\n", "\n", ". ", " ", ""],
59
+ chunk_size=3000,
60
+ chunk_overlap=1000)
61
+
62
+ document_chunks = text_splitter.split_documents(documents)
63
+
64
+ vector_store = Chroma.from_documents(
65
+
66
+ embedding=embeddings,
67
+ documents=document_chunks,
68
+ persist_directory="./data32"
69
+ )
70
+ return vector_store
71
+ except Exception as e:
72
+ st.error(f"Error creating vector store: {e}")
73
+ return None
74
+
75
+ def get_context_retriever_chain(vector_store):
76
+ """Creates a history-aware retriever chain."""
77
+ retriever = vector_store.as_retriever()
78
+
79
+ # Define the prompt for the retriever chain
80
+ prompt = ChatPromptTemplate.from_messages([
81
+ MessagesPlaceholder(variable_name="chat_history"),
82
+ ("human", "{input}"),
83
+ ("system", """Given the chat history and the latest user question, which might reference context in the chat history,
84
+ formulate a standalone question that can be understood without the chat history.
85
+ If the question is directly addressed within the provided document, provide a relevant answer.
86
+ If the question is not explicitly addressed in the document, return the following message:
87
+ 'This question is beyond the scope of the available information. Please contact your mentor for further assistance.'
88
+ Do NOT answer the question directly, just reformulate it if needed and otherwise return it as is.""")
89
+ ])
90
+
91
+ retriever_chain = create_history_aware_retriever(llm, retriever, prompt)
92
+ return retriever_chain
93
+
94
+ def get_conversational_chain(retriever_chain):
95
+ """Creates a conversational chain using the retriever chain."""
96
+ prompt = ChatPromptTemplate.from_messages([
97
+ ("system", """Hello! I'm your PreCollege AI assistant, here to help you with your JEE Mains journey.
98
+ Please provide your JEE Mains rank and preferred engineering branches or colleges,
99
+ and I'll give you tailored advice based on our verified database.
100
+ Note: I will only provide information that is available within our database to ensure accuracy. Let's get started!
101
+ """
102
+ "\n\n"
103
+ "{context}"),
104
+ MessagesPlaceholder(variable_name="chat_history"),
105
+ ("human", "{input}")
106
+ ])
107
+
108
+ stuff_documents_chain = create_stuff_documents_chain(llm, prompt)
109
+ return create_retrieval_chain(retriever_chain, stuff_documents_chain)
110
+
111
+ def get_response(user_query):
112
+ retriever_chain = get_context_retriever_chain(st.session_state.vector_store)
113
+ conversation_rag_chain = get_conversational_chain(retriever_chain)
114
+
115
+ formatted_chat_history = []
116
+ for message in st.session_state.chat_history:
117
+ if isinstance(message, HumanMessage):
118
+ formatted_chat_history.append({"author": "user", "content": message.content})
119
+ elif isinstance(message, SystemMessage):
120
+ formatted_chat_history.append({"author": "assistant", "content": message.content})
121
+
122
+ response = conversation_rag_chain.invoke({
123
+ "chat_history": formatted_chat_history,
124
+ "input": user_query
125
+ })
126
+
127
+ return response['answer']
128
+
129
+ # Load the preprocessed vector store from the local directory
130
+ st.session_state.vector_store = load_preprocessed_vectorstore()
131
+
132
+ # Initialize chat history if not present
133
+ if "chat_history" not in st.session_state:
134
+ st.session_state.chat_history = [
135
+ {"author": "assistant", "content": "Hello, I am Precollege. How can I help you?"}
136
+ ]
137
+
138
+ # Main app logic
139
+ if st.session_state.get("vector_store") is None:
140
+ st.error("Failed to load preprocessed data. Please ensure the data exists in './data' directory.")
141
+ else:
142
+ # Display chat history
143
+ with st.container():
144
+ for message in st.session_state.chat_history:
145
+ if message["author"] == "assistant":
146
+ with st.chat_message("system"):
147
+ st.write(message["content"])
148
+ elif message["author"] == "user":
149
+ with st.chat_message("human"):
150
+ st.write(message["content"])
151
+
152
+ # Add user input box below the chat
153
+ with st.container():
154
+ with st.form(key="chat_form", clear_on_submit=True):
155
+ user_query = st.text_input("Type your message here...", key="user_input")
156
+ submit_button = st.form_submit_button("Send")
157
+
158
+ if submit_button and user_query:
159
+ # Get bot response
160
+ response = get_response(user_query)
161
+ st.session_state.chat_history.append({"author": "user", "content": user_query})
162
+ st.session_state.chat_history.append({"author": "assistant", "content": response})
163
+
164
+ # Rerun the app to refresh the chat display
165
+ st.rerun()
166
+
167
+
168
+ """"""
chatbot_openai.py ADDED
@@ -0,0 +1,502 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ from langchain_openai import OpenAI
4
+ from langchain_openai import OpenAIEmbeddings
5
+ from langchain_community.document_loaders import Docx2txtLoader
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+ from langchain_community.vectorstores import Chroma
8
+ from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder, PromptTemplate
9
+ from langchain_core.messages import HumanMessage, SystemMessage
10
+ from langchain.retrievers.self_query.base import SelfQueryRetriever
11
+ from langchain.chains import create_history_aware_retriever, create_retrieval_chain
12
+ from langchain.chains.combine_documents import create_stuff_documents_chain
13
+ from langchain_core.output_parsers import StrOutputParser
14
+ from dotenv import load_dotenv
15
+
16
+ # Retrieve OpenAI API key from the .env file
17
+ load_dotenv()
18
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
19
+
20
+ if not OPENAI_API_KEY:
21
+ raise ValueError("OpenAI API key not found. Please set it in the .env file.")
22
+
23
+ # Set OpenAI API key
24
+ os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
25
+
26
+ # Streamlit app configuration
27
+ st.set_page_config(page_title="College Data Chatbot", layout="centered")
28
+ st.title("PreCollege Chatbot")
29
+
30
+ # Initialize OpenAI LLM
31
+ llm = OpenAI(
32
+ model="gpt-3.5-turbo-instruct",
33
+ temperature=0,
34
+ )
35
+
36
+ # Initialize embeddings using OpenAI
37
+ embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
38
+
39
+ def load_preprocessed_vectorstore():
40
+ try:
41
+ loader = Docx2txtLoader("./Updated_structred_aman.docx")
42
+ documents = loader.load()
43
+
44
+ text_splitter = RecursiveCharacterTextSplitter(
45
+ separators=["\n\n", "\n", ". ", " ", ""],
46
+ chunk_size=3000,
47
+ chunk_overlap=200)
48
+
49
+ document_chunks = text_splitter.split_documents(documents)
50
+
51
+ vector_store = Chroma.from_documents(
52
+ embedding=embeddings,
53
+ documents=document_chunks,
54
+ persist_directory="./data11"
55
+ )
56
+ return vector_store
57
+ except Exception as e:
58
+ st.error(f"Error creating vector store: {e}")
59
+ return None
60
+
61
+ import logging
62
+
63
+ # Function to create the retriever and prompt chain
64
+ def get_context_retriever_chain(vector_store):
65
+ """Creates a context-aware retriever and prompt chain."""
66
+ retriever = vector_store.as_retriever(k=3) # Hybrid retrieval for better results
67
+
68
+ rag_prompt = PromptTemplate(
69
+ template="""
70
+ Act as a PreCollege AI assistant dedicated to guiding students through their JEE Mains journey. Your goal is to provide personalized, accurate, and interactive advice for students seeking college admissions guidance. Tailor your responses to address students' individual needs, including:
71
+
72
+ 1. College Selection and Counseling: Help students identify colleges they qualify for based on their JEE Mains rank and preferences, including IIITs institutions. Consider factors like location, course offerings, placement records, and fees.
73
+
74
+ 2. Admission Process Guidance: Clarify the college admission procedures, including JoSAA counseling, spot rounds, document verification, and category-specific quotas (if applicable).
75
+
76
+ 3. Career and Branch Selection Advice: Assist students in making informed decisions about their preferred engineering branches based on interest, market trends, and scope of opportunities.
77
+
78
+ Interactive Sessions: Engage students in Q&A sessions to answer their doubts related to preparation, counseling, and career choices.
79
+
80
+ Maintain a professional and friendly tone. Use your expertise to ensure students receive relevant and clear information. Provide examples, stats, and other insights to support your advice wherever needed.
81
+
82
+ QUESTION: {question}
83
+ CONTEXT: {context}
84
+ Answer in a detailed yet concise manner, also highlight relevant information and do not give unnecessary information or negative responses:
85
+ """,
86
+ input_variables=["question", "context"],
87
+ )
88
+
89
+ rag_prompt_chain = rag_prompt | llm | StrOutputParser()
90
+
91
+ return retriever, rag_prompt_chain
92
+
93
+
94
+ def get_response(user_query):
95
+ """Processes the user query and generates a response."""
96
+ # Define a set of common greetings
97
+ greetings = ["hi", "hello", "hey", "greetings", "hi there"]
98
+
99
+ # Check if the user query is a greeting
100
+ if user_query.lower().strip() in greetings:
101
+ return "Hello! How can I assist you with your college search today?"
102
+
103
+ # Ensure the vector store is initialized
104
+ if "vector_store" not in st.session_state:
105
+ logging.error("Vector store is not initialized in session state.")
106
+ return "Vector store is not initialized. Please preprocess the document first."
107
+
108
+ retriever, rag_prompt_chain = get_context_retriever_chain(st.session_state.vector_store)
109
+
110
+ # Format chat history from session state
111
+ formatted_chat_history = []
112
+ for message in st.session_state.chat_history:
113
+ if message["author"] == "user":
114
+ formatted_chat_history.append({"author": "user", "content": message["content"]})
115
+ elif message["author"] == "assistant":
116
+ formatted_chat_history.append({"author": "assistant", "content": message["content"]})
117
+
118
+ try:
119
+ # Retrieve context
120
+ context = retriever.invoke(user_query)
121
+ logging.info(f"Retrieved context: {context}")
122
+
123
+ if not context:
124
+ logging.error("No relevant context retrieved.")
125
+ return "I couldn't retrieve relevant information. Please try a different query."
126
+
127
+ # Generate response
128
+ response = rag_prompt_chain.invoke({
129
+ "chat_history": formatted_chat_history,
130
+ "question": user_query,
131
+ "context": context
132
+ })
133
+ logging.info(f"Generated response: {response}")
134
+
135
+ # Check the response format
136
+ if isinstance(response, dict) and "answer" in response:
137
+ return response["answer"]
138
+ elif isinstance(response, str): # Handle raw string outputs
139
+ return response
140
+ else:
141
+ logging.error(f"Unexpected response format: {response}")
142
+ return "Unexpected error occurred. Please try again later."
143
+ except Exception as e:
144
+ logging.error(f"Error generating response: {e}")
145
+ return "Sorry, I encountered an issue while processing your request. Please try again later."
146
+
147
+
148
+
149
+ # Load the preprocessed vector store from the local directory
150
+ st.session_state.vector_store = load_preprocessed_vectorstore()
151
+
152
+ # Initialize chat history if not present
153
+ if "chat_history" not in st.session_state:
154
+ st.session_state.chat_history = [
155
+ {"author": "assistant", "content": "Hello, I am Precollege. How can I help you?"}
156
+ ]
157
+
158
+ # Main app logic
159
+ if st.session_state.get("vector_store") is None:
160
+ st.error("Failed to load preprocessed data. Please ensure the data exists in './data' directory.")
161
+ else:
162
+ # Display chat history
163
+ with st.container():
164
+ for message in st.session_state.chat_history:
165
+ if message["author"] == "assistant":
166
+ with st.chat_message("system"):
167
+ st.write(message["content"])
168
+ elif message["author"] == "user":
169
+ with st.chat_message("human"):
170
+ st.write(message["content"])
171
+
172
+ # Add user input box below the chat
173
+ with st.container():
174
+ with st.form(key="chat_form", clear_on_submit=True):
175
+ user_query = st.text_input("Type your message here...", key="user_input")
176
+ submit_button = st.form_submit_button("Send")
177
+
178
+ if submit_button and user_query:
179
+ # Get bot response
180
+ response = get_response(user_query)
181
+ st.session_state.chat_history.append({"author": "user", "content": user_query})
182
+ st.session_state.chat_history.append({"author": "assistant", "content": response})
183
+
184
+ # Rerun the app to refresh the chat display
185
+ st.rerun()
186
+
187
+
188
+
189
+
190
+
191
+
192
+
193
+
194
+
195
+
196
+
197
+
198
+
199
+
200
+
201
+
202
+ # import os
203
+ # import streamlit as st
204
+ # from langchain_openai import OpenAI
205
+ # from langchain_openai import OpenAIEmbeddings
206
+ # from langchain_community.document_loaders import Docx2txtLoader
207
+ # from langchain.text_splitter import RecursiveCharacterTextSplitter
208
+ # from langchain_community.vectorstores import Chroma
209
+ # from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
210
+ # from langchain_core.messages import HumanMessage, SystemMessage
211
+ # from langchain.chains import create_history_aware_retriever, create_retrieval_chain
212
+ # from langchain.chains.combine_documents import create_stuff_documents_chain
213
+ # from dotenv import load_dotenv
214
+
215
+
216
+ # # Retrieve OpenAI API key from the .env file
217
+ # OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
218
+
219
+ # if not OPENAI_API_KEY:
220
+ # raise ValueError("OpenAI API key not found. Please set it in the .env file.")
221
+
222
+ # # Set OpenAI API key
223
+ # os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
224
+ # # Streamlit app configuration
225
+ # st.set_page_config(page_title="College Data Chatbot", layout="centered")
226
+ # st.title("PreCollege Chatbot")
227
+
228
+ # # Initialize OpenAI LLM
229
+ # llm = OpenAI(
230
+ # model="gpt-3.5-turbo-instruct",
231
+ # temperature=0,
232
+ # )
233
+
234
+ # # Initialize embeddings using OpenAI
235
+ # embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
236
+
237
+ # def load_preprocessed_vectorstore():
238
+ # try:
239
+ # loader = Docx2txtLoader("./Updated_structred_aman.docx")
240
+ # documents = loader.load()
241
+
242
+ # text_splitter = RecursiveCharacterTextSplitter(
243
+ # separators=["\n\n", "\n", ". ", " ", ""],
244
+ # chunk_size=3000,
245
+ # chunk_overlap=200)
246
+
247
+ # document_chunks = text_splitter.split_documents(documents)
248
+
249
+ # vector_store = Chroma.from_documents(
250
+
251
+ # embedding=embeddings,
252
+ # documents=document_chunks,
253
+ # persist_directory="./data11"
254
+ # )
255
+ # return vector_store
256
+ # except Exception as e:
257
+ # st.error(f"Error creating vector store: {e}")
258
+ # return None
259
+
260
+ # def get_context_retriever_chain(vector_store):
261
+ # """Creates a history-aware retriever chain."""
262
+ # retriever = vector_store.as_retriever()
263
+
264
+ # # Define the prompt for the retriever chain
265
+ # prompt = ChatPromptTemplate.from_messages([
266
+ # MessagesPlaceholder(variable_name="chat_history"),
267
+ # ("user", "{input}"),
268
+ # ("system", "You are a PreCollege AI assistant helping students with JEE Mains college guidance. Answer interactively and provide relevant, accurate information.")
269
+ # ])
270
+
271
+ # retriever_chain = create_history_aware_retriever(llm, retriever, prompt)
272
+ # return retriever_chain
273
+
274
+ # def get_conversational_chain(retriever_chain):
275
+ # """Creates a conversational chain using the retriever chain."""
276
+ # prompt = ChatPromptTemplate.from_messages([
277
+ # ("system", "Answer the user's questions based on the context below:\n\n{context}"),
278
+ # MessagesPlaceholder(variable_name="chat_history"),
279
+ # ("user", "{input}")
280
+ # ])
281
+
282
+ # stuff_documents_chain = create_stuff_documents_chain(llm, prompt)
283
+ # return create_retrieval_chain(retriever_chain, stuff_documents_chain)
284
+
285
+ # def get_response(user_query):
286
+ # retriever_chain = get_context_retriever_chain(st.session_state.vector_store)
287
+ # conversation_rag_chain = get_conversational_chain(retriever_chain)
288
+
289
+ # formatted_chat_history = []
290
+ # for message in st.session_state.chat_history:
291
+ # if isinstance(message, HumanMessage):
292
+ # formatted_chat_history.append({"author": "user", "content": message.content})
293
+ # elif isinstance(message, SystemMessage):
294
+ # formatted_chat_history.append({"author": "assistant", "content": message.content})
295
+
296
+ # response = conversation_rag_chain.invoke({
297
+ # "chat_history": formatted_chat_history,
298
+ # "input": user_query
299
+ # })
300
+
301
+ # return response['answer']
302
+
303
+ # # Load the preprocessed vector store from the local directory
304
+ # st.session_state.vector_store = load_preprocessed_vectorstore()
305
+
306
+ # # Initialize chat history if not present
307
+ # if "chat_history" not in st.session_state:
308
+ # st.session_state.chat_history = [
309
+ # {"author": "assistant", "content": "Hello, I am Precollege. How can I help you?"}
310
+ # ]
311
+
312
+ # # Main app logic
313
+ # if st.session_state.get("vector_store") is None:
314
+ # st.error("Failed to load preprocessed data. Please ensure the data exists in './data' directory.")
315
+ # else:
316
+ # # Display chat history
317
+ # with st.container():
318
+ # for message in st.session_state.chat_history:
319
+ # if message["author"] == "assistant":
320
+ # with st.chat_message("system"):
321
+ # st.write(message["content"])
322
+ # elif message["author"] == "user":
323
+ # with st.chat_message("human"):
324
+ # st.write(message["content"])
325
+
326
+ # # Add user input box below the chat
327
+ # with st.container():
328
+ # with st.form(key="chat_form", clear_on_submit=True):
329
+ # user_query = st.text_input("Type your message here...", key="user_input")
330
+ # submit_button = st.form_submit_button("Send")
331
+
332
+ # if submit_button and user_query:
333
+ # # Get bot response
334
+ # response = get_response(user_query)
335
+ # st.session_state.chat_history.append({"author": "user", "content": user_query})
336
+ # st.session_state.chat_history.append({"author": "assistant", "content": response})
337
+
338
+ # # Rerun the app to refresh the chat display
339
+ # st.rerun()
340
+
341
+
342
+
343
+
344
+
345
+
346
+
347
+
348
+
349
+
350
+
351
+
352
+
353
+
354
+
355
+
356
+
357
+
358
+
359
+
360
+ # import os
361
+ # import tempfile
362
+ # import streamlit as st
363
+ # from langchain_openai import OpenAI
364
+ # from langchain_openai import OpenAIEmbeddings
365
+ # from langchain_community.vectorstores import Chroma
366
+ # from langchain_community.document_loaders import Docx2txtLoader
367
+ # from langchain.text_splitter import RecursiveCharacterTextSplitter
368
+ # from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
369
+ # from langchain_core.messages import HumanMessage, SystemMessage
370
+ # from langchain.chains import create_history_aware_retriever, create_retrieval_chain
371
+ # from langchain.chains.combine_documents import create_stuff_documents_chain
372
+
373
+ # # Load environment variables for API keys
374
+ # # load_dotenv()
375
+ # import os
376
+ # os.environ["OPENAI_API_KEY"]="sk-HQoHO1UganCjwF-tK2Hs-0wmwUHmVdiZIVwa_2SYBuT3BlbkFJSiebrtoqIo83LPDi-LaPHeLqndbP3I9tguwSnw3AMA"
377
+
378
+ # # Initialize OpenAI LLM
379
+ # llm = OpenAI(
380
+ # model="gpt-3.5-turbo-instruct",
381
+ # temperature=0,
382
+ # )
383
+
384
+ # # Initialize embeddings using OpenAI
385
+ # embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
386
+
387
+ # def get_vectorstore_from_docx(docx_file):
388
+ # """Processes a .docx file to create a vector store."""
389
+ # try:
390
+ # with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as temp_file:
391
+ # temp_file.write(docx_file.read())
392
+ # temp_file_path = temp_file.name
393
+
394
+ # loader = Docx2txtLoader(temp_file_path)
395
+ # documents = loader.load()
396
+
397
+ # text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=200)
398
+ # document_chunks = text_splitter.split_documents(documents)
399
+
400
+ # vector_store = Chroma.from_documents(
401
+ # embedding=embeddings,
402
+ # documents=document_chunks,
403
+ # persist_directory="./data1"
404
+ # )
405
+ # os.remove(temp_file_path)
406
+ # return vector_store
407
+ # except Exception as e:
408
+ # st.error(f"Error creating vector store: {e}")
409
+ # return None
410
+
411
+ # def get_context_retriever_chain(vector_store):
412
+ # """Creates a history-aware retriever chain."""
413
+ # retriever = vector_store.as_retriever()
414
+
415
+ # prompt = ChatPromptTemplate.from_messages([
416
+ # MessagesPlaceholder(variable_name="chat_history"),
417
+ # ("user", "{input}"),
418
+ # ("system", "You are a PreCollege AI assistant helping students with JEE Mains college guidance. Answer interactively and provide relevant, accurate information.")
419
+ # ])
420
+
421
+ # retriever_chain = create_history_aware_retriever(llm, retriever, prompt)
422
+ # return retriever_chain
423
+
424
+ # def get_conversational_chain(retriever_chain):
425
+ # """Creates a conversational chain using the retriever chain."""
426
+ # prompt = ChatPromptTemplate.from_messages([
427
+ # ("system", "Answer the user's questions based on the context below:\n\n{context}"),
428
+ # MessagesPlaceholder(variable_name="chat_history"),
429
+ # ("user", "{input}")
430
+ # ])
431
+
432
+ # stuff_documents_chain = create_stuff_documents_chain(llm, prompt)
433
+ # return create_retrieval_chain(retriever_chain, stuff_documents_chain)
434
+
435
+ # def get_response(user_query):
436
+ # retriever_chain = get_context_retriever_chain(st.session_state.vector_store)
437
+ # conversation_rag_chain = get_conversational_chain(retriever_chain)
438
+
439
+ # formatted_chat_history = []
440
+ # for message in st.session_state.chat_history:
441
+ # if isinstance(message, HumanMessage):
442
+ # formatted_chat_history.append({"author": "user", "content": message.content})
443
+ # elif isinstance(message, SystemMessage):
444
+ # formatted_chat_history.append({"author": "assistant", "content": message.content})
445
+
446
+ # response = conversation_rag_chain.invoke({
447
+ # "chat_history": formatted_chat_history,
448
+ # "input": user_query
449
+ # })
450
+
451
+ # return response['answer']
452
+
453
+ # # Streamlit app configuration
454
+ # st.set_page_config(page_title="College Data Chatbot")
455
+ # st.title("College Data Chatbot")
456
+
457
+ # # Sidebar for document upload and automatic processing
458
+ # with st.sidebar:
459
+ # st.header("Upload College Data Document")
460
+ # docx_file = st.file_uploader("Upload a .docx file")
461
+
462
+ # if docx_file:
463
+ # # Automatically process the uploaded file
464
+ # st.session_state.vector_store = get_vectorstore_from_docx(docx_file)
465
+ # if st.session_state.vector_store:
466
+ # st.session_state.docx_name = docx_file.name
467
+ # st.success("Document processed successfully!")
468
+
469
+ # # Initialize chat history if not present
470
+ # if "chat_history" not in st.session_state:
471
+ # st.session_state.chat_history = [
472
+ # {"author": "assistant", "content": "Hello, I am precollege. How can I help you?"}
473
+ # ]
474
+
475
+ # # Main chat section
476
+ # if st.session_state.get("vector_store") is None:
477
+ # st.info("Please upload and process a .docx file to get started.")
478
+ # else:
479
+ # # Display the chat history first
480
+ # with st.container():
481
+ # for message in st.session_state.chat_history:
482
+ # if message["author"] == "assistant":
483
+ # with st.chat_message("system"):
484
+ # st.write(message["content"])
485
+ # elif message["author"] == "user":
486
+ # with st.chat_message("human"):
487
+ # st.write(message["content"])
488
+
489
+ # # User input at the bottom of the chat
490
+ # with st.container():
491
+ # with st.form(key="chat_form", clear_on_submit=True):
492
+ # user_query = st.text_input("Type your message here...", key="user_input")
493
+ # submit_button = st.form_submit_button("Send")
494
+
495
+ # if submit_button and user_query:
496
+ # # Process the user query and get the bot's response
497
+ # response = get_response(user_query)
498
+ # st.session_state.chat_history.append({"author": "user", "content": user_query})
499
+ # st.session_state.chat_history.append({"author": "assistant", "content": response})
500
+
501
+ # # Scroll to the bottom of the chat
502
+ # # st.experimental_rerun()
requirements.txt ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair==5.4.1
2
+ langchain-community==0.3.7
3
+ annotated-types==0.7.0
4
+ anyio==4.6.2.post1
5
+ attrs==24.2.0
6
+ blinker==1.9.0
7
+ cachetools==5.5.0
8
+ bert-score
9
+ certifi==2024.8.30
10
+ charset-normalizer==3.4.0
11
+ db-sqlite3
12
+ pysqlite3-binary
13
+ pydantic
14
+ langchain-core
15
+ click==8.1.7
16
+ colorama==0.4.6
17
+ distro==1.9.0
18
+ gitdb==4.0.11
19
+ GitPython==3.1.43
20
+ google-ai-generativelanguage==0.6.10
21
+ google-api-core==2.23.0
22
+ google-api-python-client==2.154.0
23
+ google-auth==2.36.0
24
+ google-auth-httplib2==0.2.0
25
+ google-generativeai==0.8.3
26
+ googleapis-common-protos==1.66.0
27
+ grpcio==1.68.0
28
+ grpcio-status==1.68.0
29
+ h11==0.14.0
30
+ huggingface-hub
31
+ httpcore==1.0.7
32
+ httplib2==0.22.0
33
+ httpx==0.27.2
34
+ idna==3.10
35
+ Jinja2==3.1.4
36
+ jiter==0.7.1
37
+ jsonpatch==1.33
38
+ jsonpointer==3.0.0
39
+ jsonschema==4.23.0
40
+ jsonschema-specifications==2024.10.1
41
+ langchain-core==0.3.19
42
+ langchain-google-genai==2.0.5
43
+ langchain-openai==0.2.9
44
+ langsmith==0.1.144
45
+ markdown-it-py==3.0.0
46
+ MarkupSafe==3.0.2
47
+ mdurl==0.1.2
48
+ narwhals==1.14.1
49
+ numpy==1.26.4
50
+ openai==1.55.0
51
+ orjson==3.10.11
52
+ packaging==24.2
53
+ pandas==2.2.3
54
+ pillow==11.0.0
55
+ proto-plus==1.25.0
56
+ protobuf==5.28.3
57
+ pyarrow==18.0.0
58
+ pyasn1==0.6.1
59
+ pyasn1_modules==0.4.1
60
+ pydantic==2.10.1
61
+ pydantic_core==2.27.1
62
+ pydeck==0.9.1
63
+ Pygments==2.18.0
64
+ pyparsing==3.2.0
65
+ python-dateutil==2.9.0.post0
66
+ pytz==2024.2
67
+ PyYAML==6.0.2
68
+ referencing==0.35.1
69
+ regex==2024.11.6
70
+ requests==2.32.3
71
+ requests-toolbelt==1.0.0
72
+ rich==13.9.4
73
+ rpds-py==0.21.0
74
+ rsa==4.9
75
+ six==1.16.0
76
+ smmap==5.0.1
77
+ sniffio==1.3.1
78
+ streamlit==1.40.1
79
+ tenacity==9.0.0
80
+ tiktoken==0.8.0
81
+ toml==0.10.2
82
+ tornado==6.4.2
83
+ tqdm==4.67.0
84
+ typing_extensions==4.12.2
85
+ tzdata==2024.2
86
+ uritemplate==4.1.1
87
+ urllib3==2.2.3
88
+ watchdog==6.0.0
89
+ docx2txt
90
+ sentence-transformers==3.2.1
91
+ chromadb