SalehAhmad commited on
Commit
98186e3
·
verified ·
1 Parent(s): 74fbcab

Upload 7 files

Browse files
Files changed (4) hide show
  1. PRESET_QUERIES.py +24 -0
  2. app.ipynb +224 -0
  3. chatbot.py +90 -61
  4. data_query.py +26 -9
PRESET_QUERIES.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Queries = {
2
+ 'Who are you?': [
3
+ 'Who is Wagner Chatbot?',
4
+ 'Tell me about Wagner?',
5
+ 'Who is Wagner AI?'
6
+ ],
7
+
8
+ 'who is Daniel Ringel?': [
9
+ 'Tell me about Daniel Ringel',
10
+ 'Can you show Daniel Ringel\'s CV?',
11
+ 'Who is Daniel R.?'
12
+ ],
13
+ }
14
+
15
+ # Each query will map to a document
16
+ Query_Doc_Map = {
17
+ 'Who are you?': [
18
+ 'Who-is-Wagner-Chatbot-Response.docx'
19
+ ],
20
+
21
+ 'who is Daniel Ringel?': [
22
+ 'CV/Ringel_Daniel_CV_V1.docx'
23
+ ]
24
+ }
app.ipynb ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Importing All Required Packages and Libraries"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": 7,
13
+ "metadata": {},
14
+ "outputs": [],
15
+ "source": [
16
+ "from chatbot import RAGChatbot\n",
17
+ "import os\n",
18
+ "from dotenv import load_dotenv\n",
19
+ "load_dotenv()\n",
20
+ "\n",
21
+ "import warnings\n",
22
+ "warnings.filterwarnings(\"ignore\")"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "markdown",
27
+ "metadata": {},
28
+ "source": [
29
+ "# Initializing the RAG chatbot"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": 8,
35
+ "metadata": {},
36
+ "outputs": [],
37
+ "source": [
38
+ "chatbot = RAGChatbot(\n",
39
+ " pinecone_api_key=os.getenv('PINECONE_API_KEY'),\n",
40
+ " index_name='test',\n",
41
+ ")"
42
+ ]
43
+ },
44
+ {
45
+ "cell_type": "markdown",
46
+ "metadata": {},
47
+ "source": [
48
+ "# Below cell has code to ingest data into the pinecone vector database\n",
49
+ "## Note: Only uncomment and run when you have to really ingest the data from the Data directory (which has all the relavant files)"
50
+ ]
51
+ },
52
+ {
53
+ "cell_type": "code",
54
+ "execution_count": 9,
55
+ "metadata": {},
56
+ "outputs": [],
57
+ "source": [
58
+ "# chatbot.ingest_data('../../Data', empty=True)"
59
+ ]
60
+ },
61
+ {
62
+ "cell_type": "markdown",
63
+ "metadata": {},
64
+ "source": [
65
+ "# Below cell is used to query the RAG chatbot\n",
66
+ "## You can test the responses for different values of \n",
67
+ "- k: The number of documents to retrieve from the vector database. You can input any natural number >= 1\n",
68
+ "- rerank: Whether to rerank the retrieved documents or not. Possible inputs are true and false"
69
+ ]
70
+ },
71
+ {
72
+ "cell_type": "code",
73
+ "execution_count": 10,
74
+ "metadata": {},
75
+ "outputs": [],
76
+ "source": [
77
+ "# response = chatbot.query_chatbot(input(), k=15, rerank=True) #the input() will ask you to enter the query\n",
78
+ "# print(response['response'])\n",
79
+ "\n",
80
+ "# reranked_docs = response['context_docs']\n",
81
+ "# for docs in reranked_docs:\n",
82
+ "# print(docs.metadata)"
83
+ ]
84
+ },
85
+ {
86
+ "cell_type": "code",
87
+ "execution_count": 11,
88
+ "metadata": {},
89
+ "outputs": [],
90
+ "source": [
91
+ "prompts = [\n",
92
+ " 'Who is daniel?',\n",
93
+ " 'Who are you?',\n",
94
+ " 'What is your name?',\n",
95
+ " 'What is your job?',\n",
96
+ "]"
97
+ ]
98
+ },
99
+ {
100
+ "cell_type": "code",
101
+ "execution_count": 12,
102
+ "metadata": {},
103
+ "outputs": [
104
+ {
105
+ "name": "stdout",
106
+ "output_type": "stream",
107
+ "text": [
108
+ "Response from routing:query_text: who is daniel? - best_match query: who is Daniel Ringel? - Doc: CV/Ringel_Daniel_CV_V1.docx\n",
109
+ "Daniel M. Ringel is an Assistant Professor of Marketing for Data Science and AI at the Kenan-Flagler Business School, University of North Carolina at Chapel Hill. His research focuses on integrating marketing theory with artificial intelligence and machine learning to develop frameworks and tools that benefit both academic discussions and practical applications. Daniel's work includes mapping market structure dynamics, understanding competitive relationships using AI, and advancing data-driven marketing strategies. He has received numerous awards for his contributions to the field and has been actively involved in teaching, research, and industry engagement related to AI in business.\n",
110
+ "CV/Ringel_Daniel_CV_V1.docx\n",
111
+ "\n",
112
+ "\n",
113
+ "\n"
114
+ ]
115
+ },
116
+ {
117
+ "name": "stderr",
118
+ "output_type": "stream",
119
+ "text": [
120
+ "100%|██████████| 1/1 [00:00<00:00, 23.67it/s]\n"
121
+ ]
122
+ },
123
+ {
124
+ "name": "stdout",
125
+ "output_type": "stream",
126
+ "text": [
127
+ "Hello! I am Wagner, an AI assistant named after the character from Goethe's Faust. In the story, Wagner is a loyal assistant to Faust, sharing in his intellectual pursuits on a smaller scale. Similarly, I am dedicated to scholarly endeavors, specifically assisting with Daniel Rangel's research in artificial intelligence and marketing. My role is to provide clear, structured, and accurate information related to Daniel's academic work, including his research, teaching, and career.\n",
128
+ "{'source': '../../Data/Wagner/Who-is-Wagner-Chatbot-Response.docx'}\n",
129
+ "{'source': '../../Data/3 Published Papers/for_Embedding/Ringel-2023-Multimarket_Membership_Mapping.docx'}\n",
130
+ "{'source': '../../Data/3 Published Papers/for_Embedding/Ringel-Skiera-2016-Visualizing_Asymmetric-Competition_among_More_than_1000_Products_Using_Big_Search_Data.docx'}\n",
131
+ "{'source': '../../Data/3 Published Papers/for_Website/Ringel-Skiera-2016-Visualizing-Competition-Between-1000-Products-MktSci.pdf'}\n",
132
+ "{'source': '../../Data/3 Published Papers/for_Embedding/Ringel-2023-Multimarket_Membership_Mapping.docx'}\n",
133
+ "{'source': '../../Data/3 Published Papers/for_Embedding/Ringel-2023-Multimarket_Membership_Mapping.docx'}\n",
134
+ "{'source': '../../Data/5 Working Papers/Malhotra_Ringel_Zhao_Cui_2024_Brand_Alliance_Opportunities.pdf'}\n",
135
+ "\n",
136
+ "\n",
137
+ "\n"
138
+ ]
139
+ },
140
+ {
141
+ "name": "stderr",
142
+ "output_type": "stream",
143
+ "text": [
144
+ "100%|██████████| 1/1 [00:00<00:00, 40.62it/s]\n"
145
+ ]
146
+ },
147
+ {
148
+ "name": "stdout",
149
+ "output_type": "stream",
150
+ "text": [
151
+ "Hello! My name is Wagner. I'm an assistant named after the character from Goethe’s Faust, dedicated to assisting with inquiries related to Daniel Rangel’s research in artificial intelligence and marketing.\n",
152
+ "{'source': '../../Data/Wagner/Who-is-Wagner-Chatbot-Response.docx'}\n",
153
+ "{'source': '../../Data/3 Published Papers/for_Embedding/Ringel-2023-Multimarket_Membership_Mapping.docx'}\n",
154
+ "{'source': '../../Data/3 Published Papers/for_Embedding/Ringel-2023-Multimarket_Membership_Mapping.docx'}\n",
155
+ "{'source': '../../Data/3 Published Papers/for_Embedding/Ringel-Skiera-2016-Visualizing_Asymmetric-Competition_among_More_than_1000_Products_Using_Big_Search_Data.docx'}\n",
156
+ "{'source': '../../Data/3 Published Papers/for_Embedding/Ringel-2023-Multimarket_Membership_Mapping.docx'}\n",
157
+ "{'source': '../../Data/3 Published Papers/for_Embedding/Ringel-2023-Multimarket_Membership_Mapping.docx'}\n",
158
+ "{'source': '../../Data/3 Published Papers/for_Embedding/Ringel-Skiera-2016-Visualizing_Asymmetric-Competition_among_More_than_1000_Products_Using_Big_Search_Data.docx'}\n",
159
+ "\n",
160
+ "\n",
161
+ "\n"
162
+ ]
163
+ },
164
+ {
165
+ "name": "stderr",
166
+ "output_type": "stream",
167
+ "text": [
168
+ "100%|██████████| 1/1 [00:00<00:00, 45.34it/s]\n"
169
+ ]
170
+ },
171
+ {
172
+ "name": "stdout",
173
+ "output_type": "stream",
174
+ "text": [
175
+ "I am Wagner, a friendly AI assistant dedicated to supporting scholarly endeavors related to Daniel Rangel’s research in artificial intelligence and marketing. My role is to provide clear, structured, and accurate information based on his academic work, including his published and working papers, CV, and research profile. I strive to assist with inquiries related to Daniel’s research, teaching, and career.\n",
176
+ "{'source': '../../Data/Wagner/Who-is-Wagner-Chatbot-Response.docx'}\n",
177
+ "{'source': '../../Data/CV/Ringel_Daniel_CV_V1.docx'}\n",
178
+ "{'source': '../../Data/3 Published Papers/for_Embedding/Ringel-Skiera-2016-Visualizing_Asymmetric-Competition_among_More_than_1000_Products_Using_Big_Search_Data.docx'}\n",
179
+ "{'source': '../../Data/3 Published Papers/for_Embedding/Ringel-2023-Multimarket_Membership_Mapping.docx'}\n",
180
+ "{'source': '../../Data/3 Published Papers/for_Embedding/Matthe-Ringel-Skiera_2023_Mapping-Market-Structure-Evolution.docx'}\n",
181
+ "{'source': '../../Data/3 Published Papers/for_Embedding/Ringel-2023-Multimarket_Membership_Mapping.docx'}\n",
182
+ "{'source': '../../Data/5 Working Papers/Malhotra_Ringel_Zhao_Cui_2024_Brand_Alliance_Opportunities.pdf'}\n",
183
+ "\n",
184
+ "\n",
185
+ "\n"
186
+ ]
187
+ }
188
+ ],
189
+ "source": [
190
+ "for prompt in prompts:\n",
191
+ " response, source = chatbot.query_chatbot(prompt, k=15, rerank=True)\n",
192
+ " print(response)\n",
193
+ "\n",
194
+ " if type(source) == str:\n",
195
+ " print(source)\n",
196
+ " else:\n",
197
+ " for docs in source:\n",
198
+ " print(docs.metadata)\n",
199
+ " print('\\n\\n')"
200
+ ]
201
+ }
202
+ ],
203
+ "metadata": {
204
+ "kernelspec": {
205
+ "display_name": "env",
206
+ "language": "python",
207
+ "name": "python3"
208
+ },
209
+ "language_info": {
210
+ "codemirror_mode": {
211
+ "name": "ipython",
212
+ "version": 3
213
+ },
214
+ "file_extension": ".py",
215
+ "mimetype": "text/x-python",
216
+ "name": "python",
217
+ "nbconvert_exporter": "python",
218
+ "pygments_lexer": "ipython3",
219
+ "version": "3.11.10"
220
+ }
221
+ },
222
+ "nbformat": 4,
223
+ "nbformat_minor": 2
224
+ }
chatbot.py CHANGED
@@ -1,66 +1,30 @@
1
  import os
2
- import torch
3
  import yaml
 
 
 
4
  from langchain_pinecone import PineconeVectorStore
5
  from langchain_community.embeddings import HuggingFaceBgeEmbeddings
 
 
6
  from data_ingester import ChatbotDataIngester
7
  from data_query import ChatbotDataQuery
8
  from getpass import getpass
9
  from pinecone import Pinecone, ServerlessSpec
10
- from ragatouille import RAGPretrainedModel
11
 
 
12
  import torch.nn.functional as F
13
  from transformers import AutoModel
14
 
15
- class CustomReranker:
16
- def __init__(self, model_name="nvidia/NV-Embed-v2", max_length=32768):
17
- """
18
- Initialize the reranker with the model and tokenizer.
19
- """
20
- self.model = AutoModel.from_pretrained(model_name, trust_remote_code=True, device_map="auto")
21
- self.max_length = max_length
22
 
23
- def _encode(self, texts, instruction=""):
24
- """
25
- Helper function to encode the input texts using the model.
26
- """
27
- return self.model.encode(texts, instruction=instruction, max_length=self.max_length)
28
 
29
- def rerank(self, query, passages, k=1):
30
- """
31
- Rerank the passages based on their similarity with the query.
32
-
33
- Args:
34
- - query (str): The query text.
35
- - passages (list of str): List of passages to rerank.
36
- - k (int): The number of top-k documents to return after reranking.
37
-
38
- Returns:
39
- - A list of the top-k ranked passages with their similarity scores.
40
- """
41
- query_prefix = "Instruct: Given a question, retrieve passages that answer the question\nQuery: "
42
- passage_prefix = ""
43
-
44
- # Get the query and passage embeddings
45
- query_embeddings = self._encode([query], instruction=query_prefix)
46
- passage_embeddings = self._encode(passages, instruction=passage_prefix)
47
-
48
- # Normalize embeddings
49
- query_embeddings = F.normalize(query_embeddings, p=2, dim=1)
50
- passage_embeddings = F.normalize(passage_embeddings, p=2, dim=1)
51
-
52
- # Compute similarity scores
53
- scores = (query_embeddings @ passage_embeddings.T) * 100
54
- scores = scores.tolist()[0]
55
-
56
- # Sort passages by their scores
57
- sorted_passages = sorted(
58
- [{"content": passage, "score": score, "result_index": idx}
59
- for idx, (passage, score) in enumerate(zip(passages, scores))],
60
- key=lambda x: x['score'], reverse=True
61
- )
62
-
63
- return sorted_passages[:k] # Return top-k reranked passages
64
 
65
  class RAGChatbot:
66
  def __init__(self, pinecone_api_key=None, index_name="test-index", config_path="../config.yml"):
@@ -75,7 +39,8 @@ class RAGChatbot:
75
  self.data_ingester = ChatbotDataIngester(vector_store=self.vector_store, embeddings=self.embeddings)
76
  self.data_query = ChatbotDataQuery(vector_store=self.vector_store)
77
  self.reranker = self.initialize_reranker()
78
- # self.reranker = CustomReranker()
 
79
 
80
  def load_config(self, config_path):
81
  """
@@ -127,19 +92,83 @@ class RAGChatbot:
127
  """
128
  self.data_ingester.load_and_ingest(dir_path, empty_db=empty)
129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  def query_chatbot(self, query_text, k=1, rerank=False): #, fetch_k=2, lambda_mult=0.5
131
  """
132
  Query the chatbot using the provided query text and optional search parameters.
133
  """
134
- if rerank:
135
- response = self.data_query.query(
136
- query_text=query_text,
137
- k=k,
138
- reranker=self.reranker
139
- )
 
 
 
 
 
 
 
 
 
140
  else:
141
- response = self.data_query.query(
142
- query_text=query_text,
143
- k=k,
144
- )
145
- return response
 
1
  import os
2
+ import numpy as np
3
  import yaml
4
+ from docx import Document
5
+
6
+
7
  from langchain_pinecone import PineconeVectorStore
8
  from langchain_community.embeddings import HuggingFaceBgeEmbeddings
9
+ from ragatouille import RAGPretrainedModel
10
+
11
  from data_ingester import ChatbotDataIngester
12
  from data_query import ChatbotDataQuery
13
  from getpass import getpass
14
  from pinecone import Pinecone, ServerlessSpec
 
15
 
16
+ import torch
17
  import torch.nn.functional as F
18
  from transformers import AutoModel
19
 
20
+ from sklearn.metrics.pairwise import cosine_similarity
21
+ from openai import OpenAI
 
 
 
 
 
22
 
23
+ from PRESET_QUERIES import Queries, Query_Doc_Map
24
+ from data_query import generate_openai_response
 
 
 
25
 
26
+ from dotenv import load_dotenv
27
+ load_dotenv()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  class RAGChatbot:
30
  def __init__(self, pinecone_api_key=None, index_name="test-index", config_path="../config.yml"):
 
39
  self.data_ingester = ChatbotDataIngester(vector_store=self.vector_store, embeddings=self.embeddings)
40
  self.data_query = ChatbotDataQuery(vector_store=self.vector_store)
41
  self.reranker = self.initialize_reranker()
42
+ self.openai_api_key = os.getenv("OPENAI_API_KEY")
43
+ self.client = OpenAI(api_key=self.openai_api_key)
44
 
45
  def load_config(self, config_path):
46
  """
 
92
  """
93
  self.data_ingester.load_and_ingest(dir_path, empty_db=empty)
94
 
95
+ def __route(self, query_text):
96
+ query_text = query_text.lower()
97
+ def cosine_similarity_calc(vec1, vec2):
98
+ vec1 = np.array(vec1).reshape(1, -1)
99
+ vec2 = np.array(vec2).reshape(1, -1)
100
+ return cosine_similarity(vec1, vec2)[0][0]
101
+
102
+ def get_embeddings(client, text):
103
+ response = client.embeddings.create(
104
+ input=text,
105
+ model="text-embedding-3-large"
106
+ )
107
+ return response.data[0].embedding
108
+
109
+ # Generate embeddings for the incoming query
110
+ query_embedding = get_embeddings(self.client, query_text)
111
+
112
+ best_match = None
113
+ highest_similarity = 0
114
+
115
+ for main_query, similar_queries in Queries.items():
116
+ for query in similar_queries:
117
+ query = query.lower()
118
+ preset_embedding = get_embeddings(self.client, query)
119
+ similarity_score = cosine_similarity_calc(query_embedding, preset_embedding)
120
+ if similarity_score > highest_similarity:
121
+ highest_similarity = similarity_score
122
+ best_match = main_query
123
+
124
+ if highest_similarity >= 0.5100:
125
+ # print(f'Response from routing:query_text: {query_text} - best_match query: {best_match} - Doc: {Query_Doc_Map[best_match][0]}')
126
+ response, file_path = self.__generate_response_from_file(query_text, Query_Doc_Map[best_match][0])
127
+ return response, file_path
128
+ else:
129
+ return None, None
130
+
131
+ def __generate_response_from_file(self, query_text, file_path):
132
+ """
133
+ Generate response from a file.
134
+ """
135
+ def read_docx(file_path):
136
+ doc = Document(file_path)
137
+ full_text = []
138
+ for paragraph in doc.paragraphs:
139
+ full_text.append(paragraph.text)
140
+ return '\n'.join(full_text)
141
+
142
+ file_content = read_docx(os.path.join('../../Data', file_path))
143
+
144
+ system_prompt = '''
145
+ You are an intelligent assistant designed to provide clear, accurate, and helpful responses.
146
+ Focus on understanding user intent, give concise answers, and offer step-by-step solutions when necessary.
147
+ Be friendly, professional, and avoid unnecessary information.\n'''
148
+
149
+ input_prompt = f'Query: {query_text}\nContext: {file_content}'
150
+
151
+ response = generate_openai_response(input_prompt, system_prompt)
152
+ return response.split('\n')[1], os.path.join('../../Data', file_path)
153
+
154
  def query_chatbot(self, query_text, k=1, rerank=False): #, fetch_k=2, lambda_mult=0.5
155
  """
156
  Query the chatbot using the provided query text and optional search parameters.
157
  """
158
+
159
+ route_response, file_path = self.__route(query_text)
160
+ if route_response == None:
161
+ if rerank:
162
+ response, context_docs = self.data_query.query(
163
+ query_text=query_text,
164
+ k=k,
165
+ reranker=self.reranker
166
+ )
167
+ else:
168
+ response = self.data_query.query(
169
+ query_text=query_text,
170
+ k=k,
171
+ )
172
+ return response, context_docs
173
  else:
174
+ return route_response, file_path
 
 
 
 
data_query.py CHANGED
@@ -8,11 +8,18 @@ from langchain.chains.combine_documents import create_stuff_documents_chain
8
  from langchain_core.prompts import ChatPromptTemplate
9
  from langchain_core.documents import Document
10
 
11
- def genetare_openai_response(input_prompt):
12
- print(f'In genetare_openai_response')
13
- system_prompt = '''You are an assistant designed to provide answers when no (0) relevant documents are retrieved from the vector database. When this happens, you should follow these steps:
14
- 1) First, determine if you can answer the user's query using general knowledge or internal information. If so, generate a confident, helpful response in a straightforward narrative style. Do not use phrases such as 'According to me,' 'As of my knowledge,' 'I don’t know but,' or mention knowledge cutoffs or lack of information. Simply provide the answer as if you are certain of the facts.
15
- 2) If the question is domain-specific, too specific (e.g., about a particular person or object that could mislead), or outside your knowledge, do not attempt to answer. Politely respond with: 'I'm sorry, I currently do not have enough information to answer your question.'''
 
 
 
 
 
 
 
16
  llm = ChatOpenAI(model="gpt-4o", api_key=os.getenv("OPENAI_API_KEY"))
17
  return 'The number of retrieved documents from RAG pipeline was 0, so the answer is based on LLM\s internal knowledge.\n' + llm(system_prompt+input_prompt).content
18
 
@@ -20,7 +27,16 @@ class ChatbotDataQuery:
20
  def __init__(self, vector_store):
21
  self.llm = ChatOpenAI(model="gpt-4o", api_key=os.getenv("OPENAI_API_KEY"))
22
 
23
- self.system_prompt = '''You are Wagner, a highly intelligent and friendly AI assistant. More details on you are in a separate file: \'Who-is-Wagner-Chatbot-Response.docx\'.\n'''
 
 
 
 
 
 
 
 
 
24
 
25
  if vector_store is None:
26
  raise ValueError("Vector store cannot be None")
@@ -36,7 +52,7 @@ class ChatbotDataQuery:
36
  def __generate_response(self, query_text, retriever, reranker=None, reranker_docs=0):
37
  context_docs = retriever.invoke(query_text)
38
  if len(context_docs) == 0:
39
- response = genetare_openai_response(input_prompt=query_text)
40
  return response
41
 
42
  context_docs_texts = [doc.page_content for doc in context_docs]
@@ -55,7 +71,7 @@ class ChatbotDataQuery:
55
 
56
  final_reranked_docs = []
57
  for reranked_doc in relevant_docs:
58
- if reranked_doc['score'] < 0.50:
59
  continue
60
  else:
61
  idx_of_content_in_context_doc = reranked_doc['result_index']
@@ -92,7 +108,8 @@ class ChatbotDataQuery:
92
  response = ''
93
  for chunk in self.llm.stream(query):
94
  response += chunk.content
95
- return {'response': response, 'context_docs': context_docs}
 
96
  # yield chunk.content
97
  # return context_docs
98
 
 
8
  from langchain_core.prompts import ChatPromptTemplate
9
  from langchain_core.documents import Document
10
 
11
+ def generate_openai_response(input_prompt, system_prompt=None):
12
+ if system_prompt is None:
13
+ system_prompt = '''You are an assistant designed to provide answers when no (0) relevant documents are retrieved from the vector database. When this happens, you should follow these steps:
14
+ 1) First, determine if you can answer the user's query using general knowledge or internal information. If so, generate a confident, helpful response in a straightforward narrative style. Do not use phrases such as 'According to me,' 'As of my knowledge,' 'I don’t know but,' or mention knowledge cutoffs or lack of information. Simply provide the answer as if you are certain of the facts.
15
+ 2) If the question is domain-specific, too specific (e.g., about a particular person or object that could mislead), or outside your knowledge, do not attempt to answer. Politely respond with: 'I'm sorry, I currently do not have enough information to answer your question.
16
+
17
+ When asked about you, answer from the file(s):
18
+ - Who-is-Wagner-Chatbot-Response.docx
19
+
20
+ When asked about Daniel, answer from the file(s):
21
+ - Ringel_Daniel_CV_V1.docx
22
+ '''
23
  llm = ChatOpenAI(model="gpt-4o", api_key=os.getenv("OPENAI_API_KEY"))
24
  return 'The number of retrieved documents from RAG pipeline was 0, so the answer is based on LLM\s internal knowledge.\n' + llm(system_prompt+input_prompt).content
25
 
 
27
  def __init__(self, vector_store):
28
  self.llm = ChatOpenAI(model="gpt-4o", api_key=os.getenv("OPENAI_API_KEY"))
29
 
30
+ self.system_prompt = '''You are Wagner, a highly intelligent and friendly AI assistant.
31
+ You are developed to provide answers to given prompts about you, Daniel and Daniel's work. You will be given some context to generate the answer from to the user.
32
+ Answer in a very affirmative, concise, and clear manner. Never refuse to answer, always provide relevant information.
33
+
34
+ When asked about you, answer from the file(s):
35
+ - Who-is-Wagner-Chatbot-Response.docx
36
+
37
+ When asked about Daniel, answer from the file(s):
38
+ - Ringel_Daniel_CV_V1.docx
39
+ '''
40
 
41
  if vector_store is None:
42
  raise ValueError("Vector store cannot be None")
 
52
  def __generate_response(self, query_text, retriever, reranker=None, reranker_docs=0):
53
  context_docs = retriever.invoke(query_text)
54
  if len(context_docs) == 0:
55
+ response = generate_openai_response(input_prompt=query_text)
56
  return response
57
 
58
  context_docs_texts = [doc.page_content for doc in context_docs]
 
71
 
72
  final_reranked_docs = []
73
  for reranked_doc in relevant_docs:
74
+ if reranked_doc['score'] < 0.35:
75
  continue
76
  else:
77
  idx_of_content_in_context_doc = reranked_doc['result_index']
 
108
  response = ''
109
  for chunk in self.llm.stream(query):
110
  response += chunk.content
111
+ return response, context_docs
112
+ # return {'response': response, 'context_docs': context_docs}
113
  # yield chunk.content
114
  # return context_docs
115