SalehAhmad commited on
Commit
212894b
·
verified ·
1 Parent(s): 939d2ad

Upload 22 files

Browse files
Files changed (23) hide show
  1. .gitattributes +7 -0
  2. Code/Chatbot/PRESET_QUERIES.py +24 -0
  3. Code/Chatbot/__pycache__/PRESET_QUERIES.cpython-311.pyc +0 -0
  4. Code/Chatbot/__pycache__/chatbot.cpython-311.pyc +0 -0
  5. Code/Chatbot/__pycache__/chatbot.cpython-312.pyc +0 -0
  6. Code/Chatbot/__pycache__/data_ingester.cpython-311.pyc +0 -0
  7. Code/Chatbot/__pycache__/data_loader.cpython-311.pyc +0 -0
  8. Code/Chatbot/__pycache__/data_query.cpython-311.pyc +0 -0
  9. Code/Chatbot/app.ipynb +224 -0
  10. Code/Chatbot/app.py +46 -0
  11. Code/Chatbot/chatbot.py +174 -0
  12. Code/Chatbot/data_ingester.py +94 -0
  13. Code/Chatbot/data_loader.py +53 -0
  14. Code/Chatbot/data_query.py +125 -0
  15. Data/3 Published Papers/for_Embedding/Matthe-Ringel-Skiera_2023_Mapping-Market-Structure-Evolution.docx +3 -0
  16. Data/3 Published Papers/for_Embedding/Ringel-2023-Multimarket_Membership_Mapping.docx +3 -0
  17. Data/3 Published Papers/for_Embedding/Ringel-Skiera-2016-Visualizing_Asymmetric-Competition_among_More_than_1000_Products_Using_Big_Search_Data.docx +3 -0
  18. Data/3 Published Papers/for_Website/Matthe-Ringel-Skiera-2023-Mapping-Market-Structure-Evolution-MktSci.pdf +3 -0
  19. Data/3 Published Papers/for_Website/Ringel-2023-Multimarket-Membership-Mapping-JMR.pdf +3 -0
  20. Data/3 Published Papers/for_Website/Ringel-Skiera-2016-Visualizing-Competition-Between-1000-Products-MktSci.pdf +3 -0
  21. Data/5 Working Papers/Malhotra_Ringel_Zhao_Cui_2024_Brand_Alliance_Opportunities.pdf +3 -0
  22. Data/CV/Ringel_Daniel_CV_V1.docx +0 -0
  23. Data/Wagner/Who-is-Wagner-Chatbot-Response.docx +0 -0
.gitattributes CHANGED
@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Data/3[[:space:]]Published[[:space:]]Papers/for_Embedding/Matthe-Ringel-Skiera_2023_Mapping-Market-Structure-Evolution.docx filter=lfs diff=lfs merge=lfs -text
37
+ Data/3[[:space:]]Published[[:space:]]Papers/for_Embedding/Ringel-2023-Multimarket_Membership_Mapping.docx filter=lfs diff=lfs merge=lfs -text
38
+ Data/3[[:space:]]Published[[:space:]]Papers/for_Embedding/Ringel-Skiera-2016-Visualizing_Asymmetric-Competition_among_More_than_1000_Products_Using_Big_Search_Data.docx filter=lfs diff=lfs merge=lfs -text
39
+ Data/3[[:space:]]Published[[:space:]]Papers/for_Website/Matthe-Ringel-Skiera-2023-Mapping-Market-Structure-Evolution-MktSci.pdf filter=lfs diff=lfs merge=lfs -text
40
+ Data/3[[:space:]]Published[[:space:]]Papers/for_Website/Ringel-2023-Multimarket-Membership-Mapping-JMR.pdf filter=lfs diff=lfs merge=lfs -text
41
+ Data/3[[:space:]]Published[[:space:]]Papers/for_Website/Ringel-Skiera-2016-Visualizing-Competition-Between-1000-Products-MktSci.pdf filter=lfs diff=lfs merge=lfs -text
42
+ Data/5[[:space:]]Working[[:space:]]Papers/Malhotra_Ringel_Zhao_Cui_2024_Brand_Alliance_Opportunities.pdf filter=lfs diff=lfs merge=lfs -text
Code/Chatbot/PRESET_QUERIES.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Queries = {
2
+ 'Who are you?': [
3
+ 'Who is Wagner Chatbot?',
4
+ 'Tell me about Wagner?',
5
+ 'Who is Wagner AI?'
6
+ ],
7
+
8
+ 'who is Daniel Ringel?': [
9
+ 'Tell me about Daniel Ringel',
10
+ 'Can you show Daniel Ringel\'s CV?',
11
+ 'Who is Daniel R.?'
12
+ ],
13
+ }
14
+
15
+ # Each query will map to a document
16
+ Query_Doc_Map = {
17
+ 'Who are you?': [
18
+ 'Who-is-Wagner-Chatbot-Response.docx'
19
+ ],
20
+
21
+ 'who is Daniel Ringel?': [
22
+ 'CV/Ringel_Daniel_CV_V1.docx'
23
+ ]
24
+ }
Code/Chatbot/__pycache__/PRESET_QUERIES.cpython-311.pyc ADDED
Binary file (566 Bytes). View file
 
Code/Chatbot/__pycache__/chatbot.cpython-311.pyc ADDED
Binary file (10.2 kB). View file
 
Code/Chatbot/__pycache__/chatbot.cpython-312.pyc ADDED
Binary file (7.79 kB). View file
 
Code/Chatbot/__pycache__/data_ingester.cpython-311.pyc ADDED
Binary file (5.75 kB). View file
 
Code/Chatbot/__pycache__/data_loader.cpython-311.pyc ADDED
Binary file (3.65 kB). View file
 
Code/Chatbot/__pycache__/data_query.cpython-311.pyc ADDED
Binary file (8.03 kB). View file
 
Code/Chatbot/app.ipynb ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Importing All Required Packages and Libraries"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": 7,
13
+ "metadata": {},
14
+ "outputs": [],
15
+ "source": [
16
+ "from chatbot import RAGChatbot\n",
17
+ "import os\n",
18
+ "from dotenv import load_dotenv\n",
19
+ "load_dotenv()\n",
20
+ "\n",
21
+ "import warnings\n",
22
+ "warnings.filterwarnings(\"ignore\")"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "markdown",
27
+ "metadata": {},
28
+ "source": [
29
+ "# Initializing the RAG chatbot"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": 8,
35
+ "metadata": {},
36
+ "outputs": [],
37
+ "source": [
38
+ "chatbot = RAGChatbot(\n",
39
+ " pinecone_api_key=os.getenv('PINECONE_API_KEY'),\n",
40
+ " index_name='test',\n",
41
+ ")"
42
+ ]
43
+ },
44
+ {
45
+ "cell_type": "markdown",
46
+ "metadata": {},
47
+ "source": [
48
+ "# Below cell has code to ingest data into the pinecone vector database\n",
49
+ "## Note: Only uncomment and run when you have to really ingest the data from the Data directory (which has all the relavant files)"
50
+ ]
51
+ },
52
+ {
53
+ "cell_type": "code",
54
+ "execution_count": 9,
55
+ "metadata": {},
56
+ "outputs": [],
57
+ "source": [
58
+ "# chatbot.ingest_data('../../Data', empty=True)"
59
+ ]
60
+ },
61
+ {
62
+ "cell_type": "markdown",
63
+ "metadata": {},
64
+ "source": [
65
+ "# Below cell is used to query the RAG chatbot\n",
66
+ "## You can test the responses for different values of \n",
67
+ "- k: The number of documents to retrieve from the vector database. You can input any natural number >= 1\n",
68
+ "- rerank: Whether to rerank the retrieved documents or not. Possible inputs are true and false"
69
+ ]
70
+ },
71
+ {
72
+ "cell_type": "code",
73
+ "execution_count": 10,
74
+ "metadata": {},
75
+ "outputs": [],
76
+ "source": [
77
+ "# response = chatbot.query_chatbot(input(), k=15, rerank=True) #the input() will ask you to enter the query\n",
78
+ "# print(response['response'])\n",
79
+ "\n",
80
+ "# reranked_docs = response['context_docs']\n",
81
+ "# for docs in reranked_docs:\n",
82
+ "# print(docs.metadata)"
83
+ ]
84
+ },
85
+ {
86
+ "cell_type": "code",
87
+ "execution_count": 11,
88
+ "metadata": {},
89
+ "outputs": [],
90
+ "source": [
91
+ "prompts = [\n",
92
+ " 'Who is daniel?',\n",
93
+ " 'Who are you?',\n",
94
+ " 'What is your name?',\n",
95
+ " 'What is your job?',\n",
96
+ "]"
97
+ ]
98
+ },
99
+ {
100
+ "cell_type": "code",
101
+ "execution_count": 12,
102
+ "metadata": {},
103
+ "outputs": [
104
+ {
105
+ "name": "stdout",
106
+ "output_type": "stream",
107
+ "text": [
108
+ "Response from routing:query_text: who is daniel? - best_match query: who is Daniel Ringel? - Doc: CV/Ringel_Daniel_CV_V1.docx\n",
109
+ "Daniel M. Ringel is an Assistant Professor of Marketing for Data Science and AI at the Kenan-Flagler Business School, University of North Carolina at Chapel Hill. His research focuses on integrating marketing theory with artificial intelligence and machine learning to develop frameworks and tools that benefit both academic discussions and practical applications. Daniel's work includes mapping market structure dynamics, understanding competitive relationships using AI, and advancing data-driven marketing strategies. He has received numerous awards for his contributions to the field and has been actively involved in teaching, research, and industry engagement related to AI in business.\n",
110
+ "CV/Ringel_Daniel_CV_V1.docx\n",
111
+ "\n",
112
+ "\n",
113
+ "\n"
114
+ ]
115
+ },
116
+ {
117
+ "name": "stderr",
118
+ "output_type": "stream",
119
+ "text": [
120
+ "100%|██████████| 1/1 [00:00<00:00, 23.67it/s]\n"
121
+ ]
122
+ },
123
+ {
124
+ "name": "stdout",
125
+ "output_type": "stream",
126
+ "text": [
127
+ "Hello! I am Wagner, an AI assistant named after the character from Goethe's Faust. In the story, Wagner is a loyal assistant to Faust, sharing in his intellectual pursuits on a smaller scale. Similarly, I am dedicated to scholarly endeavors, specifically assisting with Daniel Rangel's research in artificial intelligence and marketing. My role is to provide clear, structured, and accurate information related to Daniel's academic work, including his research, teaching, and career.\n",
128
+ "{'source': '../../Data/Wagner/Who-is-Wagner-Chatbot-Response.docx'}\n",
129
+ "{'source': '../../Data/3 Published Papers/for_Embedding/Ringel-2023-Multimarket_Membership_Mapping.docx'}\n",
130
+ "{'source': '../../Data/3 Published Papers/for_Embedding/Ringel-Skiera-2016-Visualizing_Asymmetric-Competition_among_More_than_1000_Products_Using_Big_Search_Data.docx'}\n",
131
+ "{'source': '../../Data/3 Published Papers/for_Website/Ringel-Skiera-2016-Visualizing-Competition-Between-1000-Products-MktSci.pdf'}\n",
132
+ "{'source': '../../Data/3 Published Papers/for_Embedding/Ringel-2023-Multimarket_Membership_Mapping.docx'}\n",
133
+ "{'source': '../../Data/3 Published Papers/for_Embedding/Ringel-2023-Multimarket_Membership_Mapping.docx'}\n",
134
+ "{'source': '../../Data/5 Working Papers/Malhotra_Ringel_Zhao_Cui_2024_Brand_Alliance_Opportunities.pdf'}\n",
135
+ "\n",
136
+ "\n",
137
+ "\n"
138
+ ]
139
+ },
140
+ {
141
+ "name": "stderr",
142
+ "output_type": "stream",
143
+ "text": [
144
+ "100%|██████████| 1/1 [00:00<00:00, 40.62it/s]\n"
145
+ ]
146
+ },
147
+ {
148
+ "name": "stdout",
149
+ "output_type": "stream",
150
+ "text": [
151
+ "Hello! My name is Wagner. I'm an assistant named after the character from Goethe’s Faust, dedicated to assisting with inquiries related to Daniel Rangel’s research in artificial intelligence and marketing.\n",
152
+ "{'source': '../../Data/Wagner/Who-is-Wagner-Chatbot-Response.docx'}\n",
153
+ "{'source': '../../Data/3 Published Papers/for_Embedding/Ringel-2023-Multimarket_Membership_Mapping.docx'}\n",
154
+ "{'source': '../../Data/3 Published Papers/for_Embedding/Ringel-2023-Multimarket_Membership_Mapping.docx'}\n",
155
+ "{'source': '../../Data/3 Published Papers/for_Embedding/Ringel-Skiera-2016-Visualizing_Asymmetric-Competition_among_More_than_1000_Products_Using_Big_Search_Data.docx'}\n",
156
+ "{'source': '../../Data/3 Published Papers/for_Embedding/Ringel-2023-Multimarket_Membership_Mapping.docx'}\n",
157
+ "{'source': '../../Data/3 Published Papers/for_Embedding/Ringel-2023-Multimarket_Membership_Mapping.docx'}\n",
158
+ "{'source': '../../Data/3 Published Papers/for_Embedding/Ringel-Skiera-2016-Visualizing_Asymmetric-Competition_among_More_than_1000_Products_Using_Big_Search_Data.docx'}\n",
159
+ "\n",
160
+ "\n",
161
+ "\n"
162
+ ]
163
+ },
164
+ {
165
+ "name": "stderr",
166
+ "output_type": "stream",
167
+ "text": [
168
+ "100%|██████████| 1/1 [00:00<00:00, 45.34it/s]\n"
169
+ ]
170
+ },
171
+ {
172
+ "name": "stdout",
173
+ "output_type": "stream",
174
+ "text": [
175
+ "I am Wagner, a friendly AI assistant dedicated to supporting scholarly endeavors related to Daniel Rangel’s research in artificial intelligence and marketing. My role is to provide clear, structured, and accurate information based on his academic work, including his published and working papers, CV, and research profile. I strive to assist with inquiries related to Daniel’s research, teaching, and career.\n",
176
+ "{'source': '../../Data/Wagner/Who-is-Wagner-Chatbot-Response.docx'}\n",
177
+ "{'source': '../../Data/CV/Ringel_Daniel_CV_V1.docx'}\n",
178
+ "{'source': '../../Data/3 Published Papers/for_Embedding/Ringel-Skiera-2016-Visualizing_Asymmetric-Competition_among_More_than_1000_Products_Using_Big_Search_Data.docx'}\n",
179
+ "{'source': '../../Data/3 Published Papers/for_Embedding/Ringel-2023-Multimarket_Membership_Mapping.docx'}\n",
180
+ "{'source': '../../Data/3 Published Papers/for_Embedding/Matthe-Ringel-Skiera_2023_Mapping-Market-Structure-Evolution.docx'}\n",
181
+ "{'source': '../../Data/3 Published Papers/for_Embedding/Ringel-2023-Multimarket_Membership_Mapping.docx'}\n",
182
+ "{'source': '../../Data/5 Working Papers/Malhotra_Ringel_Zhao_Cui_2024_Brand_Alliance_Opportunities.pdf'}\n",
183
+ "\n",
184
+ "\n",
185
+ "\n"
186
+ ]
187
+ }
188
+ ],
189
+ "source": [
190
+ "for prompt in prompts:\n",
191
+ " response, source = chatbot.query_chatbot(prompt, k=15, rerank=True)\n",
192
+ " print(response)\n",
193
+ "\n",
194
+ " if type(source) == str:\n",
195
+ " print(source)\n",
196
+ " else:\n",
197
+ " for docs in source:\n",
198
+ " print(docs.metadata)\n",
199
+ " print('\\n\\n')"
200
+ ]
201
+ }
202
+ ],
203
+ "metadata": {
204
+ "kernelspec": {
205
+ "display_name": "env",
206
+ "language": "python",
207
+ "name": "python3"
208
+ },
209
+ "language_info": {
210
+ "codemirror_mode": {
211
+ "name": "ipython",
212
+ "version": 3
213
+ },
214
+ "file_extension": ".py",
215
+ "mimetype": "text/x-python",
216
+ "name": "python",
217
+ "nbconvert_exporter": "python",
218
+ "pygments_lexer": "ipython3",
219
+ "version": "3.11.10"
220
+ }
221
+ },
222
+ "nbformat": 4,
223
+ "nbformat_minor": 2
224
+ }
Code/Chatbot/app.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from chatbot import RAGChatbot
3
+ import os
4
+ from dotenv import load_dotenv
5
+ import warnings
6
+ warnings.filterwarnings("ignore")
7
+
8
+ # Load environment variables from .env file
9
+ load_dotenv()
10
+
11
+ @st.cache_resource
12
+ def initialize_chatbot():
13
+ # Initialize the chatbot with necessary API keys and settings
14
+ chatbot = RAGChatbot(
15
+ pinecone_api_key=os.getenv('PINECONE_API_KEY'),
16
+ index_name='test',
17
+ )
18
+ return chatbot
19
+
20
+ chatbot = initialize_chatbot()
21
+
22
+ # Streamlit app layout
23
+ st.title("RAG Chatbot")
24
+ st.write("Ask the chatbot anything and get real-time responses.")
25
+
26
+ # Input prompt from the user
27
+ prompt = st.text_input("Enter your prompt:", "")
28
+
29
+ if prompt:
30
+ # Query the chatbot and get the response
31
+ response, sources = chatbot.query_chatbot(prompt, k=15, rerank=True)
32
+
33
+ # Display LLM response
34
+ st.subheader("LLM Response")
35
+ st.write(response)
36
+
37
+ # Display reranked relevant documents with metadata
38
+ st.subheader("Relevant Documents")
39
+ if type(sources) != str:
40
+ docs = sources
41
+ for i, doc in enumerate(docs):
42
+ st.write(f"**Document {i+1} Metadata:**")
43
+ st.json(doc.metadata) # Display metadata in JSON format for better structure
44
+ elif type(sources) == str:
45
+ st.write(f"**Document {1} Metadata:**")
46
+ st.json({"source": sources})
Code/Chatbot/chatbot.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import yaml
4
+ from docx import Document
5
+
6
+
7
+ from langchain_pinecone import PineconeVectorStore
8
+ from langchain_community.embeddings import HuggingFaceBgeEmbeddings
9
+ from ragatouille import RAGPretrainedModel
10
+
11
+ from data_ingester import ChatbotDataIngester
12
+ from data_query import ChatbotDataQuery
13
+ from getpass import getpass
14
+ from pinecone import Pinecone, ServerlessSpec
15
+
16
+ import torch
17
+ import torch.nn.functional as F
18
+ from transformers import AutoModel
19
+
20
+ from sklearn.metrics.pairwise import cosine_similarity
21
+ from openai import OpenAI
22
+
23
+ from PRESET_QUERIES import Queries, Query_Doc_Map
24
+ from data_query import generate_openai_response
25
+
26
+ from dotenv import load_dotenv
27
+ load_dotenv()
28
+
29
+ class RAGChatbot:
30
+ def __init__(self, pinecone_api_key=None, index_name="test-index", config_path="../config.yml"):
31
+ """
32
+ Initialize the RAGChatbot. Handles embeddings, vector store, data ingestion, and query.
33
+ """
34
+ self.pinecone_api_key = pinecone_api_key or os.getenv("PINECONE_API_KEY")# or getpass("Enter your Pinecone API key: ")
35
+ self.index_name = index_name
36
+ self.embeddings = self.initialize_embeddings()
37
+ self.dimensions = len(self.embeddings.embed_query("Hello World!"))
38
+ self.vector_store = self.initialize_vector_store()
39
+ self.data_ingester = ChatbotDataIngester(vector_store=self.vector_store, embeddings=self.embeddings)
40
+ self.data_query = ChatbotDataQuery(vector_store=self.vector_store)
41
+ self.reranker = self.initialize_reranker()
42
+ self.openai_api_key = os.getenv("OPENAI_API_KEY")
43
+ self.client = OpenAI(api_key=self.openai_api_key)
44
+
45
+ def load_config(self, config_path):
46
+ """
47
+ Load the configuration file (config.yml).
48
+ """
49
+ with open(config_path, 'r') as file:
50
+ return yaml.safe_load(file)
51
+
52
+ def initialize_embeddings(self):
53
+ """
54
+ Initialize the embedding model based on the config file.
55
+ """
56
+ model_name = "BAAI/bge-large-en-v1.5"
57
+ model_kwargs = {"device": "cuda" if torch.cuda.is_available() else "cpu"}
58
+ encode_kwargs = {"normalize_embeddings": True}
59
+ hf = HuggingFaceBgeEmbeddings(
60
+ model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)
61
+ return hf
62
+
63
+ def initialize_reranker(self):
64
+ """
65
+ Initialize the reranker
66
+ """
67
+ return RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
68
+
69
+ def initialize_vector_store(self):
70
+ """
71
+ Initialize Pinecone vector store.
72
+ """
73
+ pc = Pinecone(api_key=self.pinecone_api_key)
74
+ existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
75
+
76
+ if self.index_name not in existing_indexes:
77
+ pc.create_index(
78
+ name=self.index_name,
79
+ dimension=self.dimensions,
80
+ metric="cosine",
81
+ spec=ServerlessSpec(cloud="aws", region="us-east-1"),
82
+ )
83
+ while not pc.describe_index(self.index_name).status["ready"]:
84
+ import time
85
+ time.sleep(1)
86
+
87
+ return PineconeVectorStore(index=pc.Index(self.index_name), embedding=self.embeddings)
88
+
89
+ def ingest_data(self, dir_path, empty=False):
90
+ """
91
+ Ingest data from a directory using the ChatbotDataIngester.
92
+ """
93
+ self.data_ingester.load_and_ingest(dir_path, empty_db=empty)
94
+
95
+ def __route(self, query_text):
96
+ query_text = query_text.lower()
97
+ def cosine_similarity_calc(vec1, vec2):
98
+ vec1 = np.array(vec1).reshape(1, -1)
99
+ vec2 = np.array(vec2).reshape(1, -1)
100
+ return cosine_similarity(vec1, vec2)[0][0]
101
+
102
+ def get_embeddings(client, text):
103
+ response = client.embeddings.create(
104
+ input=text,
105
+ model="text-embedding-3-large"
106
+ )
107
+ return response.data[0].embedding
108
+
109
+ # Generate embeddings for the incoming query
110
+ query_embedding = get_embeddings(self.client, query_text)
111
+
112
+ best_match = None
113
+ highest_similarity = 0
114
+
115
+ for main_query, similar_queries in Queries.items():
116
+ for query in similar_queries:
117
+ query = query.lower()
118
+ preset_embedding = get_embeddings(self.client, query)
119
+ similarity_score = cosine_similarity_calc(query_embedding, preset_embedding)
120
+ if similarity_score > highest_similarity:
121
+ highest_similarity = similarity_score
122
+ best_match = main_query
123
+
124
+ if highest_similarity >= 0.5100:
125
+ # print(f'Response from routing:query_text: {query_text} - best_match query: {best_match} - Doc: {Query_Doc_Map[best_match][0]}')
126
+ response, file_path = self.__generate_response_from_file(query_text, Query_Doc_Map[best_match][0])
127
+ return response, file_path
128
+ else:
129
+ return None, None
130
+
131
+ def __generate_response_from_file(self, query_text, file_path):
132
+ """
133
+ Generate response from a file.
134
+ """
135
+ def read_docx(file_path):
136
+ doc = Document(file_path)
137
+ full_text = []
138
+ for paragraph in doc.paragraphs:
139
+ full_text.append(paragraph.text)
140
+ return '\n'.join(full_text)
141
+
142
+ file_content = read_docx(os.path.join('../../Data', file_path))
143
+
144
+ system_prompt = '''
145
+ You are an intelligent assistant designed to provide clear, accurate, and helpful responses.
146
+ Focus on understanding user intent, give concise answers, and offer step-by-step solutions when necessary.
147
+ Be friendly, professional, and avoid unnecessary information.\n'''
148
+
149
+ input_prompt = f'Query: {query_text}\nContext: {file_content}'
150
+
151
+ response = generate_openai_response(input_prompt, system_prompt)
152
+ return response.split('\n')[1], os.path.join('../../Data', file_path)
153
+
154
+ def query_chatbot(self, query_text, k=1, rerank=False): #, fetch_k=2, lambda_mult=0.5
155
+ """
156
+ Query the chatbot using the provided query text and optional search parameters.
157
+ """
158
+
159
+ route_response, file_path = self.__route(query_text)
160
+ if route_response == None:
161
+ if rerank:
162
+ response, context_docs = self.data_query.query(
163
+ query_text=query_text,
164
+ k=k,
165
+ reranker=self.reranker
166
+ )
167
+ else:
168
+ response = self.data_query.query(
169
+ query_text=query_text,
170
+ k=k,
171
+ )
172
+ return response, context_docs
173
+ else:
174
+ return route_response, file_path
Code/Chatbot/data_ingester.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from uuid import uuid4
3
+ from langchain_core.documents import Document
4
+ from data_loader import ChatbotDataLoader
5
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
6
+ from langchain_text_splitters import SpacyTextSplitter
7
+
8
+
9
+ class ChatbotDataIngester:
10
+ def __init__(self, vector_store, embeddings):
11
+ """
12
+ Initialize the ChatbotDataIngester with an external vector store and embeddings model.
13
+ Raise an exception if either of them is None.
14
+ """
15
+ if vector_store in [None, '']:
16
+ raise ValueError("Vector store cannot be None/empty")
17
+ if embeddings in [None, '']:
18
+ raise ValueError("Embeddings model cannot be None/empty")
19
+
20
+ self.loader = ChatbotDataLoader()
21
+ self.vector_store = vector_store
22
+ self.embeddings = embeddings
23
+ self.text_splitter = RecursiveCharacterTextSplitter(
24
+ chunk_size=1000,
25
+ chunk_overlap=200,
26
+ length_function=len,
27
+ )
28
+
29
+ def embed_content(self, content):
30
+ """
31
+ Embed the text content using the provided embedding model.
32
+ """
33
+ return self.embeddings.embed_query(content)
34
+
35
+ def load_and_ingest(self, dir_path, empty_db=False):
36
+ """
37
+ Load documents from the directory, generate embeddings, and ingest them into the vector store.
38
+
39
+ :param dir_path: Directory path to load the documents from.
40
+ :param empty_db: If True, the vector store will be emptied before adding new documents.
41
+ """
42
+ # Optionally clear the vector store
43
+ if empty_db:
44
+ self.clear_vector_store()
45
+
46
+ # Load files from the directory
47
+ file_contents = self.loader.load_directory(dir_path)
48
+
49
+ # Create documents from the file contents
50
+ documents = [
51
+ Document(page_content=content, metadata={"source": file_path})
52
+ for file_path, content in file_contents.items()
53
+ ]
54
+
55
+ print(f'{len(documents)} documents loaded from the database')
56
+
57
+ split_docs = self.text_splitter.split_documents(documents)
58
+
59
+ # Generate UUIDs for documents
60
+ uuids = [str(uuid4()) for _ in range(len(split_docs))]
61
+
62
+ print(f'{len(documents)} documents splitted into {len(split_docs)} chunks')
63
+
64
+ # Ingest documents into the vector store
65
+ self.ingest_to_vector_store(split_docs, uuids)
66
+
67
+ def clear_vector_store(self):
68
+ """
69
+ Clear all documents in the vector store.
70
+ """
71
+ try:
72
+ current_index = self.vector_store.get_pinecone_index('test')
73
+ check = False
74
+ for ids in current_index.list(namespace='default'):
75
+ check = True
76
+ break
77
+ if not check:
78
+ print("The vector store is already empty.")
79
+ return
80
+ else:
81
+ self.vector_store.delete(delete_all=True)
82
+ print("Cleared the vector store.")
83
+ except Exception as e:
84
+ print(f"Failed to clear the vector store: {str(e)}")
85
+
86
+ def ingest_to_vector_store(self, documents, uuids):
87
+ """
88
+ Ingest the documents into the vector store.
89
+ """
90
+ try:
91
+ self.vector_store.add_documents(documents, ids=uuids)
92
+ print(f'Ingested {len(documents)} chunks to the vector store')
93
+ except Exception as e:
94
+ print(f'Failed to ingest documents: {str(e)}')
Code/Chatbot/data_loader.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from docx import Document
3
+ import PyPDF2
4
+
5
+ class ChatbotDataLoader:
6
+ def __init__(self):
7
+ pass
8
+
9
+ def read_docx(self, file_path):
10
+ """
11
+ Reads content from a .docx file.
12
+ """
13
+ doc = Document(file_path)
14
+ content = "\n".join([para.text for para in doc.paragraphs])
15
+ return content
16
+
17
+ def read_pdf(self, file_path):
18
+ """
19
+ Reads content from a .pdf file.
20
+ """
21
+ with open(file_path, "rb") as file:
22
+ reader = PyPDF2.PdfReader(file)
23
+ content = ""
24
+ for page in range(len(reader.pages)):
25
+ content += reader.pages[page].extract_text()
26
+ return content
27
+
28
+ def load_file(self, file_path):
29
+ """
30
+ Reads content from a .docx or .pdf file based on the file extension.
31
+ """
32
+ if file_path.endswith(".docx"):
33
+ return self.read_docx(file_path)
34
+ elif file_path.endswith(".pdf"):
35
+ return self.read_pdf(file_path)
36
+ else:
37
+ raise ValueError(f"Unsupported file type: {file_path}")
38
+
39
+ def load_directory(self, dir_path):
40
+ """
41
+ Iterates through the directory, loads all .docx and .pdf files, and returns their content.
42
+ """
43
+ file_contents = {}
44
+ for root, _, files in os.walk(dir_path):
45
+ for file in files:
46
+ file_path = os.path.join(root, file)
47
+ if file.endswith((".docx", ".pdf")):
48
+ try:
49
+ content = self.load_file(file_path)
50
+ file_contents[file_path] = content
51
+ except Exception as e:
52
+ print(f"Failed to load {file_path}: {str(e)}")
53
+ return file_contents
Code/Chatbot/data_query.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import getpass
2
+ import os
3
+ from langchain_core.runnables import RunnablePassthrough
4
+ from langchain_core.output_parsers import StrOutputParser
5
+ from langchain_openai import ChatOpenAI
6
+ from langchain.chains import create_retrieval_chain
7
+ from langchain.chains.combine_documents import create_stuff_documents_chain
8
+ from langchain_core.prompts import ChatPromptTemplate
9
+ from langchain_core.documents import Document
10
+
11
+ def generate_openai_response(input_prompt, system_prompt=None):
12
+ if system_prompt is None:
13
+ system_prompt = '''You are an assistant designed to provide answers when no (0) relevant documents are retrieved from the vector database. When this happens, you should follow these steps:
14
+ 1) First, determine if you can answer the user's query using general knowledge or internal information. If so, generate a confident, helpful response in a straightforward narrative style. Do not use phrases such as 'According to me,' 'As of my knowledge,' 'I don’t know but,' or mention knowledge cutoffs or lack of information. Simply provide the answer as if you are certain of the facts.
15
+ 2) If the question is domain-specific, too specific (e.g., about a particular person or object that could mislead), or outside your knowledge, do not attempt to answer. Politely respond with: 'I'm sorry, I currently do not have enough information to answer your question.
16
+
17
+ When asked about you, answer from the file(s):
18
+ - Who-is-Wagner-Chatbot-Response.docx
19
+
20
+ When asked about Daniel, answer from the file(s):
21
+ - Ringel_Daniel_CV_V1.docx
22
+ '''
23
+ llm = ChatOpenAI(model="gpt-4o", api_key=os.getenv("OPENAI_API_KEY"))
24
+ return 'The number of retrieved documents from RAG pipeline was 0, so the answer is based on LLM\s internal knowledge.\n' + llm(system_prompt+input_prompt).content
25
+
26
+ class ChatbotDataQuery:
27
+ def __init__(self, vector_store):
28
+ self.llm = ChatOpenAI(model="gpt-4o", api_key=os.getenv("OPENAI_API_KEY"))
29
+
30
+ self.system_prompt = '''You are Wagner, a highly intelligent and friendly AI assistant.
31
+ You are developed to provide answers to given prompts about you, Daniel and Daniel's work. You will be given some context to generate the answer from to the user.
32
+ Answer in a very affirmative, concise, and clear manner. Never refuse to answer, always provide relevant information.
33
+
34
+ When asked about you, answer from the file(s):
35
+ - Who-is-Wagner-Chatbot-Response.docx
36
+
37
+ When asked about Daniel, answer from the file(s):
38
+ - Ringel_Daniel_CV_V1.docx
39
+ '''
40
+
41
+ if vector_store is None:
42
+ raise ValueError("Vector store cannot be None")
43
+ else:
44
+ self.vector_store = vector_store
45
+
46
+ def initialize_reranker(self):
47
+ """
48
+ Initialize the custom reranker.
49
+ """
50
+ return CustomReranker()
51
+
52
+ def __generate_response(self, query_text, retriever, reranker=None, reranker_docs=0):
53
+ context_docs = retriever.invoke(query_text)
54
+ if len(context_docs) == 0:
55
+ response = generate_openai_response(input_prompt=query_text)
56
+ return response
57
+
58
+ context_docs_texts = [doc.page_content for doc in context_docs]
59
+
60
+ if reranker is not None and reranker_docs > 0:
61
+ # Use the custom reranker to rerank the context_docs
62
+ relevant_docs = reranker.rerank(query_text, context_docs_texts, k=reranker_docs)
63
+
64
+ All_Scores = [doc['score'] for doc in relevant_docs]
65
+ Min = min(All_Scores)
66
+ Max = max(All_Scores)
67
+ Normalized_Scores = [(doc['score'] - Min) / (Max - Min) for doc in relevant_docs]
68
+
69
+ for idx,doc in enumerate(relevant_docs):
70
+ doc['score'] = Normalized_Scores[idx]
71
+
72
+ final_reranked_docs = []
73
+ for reranked_doc in relevant_docs:
74
+ if reranked_doc['score'] < 0.35:
75
+ continue
76
+ else:
77
+ idx_of_content_in_context_doc = reranked_doc['result_index']
78
+ meta_data = context_docs[idx_of_content_in_context_doc].metadata
79
+ final_reranked_docs.append(Document(page_content=reranked_doc['content'], metadata=meta_data))
80
+
81
+ context_docs = final_reranked_docs
82
+
83
+ prompt = ChatPromptTemplate.from_template(
84
+ "You are a helpful assistant that only answers questions about the context. "
85
+ "You try your best to extract the relevant answers from the context. "
86
+ "The context is:\n\n{context}\n\n"
87
+ "Question: {question}\n"
88
+ "Helpful Answer:"
89
+ )
90
+
91
+ # print(f'---\nThe Retrieved Documents are:')
92
+ # for idx, doc in enumerate(context_docs):
93
+ # print(idx, '-', doc.metadata)
94
+ # print('---\n\n')
95
+
96
+ chain = create_stuff_documents_chain(
97
+ llm=self.llm,
98
+ prompt=prompt,
99
+ document_variable_name="context",
100
+ )
101
+
102
+ context = '\n\n'.join([doc.page_content for doc in context_docs])
103
+ query = [
104
+ ("system", f"{self.system_prompt}"),
105
+ ("human", f"context: {context}\nInput: {query_text}"),
106
+ ]
107
+
108
+ response = ''
109
+ for chunk in self.llm.stream(query):
110
+ response += chunk.content
111
+ return response, context_docs
112
+ # return {'response': response, 'context_docs': context_docs}
113
+ # yield chunk.content
114
+ # return context_docs
115
+
116
+ def query(self, query_text, k=1, reranker=None):
117
+ retriever = self.vector_store.as_retriever(
118
+ search_kwargs={"k": k},
119
+ search_type="similarity",
120
+ )
121
+ try:
122
+ return self.__generate_response(query_text=query_text, retriever=retriever, reranker=reranker, reranker_docs=k//2)
123
+ except Exception as e:
124
+ print(f"Failed to retrieve documents: {str(e)}")
125
+ return None
Data/3 Published Papers/for_Embedding/Matthe-Ringel-Skiera_2023_Mapping-Market-Structure-Evolution.docx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c878e169ad075e3a128d174ddad66e08ac1ec9078977b6373aff1ecc1e4313a
3
+ size 2852959
Data/3 Published Papers/for_Embedding/Ringel-2023-Multimarket_Membership_Mapping.docx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1b86cb993925516abb35c5d145b654be6fa86c46c3e6da88283b20f0960e8ff
3
+ size 18283400
Data/3 Published Papers/for_Embedding/Ringel-Skiera-2016-Visualizing_Asymmetric-Competition_among_More_than_1000_Products_Using_Big_Search_Data.docx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dfb8bef2648e45dc910c2c50d0d57e44af1146c16040b21f967bb40b696670c9
3
+ size 4679800
Data/3 Published Papers/for_Website/Matthe-Ringel-Skiera-2023-Mapping-Market-Structure-Evolution-MktSci.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce62b17d1c7173289c4b7fba884be8b98cb2bf184a8ce9d62cc22cf6aea4482f
3
+ size 2941458
Data/3 Published Papers/for_Website/Ringel-2023-Multimarket-Membership-Mapping-JMR.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2d9a0ca2e8a61ed9c11f84ae14ffbb3e3d2eb4bf120bf1dafc8501bd830fd23
3
+ size 3394926
Data/3 Published Papers/for_Website/Ringel-Skiera-2016-Visualizing-Competition-Between-1000-Products-MktSci.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6efcc41eebe77b0dfa86d3f5bf6dab466a89804a74569d184e9e8618cc1039f
3
+ size 1815358
Data/5 Working Papers/Malhotra_Ringel_Zhao_Cui_2024_Brand_Alliance_Opportunities.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb49ba0a7e2a8b04238543b492a21664f315b18df9dbc58da4f06cdb029f29f0
3
+ size 2479623
Data/CV/Ringel_Daniel_CV_V1.docx ADDED
Binary file (59.3 kB). View file
 
Data/Wagner/Who-is-Wagner-Chatbot-Response.docx ADDED
Binary file (13.4 kB). View file