Spaces:

SalehAhmad
/

Wagner

Runtime error

App Files Files Community

SalehAhmad commited on Oct 4, 2024

Commit

212894b

verified ·

1 Parent(s): 939d2ad

Upload 22 files

Browse files

Files changed (23) hide show

.gitattributes +7 -0
Code/Chatbot/PRESET_QUERIES.py +24 -0
Code/Chatbot/__pycache__/PRESET_QUERIES.cpython-311.pyc +0 -0
Code/Chatbot/__pycache__/chatbot.cpython-311.pyc +0 -0
Code/Chatbot/__pycache__/chatbot.cpython-312.pyc +0 -0
Code/Chatbot/__pycache__/data_ingester.cpython-311.pyc +0 -0
Code/Chatbot/__pycache__/data_loader.cpython-311.pyc +0 -0
Code/Chatbot/__pycache__/data_query.cpython-311.pyc +0 -0
Code/Chatbot/app.ipynb +224 -0
Code/Chatbot/app.py +46 -0
Code/Chatbot/chatbot.py +174 -0
Code/Chatbot/data_ingester.py +94 -0
Code/Chatbot/data_loader.py +53 -0
Code/Chatbot/data_query.py +125 -0
Data/3 Published Papers/for_Embedding/Matthe-Ringel-Skiera_2023_Mapping-Market-Structure-Evolution.docx +3 -0
Data/3 Published Papers/for_Embedding/Ringel-2023-Multimarket_Membership_Mapping.docx +3 -0
Data/3 Published Papers/for_Embedding/Ringel-Skiera-2016-Visualizing_Asymmetric-Competition_among_More_than_1000_Products_Using_Big_Search_Data.docx +3 -0
Data/3 Published Papers/for_Website/Matthe-Ringel-Skiera-2023-Mapping-Market-Structure-Evolution-MktSci.pdf +3 -0
Data/3 Published Papers/for_Website/Ringel-2023-Multimarket-Membership-Mapping-JMR.pdf +3 -0
Data/3 Published Papers/for_Website/Ringel-Skiera-2016-Visualizing-Competition-Between-1000-Products-MktSci.pdf +3 -0
Data/5 Working Papers/Malhotra_Ringel_Zhao_Cui_2024_Brand_Alliance_Opportunities.pdf +3 -0
Data/CV/Ringel_Daniel_CV_V1.docx +0 -0
Data/Wagner/Who-is-Wagner-Chatbot-Response.docx +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+Data/3[[:space:]]Published[[:space:]]Papers/for_Embedding/Matthe-Ringel-Skiera_2023_Mapping-Market-Structure-Evolution.docx filter=lfs diff=lfs merge=lfs -text
+Data/3[[:space:]]Published[[:space:]]Papers/for_Embedding/Ringel-2023-Multimarket_Membership_Mapping.docx filter=lfs diff=lfs merge=lfs -text
+Data/3[[:space:]]Published[[:space:]]Papers/for_Embedding/Ringel-Skiera-2016-Visualizing_Asymmetric-Competition_among_More_than_1000_Products_Using_Big_Search_Data.docx filter=lfs diff=lfs merge=lfs -text
+Data/3[[:space:]]Published[[:space:]]Papers/for_Website/Matthe-Ringel-Skiera-2023-Mapping-Market-Structure-Evolution-MktSci.pdf filter=lfs diff=lfs merge=lfs -text
+Data/3[[:space:]]Published[[:space:]]Papers/for_Website/Ringel-2023-Multimarket-Membership-Mapping-JMR.pdf filter=lfs diff=lfs merge=lfs -text
+Data/3[[:space:]]Published[[:space:]]Papers/for_Website/Ringel-Skiera-2016-Visualizing-Competition-Between-1000-Products-MktSci.pdf filter=lfs diff=lfs merge=lfs -text
+Data/5[[:space:]]Working[[:space:]]Papers/Malhotra_Ringel_Zhao_Cui_2024_Brand_Alliance_Opportunities.pdf filter=lfs diff=lfs merge=lfs -text

Code/Chatbot/PRESET_QUERIES.py ADDED Viewed

	@@ -0,0 +1,24 @@

+Queries = {
+    'Who are you?': [
+        'Who is Wagner Chatbot?',
+        'Tell me about Wagner?',
+        'Who is Wagner AI?'
+    ],
+    'who is Daniel Ringel?': [
+        'Tell me about Daniel Ringel',
+        'Can you show Daniel Ringel\'s CV?',
+        'Who is Daniel R.?'
+    ],
+}
+# Each query will map to a document
+Query_Doc_Map = {
+    'Who are you?': [
+        'Who-is-Wagner-Chatbot-Response.docx'
+    ],
+    'who is Daniel Ringel?': [
+        'CV/Ringel_Daniel_CV_V1.docx'
+    ]
+}

Code/Chatbot/__pycache__/PRESET_QUERIES.cpython-311.pyc ADDED Viewed

Binary file (566 Bytes). View file

Code/Chatbot/__pycache__/chatbot.cpython-311.pyc ADDED Viewed

Binary file (10.2 kB). View file

Code/Chatbot/__pycache__/chatbot.cpython-312.pyc ADDED Viewed

Binary file (7.79 kB). View file

Code/Chatbot/__pycache__/data_ingester.cpython-311.pyc ADDED Viewed

Binary file (5.75 kB). View file

Code/Chatbot/__pycache__/data_loader.cpython-311.pyc ADDED Viewed

Binary file (3.65 kB). View file

Code/Chatbot/__pycache__/data_query.cpython-311.pyc ADDED Viewed

Binary file (8.03 kB). View file

Code/Chatbot/app.ipynb ADDED Viewed

	@@ -0,0 +1,224 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Importing All Required Packages and Libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from chatbot import RAGChatbot\n",
+    "import os\n",
+    "from dotenv import load_dotenv\n",
+    "load_dotenv()\n",
+    "\n",
+    "import warnings\n",
+    "warnings.filterwarnings(\"ignore\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Initializing the RAG chatbot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chatbot = RAGChatbot(\n",
+    "    pinecone_api_key=os.getenv('PINECONE_API_KEY'),\n",
+    "    index_name='test',\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Below cell has code to ingest data into the pinecone vector database\n",
+    "## Note: Only uncomment and run when you have to really ingest the data from the Data directory (which has all the relavant files)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# chatbot.ingest_data('../../Data', empty=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Below cell is used to query the RAG chatbot\n",
+    "## You can test the responses for different values of \n",
+    "- k: The number of documents to retrieve from the vector database. You can input any natural number >= 1\n",
+    "- rerank: Whether to rerank the retrieved documents or not. Possible inputs are true and false"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# response = chatbot.query_chatbot(input(), k=15, rerank=True) #the input() will ask you to enter the query\n",
+    "# print(response['response'])\n",
+    "\n",
+    "# reranked_docs = response['context_docs']\n",
+    "# for docs in reranked_docs:\n",
+    "#     print(docs.metadata)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompts = [\n",
+    "    'Who is daniel?',\n",
+    "    'Who are you?',\n",
+    "    'What is your name?',\n",
+    "    'What is your job?',\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Response from routing:query_text: who is daniel? - best_match query: who is Daniel Ringel? - Doc: CV/Ringel_Daniel_CV_V1.docx\n",
+      "Daniel M. Ringel is an Assistant Professor of Marketing for Data Science and AI at the Kenan-Flagler Business School, University of North Carolina at Chapel Hill. His research focuses on integrating marketing theory with artificial intelligence and machine learning to develop frameworks and tools that benefit both academic discussions and practical applications. Daniel's work includes mapping market structure dynamics, understanding competitive relationships using AI, and advancing data-driven marketing strategies. He has received numerous awards for his contributions to the field and has been actively involved in teaching, research, and industry engagement related to AI in business.\n",
+      "CV/Ringel_Daniel_CV_V1.docx\n",
+      "\n",
+      "\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 1/1 [00:00<00:00, 23.67it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Hello! I am Wagner, an AI assistant named after the character from Goethe's Faust. In the story, Wagner is a loyal assistant to Faust, sharing in his intellectual pursuits on a smaller scale. Similarly, I am dedicated to scholarly endeavors, specifically assisting with Daniel Rangel's research in artificial intelligence and marketing. My role is to provide clear, structured, and accurate information related to Daniel's academic work, including his research, teaching, and career.\n",
+      "{'source': '../../Data/Wagner/Who-is-Wagner-Chatbot-Response.docx'}\n",
+      "{'source': '../../Data/3 Published Papers/for_Embedding/Ringel-2023-Multimarket_Membership_Mapping.docx'}\n",
+      "{'source': '../../Data/3 Published Papers/for_Embedding/Ringel-Skiera-2016-Visualizing_Asymmetric-Competition_among_More_than_1000_Products_Using_Big_Search_Data.docx'}\n",
+      "{'source': '../../Data/3 Published Papers/for_Website/Ringel-Skiera-2016-Visualizing-Competition-Between-1000-Products-MktSci.pdf'}\n",
+      "{'source': '../../Data/3 Published Papers/for_Embedding/Ringel-2023-Multimarket_Membership_Mapping.docx'}\n",
+      "{'source': '../../Data/3 Published Papers/for_Embedding/Ringel-2023-Multimarket_Membership_Mapping.docx'}\n",
+      "{'source': '../../Data/5 Working Papers/Malhotra_Ringel_Zhao_Cui_2024_Brand_Alliance_Opportunities.pdf'}\n",
+      "\n",
+      "\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 1/1 [00:00<00:00, 40.62it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Hello! My name is Wagner. I'm an assistant named after the character from Goethe’s Faust, dedicated to assisting with inquiries related to Daniel Rangel’s research in artificial intelligence and marketing.\n",
+      "{'source': '../../Data/Wagner/Who-is-Wagner-Chatbot-Response.docx'}\n",
+      "{'source': '../../Data/3 Published Papers/for_Embedding/Ringel-2023-Multimarket_Membership_Mapping.docx'}\n",
+      "{'source': '../../Data/3 Published Papers/for_Embedding/Ringel-2023-Multimarket_Membership_Mapping.docx'}\n",
+      "{'source': '../../Data/3 Published Papers/for_Embedding/Ringel-Skiera-2016-Visualizing_Asymmetric-Competition_among_More_than_1000_Products_Using_Big_Search_Data.docx'}\n",
+      "{'source': '../../Data/3 Published Papers/for_Embedding/Ringel-2023-Multimarket_Membership_Mapping.docx'}\n",
+      "{'source': '../../Data/3 Published Papers/for_Embedding/Ringel-2023-Multimarket_Membership_Mapping.docx'}\n",
+      "{'source': '../../Data/3 Published Papers/for_Embedding/Ringel-Skiera-2016-Visualizing_Asymmetric-Competition_among_More_than_1000_Products_Using_Big_Search_Data.docx'}\n",
+      "\n",
+      "\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 1/1 [00:00<00:00, 45.34it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "I am Wagner, a friendly AI assistant dedicated to supporting scholarly endeavors related to Daniel Rangel’s research in artificial intelligence and marketing. My role is to provide clear, structured, and accurate information based on his academic work, including his published and working papers, CV, and research profile. I strive to assist with inquiries related to Daniel’s research, teaching, and career.\n",
+      "{'source': '../../Data/Wagner/Who-is-Wagner-Chatbot-Response.docx'}\n",
+      "{'source': '../../Data/CV/Ringel_Daniel_CV_V1.docx'}\n",
+      "{'source': '../../Data/3 Published Papers/for_Embedding/Ringel-Skiera-2016-Visualizing_Asymmetric-Competition_among_More_than_1000_Products_Using_Big_Search_Data.docx'}\n",
+      "{'source': '../../Data/3 Published Papers/for_Embedding/Ringel-2023-Multimarket_Membership_Mapping.docx'}\n",
+      "{'source': '../../Data/3 Published Papers/for_Embedding/Matthe-Ringel-Skiera_2023_Mapping-Market-Structure-Evolution.docx'}\n",
+      "{'source': '../../Data/3 Published Papers/for_Embedding/Ringel-2023-Multimarket_Membership_Mapping.docx'}\n",
+      "{'source': '../../Data/5 Working Papers/Malhotra_Ringel_Zhao_Cui_2024_Brand_Alliance_Opportunities.pdf'}\n",
+      "\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "for prompt in prompts:\n",
+    "    response, source = chatbot.query_chatbot(prompt, k=15, rerank=True)\n",
+    "    print(response)\n",
+    "\n",
+    "    if type(source) == str:\n",
+    "        print(source)\n",
+    "    else:\n",
+    "        for docs in source:\n",
+    "            print(docs.metadata)\n",
+    "    print('\\n\\n')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

Code/Chatbot/app.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import streamlit as st
+from chatbot import RAGChatbot
+import os
+from dotenv import load_dotenv
+import warnings
+warnings.filterwarnings("ignore")
+# Load environment variables from .env file
+load_dotenv()
+@st.cache_resource
+def initialize_chatbot():
+    # Initialize the chatbot with necessary API keys and settings
+    chatbot = RAGChatbot(
+        pinecone_api_key=os.getenv('PINECONE_API_KEY'),
+        index_name='test',
+    )
+    return chatbot
+chatbot = initialize_chatbot()
+# Streamlit app layout
+st.title("RAG Chatbot")
+st.write("Ask the chatbot anything and get real-time responses.")
+# Input prompt from the user
+prompt = st.text_input("Enter your prompt:", "")
+if prompt:
+    # Query the chatbot and get the response
+    response, sources = chatbot.query_chatbot(prompt, k=15, rerank=True)
+    # Display LLM response
+    st.subheader("LLM Response")
+    st.write(response)
+    # Display reranked relevant documents with metadata
+    st.subheader("Relevant Documents")
+    if type(sources) != str:
+        docs = sources
+        for i, doc in enumerate(docs):
+            st.write(f"**Document {i+1} Metadata:**")
+            st.json(doc.metadata)  # Display metadata in JSON format for better structure
+    elif type(sources) == str:
+        st.write(f"**Document {1} Metadata:**")
+        st.json({"source": sources})

Code/Chatbot/chatbot.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import os
+import numpy as np
+import yaml
+from docx import Document
+from langchain_pinecone import PineconeVectorStore
+from langchain_community.embeddings import HuggingFaceBgeEmbeddings
+from ragatouille import RAGPretrainedModel
+from data_ingester import ChatbotDataIngester
+from data_query import ChatbotDataQuery
+from getpass import getpass
+from pinecone import Pinecone, ServerlessSpec
+import torch
+import torch.nn.functional as F
+from transformers import AutoModel
+from sklearn.metrics.pairwise import cosine_similarity
+from openai import OpenAI
+from PRESET_QUERIES import Queries, Query_Doc_Map
+from data_query import generate_openai_response
+from dotenv import load_dotenv
+load_dotenv()
+class RAGChatbot:
+    def __init__(self, pinecone_api_key=None, index_name="test-index", config_path="../config.yml"):
+        """
+        Initialize the RAGChatbot. Handles embeddings, vector store, data ingestion, and query.
+        """
+        self.pinecone_api_key = pinecone_api_key or os.getenv("PINECONE_API_KEY")# or getpass("Enter your Pinecone API key: ")
+        self.index_name = index_name
+        self.embeddings = self.initialize_embeddings()
+        self.dimensions = len(self.embeddings.embed_query("Hello World!"))
+        self.vector_store = self.initialize_vector_store()
+        self.data_ingester = ChatbotDataIngester(vector_store=self.vector_store, embeddings=self.embeddings)
+        self.data_query = ChatbotDataQuery(vector_store=self.vector_store)
+        self.reranker = self.initialize_reranker()
+        self.openai_api_key = os.getenv("OPENAI_API_KEY")
+        self.client = OpenAI(api_key=self.openai_api_key)
+    def load_config(self, config_path):
+        """
+        Load the configuration file (config.yml).
+        """
+        with open(config_path, 'r') as file:
+            return yaml.safe_load(file)
+    def initialize_embeddings(self):
+        """
+        Initialize the embedding model based on the config file.
+        """
+        model_name = "BAAI/bge-large-en-v1.5"
+        model_kwargs = {"device": "cuda" if torch.cuda.is_available() else "cpu"}
+        encode_kwargs = {"normalize_embeddings": True}
+        hf = HuggingFaceBgeEmbeddings(
+            model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)
+        return hf
+    def initialize_reranker(self):
+        """
+        Initialize the reranker
+        """
+        return RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
+    def initialize_vector_store(self):
+        """
+        Initialize Pinecone vector store.
+        """
+        pc = Pinecone(api_key=self.pinecone_api_key)
+        existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
+        if self.index_name not in existing_indexes:
+            pc.create_index(
+                name=self.index_name,
+                dimension=self.dimensions,
+                metric="cosine",
+                spec=ServerlessSpec(cloud="aws", region="us-east-1"),
+            )
+            while not pc.describe_index(self.index_name).status["ready"]:
+                import time
+                time.sleep(1)
+        return PineconeVectorStore(index=pc.Index(self.index_name), embedding=self.embeddings)
+    def ingest_data(self, dir_path, empty=False):
+        """
+        Ingest data from a directory using the ChatbotDataIngester.
+        """
+        self.data_ingester.load_and_ingest(dir_path, empty_db=empty)
+    def __route(self, query_text):
+        query_text = query_text.lower()
+        def cosine_similarity_calc(vec1, vec2):
+            vec1 = np.array(vec1).reshape(1, -1)
+            vec2 = np.array(vec2).reshape(1, -1)
+            return cosine_similarity(vec1, vec2)[0][0]
+        def get_embeddings(client, text):
+            response = client.embeddings.create(
+                input=text,
+                model="text-embedding-3-large"
+            )
+            return response.data[0].embedding
+        # Generate embeddings for the incoming query
+        query_embedding = get_embeddings(self.client, query_text)
+        best_match = None
+        highest_similarity = 0
+        for main_query, similar_queries in Queries.items():
+            for query in similar_queries:
+                query = query.lower()
+                preset_embedding = get_embeddings(self.client, query)
+                similarity_score = cosine_similarity_calc(query_embedding, preset_embedding)
+                if similarity_score > highest_similarity:
+                    highest_similarity = similarity_score
+                    best_match = main_query
+        if highest_similarity >= 0.5100:
+            # print(f'Response from routing:query_text: {query_text} - best_match query: {best_match} - Doc: {Query_Doc_Map[best_match][0]}')
+            response, file_path = self.__generate_response_from_file(query_text, Query_Doc_Map[best_match][0])
+            return response, file_path
+        else:
+            return None, None
+    def __generate_response_from_file(self, query_text, file_path):
+        """
+        Generate response from a file.
+        """
+        def read_docx(file_path):
+            doc = Document(file_path)
+            full_text = []
+            for paragraph in doc.paragraphs:
+                full_text.append(paragraph.text)
+            return '\n'.join(full_text)
+        file_content = read_docx(os.path.join('../../Data', file_path))
+        system_prompt = '''
+        You are an intelligent assistant designed to provide clear, accurate, and helpful responses.
+        Focus on understanding user intent, give concise answers, and offer step-by-step solutions when necessary.
+        Be friendly, professional, and avoid unnecessary information.\n'''
+        input_prompt = f'Query: {query_text}\nContext: {file_content}'
+        response = generate_openai_response(input_prompt, system_prompt)
+        return response.split('\n')[1], os.path.join('../../Data', file_path)
+    def query_chatbot(self, query_text, k=1, rerank=False): #, fetch_k=2, lambda_mult=0.5
+        """
+        Query the chatbot using the provided query text and optional search parameters.
+        """
+        route_response, file_path = self.__route(query_text)
+        if route_response == None:
+            if rerank:
+                response, context_docs = self.data_query.query(
+                    query_text=query_text,
+                    k=k,
+                    reranker=self.reranker
+                )
+            else:
+                response = self.data_query.query(
+                    query_text=query_text,
+                    k=k,
+                )
+            return response, context_docs
+        else:
+            return route_response, file_path

Code/Chatbot/data_ingester.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import os
+from uuid import uuid4
+from langchain_core.documents import Document
+from data_loader import ChatbotDataLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_text_splitters import SpacyTextSplitter
+class ChatbotDataIngester:
+    def __init__(self, vector_store, embeddings):
+        """
+        Initialize the ChatbotDataIngester with an external vector store and embeddings model.
+        Raise an exception if either of them is None.
+        """
+        if vector_store in [None, '']:
+            raise ValueError("Vector store cannot be None/empty")
+        if embeddings in [None, '']:
+            raise ValueError("Embeddings model cannot be None/empty")
+        self.loader = ChatbotDataLoader()
+        self.vector_store = vector_store
+        self.embeddings = embeddings
+        self.text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000,
+            chunk_overlap=200,
+            length_function=len,
+        )
+    def embed_content(self, content):
+        """
+        Embed the text content using the provided embedding model.
+        """
+        return self.embeddings.embed_query(content)
+    def load_and_ingest(self, dir_path, empty_db=False):
+        """
+        Load documents from the directory, generate embeddings, and ingest them into the vector store.
+        :param dir_path: Directory path to load the documents from.
+        :param empty_db: If True, the vector store will be emptied before adding new documents.
+        """
+        # Optionally clear the vector store
+        if empty_db:
+            self.clear_vector_store()
+        # Load files from the directory
+        file_contents = self.loader.load_directory(dir_path)
+        # Create documents from the file contents
+        documents = [
+            Document(page_content=content, metadata={"source": file_path})
+            for file_path, content in file_contents.items()
+        ]
+        print(f'{len(documents)} documents loaded from the database')
+        split_docs = self.text_splitter.split_documents(documents)
+        # Generate UUIDs for documents
+        uuids = [str(uuid4()) for _ in range(len(split_docs))]
+        print(f'{len(documents)} documents splitted into {len(split_docs)} chunks')
+        # Ingest documents into the vector store
+        self.ingest_to_vector_store(split_docs, uuids)
+    def clear_vector_store(self):
+        """
+        Clear all documents in the vector store.
+        """
+        try:
+            current_index = self.vector_store.get_pinecone_index('test')
+            check = False
+            for ids in current_index.list(namespace='default'):
+                check = True
+                break
+            if not check:
+                print("The vector store is already empty.")
+                return
+            else:
+                self.vector_store.delete(delete_all=True)
+                print("Cleared the vector store.")
+        except Exception as e:
+            print(f"Failed to clear the vector store: {str(e)}")
+    def ingest_to_vector_store(self, documents, uuids):
+        """
+        Ingest the documents into the vector store.
+        """
+        try:
+            self.vector_store.add_documents(documents, ids=uuids)
+            print(f'Ingested {len(documents)} chunks to the vector store')
+        except Exception as e:
+            print(f'Failed to ingest documents: {str(e)}')

Code/Chatbot/data_loader.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import os
+from docx import Document
+import PyPDF2
+class ChatbotDataLoader:
+    def __init__(self):
+        pass
+    def read_docx(self, file_path):
+        """
+        Reads content from a .docx file.
+        """
+        doc = Document(file_path)
+        content = "\n".join([para.text for para in doc.paragraphs])
+        return content
+    def read_pdf(self, file_path):
+        """
+        Reads content from a .pdf file.
+        """
+        with open(file_path, "rb") as file:
+            reader = PyPDF2.PdfReader(file)
+            content = ""
+            for page in range(len(reader.pages)):
+                content += reader.pages[page].extract_text()
+        return content
+    def load_file(self, file_path):
+        """
+        Reads content from a .docx or .pdf file based on the file extension.
+        """
+        if file_path.endswith(".docx"):
+            return self.read_docx(file_path)
+        elif file_path.endswith(".pdf"):
+            return self.read_pdf(file_path)
+        else:
+            raise ValueError(f"Unsupported file type: {file_path}")
+    def load_directory(self, dir_path):
+        """
+        Iterates through the directory, loads all .docx and .pdf files, and returns their content.
+        """
+        file_contents = {}
+        for root, _, files in os.walk(dir_path):
+            for file in files:
+                file_path = os.path.join(root, file)
+                if file.endswith((".docx", ".pdf")):
+                    try:
+                        content = self.load_file(file_path)
+                        file_contents[file_path] = content
+                    except Exception as e:
+                        print(f"Failed to load {file_path}: {str(e)}")
+        return file_contents

Code/Chatbot/data_query.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import getpass
+import os
+from langchain_core.runnables import RunnablePassthrough
+from langchain_core.output_parsers import StrOutputParser
+from langchain_openai import ChatOpenAI
+from langchain.chains import create_retrieval_chain
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.documents import Document
+def generate_openai_response(input_prompt, system_prompt=None):
+    if system_prompt is None:
+        system_prompt = '''You are an assistant designed to provide answers when no (0) relevant documents are retrieved from the vector database. When this happens, you should follow these steps:
+                        1) First, determine if you can answer the user's query using general knowledge or internal information. If so, generate a confident, helpful response in a straightforward narrative style. Do not use phrases such as 'According to me,' 'As of my knowledge,' 'I don’t know but,' or mention knowledge cutoffs or lack of information. Simply provide the answer as if you are certain of the facts.
+                        2) If the question is domain-specific, too specific (e.g., about a particular person or object that could mislead), or outside your knowledge, do not attempt to answer. Politely respond with: 'I'm sorry, I currently do not have enough information to answer your question.
+                        When asked about you, answer from the file(s):
+                        - Who-is-Wagner-Chatbot-Response.docx
+                        When asked about Daniel, answer from the file(s):
+                        - Ringel_Daniel_CV_V1.docx
+                        '''
+    llm = ChatOpenAI(model="gpt-4o", api_key=os.getenv("OPENAI_API_KEY"))
+    return 'The number of retrieved documents from RAG pipeline was 0, so the answer is based on LLM\s internal knowledge.\n' + llm(system_prompt+input_prompt).content
+class ChatbotDataQuery:
+    def __init__(self, vector_store):
+        self.llm = ChatOpenAI(model="gpt-4o", api_key=os.getenv("OPENAI_API_KEY"))
+        self.system_prompt = '''You are Wagner, a highly intelligent and friendly AI assistant.
+        You are developed to provide answers to given prompts about you, Daniel and Daniel's work. You will be given some context to generate the answer from to the user.
+        Answer in a very affirmative, concise, and clear manner. Never refuse to answer, always provide relevant information.
+        When asked about you, answer from the file(s):
+        - Who-is-Wagner-Chatbot-Response.docx
+        When asked about Daniel, answer from the file(s):
+        - Ringel_Daniel_CV_V1.docx
+        '''
+        if vector_store is None:
+            raise ValueError("Vector store cannot be None")
+        else:
+            self.vector_store = vector_store
+    def initialize_reranker(self):
+        """
+        Initialize the custom reranker.
+        """
+        return CustomReranker()
+    def __generate_response(self, query_text, retriever, reranker=None, reranker_docs=0):
+        context_docs = retriever.invoke(query_text)
+        if len(context_docs) == 0:
+            response = generate_openai_response(input_prompt=query_text)
+            return response
+        context_docs_texts = [doc.page_content for doc in context_docs]
+        if reranker is not None and reranker_docs > 0:
+            # Use the custom reranker to rerank the context_docs
+            relevant_docs = reranker.rerank(query_text, context_docs_texts, k=reranker_docs)
+            All_Scores = [doc['score'] for doc in relevant_docs]
+            Min = min(All_Scores)
+            Max = max(All_Scores)
+            Normalized_Scores = [(doc['score'] - Min) / (Max - Min) for doc in relevant_docs]
+            for idx,doc in enumerate(relevant_docs):
+                doc['score'] = Normalized_Scores[idx]
+            final_reranked_docs = []
+            for reranked_doc in relevant_docs:
+                if reranked_doc['score'] < 0.35:
+                    continue
+                else:
+                    idx_of_content_in_context_doc = reranked_doc['result_index']
+                    meta_data = context_docs[idx_of_content_in_context_doc].metadata
+                    final_reranked_docs.append(Document(page_content=reranked_doc['content'], metadata=meta_data))
+            context_docs = final_reranked_docs
+        prompt = ChatPromptTemplate.from_template(
+            "You are a helpful assistant that only answers questions about the context. "
+            "You try your best to extract the relevant answers from the context. "
+            "The context is:\n\n{context}\n\n"
+            "Question: {question}\n"
+            "Helpful Answer:"
+        )
+        # print(f'---\nThe Retrieved Documents are:')
+        # for idx, doc in enumerate(context_docs):
+        #     print(idx, '-', doc.metadata)
+        # print('---\n\n')
+        chain = create_stuff_documents_chain(
+            llm=self.llm,
+            prompt=prompt,
+            document_variable_name="context",
+        )
+        context = '\n\n'.join([doc.page_content for doc in context_docs])
+        query = [
+            ("system", f"{self.system_prompt}"),
+            ("human", f"context: {context}\nInput: {query_text}"),
+        ]
+        response = ''
+        for chunk in self.llm.stream(query):
+            response += chunk.content
+        return response, context_docs
+        # return {'response': response, 'context_docs': context_docs}
+            # yield chunk.content
+        # return context_docs
+    def query(self, query_text, k=1, reranker=None):
+        retriever = self.vector_store.as_retriever(
+            search_kwargs={"k": k},
+            search_type="similarity",
+        )
+        try:
+            return self.__generate_response(query_text=query_text, retriever=retriever, reranker=reranker, reranker_docs=k//2)
+        except Exception as e:
+            print(f"Failed to retrieve documents: {str(e)}")
+            return None

Data/3 Published Papers/for_Embedding/Matthe-Ringel-Skiera_2023_Mapping-Market-Structure-Evolution.docx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0c878e169ad075e3a128d174ddad66e08ac1ec9078977b6373aff1ecc1e4313a
+size 2852959

Data/3 Published Papers/for_Embedding/Ringel-2023-Multimarket_Membership_Mapping.docx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f1b86cb993925516abb35c5d145b654be6fa86c46c3e6da88283b20f0960e8ff
+size 18283400

Data/3 Published Papers/for_Embedding/Ringel-Skiera-2016-Visualizing_Asymmetric-Competition_among_More_than_1000_Products_Using_Big_Search_Data.docx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dfb8bef2648e45dc910c2c50d0d57e44af1146c16040b21f967bb40b696670c9
+size 4679800

Data/3 Published Papers/for_Website/Matthe-Ringel-Skiera-2023-Mapping-Market-Structure-Evolution-MktSci.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ce62b17d1c7173289c4b7fba884be8b98cb2bf184a8ce9d62cc22cf6aea4482f
+size 2941458

Data/3 Published Papers/for_Website/Ringel-2023-Multimarket-Membership-Mapping-JMR.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c2d9a0ca2e8a61ed9c11f84ae14ffbb3e3d2eb4bf120bf1dafc8501bd830fd23
+size 3394926

Data/3 Published Papers/for_Website/Ringel-Skiera-2016-Visualizing-Competition-Between-1000-Products-MktSci.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c6efcc41eebe77b0dfa86d3f5bf6dab466a89804a74569d184e9e8618cc1039f
+size 1815358

Data/5 Working Papers/Malhotra_Ringel_Zhao_Cui_2024_Brand_Alliance_Opportunities.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eb49ba0a7e2a8b04238543b492a21664f315b18df9dbc58da4f06cdb029f29f0
+size 2479623

Data/CV/Ringel_Daniel_CV_V1.docx ADDED Viewed

Binary file (59.3 kB). View file

Data/Wagner/Who-is-Wagner-Chatbot-Response.docx ADDED Viewed

Binary file (13.4 kB). View file