SalehAhmad commited on
Commit
10d250d
·
verified ·
1 Parent(s): d884eee

Upload 6 files

Browse files
Files changed (6) hide show
  1. app.py +42 -0
  2. chatbot.py +145 -0
  3. data_ingester.py +82 -0
  4. data_loader.py +53 -0
  5. data_query.py +97 -0
  6. requirements.txt +193 -0
app.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from chatbot import RAGChatbot
3
+ import os
4
+ from dotenv import load_dotenv
5
+
6
+ # Load environment variables from .env file
7
+ load_dotenv()
8
+
9
+ @st.cache_resource
10
+ def initialize_chatbot():
11
+ # Initialize the chatbot with necessary API keys and settings
12
+ chatbot = RAGChatbot(
13
+ pinecone_api_key=os.getenv('PINECONE_API_KEY'),
14
+ index_name='test',
15
+ )
16
+ return chatbot
17
+
18
+ chatbot = initialize_chatbot()
19
+
20
+ # Streamlit app layout
21
+ st.title("RAG Chatbot")
22
+ st.write("Ask the chatbot anything and get real-time responses.")
23
+
24
+ # Input prompt from the user
25
+ prompt = st.text_input("Enter your prompt:", "")
26
+
27
+ if prompt:
28
+ # Query the chatbot and get the response
29
+ response = chatbot.query_chatbot(prompt, k=15, rerank=True)
30
+
31
+ # Display LLM response
32
+ st.subheader("LLM Response")
33
+ if 'response' in response:
34
+ st.write(response['response']) # Display the entire response in a readable format
35
+
36
+ # Display reranked relevant documents with metadata
37
+ st.subheader("Relevant Documents")
38
+ if 'context_docs' in response:
39
+ reranked_docs = response['context_docs']
40
+ for i, doc in enumerate(reranked_docs):
41
+ st.write(f"**Document {i+1} Metadata:**")
42
+ st.json(doc.metadata) # Display metadata in JSON format for better structure
chatbot.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import yaml
4
+ from langchain_pinecone import PineconeVectorStore
5
+ from langchain_community.embeddings import HuggingFaceBgeEmbeddings
6
+ from data_ingester import ChatbotDataIngester
7
+ from data_query import ChatbotDataQuery
8
+ from getpass import getpass
9
+ from pinecone import Pinecone, ServerlessSpec
10
+ from ragatouille import RAGPretrainedModel
11
+
12
+ import torch.nn.functional as F
13
+ from transformers import AutoModel
14
+
15
+ class CustomReranker:
16
+ def __init__(self, model_name="nvidia/NV-Embed-v2", max_length=32768):
17
+ """
18
+ Initialize the reranker with the model and tokenizer.
19
+ """
20
+ self.model = AutoModel.from_pretrained(model_name, trust_remote_code=True, device_map="auto")
21
+ self.max_length = max_length
22
+
23
+ def _encode(self, texts, instruction=""):
24
+ """
25
+ Helper function to encode the input texts using the model.
26
+ """
27
+ return self.model.encode(texts, instruction=instruction, max_length=self.max_length)
28
+
29
+ def rerank(self, query, passages, k=1):
30
+ """
31
+ Rerank the passages based on their similarity with the query.
32
+
33
+ Args:
34
+ - query (str): The query text.
35
+ - passages (list of str): List of passages to rerank.
36
+ - k (int): The number of top-k documents to return after reranking.
37
+
38
+ Returns:
39
+ - A list of the top-k ranked passages with their similarity scores.
40
+ """
41
+ query_prefix = "Instruct: Given a question, retrieve passages that answer the question\nQuery: "
42
+ passage_prefix = ""
43
+
44
+ # Get the query and passage embeddings
45
+ query_embeddings = self._encode([query], instruction=query_prefix)
46
+ passage_embeddings = self._encode(passages, instruction=passage_prefix)
47
+
48
+ # Normalize embeddings
49
+ query_embeddings = F.normalize(query_embeddings, p=2, dim=1)
50
+ passage_embeddings = F.normalize(passage_embeddings, p=2, dim=1)
51
+
52
+ # Compute similarity scores
53
+ scores = (query_embeddings @ passage_embeddings.T) * 100
54
+ scores = scores.tolist()[0]
55
+
56
+ # Sort passages by their scores
57
+ sorted_passages = sorted(
58
+ [{"content": passage, "score": score, "result_index": idx}
59
+ for idx, (passage, score) in enumerate(zip(passages, scores))],
60
+ key=lambda x: x['score'], reverse=True
61
+ )
62
+
63
+ return sorted_passages[:k] # Return top-k reranked passages
64
+
65
+ class RAGChatbot:
66
+ def __init__(self, pinecone_api_key=None, index_name="test-index", config_path="../config.yml"):
67
+ """
68
+ Initialize the RAGChatbot. Handles embeddings, vector store, data ingestion, and query.
69
+ """
70
+ self.pinecone_api_key = pinecone_api_key or os.getenv("PINECONE_API_KEY")# or getpass("Enter your Pinecone API key: ")
71
+ self.index_name = index_name
72
+ self.embeddings = self.initialize_embeddings()
73
+ self.dimensions = len(self.embeddings.embed_query("Hello World!"))
74
+ self.vector_store = self.initialize_vector_store()
75
+ self.data_ingester = ChatbotDataIngester(vector_store=self.vector_store, embeddings=self.embeddings)
76
+ self.data_query = ChatbotDataQuery(vector_store=self.vector_store)
77
+ # self.reranker = self.initialize_reranker()
78
+ # self.reranker = CustomReranker()
79
+
80
+ def load_config(self, config_path):
81
+ """
82
+ Load the configuration file (config.yml).
83
+ """
84
+ with open(config_path, 'r') as file:
85
+ return yaml.safe_load(file)
86
+
87
+ def initialize_embeddings(self):
88
+ """
89
+ Initialize the embedding model based on the config file.
90
+ """
91
+ model_name = "BAAI/bge-large-en-v1.5"
92
+ model_kwargs = {"device": "cuda" if torch.cuda.is_available() else "cpu"}
93
+ encode_kwargs = {"normalize_embeddings": True}
94
+ hf = HuggingFaceBgeEmbeddings(
95
+ model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)
96
+ return hf
97
+
98
+ def initialize_reranker(self):
99
+ """
100
+ Initialize the reranker
101
+ """
102
+ return RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
103
+
104
+ def initialize_vector_store(self):
105
+ """
106
+ Initialize Pinecone vector store.
107
+ """
108
+ pc = Pinecone(api_key=self.pinecone_api_key)
109
+ existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
110
+
111
+ if self.index_name not in existing_indexes:
112
+ pc.create_index(
113
+ name=self.index_name,
114
+ dimension=self.dimensions,
115
+ metric="cosine",
116
+ spec=ServerlessSpec(cloud="aws", region="us-east-1"),
117
+ )
118
+ while not pc.describe_index(self.index_name).status["ready"]:
119
+ import time
120
+ time.sleep(1)
121
+
122
+ return PineconeVectorStore(index=pc.Index(self.index_name), embedding=self.embeddings)
123
+
124
+ def ingest_data(self, dir_path, empty=False):
125
+ """
126
+ Ingest data from a directory using the ChatbotDataIngester.
127
+ """
128
+ self.data_ingester.load_and_ingest(dir_path, empty_db=empty)
129
+
130
+ def query_chatbot(self, query_text, k=1, rerank=False): #, fetch_k=2, lambda_mult=0.5
131
+ """
132
+ Query the chatbot using the provided query text and optional search parameters.
133
+ """
134
+ if rerank:
135
+ response = self.data_query.query(
136
+ query_text=query_text,
137
+ k=k,
138
+ # reranker=self.reranker
139
+ )
140
+ else:
141
+ response = self.data_query.query(
142
+ query_text=query_text,
143
+ k=k,
144
+ )
145
+ return response
data_ingester.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from uuid import uuid4
3
+ from langchain_core.documents import Document
4
+ from data_loader import ChatbotDataLoader
5
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
6
+ from langchain_text_splitters import SpacyTextSplitter
7
+
8
+
9
+ class ChatbotDataIngester:
10
+ def __init__(self, vector_store, embeddings):
11
+ """
12
+ Initialize the ChatbotDataIngester with an external vector store and embeddings model.
13
+ Raise an exception if either of them is None.
14
+ """
15
+ if vector_store in [None, '']:
16
+ raise ValueError("Vector store cannot be None/empty")
17
+ if embeddings in [None, '']:
18
+ raise ValueError("Embeddings model cannot be None/empty")
19
+
20
+ self.loader = ChatbotDataLoader()
21
+ self.vector_store = vector_store
22
+ self.embeddings = embeddings
23
+ self.text_splitter = SpacyTextSplitter(
24
+ separator=["\n\n", "\n", '.'],
25
+ chunk_size=1000,
26
+ chunk_overlap=200,)
27
+
28
+ def embed_content(self, content):
29
+ """
30
+ Embed the text content using the provided embedding model.
31
+ """
32
+ return self.embeddings.embed_query(content)
33
+
34
+ def load_and_ingest(self, dir_path, empty_db=False):
35
+ """
36
+ Load documents from the directory, generate embeddings, and ingest them into the vector store.
37
+
38
+ :param dir_path: Directory path to load the documents from.
39
+ :param empty_db: If True, the vector store will be emptied before adding new documents.
40
+ """
41
+ # Optionally clear the vector store
42
+ if empty_db:
43
+ self.clear_vector_store()
44
+
45
+ # Load files from the directory
46
+ file_contents = self.loader.load_directory(dir_path)
47
+
48
+ # Create documents from the file contents
49
+ documents = [
50
+ Document(page_content=content, metadata={"source": file_path})
51
+ for file_path, content in file_contents.items()
52
+ ]
53
+
54
+ split_docs = self.text_splitter.split_documents(documents)
55
+
56
+ # Generate UUIDs for documents
57
+ uuids = [str(uuid4()) for _ in range(len(split_docs))]
58
+
59
+ print(f'{len(documents)} documents splitted into {len(split_docs)} chunks')
60
+
61
+ # Ingest documents into the vector store
62
+ self.ingest_to_vector_store(split_docs, uuids)
63
+
64
+ def clear_vector_store(self):
65
+ """
66
+ Clear all documents in the vector store.
67
+ """
68
+ try:
69
+ self.vector_store.delete(delete_all=True)
70
+ print("Cleared the vector store.")
71
+ except Exception as e:
72
+ print(f"Failed to clear the vector store: {str(e)}")
73
+
74
+ def ingest_to_vector_store(self, documents, uuids):
75
+ """
76
+ Ingest the documents into the vector store.
77
+ """
78
+ try:
79
+ self.vector_store.add_documents(documents, ids=uuids)
80
+ print(f'Ingested {len(documents)} chunks to the vector store')
81
+ except Exception as e:
82
+ print(f'Failed to ingest documents: {str(e)}')
data_loader.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from docx import Document
3
+ import PyPDF2
4
+
5
+ class ChatbotDataLoader:
6
+ def __init__(self):
7
+ pass
8
+
9
+ def read_docx(self, file_path):
10
+ """
11
+ Reads content from a .docx file.
12
+ """
13
+ doc = Document(file_path)
14
+ content = "\n".join([para.text for para in doc.paragraphs])
15
+ return content
16
+
17
+ def read_pdf(self, file_path):
18
+ """
19
+ Reads content from a .pdf file.
20
+ """
21
+ with open(file_path, "rb") as file:
22
+ reader = PyPDF2.PdfReader(file)
23
+ content = ""
24
+ for page in range(len(reader.pages)):
25
+ content += reader.pages[page].extract_text()
26
+ return content
27
+
28
+ def load_file(self, file_path):
29
+ """
30
+ Reads content from a .docx or .pdf file based on the file extension.
31
+ """
32
+ if file_path.endswith(".docx"):
33
+ return self.read_docx(file_path)
34
+ elif file_path.endswith(".pdf"):
35
+ return self.read_pdf(file_path)
36
+ else:
37
+ raise ValueError(f"Unsupported file type: {file_path}")
38
+
39
+ def load_directory(self, dir_path):
40
+ """
41
+ Iterates through the directory, loads all .docx and .pdf files, and returns their content.
42
+ """
43
+ file_contents = {}
44
+ for root, _, files in os.walk(dir_path):
45
+ for file in files:
46
+ file_path = os.path.join(root, file)
47
+ if file.endswith((".docx", ".pdf")):
48
+ try:
49
+ content = self.load_file(file_path)
50
+ file_contents[file_path] = content
51
+ except Exception as e:
52
+ print(f"Failed to load {file_path}: {str(e)}")
53
+ return file_contents
data_query.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import getpass
2
+ import os
3
+ from langchain_core.runnables import RunnablePassthrough
4
+ from langchain_core.output_parsers import StrOutputParser
5
+ from langchain_openai import ChatOpenAI
6
+ from langchain.chains import create_retrieval_chain
7
+ from langchain.chains.combine_documents import create_stuff_documents_chain
8
+ from langchain_core.prompts import ChatPromptTemplate
9
+ from langchain_core.documents import Document
10
+
11
+ def genetare_openai_response(input_prompt):
12
+ print(f'In genetare_openai_response')
13
+ system_prompt = '''You are an assistant designed to provide answers when no (0) relevant documents are retrieved from the vector database. When this happens, you should follow these steps:
14
+ 1) First, determine if you can answer the user's query using general knowledge or internal information. If so, generate a confident, helpful response in a straightforward narrative style. Do not use phrases such as 'According to me,' 'As of my knowledge,' 'I don’t know but,' or mention knowledge cutoffs or lack of information. Simply provide the answer as if you are certain of the facts.
15
+ 2) If the question is domain-specific, too specific (e.g., about a particular person or object that could mislead), or outside your knowledge, do not attempt to answer. Politely respond with: 'I'm sorry, I currently do not have enough information to answer your question.'''
16
+ llm = ChatOpenAI(model="gpt-4o", api_key=os.getenv("OPENAI_API_KEY"))
17
+ return 'The number of retrieved documents from RAG pipeline was 0, so the answer is based on LLM\s internal knowledge.\n' + llm(system_prompt+input_prompt).content
18
+
19
+ class ChatbotDataQuery:
20
+ def __init__(self, vector_store):
21
+ self.llm = ChatOpenAI(model="gpt-4o", api_key=os.getenv("OPENAI_API_KEY"))
22
+
23
+ self.system_prompt = '''You are Wagner, a highly intelligent and friendly AI assistant...'''
24
+
25
+ if vector_store is None:
26
+ raise ValueError("Vector store cannot be None")
27
+ else:
28
+ self.vector_store = vector_store
29
+
30
+ def initialize_reranker(self):
31
+ """
32
+ Initialize the custom reranker.
33
+ """
34
+ return CustomReranker()
35
+
36
+ def __generate_response(self, query_text, retriever, reranker=None, reranker_docs=0):
37
+ context_docs = retriever.invoke(query_text)
38
+ if len(context_docs) == 0:
39
+ response = genetare_openai_response(input_prompt=query_text)
40
+ return response
41
+
42
+ context_docs_texts = [doc.page_content for doc in context_docs]
43
+
44
+ if reranker is not None and reranker_docs > 0:
45
+ # Use the custom reranker to rerank the context_docs
46
+ relevant_docs = reranker.rerank(query_text, context_docs_texts, k=reranker_docs)
47
+
48
+ final_reranked_docs = []
49
+ for reranked_doc in relevant_docs:
50
+ idx_of_content_in_context_doc = reranked_doc['result_index']
51
+ meta_data = context_docs[idx_of_content_in_context_doc].metadata
52
+ final_reranked_docs.append(Document(page_content=reranked_doc['content'], metadata=meta_data))
53
+
54
+ context_docs = final_reranked_docs
55
+
56
+ prompt = ChatPromptTemplate.from_template(
57
+ "You are a helpful assistant that only answers questions about the context. "
58
+ "You try your best to extract the relevant answers from the context. "
59
+ "The context is:\n\n{context}\n\n"
60
+ "Question: {question}\n"
61
+ "Helpful Answer:"
62
+ )
63
+
64
+ print(f'---\nThe Retrieved Documents are:')
65
+ for idx, doc in enumerate(context_docs):
66
+ print(idx, '-', doc.metadata)
67
+ print('---\n\n')
68
+
69
+ chain = create_stuff_documents_chain(
70
+ llm=self.llm,
71
+ prompt=prompt,
72
+ document_variable_name="context",
73
+ )
74
+
75
+ context = '\n\n'.join([doc.page_content for doc in context_docs])
76
+ query = [
77
+ ("system", f"{self.system_prompt}"),
78
+ ("human", f"context: {context}\nInput: {query_text}"),
79
+ ]
80
+
81
+ response = ''
82
+ for chunk in self.llm.stream(query):
83
+ response += chunk.content
84
+ return {'response': response, 'context_docs': context_docs}
85
+ # yield chunk.content
86
+ # return context_docs
87
+
88
+ def query(self, query_text, k=1, reranker=None):
89
+ retriever = self.vector_store.as_retriever(
90
+ search_kwargs={"k": k},
91
+ search_type="similarity",
92
+ )
93
+ try:
94
+ return self.__generate_response(query_text=query_text, retriever=retriever, reranker=reranker, reranker_docs=k//2)
95
+ except Exception as e:
96
+ print(f"Failed to retrieve documents: {str(e)}")
97
+ return None
requirements.txt ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate==0.34.2
2
+ aiohttp==3.9.5
3
+ aiosignal==1.3.1
4
+ annotated-types==0.7.0
5
+ anyio==4.5.0
6
+ asttokens==2.4.1
7
+ attrs==24.2.0
8
+ beautifulsoup4==4.12.3
9
+ bitarray==2.9.2
10
+ blinker==1.8.2
11
+ blis==0.7.11
12
+ catalogue==2.0.10
13
+ certifi==2024.8.30
14
+ charset-normalizer==3.3.2
15
+ click==8.1.7
16
+ cloudpathlib==0.19.0
17
+ colbert-ai==0.2.19
18
+ comm==0.2.2
19
+ confection==0.1.5
20
+ cymem==2.0.8
21
+ dataclasses-json==0.6.7
22
+ datasets==3.0.0
23
+ debugpy==1.8.5
24
+ decorator==5.1.1
25
+ Deprecated==1.2.14
26
+ dill==0.3.8
27
+ dirtyjson==1.0.8
28
+ distro==1.9.0
29
+ einops==0.8.0
30
+ en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889
31
+ executing==2.1.0
32
+ faiss-cpu==1.8.0.post1
33
+ fast-pytorch-kmeans==0.2.0.1
34
+ filelock==3.16.1
35
+ Flask==3.0.3
36
+ frozenlist==1.4.1
37
+ fsspec==2024.6.1
38
+ git-python==1.0.3
39
+ gitdb==4.0.11
40
+ GitPython==3.1.43
41
+ greenlet==3.1.0
42
+ h11==0.14.0
43
+ httpcore==1.0.5
44
+ httpx==0.27.2
45
+ huggingface-hub==0.25.0
46
+ idna==3.10
47
+ InstructorEmbedding==1.0.1
48
+ ipykernel==6.29.5
49
+ ipython==8.27.0
50
+ itsdangerous==2.2.0
51
+ jedi==0.19.1
52
+ Jinja2==3.1.4
53
+ jiter==0.5.0
54
+ joblib==1.4.2
55
+ jsonpatch==1.33
56
+ jsonpointer==3.0.0
57
+ jupyter_client==8.6.3
58
+ jupyter_core==5.7.2
59
+ langchain==0.3.0
60
+ langchain-community==0.3.0
61
+ langchain-core==0.3.2
62
+ langchain-huggingface==0.1.0
63
+ langchain-openai==0.2.0
64
+ langchain-pinecone==0.2.0
65
+ langchain-text-splitters==0.3.0
66
+ langcodes==3.4.0
67
+ langsmith==0.1.125
68
+ language_data==1.2.0
69
+ llama-cloud==0.1.0
70
+ llama-index==0.11.13
71
+ llama-index-agent-openai==0.3.4
72
+ llama-index-cli==0.3.1
73
+ llama-index-core==0.11.13.post1
74
+ llama-index-embeddings-openai==0.2.5
75
+ llama-index-indices-managed-llama-cloud==0.3.1
76
+ llama-index-legacy==0.9.48.post3
77
+ llama-index-llms-openai==0.2.9
78
+ llama-index-multi-modal-llms-openai==0.2.1
79
+ llama-index-program-openai==0.2.0
80
+ llama-index-question-gen-openai==0.2.0
81
+ llama-index-readers-file==0.2.2
82
+ llama-index-readers-llama-parse==0.3.0
83
+ llama-parse==0.5.6
84
+ lxml==5.3.0
85
+ marisa-trie==1.2.0
86
+ markdown-it-py==3.0.0
87
+ MarkupSafe==2.1.5
88
+ marshmallow==3.22.0
89
+ matplotlib-inline==0.1.7
90
+ mdurl==0.1.2
91
+ mpmath==1.3.0
92
+ multidict==6.1.0
93
+ multiprocess==0.70.16
94
+ murmurhash==1.0.10
95
+ mypy-extensions==1.0.0
96
+ nest-asyncio==1.6.0
97
+ networkx==3.3
98
+ ninja==1.11.1.1
99
+ nltk==3.9.1
100
+ numpy==1.26.4
101
+ nvidia-cublas-cu12==12.1.3.1
102
+ nvidia-cuda-cupti-cu12==12.1.105
103
+ nvidia-cuda-nvrtc-cu12==12.1.105
104
+ nvidia-cuda-runtime-cu12==12.1.105
105
+ nvidia-cudnn-cu12==9.1.0.70
106
+ nvidia-cufft-cu12==11.0.2.54
107
+ nvidia-curand-cu12==10.3.2.106
108
+ nvidia-cusolver-cu12==11.4.5.107
109
+ nvidia-cusparse-cu12==12.1.0.106
110
+ nvidia-nccl-cu12==2.20.5
111
+ nvidia-nvjitlink-cu12==12.6.68
112
+ nvidia-nvtx-cu12==12.1.105
113
+ onnx==1.16.2
114
+ openai==1.46.1
115
+ orjson==3.10.7
116
+ packaging==24.1
117
+ pandas==2.2.2
118
+ parso==0.8.4
119
+ pexpect==4.9.0
120
+ pillow==10.4.0
121
+ pinecone-client==5.0.1
122
+ pinecone-plugin-inference==1.1.0
123
+ pinecone-plugin-interface==0.0.7
124
+ platformdirs==4.3.6
125
+ preshed==3.0.9
126
+ prompt_toolkit==3.0.47
127
+ protobuf==5.28.2
128
+ psutil==6.0.0
129
+ ptyprocess==0.7.0
130
+ pure_eval==0.2.3
131
+ pyarrow==17.0.0
132
+ pydantic==2.9.2
133
+ pydantic-settings==2.5.2
134
+ pydantic_core==2.23.4
135
+ Pygments==2.18.0
136
+ pynvml==11.5.3
137
+ pypdf==4.3.1
138
+ PyPDF2==3.0.1
139
+ python-dateutil==2.9.0.post0
140
+ python-docx==1.1.2
141
+ python-dotenv==1.0.1
142
+ pytz==2024.2
143
+ PyYAML==6.0.2
144
+ pyzmq==26.2.0
145
+ RAGatouille==0.0.8.post4
146
+ regex==2024.9.11
147
+ requests==2.32.3
148
+ rich==13.8.1
149
+ safetensors==0.4.5
150
+ scikit-learn==1.5.2
151
+ scipy==1.14.1
152
+ sentence-transformers==2.7.0
153
+ sentencepiece==0.2.0
154
+ shellingham==1.5.4
155
+ six==1.16.0
156
+ smart-open==7.0.4
157
+ smmap==5.0.1
158
+ sniffio==1.3.1
159
+ soupsieve==2.6
160
+ spacy==3.7.6
161
+ spacy-legacy==3.0.12
162
+ spacy-loggers==1.0.5
163
+ SQLAlchemy==2.0.35
164
+ srsly==2.4.8
165
+ stack-data==0.6.3
166
+ striprtf==0.0.26
167
+ sympy==1.13.3
168
+ tenacity==8.5.0
169
+ thinc==8.2.5
170
+ threadpoolctl==3.5.0
171
+ tiktoken==0.7.0
172
+ tokenizers==0.19.1
173
+ torch==2.4.1
174
+ torchvision==0.19.1
175
+ tornado==6.4.1
176
+ tqdm==4.66.5
177
+ traitlets==5.14.3
178
+ transformers==4.44.2
179
+ triton==3.0.0
180
+ typer==0.12.5
181
+ typing-inspect==0.9.0
182
+ typing_extensions==4.12.2
183
+ tzdata==2024.1
184
+ ujson==5.10.0
185
+ urllib3==2.2.3
186
+ voyager==2.0.9
187
+ wasabi==1.1.3
188
+ wcwidth==0.2.13
189
+ weasel==0.4.1
190
+ Werkzeug==3.0.4
191
+ wrapt==1.16.0
192
+ xxhash==3.5.0
193
+ yarl==1.11.1