captain-awesome commited on
Commit
d9b4100
·
1 Parent(s): 96d88aa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -1
app.py CHANGED
@@ -92,4 +92,33 @@ def load_model():
92
  # max_new_tokens=max_new_tokens, # type: ignore
93
  # temperature=temperature, # type: ignore
94
  )
95
- return llm
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  # max_new_tokens=max_new_tokens, # type: ignore
93
  # temperature=temperature, # type: ignore
94
  )
95
+ return llm
96
+
97
+ def create_vector_database(loaded_documents):
98
+ # DB_DIR: str = os.path.join(ABS_PATH, "db")
99
+ """
100
+ Creates a vector database using document loaders and embeddings.
101
+ This function loads data from PDF, markdown and text files in the 'data/' directory,
102
+ splits the loaded documents into chunks, transforms them into embeddings using HuggingFace,
103
+ and finally persists the embeddings into a Chroma vector database.
104
+ """
105
+ # Split loaded documents into chunks
106
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=30, length_function = len)
107
+ chunked_documents = text_splitter.split_documents(loaded_documents)
108
+
109
+ embeddings = HuggingFaceBgeEmbeddings(
110
+ model_name = "BAAI/bge-large-en"
111
+ )
112
+
113
+ persist_directory = 'db'
114
+ # Create and persist a Chroma vector database from the chunked documents
115
+ db = Chroma.from_documents(
116
+ documents=chunked_documents,
117
+ embedding=embeddings,
118
+ persist_directory=persist_directory
119
+ # persist_directory=DB_DIR,
120
+ )
121
+ db.persist()
122
+ # db = Chroma(persist_directory=persist_directory,
123
+ # embedding_function=embedding)
124
+ return db