Spaces:
Build error
Build error
import logging | |
import os | |
import yaml | |
from modules.embedding_model_loader import EmbeddingModelLoader | |
from langchain.vectorstores import FAISS | |
from modules.data_loader import DataLoader | |
from modules.constants import * | |
from modules.helpers import * | |
class VectorDB: | |
def __init__(self, config, logger=None): | |
self.config = config | |
self.db_option = config["embedding_options"]["db_option"] | |
self.document_names = None | |
self.webpage_crawler = WebpageCrawler() | |
# Set up logging to both console and a file | |
if logger is None: | |
self.logger = logging.getLogger(__name__) | |
self.logger.setLevel(logging.INFO) | |
# Console Handler | |
console_handler = logging.StreamHandler() | |
console_handler.setLevel(logging.INFO) | |
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") | |
console_handler.setFormatter(formatter) | |
self.logger.addHandler(console_handler) | |
# File Handler | |
log_file_path = "vector_db.log" # Change this to your desired log file path | |
file_handler = logging.FileHandler(log_file_path, mode="w") | |
file_handler.setLevel(logging.INFO) | |
file_handler.setFormatter(formatter) | |
self.logger.addHandler(file_handler) | |
else: | |
self.logger = logger | |
self.logger.info("VectorDB instance instantiated") | |
def load_files(self): | |
files = os.listdir(self.config["embedding_options"]["data_path"]) | |
files = [ | |
os.path.join(self.config["embedding_options"]["data_path"], file) | |
for file in files | |
] | |
urls = get_urls_from_file(self.config["embedding_options"]["url_file_path"]) | |
if self.config["embedding_options"]["expand_urls"]: | |
all_urls = [] | |
for url in urls: | |
base_url = get_base_url(url) | |
all_urls.extend(self.webpage_crawler.get_all_pages(url, base_url)) | |
urls = all_urls | |
return files, urls | |
def create_embedding_model(self): | |
self.logger.info("Creating embedding function") | |
self.embedding_model_loader = EmbeddingModelLoader(self.config) | |
self.embedding_model = self.embedding_model_loader.load_embedding_model() | |
def initialize_database(self, document_chunks: list, document_names: list): | |
# Track token usage | |
self.logger.info("Initializing vector_db") | |
self.logger.info("\tUsing {} as db_option".format(self.db_option)) | |
if self.db_option == "FAISS": | |
self.vector_db = FAISS.from_documents( | |
documents=document_chunks, embedding=self.embedding_model | |
) | |
self.logger.info("Completed initializing vector_db") | |
def create_database(self): | |
data_loader = DataLoader(self.config) | |
self.logger.info("Loading data") | |
files, urls = self.load_files() | |
document_chunks, document_names = data_loader.get_chunks(files, urls) | |
self.logger.info("Completed loading data") | |
self.create_embedding_model() | |
self.initialize_database(document_chunks, document_names) | |
def save_database(self): | |
self.vector_db.save_local( | |
os.path.join( | |
self.config["embedding_options"]["db_path"], | |
"db_" | |
+ self.config["embedding_options"]["db_option"] | |
+ "_" | |
+ self.config["embedding_options"]["model"], | |
) | |
) | |
self.logger.info("Saved database") | |
def load_database(self): | |
self.create_embedding_model() | |
self.vector_db = FAISS.load_local( | |
os.path.join( | |
self.config["embedding_options"]["db_path"], | |
"db_" | |
+ self.config["embedding_options"]["db_option"] | |
+ "_" | |
+ self.config["embedding_options"]["model"], | |
), | |
self.embedding_model, | |
) | |
self.logger.info("Loaded database") | |
return self.vector_db | |
if __name__ == "__main__": | |
with open("config.yml", "r") as f: | |
config = yaml.safe_load(f) | |
print(config) | |
vector_db = VectorDB(config) | |
vector_db.create_database() | |
vector_db.save_database() | |