Spaces:
Sleeping
Sleeping
| # vectorstore.py | |
| import os | |
| from langchain_community.document_loaders import TextLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_community.vectorstores import FAISS | |
| def load_and_split_document(file_path, chunk_size=1000, chunk_overlap=150): | |
| """ | |
| Load a document from a file and split it into chunks. | |
| Args: | |
| file_path: Path to the text file. | |
| chunk_size: The maximum size of each chunk. | |
| chunk_overlap: The overlap between chunks. | |
| Returns: | |
| A list of document chunks. | |
| """ | |
| loader = TextLoader( | |
| file_path, | |
| encoding='utf-8', | |
| autodetect_encoding=True | |
| ) | |
| try: | |
| documents = loader.load() | |
| except RuntimeError: | |
| # Fallback to a different encoding if autodetection fails | |
| loader = TextLoader( | |
| file_path, | |
| encoding='latin-1', | |
| autodetect_encoding=False | |
| ) | |
| documents = loader.load() | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| length_function=len | |
| ) | |
| chunks = text_splitter.split_documents(documents) | |
| return chunks | |
| def create_vector_stores(doc_paths, embeddings): | |
| """ | |
| Create vector stores from a list of document paths. | |
| Args: | |
| doc_paths: List of paths to document files. | |
| embeddings: The embeddings model to use. | |
| Returns: | |
| A dictionary of vector stores. | |
| """ | |
| vector_stores = {} | |
| os.makedirs("vector_stores", exist_ok=True) | |
| for doc_path in doc_paths: | |
| store_name = os.path.basename(doc_path).split('.')[0] | |
| chunks = load_and_split_document(doc_path) | |
| print(f"Processing {store_name}: {len(chunks)} chunks created") | |
| vectorstore = FAISS.from_documents(chunks, embeddings) | |
| vectorstore.save_local(f"vector_stores/{store_name}") | |
| vector_stores[store_name] = vectorstore | |
| return vector_stores | |
| def create_vector_store_from_folder(folder_path, embeddings): | |
| """ | |
| Create a single vector store from all text files in a folder. | |
| Args: | |
| folder_path: Path to the folder containing text files. | |
| embeddings: The embeddings model to use. | |
| Returns: | |
| A dictionary containing the created vector store. | |
| """ | |
| vector_stores = {} | |
| os.makedirs("vector_stores", exist_ok=True) | |
| all_chunks = [] | |
| file_names = [] | |
| for filename in os.listdir(folder_path): | |
| if filename.endswith(".txt"): | |
| file_path = os.path.join(folder_path, filename) | |
| chunks = load_and_split_document(file_path) | |
| all_chunks.extend(chunks) | |
| file_names.append(filename) | |
| print(f"Processing {folder_path}: {len(all_chunks)} chunks created from {len(file_names)} files") | |
| vectorstore = FAISS.from_documents(all_chunks, embeddings) | |
| store_name = os.path.basename(folder_path.rstrip('/')) | |
| vectorstore.save_local(f"vector_stores/{store_name}") | |
| vector_stores[store_name] = vectorstore | |
| return vector_stores | |
| def load_all_vector_stores(embeddings): | |
| """ | |
| Load all vector stores from the 'vector_stores' directory. | |
| Args: | |
| embeddings: The embeddings model to use. | |
| Returns: | |
| A dictionary of loaded vector stores. | |
| """ | |
| vector_stores = {} | |
| store_dir = "vector_stores" | |
| for store_name in os.listdir(store_dir): | |
| store_path = os.path.join(store_dir, store_name) | |
| if os.path.isdir(store_path): | |
| vector_stores[store_name] = FAISS.load_local(store_path, embeddings, allow_dangerous_deserialization=True) | |
| return vector_stores | |