Spaces:

Tanmay09516
/

langchat

Sleeping

App Files Files Community

langchat / vectorstore.py

Tanmay09516

Upload 14 files

dd87c4b verified 12 months ago

raw

history blame

3.7 kB

	# vectorstore.py

	import os
	from langchain_community.document_loaders import TextLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.vectorstores import FAISS

	def load_and_split_document(file_path, chunk_size=1000, chunk_overlap=150):
	"""
	Load a document from a file and split it into chunks.

	Args:
	file_path: Path to the text file.
	chunk_size: The maximum size of each chunk.
	chunk_overlap: The overlap between chunks.

	Returns:
	A list of document chunks.
	"""
	loader = TextLoader(
	file_path,
	encoding='utf-8',
	autodetect_encoding=True
	)

	try:
	documents = loader.load()
	except RuntimeError:
	# Fallback to a different encoding if autodetection fails
	loader = TextLoader(
	file_path,
	encoding='latin-1',
	autodetect_encoding=False
	)
	documents = loader.load()

	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	length_function=len
	)

	chunks = text_splitter.split_documents(documents)
	return chunks

	def create_vector_stores(doc_paths, embeddings):
	"""
	Create vector stores from a list of document paths.

	Args:
	doc_paths: List of paths to document files.
	embeddings: The embeddings model to use.

	Returns:
	A dictionary of vector stores.
	"""
	vector_stores = {}
	os.makedirs("vector_stores", exist_ok=True)

	for doc_path in doc_paths:
	store_name = os.path.basename(doc_path).split('.')[0]
	chunks = load_and_split_document(doc_path)
	print(f"Processing {store_name}: {len(chunks)} chunks created")
	vectorstore = FAISS.from_documents(chunks, embeddings)
	vectorstore.save_local(f"vector_stores/{store_name}")
	vector_stores[store_name] = vectorstore

	return vector_stores

	def create_vector_store_from_folder(folder_path, embeddings):
	"""
	Create a single vector store from all text files in a folder.

	Args:
	folder_path: Path to the folder containing text files.
	embeddings: The embeddings model to use.

	Returns:
	A dictionary containing the created vector store.
	"""
	vector_stores = {}
	os.makedirs("vector_stores", exist_ok=True)
	all_chunks = []
	file_names = []

	for filename in os.listdir(folder_path):
	if filename.endswith(".txt"):
	file_path = os.path.join(folder_path, filename)
	chunks = load_and_split_document(file_path)
	all_chunks.extend(chunks)
	file_names.append(filename)

	print(f"Processing {folder_path}: {len(all_chunks)} chunks created from {len(file_names)} files")
	vectorstore = FAISS.from_documents(all_chunks, embeddings)
	store_name = os.path.basename(folder_path.rstrip('/'))
	vectorstore.save_local(f"vector_stores/{store_name}")
	vector_stores[store_name] = vectorstore

	return vector_stores

	def load_all_vector_stores(embeddings):
	"""
	Load all vector stores from the 'vector_stores' directory.

	Args:
	embeddings: The embeddings model to use.

	Returns:
	A dictionary of loaded vector stores.
	"""
	vector_stores = {}
	store_dir = "vector_stores"

	for store_name in os.listdir(store_dir):
	store_path = os.path.join(store_dir, store_name)
	if os.path.isdir(store_path):
	vector_stores[store_name] = FAISS.load_local(store_path, embeddings, allow_dangerous_deserialization=True)
	return vector_stores