|
import os
|
|
from typing import Optional, List
|
|
import shutil
|
|
from zipfile import ZipFile
|
|
from langchain_community.vectorstores import FAISS
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
from langchain_core.embeddings import Embeddings
|
|
|
|
from document_processor import DocumentProcessor
|
|
|
|
|
|
class VectorStoreManager:
|
|
def __init__(self, path: str, name: str, embeddings: Embeddings):
|
|
"""
|
|
Descripci贸n: Clase para gestionar el vectorstore, incluyendo la creaci贸n, eliminaci贸n y b煤squeda de
|
|
documentos similares.
|
|
|
|
Par谩metros:
|
|
- path: str - ruta del directorio que contiene los documentos (usualmente es "database" que es el directorio
|
|
donde se almacenan las bases de datos).
|
|
- name: str - nombre del vectorstore. (usualmente, es el nombre de la base de datos que contiene los documentos)
|
|
- embeddings: Embeddings - modelo de embeddings para el vectorstore.
|
|
|
|
"""
|
|
self.path = path
|
|
self.name = name
|
|
self.embeddings = embeddings
|
|
self.vectorstore = None
|
|
|
|
def create_vectorstore(self) -> bool:
|
|
documents = DocumentProcessor(self.path).files_to_texts()
|
|
text_splitter = RecursiveCharacterTextSplitter(
|
|
chunk_size=1000, chunk_overlap=200, length_function=len
|
|
)
|
|
texts = text_splitter.split_documents(documents)
|
|
self.vectorstore = FAISS.from_documents(
|
|
documents=texts, embedding=self.embeddings
|
|
)
|
|
base_de_datos_dir = os.path.join("database", self.name)
|
|
self.vectorstore.save_local(folder_path=base_de_datos_dir)
|
|
return True
|
|
|
|
def delete_vectorstore(self) -> bool:
|
|
try:
|
|
shutil.rmtree(f"database/{self.name}")
|
|
except FileNotFoundError:
|
|
return False
|
|
return True
|
|
|
|
def search_similarity(self, query: str, fuente: Optional[str] = None) -> str:
|
|
"""
|
|
Modo de uso:
|
|
debe ingresar la query y la fuente (opcional) para buscar documentos similares en el vectorstore.
|
|
|
|
Nota: debe estar definido el vectorstore para poder realizar la b煤squeda.
|
|
|
|
Par谩metros:
|
|
query: str - texto de la query.
|
|
fuente: str - fuente de los documentos a buscar.
|
|
|
|
Retorna:
|
|
str - documentos similares.
|
|
"""
|
|
if not self.vectorstore:
|
|
self.vectorstore = self.load_vectorstore()
|
|
|
|
if fuente:
|
|
filtro = {"source": fuente}
|
|
retriever = self.vectorstore.similarity_search(
|
|
query=query, k=5, filter=filtro
|
|
)
|
|
else:
|
|
retriever = self.vectorstore.similarity_search(query=query, k=5)
|
|
busqueda = [
|
|
{
|
|
"content": doc.page_content,
|
|
"title": doc.metadata.get("title", None),
|
|
"source": doc.metadata.get("source", None),
|
|
}
|
|
for doc in retriever
|
|
]
|
|
|
|
return str(busqueda)
|
|
|
|
def list_sources(self) -> List[str]:
|
|
if not self.vectorstore:
|
|
self.vectorstore = self.load_vectorstore()
|
|
|
|
docstore_dict = self.vectorstore.docstore._dict
|
|
source_metadata = {}
|
|
for doc_id, document in docstore_dict.items():
|
|
source = document.metadata.get("source", None)
|
|
source_metadata[doc_id] = source
|
|
|
|
return list(set(source_metadata.values()))
|
|
|
|
def extract_texts_by_source(self, source: str) -> List[str]:
|
|
if not self.vectorstore:
|
|
self.vectorstore = self.load_vectorstore()
|
|
|
|
docstore_dict = self.vectorstore.docstore._dict
|
|
texts = []
|
|
for document in docstore_dict.values():
|
|
source_doc = document.metadata.get("source", None)
|
|
if source_doc == source:
|
|
texts.append(document.page_content)
|
|
return texts
|
|
|
|
def save_text_to_file_temp(self, source: str) -> bool:
|
|
texts = self.extract_texts_by_source(source)
|
|
carpeta = "temp"
|
|
target_source_safe = source.replace("\\", "_").replace("/", "_")
|
|
file_path = os.path.join(carpeta, target_source_safe + ".txt")
|
|
|
|
try:
|
|
if os.path.exists(carpeta):
|
|
shutil.rmtree(carpeta)
|
|
os.makedirs(carpeta)
|
|
|
|
with open(file_path, "w", encoding="utf-8") as file:
|
|
for text in texts:
|
|
file.write(text)
|
|
file.write("\n")
|
|
return True
|
|
except Exception:
|
|
return False
|
|
|
|
def load_vectorstore(self) -> FAISS:
|
|
return FAISS.load_local(
|
|
folder_path=os.path.join("database", self.name),
|
|
embeddings=self.embeddings,
|
|
allow_dangerous_deserialization=True,
|
|
)
|
|
|
|
def add_files_vectorstore(self) -> Optional[FAISS]:
|
|
temp_folder = "docs"
|
|
if not os.path.exists(temp_folder):
|
|
os.makedirs(temp_folder)
|
|
return None
|
|
|
|
documents = DocumentProcessor(temp_folder).files_to_texts()
|
|
if not documents:
|
|
return None
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter(
|
|
chunk_size=1000, chunk_overlap=200, length_function=len
|
|
)
|
|
texts = text_splitter.split_documents(documents)
|
|
self.vectorstore = self.load_vectorstore()
|
|
self.vectorstore.add_documents(documents=texts)
|
|
self.vectorstore.save_local(folder_path=os.path.join("database", self.name))
|
|
return self.vectorstore
|
|
|
|
def download_vectorstore(self):
|
|
|
|
with ZipFile("temp/vectorstore.zip", "w") as zip:
|
|
for root, dirs, files in os.walk(f"database/{self.name}"):
|
|
for file in files:
|
|
zip.write(os.path.join(root, file))
|
|
return "temp/vectorstore.zip"
|
|
|