RAG3 / vector_store.py
jeongsoo's picture
Initial commit
2d0ccb4
"""
๊ฐœ์„ ๋œ ๋ฒกํ„ฐ ์Šคํ† ์–ด ๋ชจ๋“ˆ - Milvus ์„ค์ • ์ตœ์ ํ™” ๋ฐ ์˜ˆ์™ธ ์ฒ˜๋ฆฌ ๊ฐ•ํ™”
"""
import os
import logging
from typing import List, Dict, Any, Optional
import uuid
from langchain.schema import Document
# ๋กœ๊น… ์„ค์ •
logger = logging.getLogger("VectorStore")
# ๋ฒกํ„ฐ ์Šคํ† ์–ด ๊ด€๋ จ ์˜ˆ์™ธ ํด๋ž˜์Šค
class VectorStoreInitError(Exception):
"""๋ฒกํ„ฐ ์Šคํ† ์–ด ์ดˆ๊ธฐํ™” ์ค‘ ๋ฐœ์ƒํ•œ ์˜ค๋ฅ˜"""
pass
class EmbeddingModelError(Exception):
"""์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ์ดˆ๊ธฐํ™” ์ค‘ ๋ฐœ์ƒํ•œ ์˜ค๋ฅ˜"""
pass
class DocumentIndexError(Exception):
"""๋ฌธ์„œ ์ธ๋ฑ์‹ฑ ์ค‘ ๋ฐœ์ƒํ•œ ์˜ค๋ฅ˜"""
pass
class VectorSearchError(Exception):
"""๋ฒกํ„ฐ ๊ฒ€์ƒ‰ ์ค‘ ๋ฐœ์ƒํ•œ ์˜ค๋ฅ˜"""
pass
class PersistenceError(Exception):
"""์ธ๋ฑ์Šค ์ €์žฅ/๋กœ๋“œ ์ค‘ ๋ฐœ์ƒํ•œ ์˜ค๋ฅ˜"""
pass
# ๋ฒกํ„ฐ ์Šคํ† ์–ด ์ž„ํฌํŠธ
try:
# ์ตœ์‹  ๋ฒ„์ „ ์ž„ํฌํŠธ
from langchain_milvus import Milvus
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
MODERN_IMPORTS = True
logger.info("์ตœ์‹  langchain ํŒจํ‚ค์ง€ ์ž„ํฌํŠธ ์„ฑ๊ณต")
except ImportError:
try:
# ์ด์ „ ๋ฒ„์ „ ์ž„ํฌํŠธ
from langchain_community.vectorstores import Milvus, FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
MODERN_IMPORTS = False
logger.info("๋ ˆ๊ฑฐ์‹œ langchain_community ํŒจํ‚ค์ง€ ์‚ฌ์šฉ")
except ImportError as e:
logger.error(f"ํ•„์ˆ˜ ๋ฒกํ„ฐ ์Šคํ† ์–ด ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ๋ฅผ ์ž„ํฌํŠธํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค: {e}")
raise VectorStoreInitError(f"ํ•„์ˆ˜ ๋ฒกํ„ฐ ์Šคํ† ์–ด ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ๋ฅผ ์ž„ํฌํŠธํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค: {str(e)}")
from config import MILVUS_HOST, MILVUS_PORT, MILVUS_COLLECTION, EMBEDDING_MODEL
class VectorStore:
def __init__(self, use_milvus: bool = True):
"""
๋ฒกํ„ฐ ์Šคํ† ์–ด ์ดˆ๊ธฐํ™”
Args:
use_milvus: Milvus ์‚ฌ์šฉ ์—ฌ๋ถ€ (False์ด๋ฉด FAISS ์‚ฌ์šฉ)
"""
self.use_milvus = use_milvus
self.vector_store = None
# ์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ์„ค์ •
logger.info(f"์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ๋กœ๋“œ ์ค‘: {EMBEDDING_MODEL}")
model_kwargs = {
"device": "cpu",
"trust_remote_code": True # ์›๊ฒฉ ์ฝ”๋“œ ์‹คํ–‰ ํ—ˆ์šฉ (ํ•„์ˆ˜)
}
encode_kwargs = {"normalize_embeddings": True}
try:
self.embeddings = HuggingFaceEmbeddings(
model_name=EMBEDDING_MODEL,
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs
)
logger.info(f"์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ์ดˆ๊ธฐํ™” ์™„๋ฃŒ: {EMBEDDING_MODEL}")
except Exception as e:
logger.error(f"์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ์ดˆ๊ธฐํ™” ์‹คํŒจ: {e}", exc_info=True)
raise EmbeddingModelError(f"์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ '{EMBEDDING_MODEL}' ์ดˆ๊ธฐํ™” ์‹คํŒจ: {str(e)}")
def init_milvus(self) -> Milvus:
"""
Milvus ๋ฒกํ„ฐ ์Šคํ† ์–ด ์ดˆ๊ธฐํ™”
Returns:
Milvus ๋ฒกํ„ฐ ์Šคํ† ์–ด ์ธ์Šคํ„ด์Šค
"""
try:
connection_args = {
"host": MILVUS_HOST,
"port": MILVUS_PORT,
}
# ๋ฒกํ„ฐ ๊ฒ€์ƒ‰ ์ธ๋ฑ์Šค ํŒŒ๋ผ๋ฏธํ„ฐ (FLAT ์ธ๋ฑ์Šค ๋ฐ ์ฝ”์‚ฌ์ธ ์œ ์‚ฌ๋„ ๋ฉ”ํŠธ๋ฆญ)
index_params = {
"index_type": "FLAT", # ์ •ํ™•๋„ ์šฐ์„  FLAT ์ธ๋ฑ์Šค
"metric_type": "COSINE", # ์ฝ”์‚ฌ์ธ ์œ ์‚ฌ๋„ (์ •๊ทœํ™”๋œ ๋ฒกํ„ฐ์— ์ ํ•ฉ)
"params": {} # FLAT ์ธ๋ฑ์Šค์—๋Š” ์ถ”๊ฐ€ ํŒŒ๋ผ๋ฏธํ„ฐ ์—†์Œ
}
logger.info(f"Milvus ์—ฐ๊ฒฐ ์‹œ๋„ ์ค‘: {MILVUS_HOST}:{MILVUS_PORT}")
milvus_store = Milvus(
embedding_function=self.embeddings,
collection_name=MILVUS_COLLECTION,
connection_args=connection_args,
index_params=index_params
)
logger.info(f"Milvus ์—ฐ๊ฒฐ ์„ฑ๊ณต: {MILVUS_COLLECTION}")
return milvus_store
except Exception as e:
logger.error(f"Milvus ์ดˆ๊ธฐํ™” ์‹คํŒจ: {e}", exc_info=True)
raise VectorStoreInitError(f"Milvus ๋ฒกํ„ฐ ์Šคํ† ์–ด ์ดˆ๊ธฐํ™” ์‹คํŒจ: {str(e)}")
def init_faiss(self) -> FAISS:
"""
FAISS ๋ฒกํ„ฐ ์Šคํ† ์–ด ์ดˆ๊ธฐํ™” (๋กœ์ปฌ ๋Œ€์ฒด์šฉ)
Returns:
FAISS ๋ฒกํ„ฐ ์Šคํ† ์–ด ์ธ์Šคํ„ด์Šค
"""
try:
logger.info("FAISS ๋ฒกํ„ฐ ์Šคํ† ์–ด ์ดˆ๊ธฐํ™” ์ค‘")
faiss_store = FAISS.from_documents([], self.embeddings)
logger.info("FAISS ๋ฒกํ„ฐ ์Šคํ† ์–ด ์ดˆ๊ธฐํ™” ์™„๋ฃŒ")
return faiss_store
except Exception as e:
logger.error(f"FAISS ์ดˆ๊ธฐํ™” ์‹คํŒจ: {e}", exc_info=True)
raise VectorStoreInitError(f"FAISS ๋ฒกํ„ฐ ์Šคํ† ์–ด ์ดˆ๊ธฐํ™” ์‹คํŒจ: {str(e)}")
def create_or_load(self, documents: Optional[List[Document]] = None) -> Any:
"""
๋ฒกํ„ฐ ์Šคํ† ์–ด ์ƒ์„ฑ ๋˜๋Š” ๋กœ๋“œ
Args:
documents: ์ €์žฅํ•  ๋ฌธ์„œ ๋ฆฌ์ŠคํŠธ (None์ด๋ฉด ๋นˆ ์Šคํ† ์–ด ์ƒ์„ฑ)
Returns:
๋ฒกํ„ฐ ์Šคํ† ์–ด ์ธ์Šคํ„ด์Šค
"""
if self.use_milvus:
if documents:
# ๋ฌธ์„œ๊ฐ€ ์ œ๊ณต๋œ ๊ฒฝ์šฐ ์ƒˆ ์ปฌ๋ ‰์…˜ ์ƒ์„ฑ
try:
# ์—ฐ๊ฒฐ ์„ค์ •
connection_args = {
"host": MILVUS_HOST,
"port": MILVUS_PORT,
}
# ๊ฒ€์ƒ‰ ์ธ๋ฑ์Šค ์„ค์ •
index_params = {
"index_type": "FLAT", # ์ •ํ™•๋„ ์šฐ์„ 
"metric_type": "COSINE", # ์ฝ”์‚ฌ์ธ ์œ ์‚ฌ๋„
"params": {}
}
logger.info(f"Milvus ์ปฌ๋ ‰์…˜ ์ƒ์„ฑ ์ค‘: {MILVUS_COLLECTION} (๊ธฐ์กด ์ปฌ๋ ‰์…˜ ์‚ญ์ œ)")
# ๋ฌธ์„œ๋กœ๋ถ€ํ„ฐ Milvus ์ปฌ๋ ‰์…˜ ์ƒ์„ฑ
self.vector_store = Milvus.from_documents(
documents=documents,
embedding=self.embeddings,
collection_name=MILVUS_COLLECTION,
connection_args=connection_args,
index_params=index_params,
drop_old=True # ๊ธฐ์กด ์ปฌ๋ ‰์…˜ ์‚ญ์ œ (์žฌ๊ตฌ์ถ•)
)
logger.info(f"Milvus ์ปฌ๋ ‰์…˜ ์ƒ์„ฑ ์™„๋ฃŒ: {len(documents)}๊ฐœ ๋ฌธ์„œ ์ธ๋ฑ์‹ฑ๋จ")
except Exception as e:
logger.error(f"Milvus ์ปฌ๋ ‰์…˜ ์ƒ์„ฑ ์‹คํŒจ: {e}", exc_info=True)
# ๋Œ€์ฒด ๋ฐฉ์•ˆ์œผ๋กœ FAISS ์‚ฌ์šฉ
logger.warning("Milvus ์‹คํŒจ๋กœ FAISS๋กœ ๋Œ€์ฒดํ•ฉ๋‹ˆ๋‹ค")
self.use_milvus = False
try:
self.vector_store = FAISS.from_documents(documents, self.embeddings)
logger.info(f"FAISS๋กœ ๋Œ€์ฒด ์„ฑ๊ณต: {len(documents)}๊ฐœ ๋ฌธ์„œ ์ธ๋ฑ์‹ฑ๋จ")
except Exception as faiss_err:
logger.error(f"FAISS ๋Œ€์ฒด ์‹คํŒจ: {faiss_err}", exc_info=True)
raise DocumentIndexError(f"๋ฌธ์„œ ์ธ๋ฑ์‹ฑ ์‹คํŒจ (Milvus ๋ฐ FAISS): {str(e)} / {str(faiss_err)}")
else:
# ๊ธฐ์กด ์ปฌ๋ ‰์…˜ ๋กœ๋“œ
try:
self.vector_store = self.init_milvus()
except VectorStoreInitError as e:
logger.error(f"Milvus ์ปฌ๋ ‰์…˜ ๋กœ๋“œ ์‹คํŒจ: {e}")
# ๋Œ€์ฒด ๋ฐฉ์•ˆ์œผ๋กœ FAISS ์‚ฌ์šฉ
logger.warning("Milvus ์‹คํŒจ๋กœ FAISS๋กœ ๋Œ€์ฒดํ•ฉ๋‹ˆ๋‹ค")
self.use_milvus = False
try:
self.vector_store = self.init_faiss()
except VectorStoreInitError as faiss_err:
logger.error(f"FAISS ๋Œ€์ฒด ์‹คํŒจ: {faiss_err}", exc_info=True)
raise VectorStoreInitError(f"๋ฒกํ„ฐ ์Šคํ† ์–ด ์ดˆ๊ธฐํ™” ์‹คํŒจ (Milvus ๋ฐ FAISS): {str(e)} / {str(faiss_err)}")
else:
# FAISS ์‚ฌ์šฉ
if documents:
try:
logger.info(f"FAISS ์ธ๋ฑ์Šค ์ƒ์„ฑ ์ค‘: {len(documents)}๊ฐœ ๋ฌธ์„œ")
self.vector_store = FAISS.from_documents(documents, self.embeddings)
logger.info("FAISS ์ธ๋ฑ์Šค ์ƒ์„ฑ ์™„๋ฃŒ")
except Exception as e:
logger.error(f"FAISS ์ธ๋ฑ์Šค ์ƒ์„ฑ ์‹คํŒจ: {e}", exc_info=True)
raise DocumentIndexError(f"FAISS ๋ฌธ์„œ ์ธ๋ฑ์‹ฑ ์‹คํŒจ: {str(e)}")
else:
try:
self.vector_store = self.init_faiss()
except VectorStoreInitError as e:
# ์ด๋ฏธ ๋กœ๊น…๋จ
raise
return self.vector_store
def add_documents(self, documents: List[Document]) -> None:
"""
๋ฒกํ„ฐ ์Šคํ† ์–ด์— ๋ฌธ์„œ ์ถ”๊ฐ€
Args:
documents: ์ถ”๊ฐ€ํ•  ๋ฌธ์„œ ๋ฆฌ์ŠคํŠธ
"""
if not documents:
logger.warning("์ถ”๊ฐ€ํ•  ๋ฌธ์„œ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค")
return
try:
if self.vector_store is None:
logger.info("๋ฒกํ„ฐ ์Šคํ† ์–ด๊ฐ€ ์ดˆ๊ธฐํ™”๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค. ์ƒˆ ๋ฒกํ„ฐ ์Šคํ† ์–ด๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.")
self.create_or_load(documents)
else:
logger.info(f"{len(documents)}๊ฐœ ๋ฌธ์„œ๋ฅผ ๊ธฐ์กด ๋ฒกํ„ฐ ์Šคํ† ์–ด์— ์ถ”๊ฐ€ํ•ฉ๋‹ˆ๋‹ค")
self.vector_store.add_documents(documents)
logger.info(f"{len(documents)}๊ฐœ ๋ฌธ์„œ ์ถ”๊ฐ€ ์™„๋ฃŒ")
except Exception as e:
logger.error(f"๋ฌธ์„œ ์ถ”๊ฐ€ ์‹คํŒจ: {e}", exc_info=True)
raise DocumentIndexError(f"๋ฒกํ„ฐ ์Šคํ† ์–ด์— ๋ฌธ์„œ ์ถ”๊ฐ€ ์‹คํŒจ: {str(e)}")
def similarity_search(self, query: str, k: int = 5) -> List[Document]:
"""
๋ฒกํ„ฐ ์œ ์‚ฌ๋„ ๊ฒ€์ƒ‰ ์ˆ˜ํ–‰
Args:
query: ๊ฒ€์ƒ‰ ์ฟผ๋ฆฌ
k: ๋ฐ˜ํ™˜ํ•  ๊ฒฐ๊ณผ ์ˆ˜
Returns:
์œ ์‚ฌ๋„๊ฐ€ ๋†’์€ ๋ฌธ์„œ ๋ฆฌ์ŠคํŠธ
"""
if not query or not query.strip():
logger.warning("๋นˆ ์ฟผ๋ฆฌ๋กœ ๊ฒ€์ƒ‰ ์‹œ๋„")
return []
if self.vector_store is None:
logger.error("๋ฒกํ„ฐ ์Šคํ† ์–ด๊ฐ€ ์ดˆ๊ธฐํ™”๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค")
raise VectorSearchError("๋ฒกํ„ฐ ์Šคํ† ์–ด๊ฐ€ ์ดˆ๊ธฐํ™”๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค")
try:
logger.info(f"๊ฒ€์ƒ‰ ์ฟผ๋ฆฌ ์‹คํ–‰: '{query[:50]}{'...' if len(query) > 50 else ''}', ์ƒ์œ„ {k}๊ฐœ ๊ฒฐ๊ณผ ์š”์ฒญ")
results = self.vector_store.similarity_search(query, k=k)
logger.info(f"๊ฒ€์ƒ‰ ์™„๋ฃŒ: {len(results)}๊ฐœ ๊ฒฐ๊ณผ ์ฐพ์Œ")
return results
except Exception as e:
logger.error(f"๊ฒ€์ƒ‰ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {e}", exc_info=True)
raise VectorSearchError(f"๋ฒกํ„ฐ ๊ฒ€์ƒ‰ ์‹คํŒจ: {str(e)}")
def save_local(self, path: str = "faiss_index") -> bool:
"""
FAISS ์ธ๋ฑ์Šค ๋กœ์ปฌ ์ €์žฅ (Milvus ์‚ฌ์šฉ ์•ˆ ํ•  ๊ฒฝ์šฐ)
Args:
path: ์ €์žฅ ๊ฒฝ๋กœ
Returns:
์ €์žฅ ์„ฑ๊ณต ์—ฌ๋ถ€
"""
if self.vector_store is None:
logger.error("์ €์žฅํ•  ๋ฒกํ„ฐ ์Šคํ† ์–ด๊ฐ€ ์ดˆ๊ธฐํ™”๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค")
raise PersistenceError("์ €์žฅํ•  ๋ฒกํ„ฐ ์Šคํ† ์–ด๊ฐ€ ์ดˆ๊ธฐํ™”๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค")
# FAISS๋งŒ ๋กœ์ปฌ ์ €์žฅ ๊ฐ€๋Šฅ
if not self.use_milvus:
try:
# ์ €์žฅ ๋””๋ ‰ํ† ๋ฆฌ๊ฐ€ ์กด์žฌํ•˜๋Š”์ง€ ํ™•์ธ
os.makedirs(os.path.dirname(path) if os.path.dirname(path) else path, exist_ok=True)
self.vector_store.save_local(path)
logger.info(f"FAISS ์ธ๋ฑ์Šค ๋กœ์ปฌ ์ €์žฅ ์™„๋ฃŒ: {path}")
return True
except Exception as e:
logger.error(f"FAISS ์ธ๋ฑ์Šค ์ €์žฅ ์‹คํŒจ: {e}", exc_info=True)
raise PersistenceError(f"๋ฒกํ„ฐ ์ธ๋ฑ์Šค ์ €์žฅ ์‹คํŒจ: {str(e)}")
else:
logger.info("Milvus๋Š” ๋กœ์ปฌ ์ €์žฅ์ด ํ•„์š”ํ•˜์ง€ ์•Š์Šต๋‹ˆ๋‹ค")
return True
def load_local(self, path: str = "faiss_index") -> bool:
"""
FAISS ์ธ๋ฑ์Šค ๋กœ์ปฌ ๋กœ๋“œ (Milvus ์‚ฌ์šฉ ์•ˆ ํ•  ๊ฒฝ์šฐ)
Args:
path: ๋กœ๋“œํ•  ์ธ๋ฑ์Šค ๊ฒฝ๋กœ
Returns:
๋กœ๋“œ ์„ฑ๊ณต ์—ฌ๋ถ€
"""
if self.use_milvus:
logger.info("Milvus ์‚ฌ์šฉ ์ค‘์ด๋ฏ€๋กœ ๋กœ์ปฌ ๋กœ๋“œ๋ฅผ ๊ฑด๋„ˆ๋œ๋‹ˆ๋‹ค")
try:
# Milvus ์—ฐ๊ฒฐ ํ™•์ธ
self.vector_store = self.init_milvus()
return True
except Exception as e:
logger.error(f"Milvus ์—ฐ๊ฒฐ ์‹คํŒจ, FAISS๋กœ ๋Œ€์ฒด: {e}")
self.use_milvus = False
# FAISS๋กœ ๊ณ„์† ์ง„ํ–‰
if not os.path.exists(path):
logger.warning(f"์ธ๋ฑ์Šค ๊ฒฝ๋กœ๊ฐ€ ์กด์žฌํ•˜์ง€ ์•Š์Œ: {path}")
raise FileNotFoundError(f"๋ฒกํ„ฐ ์ธ๋ฑ์Šค ๊ฒฝ๋กœ๊ฐ€ ์กด์žฌํ•˜์ง€ ์•Š์Œ: {path}")
try:
logger.info(f"FAISS ์ธ๋ฑ์Šค ๋กœ๋“œ ์ค‘: {path}")
# ์—ญ์ง๋ ฌํ™” ํ—ˆ์šฉ ์˜ต์…˜ ์ถ”๊ฐ€ (๋ณด์•ˆ ๊ฒฝ๊ณ  ํ™•์ธ ํ•„์š”)
self.vector_store = FAISS.load_local(
path,
self.embeddings,
allow_dangerous_deserialization=True # ์—ญ์ง๋ ฌํ™” ํ—ˆ์šฉ
)
logger.info(f"FAISS ์ธ๋ฑ์Šค ๋กœ๋“œ ์™„๋ฃŒ: {path}")
return True
except FileNotFoundError as e:
logger.error(f"FAISS ์ธ๋ฑ์Šค ํŒŒ์ผ์„ ์ฐพ์„ ์ˆ˜ ์—†์Œ: {e}")
raise PersistenceError(f"๋ฒกํ„ฐ ์ธ๋ฑ์Šค ํŒŒ์ผ์„ ์ฐพ์„ ์ˆ˜ ์—†์Œ: {str(e)}")
except Exception as e:
logger.error(f"FAISS ์ธ๋ฑ์Šค ๋กœ๋“œ ์‹คํŒจ: {e}", exc_info=True)
# ์˜ค๋ฅ˜ ์„ธ๋ถ€ ์ •๋ณด ์ถœ๋ ฅ
import traceback
logger.error(f"์ƒ์„ธ ์˜ค๋ฅ˜: {traceback.format_exc()}")
# ์ƒˆ ์ธ๋ฑ์Šค ์ดˆ๊ธฐํ™”
logger.warning("์ธ๋ฑ์Šค ๋กœ๋“œ ์‹คํŒจ๋กœ ์ƒˆ FAISS ์ธ๋ฑ์Šค ์ดˆ๊ธฐํ™”")
self.vector_store = self.init_faiss()
return False