Spaces:
Paused
Paused
""" | |
λλ²κΉ μ μν μ½λ μΆκ° - κ²½λ‘ κ΄λ ¨ λ¬Έμ ν΄κ²° | |
""" | |
import os | |
import time | |
import hashlib | |
import pickle | |
import json | |
import logging | |
import glob | |
from typing import List, Dict, Tuple, Any, Optional | |
from logging.handlers import RotatingFileHandler | |
from pathlib import Path | |
from langchain.schema import Document | |
from config import ( | |
PDF_DIRECTORY, CACHE_DIRECTORY, CHUNK_SIZE, CHUNK_OVERLAP, | |
LLM_MODEL, LOG_LEVEL, LOG_FILE, print_config, validate_config | |
) | |
from optimized_document_processor import OptimizedDocumentProcessor | |
from vector_store import VectorStore | |
import sys | |
print("===== Script starting =====") | |
sys.stdout.flush() # μ¦μ μΆλ ₯ κ°μ | |
# μ£Όμ ν¨μ/λ©μλ νΈμΆ μ νμλ λλ²κΉ μΆλ ₯ μΆκ° | |
print("Loading config...") | |
sys.stdout.flush() | |
# from config import ... λ±μ μ½λ | |
print("Config loaded!") | |
sys.stdout.flush() | |
# λ‘κΉ μ€μ κ°μ | |
def setup_logging(): | |
"""μ ν리μΌμ΄μ λ‘κΉ μ€μ """ | |
# λ‘κ·Έ λ 벨 μ€μ | |
log_level = getattr(logging, LOG_LEVEL.upper(), logging.INFO) | |
# λ‘κ·Έ ν¬λ§· μ€μ | |
log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' | |
formatter = logging.Formatter(log_format) | |
# λ£¨νΈ λ‘κ±° μ€μ | |
root_logger = logging.getLogger() | |
root_logger.setLevel(log_level) | |
# νΈλ€λ¬ μ΄κΈ°ν | |
# μ½μ νΈλ€λ¬ | |
console_handler = logging.StreamHandler() | |
console_handler.setFormatter(formatter) | |
root_logger.addHandler(console_handler) | |
# νμΌ νΈλ€λ¬ (νμ μ) | |
try: | |
file_handler = RotatingFileHandler( | |
LOG_FILE, | |
maxBytes=10*1024*1024, # 10 MB | |
backupCount=5 | |
) | |
file_handler.setFormatter(formatter) | |
root_logger.addHandler(file_handler) | |
except Exception as e: | |
console_handler.warning(f"λ‘κ·Έ νμΌ μ€μ μ€ν¨: {e}, μ½μ λ‘κΉ λ§ μ¬μ©ν©λλ€.") | |
return logging.getLogger("AutoRAG") | |
# λ‘κ±° μ€μ | |
logger = setup_logging() | |
# νμ¬ μμ λλ ν 리 νμΈμ μν λλ²κΉ μ½λ | |
current_dir = os.getcwd() | |
logger.info(f"νμ¬ μμ λλ ν 리: {current_dir}") | |
# μ€μ λ PDF λλ ν 리 νμΈ | |
abs_pdf_dir = os.path.abspath(PDF_DIRECTORY) | |
logger.info(f"μ€μ λ PDF λλ ν 리: {PDF_DIRECTORY}") | |
logger.info(f"μ λ κ²½λ‘λ‘ λ³νλ PDF λλ ν 리: {abs_pdf_dir}") | |
# PDF λλ ν 리 μ‘΄μ¬ νμΈ | |
if os.path.exists(abs_pdf_dir): | |
logger.info(f"PDF λλ ν λ¦¬κ° μ‘΄μ¬ν©λλ€: {abs_pdf_dir}") | |
# λλ ν 리 λ΄μ© νμΈ | |
pdf_files = glob.glob(os.path.join(abs_pdf_dir, "*.pdf")) | |
logger.info(f"λλ ν 리 λ΄ PDF νμΌ λͺ©λ‘: {pdf_files}") | |
else: | |
logger.error(f"PDF λλ ν λ¦¬κ° μ‘΄μ¬νμ§ μμ΅λλ€: {abs_pdf_dir}") | |
# μμ λλ ν 리 λ΄μ© νμΈ | |
parent_dir = os.path.dirname(abs_pdf_dir) | |
logger.info(f"μμ λλ ν 리: {parent_dir}") | |
if os.path.exists(parent_dir): | |
dir_contents = os.listdir(parent_dir) | |
logger.info(f"μμ λλ ν 리 λ΄μ©: {dir_contents}") | |
# μ€μ μν νμΈ | |
logger.info("μ ν리μΌμ΄μ μ€μ κ²μ¦ μ€...") | |
config_status = validate_config() | |
if config_status["status"] != "valid": | |
for warning in config_status["warnings"]: | |
logger.warning(f"μ€μ κ²½κ³ : {warning}") | |
# μμ ν μν¬νΈ | |
try: | |
from rag_chain import RAGChain | |
RAG_CHAIN_AVAILABLE = True | |
print("RAG μ²΄μΈ λͺ¨λ λ‘λ μ±κ³΅!") | |
except ImportError as e: | |
logger.warning(f"RAG μ²΄μΈ λͺ¨λμ λ‘λν μ μμ΅λλ€: {e}") | |
RAG_CHAIN_AVAILABLE = False | |
except Exception as e: | |
logger.warning(f"RAG μ²΄μΈ λͺ¨λ λ‘λ μ€ μμμΉ λͺ»ν μ€λ₯: {e}") | |
RAG_CHAIN_AVAILABLE = False | |
# ν΄λ°± RAG κ΄λ ¨ λͺ¨λλ 미리 νμΈ | |
try: | |
from fallback_rag_chain import FallbackRAGChain | |
FALLBACK_AVAILABLE = True | |
print("ν΄λ°± RAG μ²΄μΈ λͺ¨λ λ‘λ μ±κ³΅!") | |
except ImportError as e: | |
logger.warning(f"ν΄λ°± RAG μ²΄μΈ λͺ¨λμ λ‘λν μ μμ΅λλ€: {e}") | |
FALLBACK_AVAILABLE = False | |
try: | |
from offline_fallback_rag import OfflineFallbackRAG | |
OFFLINE_FALLBACK_AVAILABLE = True | |
print("μ€νλΌμΈ ν΄λ°± RAG λͺ¨λ λ‘λ μ±κ³΅!") | |
except ImportError as e: | |
logger.warning(f"μ€νλΌμΈ ν΄λ°± RAG λͺ¨λμ λ‘λν μ μμ΅λλ€: {e}") | |
OFFLINE_FALLBACK_AVAILABLE = False | |
class DocumentProcessingError(Exception): | |
"""λ¬Έμ μ²λ¦¬ μ€ λ°μνλ μμΈ""" | |
pass | |
class VectorStoreError(Exception): | |
"""λ²‘ν° μ€ν μ΄ μμ μ€ λ°μνλ μμΈ""" | |
pass | |
class RAGInitializationError(Exception): | |
"""RAG μ²΄μΈ μ΄κΈ°ν μ€ λ°μνλ μμΈ""" | |
pass | |
class ConfigurationError(Exception): | |
"""μ€μ κ΄λ ¨ μ€λ₯""" | |
pass | |
class AutoRAGChatApp: | |
""" | |
documents ν΄λμ PDF νμΌμ μλμΌλ‘ μ²λ¦¬νλ RAG μ±λ΄ | |
""" | |
def __init__(self): | |
""" | |
RAG μ±λ΄ μ ν리μΌμ΄μ μ΄κΈ°ν | |
""" | |
try: | |
logger.info("AutoRAGChatApp μ΄κΈ°ν μμ") | |
# λ°μ΄ν° λλ ν 리 μ μ (μ€μ μμ κ°μ Έμ΄) | |
# μ λ κ²½λ‘λ‘ λ³ννμ¬ μ¬μ© | |
self.pdf_directory = os.path.abspath(PDF_DIRECTORY) | |
self.cache_directory = os.path.abspath(CACHE_DIRECTORY) | |
self.index_file = os.path.join(self.cache_directory, "file_index.json") | |
self.chunks_dir = os.path.join(self.cache_directory, "chunks") | |
self.vector_index_dir = os.path.join(self.cache_directory, "vector_index") | |
logger.info(f"μ€μ λ PDF λλ ν 리 (μ λ κ²½λ‘): {self.pdf_directory}") | |
# λλ ν 리 κ²μ¦ | |
self._verify_pdf_directory() | |
# λλ ν 리 μμ± | |
self._ensure_directories_exist() | |
logger.info(f"PDF λ¬Έμ λλ ν 리: '{self.pdf_directory}'") | |
logger.info(f"μΊμ λλ ν 리: '{self.cache_directory}'") | |
# μ»΄ν¬λνΈ μ΄κΈ°ν | |
try: | |
self.document_processor = OptimizedDocumentProcessor( | |
chunk_size=CHUNK_SIZE, | |
chunk_overlap=CHUNK_OVERLAP | |
) | |
except Exception as e: | |
logger.error(f"λ¬Έμ μ²λ¦¬κΈ° μ΄κΈ°ν μ€ν¨: {e}") | |
raise DocumentProcessingError(f"λ¬Έμ μ²λ¦¬κΈ° μ΄κΈ°ν μ€ν¨: {str(e)}") | |
# λ²‘ν° μ μ₯μ μ΄κΈ°ν | |
try: | |
self.vector_store = VectorStore(use_milvus=False) | |
except Exception as e: | |
logger.error(f"λ²‘ν° μ μ₯μ μ΄κΈ°ν μ€ν¨: {e}") | |
raise VectorStoreError(f"λ²‘ν° μ μ₯μ μ΄κΈ°ν μ€ν¨: {str(e)}") | |
# λ¬Έμ μΈλ±μ€ λ‘λ | |
self.file_index = self._load_file_index() | |
# κΈ°λ³Έ λ³μ μ΄κΈ°ν | |
self.documents = [] | |
self.processed_files = [] | |
self.is_initialized = False | |
# μμ μ μλμΌλ‘ λ¬Έμ λ‘λ λ° μ²λ¦¬ | |
logger.info("λ¬Έμ μλ λ‘λ λ° μ²λ¦¬ μμ...") | |
self.auto_process_documents() | |
logger.info("AutoRAGChatApp μ΄κΈ°ν μλ£") | |
except Exception as e: | |
logger.critical(f"μ ν리μΌμ΄μ μ΄κΈ°ν μ€ μ¬κ°ν μ€λ₯: {e}", exc_info=True) | |
# κΈ°λ³Έ μν μ€μ μΌλ‘ μ΅μνμ κΈ°λ₯ μ μ§ | |
self.pdf_directory = os.path.abspath(PDF_DIRECTORY) | |
self.documents = [] | |
self.processed_files = [] | |
self.is_initialized = False | |
self.file_index = {} | |
def _ensure_directories_exist(self) -> None: | |
""" | |
νμν λλ ν λ¦¬κ° μ‘΄μ¬νλμ§ νμΈνκ³ μμ± | |
""" | |
directories = [ | |
self.pdf_directory, | |
self.cache_directory, | |
self.chunks_dir, | |
self.vector_index_dir | |
] | |
for directory in directories: | |
try: | |
os.makedirs(directory, exist_ok=True) | |
except Exception as e: | |
logger.error(f"λλ ν 리 μμ± μ€ν¨ '{directory}': {e}") | |
raise OSError(f"λλ ν 리 μμ± μ€ν¨ '{directory}': {str(e)}") | |
def _process_pdf_file(self, file_path: str) -> List[Document]: | |
""" | |
PDF νμΌ μ²λ¦¬ - docling μ€ν¨ μ PyPDFLoader μ¬μ© | |
Args: | |
file_path: μ²λ¦¬ν PDF νμΌ κ²½λ‘ | |
Returns: | |
μ²λ¦¬λ λ¬Έμ μ²ν¬ 리μ€νΈ | |
""" | |
if not os.path.exists(file_path): | |
logger.error(f"νμΌμ΄ μ‘΄μ¬νμ§ μμ: {file_path}") | |
raise FileNotFoundError(f"νμΌμ΄ μ‘΄μ¬νμ§ μμ: {file_path}") | |
try: | |
logger.info(f"doclingμΌλ‘ μ²λ¦¬ μλ: {file_path}") | |
# docling μ¬μ© μλ | |
try: | |
# 10μ΄ νμμμ μ€μ (μ΅μ ) | |
import signal | |
def timeout_handler(signum, frame): | |
raise TimeoutError("docling μ²λ¦¬ μκ° μ΄κ³Ό (60μ΄)") | |
# 리λ μ€/λ§₯μμλ§ μλ (μλμ°μμλ 무μλ¨) | |
try: | |
signal.signal(signal.SIGALRM, timeout_handler) | |
signal.alarm(60) # 60μ΄ νμμμ | |
except (AttributeError, ValueError) as se: | |
logger.warning(f"μκ·Έλ μ€μ μ€ν¨ (μλμ° νκ²½μΌ μ μμ): {se}") | |
# doclingμΌλ‘ μ²λ¦¬ μλ | |
chunks = self.document_processor.process_pdf(file_path, use_docling=True) | |
# νμμμ μ·¨μ | |
try: | |
signal.alarm(0) | |
except (AttributeError, ValueError): | |
pass | |
return chunks | |
except TimeoutError as te: | |
logger.warning(f"docling μ²λ¦¬ μκ° μ΄κ³Ό: {te}") | |
logger.info("PyPDFLoaderλ‘ λ체ν©λλ€.") | |
# PyPDFLoaderλ‘ λ체 | |
try: | |
return self.document_processor.process_pdf(file_path, use_docling=False) | |
except Exception as inner_e: | |
logger.error(f"PyPDFLoader μ²λ¦¬ μ€λ₯: {inner_e}", exc_info=True) | |
raise DocumentProcessingError(f"PDF λ‘λ© μ€ν¨ (PyPDFLoader): {str(inner_e)}") | |
except Exception as e: | |
# docling μ€λ₯ νμΈ | |
error_str = str(e) | |
if "Invalid code point" in error_str or "RuntimeError" in error_str: | |
logger.warning(f"docling μ²λ¦¬ μ€λ₯ (μ½λ ν¬μΈνΈ λ¬Έμ ): {error_str}") | |
logger.info("PyPDFLoaderλ‘ λ체ν©λλ€.") | |
else: | |
logger.warning(f"docling μ²λ¦¬ μ€λ₯: {error_str}") | |
logger.info("PyPDFLoaderλ‘ λ체ν©λλ€.") | |
# PyPDFLoaderλ‘ λ체 | |
try: | |
return self.document_processor.process_pdf(file_path, use_docling=False) | |
except Exception as inner_e: | |
logger.error(f"PyPDFLoader μ²λ¦¬ μ€λ₯: {inner_e}", exc_info=True) | |
raise DocumentProcessingError(f"PDF λ‘λ© μ€ν¨ (PyPDFLoader): {str(inner_e)}") | |
except DocumentProcessingError: | |
# μ΄λ―Έ λνλ μμΈλ κ·Έλλ‘ μ λ¬ | |
raise | |
except Exception as e: | |
logger.error(f"PDF μ²λ¦¬ μ€ μ¬κ°ν μ€λ₯: {e}", exc_info=True) | |
# λΉ μ²ν¬λΌλ λ°ννμ¬ μ 체 μ²λ¦¬κ° μ€λ¨λμ§ μλλ‘ ν¨ | |
logger.warning(f"'{file_path}' μ²λ¦¬ μ€ν¨λ‘ λΉ μ²ν¬ λͺ©λ‘ λ°ν") | |
return [] | |
def _load_file_index(self) -> Dict[str, Dict[str, Any]]: | |
""" | |
νμΌ μΈλ±μ€ λ‘λ | |
Returns: | |
νμΌ κ²½λ‘ -> λ©νλ°μ΄ν° λ§€ν | |
""" | |
if os.path.exists(self.index_file): | |
try: | |
with open(self.index_file, 'r', encoding='utf-8') as f: | |
return json.load(f) | |
except json.JSONDecodeError as e: | |
logger.error(f"μΈλ±μ€ νμΌ JSON νμ± μ€ν¨: {e}") | |
logger.warning("μμλ μΈλ±μ€ νμΌ, μλ‘μ΄ μΈλ±μ€λ₯Ό μμ±ν©λλ€.") | |
return {} | |
except Exception as e: | |
logger.error(f"μΈλ±μ€ νμΌ λ‘λ μ€ν¨: {e}") | |
return {} | |
return {} | |
def _save_file_index(self) -> None: | |
""" | |
νμΌ μΈλ±μ€ μ μ₯ | |
""" | |
try: | |
with open(self.index_file, 'w', encoding='utf-8') as f: | |
json.dump(self.file_index, f, ensure_ascii=False, indent=2) | |
logger.debug("νμΌ μΈλ±μ€ μ μ₯ μλ£") | |
except Exception as e: | |
logger.error(f"νμΌ μΈλ±μ€ μ μ₯ μ€ν¨: {e}") | |
raise IOError(f"νμΌ μΈλ±μ€ μ μ₯ μ€ν¨: {str(e)}") | |
def _calculate_file_hash(self, file_path: str) -> str: | |
""" | |
νμΌ ν΄μ κ³μ° | |
Args: | |
file_path: νμΌ κ²½λ‘ | |
Returns: | |
MD5 ν΄μκ° | |
""" | |
if not os.path.exists(file_path): | |
logger.error(f"ν΄μ κ³μ° μ€ν¨ - νμΌμ΄ μ‘΄μ¬νμ§ μμ: {file_path}") | |
raise FileNotFoundError(f"νμΌμ΄ μ‘΄μ¬νμ§ μμ: {file_path}") | |
try: | |
hasher = hashlib.md5() | |
with open(file_path, 'rb') as f: | |
buf = f.read(65536) | |
while len(buf) > 0: | |
hasher.update(buf) | |
buf = f.read(65536) | |
return hasher.hexdigest() | |
except Exception as e: | |
logger.error(f"νμΌ ν΄μ κ³μ° μ€ μ€λ₯: {e}") | |
raise IOError(f"νμΌ ν΄μ κ³μ° μ€ν¨: {str(e)}") | |
def _is_file_processed(self, file_path: str) -> bool: | |
""" | |
νμΌμ΄ μ΄λ―Έ μ²λ¦¬λμκ³ λ³κ²½λμ§ μμλμ§ νμΈ | |
Args: | |
file_path: νμΌ κ²½λ‘ | |
Returns: | |
μ²λ¦¬ μ¬λΆ | |
""" | |
# νμΌ μ‘΄μ¬ νμΈ | |
if not os.path.exists(file_path): | |
logger.warning(f"νμΌμ΄ μ‘΄μ¬νμ§ μμ: {file_path}") | |
return False | |
# μΈλ±μ€μ νμΌ μ‘΄μ¬ μ¬λΆ νμΈ | |
if file_path not in self.file_index: | |
return False | |
try: | |
# νμ¬ ν΄μκ° κ³μ° | |
current_hash = self._calculate_file_hash(file_path) | |
# μ μ₯λ ν΄μκ°κ³Ό λΉκ΅ | |
if self.file_index[file_path]['hash'] != current_hash: | |
logger.info(f"νμΌ λ³κ²½ κ°μ§: {file_path}") | |
return False | |
# μ²ν¬ νμΌ μ‘΄μ¬ νμΈ | |
chunks_path = self.file_index[file_path]['chunks_path'] | |
if not os.path.exists(chunks_path): | |
logger.warning(f"μ²ν¬ νμΌμ΄ μ‘΄μ¬νμ§ μμ: {chunks_path}") | |
return False | |
return True | |
except Exception as e: | |
logger.error(f"νμΌ μ²λ¦¬ μν νμΈ μ€ μ€λ₯: {e}") | |
return False | |
def _get_chunks_path(self, file_hash: str) -> str: | |
""" | |
μ²ν¬ νμΌ κ²½λ‘ μμ± | |
Args: | |
file_hash: νμΌ ν΄μκ° | |
Returns: | |
μ²ν¬ νμΌ κ²½λ‘ | |
""" | |
return os.path.join(self.chunks_dir, f"{file_hash}.pkl") | |
def _save_chunks(self, file_path: str, chunks: List[Document]) -> None: | |
""" | |
μ²ν¬ λ°μ΄ν° μ μ₯ | |
Args: | |
file_path: μλ³Έ νμΌ κ²½λ‘ | |
chunks: λ¬Έμ μ²ν¬ 리μ€νΈ | |
""" | |
try: | |
# ν΄μ κ³μ° | |
file_hash = self._calculate_file_hash(file_path) | |
# μ²ν¬ νμΌ κ²½λ‘ | |
chunks_path = self._get_chunks_path(file_hash) | |
# μ²ν¬ λ°μ΄ν° μ μ₯ | |
with open(chunks_path, 'wb') as f: | |
pickle.dump(chunks, f) | |
# μΈλ±μ€ μ λ°μ΄νΈ | |
self.file_index[file_path] = { | |
'hash': file_hash, | |
'chunks_path': chunks_path, | |
'last_processed': time.time(), | |
'chunks_count': len(chunks), | |
'file_size': os.path.getsize(file_path), | |
'file_name': os.path.basename(file_path) | |
} | |
# μΈλ±μ€ μ μ₯ | |
self._save_file_index() | |
logger.info(f"μ²ν¬ μ μ₯ μλ£: {file_path} ({len(chunks)}κ° μ²ν¬)") | |
except Exception as e: | |
logger.error(f"μ²ν¬ μ μ₯ μ€ν¨: {e}", exc_info=True) | |
raise IOError(f"μ²ν¬ μ μ₯ μ€ν¨: {str(e)}") | |
def _load_chunks(self, file_path: str) -> List[Document]: | |
""" | |
μ μ₯λ μ²ν¬ λ°μ΄ν° λ‘λ | |
Args: | |
file_path: νμΌ κ²½λ‘ | |
Returns: | |
λ¬Έμ μ²ν¬ 리μ€νΈ | |
""" | |
if file_path not in self.file_index: | |
logger.error(f"μΈλ±μ€μ νμΌμ΄ μ‘΄μ¬νμ§ μμ: {file_path}") | |
raise KeyError(f"μΈλ±μ€μ νμΌμ΄ μ‘΄μ¬νμ§ μμ: {file_path}") | |
chunks_path = self.file_index[file_path]['chunks_path'] | |
if not os.path.exists(chunks_path): | |
logger.error(f"μ²ν¬ νμΌμ΄ μ‘΄μ¬νμ§ μμ: {chunks_path}") | |
raise FileNotFoundError(f"μ²ν¬ νμΌμ΄ μ‘΄μ¬νμ§ μμ: {chunks_path}") | |
try: | |
with open(chunks_path, 'rb') as f: | |
chunks = pickle.load(f) | |
logger.info(f"μ²ν¬ λ‘λ μλ£: {file_path} ({len(chunks)}κ° μ²ν¬)") | |
return chunks | |
except pickle.UnpicklingError as e: | |
logger.error(f"μ²ν¬ νμΌ μμ§λ ¬ν μ€ν¨: {e}") | |
raise IOError(f"μ²ν¬ νμΌ μμ: {str(e)}") | |
except Exception as e: | |
logger.error(f"μ²ν¬ λ‘λ μ€ν¨: {e}", exc_info=True) | |
raise IOError(f"μ²ν¬ λ‘λ μ€ν¨: {str(e)}") | |
def _verify_pdf_directory(self): | |
"""PDF λλ ν 리 κ²μ¦ λ° νμΌ μ‘΄μ¬ νμΈ""" | |
try: | |
# λλ ν 리 μ‘΄μ¬ νμΈ | |
if not os.path.exists(self.pdf_directory): | |
try: | |
logger.warning(f"PDF λλ ν λ¦¬κ° μ‘΄μ¬νμ§ μμ μμ±ν©λλ€: {self.pdf_directory}") | |
os.makedirs(self.pdf_directory, exist_ok=True) | |
except Exception as e: | |
logger.error(f"PDF λλ ν 리 μμ± μ€ν¨: {e}") | |
raise | |
# λλ ν 리μΈμ§ νμΈ | |
if not os.path.isdir(self.pdf_directory): | |
logger.error(f"PDF κ²½λ‘κ° λλ ν λ¦¬κ° μλλλ€: {self.pdf_directory}") | |
raise ConfigurationError(f"PDF κ²½λ‘κ° λλ ν λ¦¬κ° μλλλ€: {self.pdf_directory}") | |
# PDF νμΌ μ‘΄μ¬ νμΈ | |
pdf_files = [f for f in os.listdir(self.pdf_directory) if f.lower().endswith('.pdf')] | |
if pdf_files: | |
logger.info(f"PDF λλ ν 리μμ {len(pdf_files)}κ°μ PDF νμΌμ μ°Ύμμ΅λλ€: {pdf_files}") | |
else: | |
# μ¬λ¬ κ²½λ‘μμ PDF νμΌ νμ μλ | |
alternative_paths = [ | |
"./documents", | |
"../documents", | |
"documents", | |
os.path.join(os.getcwd(), "documents") | |
] | |
found_pdfs = False | |
for alt_path in alternative_paths: | |
if os.path.exists(alt_path) and os.path.isdir(alt_path): | |
alt_pdf_files = [f for f in os.listdir(alt_path) if f.lower().endswith('.pdf')] | |
if alt_pdf_files: | |
logger.warning(f"λ체 κ²½λ‘ '{alt_path}'μμ PDF νμΌμ μ°Ύμμ΅λλ€. μ΄ κ²½λ‘λ₯Ό μ¬μ©ν©λλ€.") | |
self.pdf_directory = os.path.abspath(alt_path) | |
found_pdfs = True | |
break | |
if not found_pdfs: | |
logger.warning(f"PDF λλ ν 리μ PDF νμΌμ΄ μμ΅λλ€: {self.pdf_directory}") | |
logger.info("PDF νμΌμ λλ ν 리μ μΆκ°ν΄μ£ΌμΈμ.") | |
except Exception as e: | |
logger.error(f"PDF λλ ν 리 κ²μ¦ μ€ μ€λ₯: {e}", exc_info=True) | |
raise | |
def auto_process_documents(self) -> str: | |
""" | |
documents ν΄λμ PDF νμΌ μλ μ²λ¦¬ | |
Returns: | |
μ²λ¦¬ κ²°κ³Ό λ©μμ§ | |
""" | |
try: | |
start_time = time.time() | |
# PDF νμΌ λͺ©λ‘ μμ§μ κ°μ νμ¬ λ€μν κ²½λ‘ μ²λ¦¬ | |
try: | |
pdf_files = [] | |
# μ€μ λ λλ ν 리μμ PDF νμΌ μ°ΎκΈ° | |
logger.info(f"PDF νμΌ κ²μ κ²½λ‘: {self.pdf_directory}") | |
if os.path.exists(self.pdf_directory) and os.path.isdir(self.pdf_directory): | |
# λλ ν 리 λ΄μ© μΆλ ₯ (λλ²κΉ μ©) | |
dir_contents = os.listdir(self.pdf_directory) | |
logger.info(f"λλ ν 리 λ΄μ©: {dir_contents}") | |
# PDF νμΌλ§ νν°λ§ | |
for filename in os.listdir(self.pdf_directory): | |
if filename.lower().endswith('.pdf'): | |
file_path = os.path.join(self.pdf_directory, filename) | |
if os.path.isfile(file_path): # μ€μ νμΌμΈμ§ νμΈ | |
pdf_files.append(file_path) | |
logger.info(f"PDF νμΌ μ°Ύμ: {file_path}") | |
# λ°κ²¬λ λͺ¨λ νμΌ λ‘κ·Έ | |
logger.info(f"λ°κ²¬λ λͺ¨λ PDF νμΌ: {pdf_files}") | |
except FileNotFoundError: | |
logger.error(f"PDF λλ ν 리λ₯Ό μ°Ύμ μ μμ: {self.pdf_directory}") | |
return f"'{self.pdf_directory}' λλ ν 리λ₯Ό μ°Ύμ μ μμ΅λλ€. λλ ν λ¦¬κ° μ‘΄μ¬νλμ§ νμΈνμΈμ." | |
except PermissionError: | |
logger.error(f"PDF λλ ν 리 μ κ·Ό κΆν μμ: {self.pdf_directory}") | |
return f"'{self.pdf_directory}' λλ ν 리μ μ κ·Όν μ μμ΅λλ€. κΆνμ νμΈνμΈμ." | |
if not pdf_files: | |
logger.warning(f"'{self.pdf_directory}' ν΄λμ PDF νμΌμ΄ μμ΅λλ€.") | |
return f"'{self.pdf_directory}' ν΄λμ PDF νμΌμ΄ μμ΅λλ€." | |
logger.info(f"λ°κ²¬λ PDF νμΌ: {len(pdf_files)}κ°") | |
# ν΄λ λ΄ PDF νμΌ μ²λ¦¬ | |
new_files = [] | |
updated_files = [] | |
cached_files = [] | |
failed_files = [] | |
all_chunks = [] | |
for file_path in pdf_files: | |
try: | |
if self._is_file_processed(file_path): | |
# μΊμμμ μ²ν¬ λ‘λ | |
try: | |
chunks = self._load_chunks(file_path) | |
all_chunks.extend(chunks) | |
cached_files.append(file_path) | |
self.processed_files.append(os.path.basename(file_path)) | |
except Exception as e: | |
logger.error(f"μΊμλ μ²ν¬ λ‘λ μ€ν¨: {e}") | |
# νμΌμ λ€μ μ²λ¦¬ | |
logger.info(f"μΊμ μ€ν¨λ‘ νμΌ μ¬μ²λ¦¬: {file_path}") | |
chunks = self._process_pdf_file(file_path) | |
if chunks: | |
self._save_chunks(file_path, chunks) | |
all_chunks.extend(chunks) | |
updated_files.append(file_path) | |
self.processed_files.append(os.path.basename(file_path)) | |
else: | |
failed_files.append(file_path) | |
else: | |
# μ νμΌ λλ λ³κ²½λ νμΌ μ²λ¦¬ | |
logger.info(f"μ²λ¦¬ μ€: {file_path}") | |
try: | |
# κ°μ λ PDF μ²λ¦¬ λ©μλ μ¬μ© | |
chunks = self._process_pdf_file(file_path) | |
if chunks: # μ²ν¬κ° μλ κ²½μ°μλ§ μ μ₯ | |
# μ²ν¬ μ μ₯ | |
self._save_chunks(file_path, chunks) | |
all_chunks.extend(chunks) | |
if file_path in self.file_index: | |
updated_files.append(file_path) | |
else: | |
new_files.append(file_path) | |
self.processed_files.append(os.path.basename(file_path)) | |
else: | |
logger.warning(f"'{file_path}' μ²λ¦¬ μ€ν¨: μΆμΆλ μ²ν¬ μμ") | |
failed_files.append(file_path) | |
except Exception as e: | |
logger.error(f"'{file_path}' μ²λ¦¬ μ€ μ€λ₯: {e}", exc_info=True) | |
failed_files.append(file_path) | |
except Exception as e: | |
logger.error(f"'{file_path}' νμΌ μ²λ¦¬ 루ν μ€ μ€λ₯: {e}", exc_info=True) | |
failed_files.append(file_path) | |
# λͺ¨λ μ²ν¬ μ μ₯ | |
self.documents = all_chunks | |
processing_time = time.time() - start_time | |
logger.info(f"λ¬Έμ μ²λ¦¬ μλ£: {len(all_chunks)}κ° μ²ν¬, {processing_time:.2f}μ΄") | |
# λ²‘ν° μΈλ±μ€ μ²λ¦¬ | |
try: | |
self._process_vector_index(new_files, updated_files) | |
except Exception as e: | |
logger.error(f"λ²‘ν° μΈλ±μ€ μ²λ¦¬ μ€ν¨: {e}", exc_info=True) | |
return f"λ¬Έμλ μ²λ¦¬λμμΌλ λ²‘ν° μΈλ±μ€ μμ±μ μ€ν¨νμ΅λλ€: {str(e)}" | |
# RAG μ²΄μΈ μ΄κΈ°ν | |
if RAG_CHAIN_AVAILABLE: | |
try: | |
logger.info("RAGChainμΌλ‘ μ΄κΈ°νλ₯Ό μλν©λλ€.") | |
self.rag_chain = RAGChain(self.vector_store) | |
self.is_initialized = True | |
logger.info("RAG μ²΄μΈ μ΄κΈ°ν μ±κ³΅") | |
except Exception as e: | |
logger.error(f"RAG μ²΄μΈ μ΄κΈ°ν μ€ν¨: {e}", exc_info=True) | |
# FallbackRAGChainμΌλ‘ λ체 μλ | |
try: | |
logger.info("FallbackRAGChainμΌλ‘ λ체ν©λλ€...") | |
from fallback_rag_chain import FallbackRAGChain | |
self.rag_chain = FallbackRAGChain(self.vector_store) | |
self.is_initialized = True | |
logger.info("ν΄λ°± RAG μ²΄μΈ μ΄κΈ°ν μ±κ³΅") | |
except Exception as fallback_e: | |
logger.error(f"ν΄λ°± RAG μ²΄μΈ μ΄κΈ°ν μ€ν¨: {fallback_e}", exc_info=True) | |
# SimpleRAGChain μλ (μ΅νμ μλ¨) | |
try: | |
logger.info("SimpleRAGChainμΌλ‘ λ체ν©λλ€...") | |
from simple_rag_chain import SimpleRAGChain | |
# API μ 보 κ°μ Έμ€κΈ° | |
try: | |
from config import DEEPSEEK_API_KEY, DEEPSEEK_MODEL, DEEPSEEK_ENDPOINT | |
logger.info(f"μ€μ νμΌμμ DeepSeek API μ 보λ₯Ό λ‘λνμ΅λλ€: λͺ¨λΈ={DEEPSEEK_MODEL}") | |
except ImportError: | |
# μ€μ νμΌμμ κ°μ Έμ¬ μ μλ κ²½μ° νκ²½ λ³μ νμΈ | |
DEEPSEEK_API_KEY = os.environ.get("DEEPSEEK_API_KEY", "") | |
DEEPSEEK_MODEL = os.environ.get("DEEPSEEK_MODEL", "deepseek-chat") | |
DEEPSEEK_ENDPOINT = os.environ.get("DEEPSEEK_ENDPOINT", | |
"https://api.deepseek.com/v1/chat/completions") | |
logger.info(f"νκ²½ λ³μμμ DeepSeek API μ 보λ₯Ό λ‘λνμ΅λλ€: λͺ¨λΈ={DEEPSEEK_MODEL}") | |
# SimpleRAGChain μ΄κΈ°ν μλ | |
self.rag_chain = SimpleRAGChain(self.vector_store) | |
self.is_initialized = True | |
logger.info("SimpleRAGChain μ΄κΈ°ν μ±κ³΅") | |
except Exception as simple_e: | |
logger.error(f"λͺ¨λ RAG μ²΄μΈ μ΄κΈ°ν μ€ν¨: {simple_e}", exc_info=True) | |
return f"λ¬Έμμ λ²‘ν° μΈλ±μ€λ μ²λ¦¬λμμΌλ RAG μ²΄μΈ μ΄κΈ°νμ μ€ν¨νμ΅λλ€: {str(e)}" | |
else: | |
# RAGChainμ μ¬μ©ν μ μλ κ²½μ° | |
try: | |
logger.info("κΈ°λ³Έ RAG Chainμ μ¬μ©ν μ μμ΄ λ체 λ²μ μ μλν©λλ€...") | |
# FallbackRAGChain μλ | |
try: | |
from fallback_rag_chain import FallbackRAGChain | |
self.rag_chain = FallbackRAGChain(self.vector_store) | |
self.is_initialized = True | |
logger.info("ν΄λ°± RAG μ²΄μΈ μ΄κΈ°ν μ±κ³΅") | |
except Exception as fallback_e: | |
logger.error(f"ν΄λ°± RAG μ²΄μΈ μ΄κΈ°ν μ€ν¨: {fallback_e}", exc_info=True) | |
# SimpleRAGChain μλ (μ΅νμ μλ¨) | |
try: | |
from simple_rag_chain import SimpleRAGChain | |
self.rag_chain = SimpleRAGChain(self.vector_store) | |
self.is_initialized = True | |
logger.info("SimpleRAGChain μ΄κΈ°ν μ±κ³΅") | |
except Exception as simple_e: | |
logger.error(f"λͺ¨λ RAG μ²΄μΈ μ΄κΈ°ν μ€ν¨: {simple_e}", exc_info=True) | |
return f"λ¬Έμμ λ²‘ν° μΈλ±μ€λ μ²λ¦¬λμμΌλ RAG μ²΄μΈ μ΄κΈ°νμ μ€ν¨νμ΅λλ€" | |
except Exception as e: | |
logger.error(f"RAG μ²΄μΈ μ΄κΈ°ν μ€ν¨: {e}", exc_info=True) | |
return f"λ¬Έμμ λ²‘ν° μΈλ±μ€λ μ²λ¦¬λμμΌλ RAG μ²΄μΈ μ΄κΈ°νμ μ€ν¨νμ΅λλ€: {str(e)}" | |
# μ±κ³΅ λ©μμ§ μμ± | |
result_message = f"""λ¬Έμ μ²λ¦¬ μλ£! | |
- μ²λ¦¬λ νμΌ: {len(pdf_files)}κ° | |
- μΊμλ νμΌ: {len(cached_files)}κ° | |
- μ νμΌ: {len(new_files)}κ° | |
- μ λ°μ΄νΈλ νμΌ: {len(updated_files)}κ° | |
- μ€ν¨ν νμΌ: {len(failed_files)}κ° | |
- μ΄ μ²ν¬ μ: {len(all_chunks)}κ° | |
- μ²λ¦¬ μκ°: {processing_time:.2f}μ΄ | |
μ΄μ μ§λ¬Έν μ€λΉκ° λμμ΅λλ€!""" | |
return result_message | |
except Exception as e: | |
error_message = f"λ¬Έμ μ²λ¦¬ μ€ μ€λ₯ λ°μ: {str(e)}" | |
logger.error(error_message, exc_info=True) | |
return error_message | |
def _process_vector_index(self, new_files: List[str], updated_files: List[str]) -> None: | |
""" | |
λ²‘ν° μΈλ±μ€ μ²λ¦¬ | |
Args: | |
new_files: μλ‘ μΆκ°λ νμΌ λͺ©λ‘ | |
updated_files: μ λ°μ΄νΈλ νμΌ λͺ©λ‘ | |
""" | |
# λ²‘ν° μΈλ±μ€ μ μ₯ κ²½λ‘ νμΈ | |
if os.path.exists(self.vector_index_dir) and any(os.listdir(self.vector_index_dir)): | |
# κΈ°μ‘΄ λ²‘ν° μΈλ±μ€ λ‘λ | |
try: | |
logger.info("μ μ₯λ λ²‘ν° μΈλ±μ€ λ‘λ μ€...") | |
vector_store_loaded = self.vector_store.load_local(self.vector_index_dir) | |
# μΈλ±μ€ λ‘λ μ±κ³΅ νμΈ | |
if self.vector_store.vector_store is not None: | |
# μ λ¬Έμλ λ³κ²½λ λ¬Έμκ° μμΌλ©΄ μΈλ±μ€ μ λ°μ΄νΈ | |
if new_files or updated_files: | |
logger.info("λ²‘ν° μΈλ±μ€ μ λ°μ΄νΈ μ€...") | |
self.vector_store.add_documents(self.documents) | |
logger.info("λ²‘ν° μΈλ±μ€ λ‘λ μλ£") | |
else: | |
logger.warning("λ²‘ν° μΈλ±μ€λ₯Ό λ‘λνμΌλ μ ν¨νμ§ μμ, μλ‘ μμ±ν©λλ€.") | |
self.vector_store.create_or_load(self.documents) | |
except Exception as e: | |
logger.error(f"λ²‘ν° μΈλ±μ€ λ‘λ μ€ν¨, μλ‘ μμ±ν©λλ€: {e}", exc_info=True) | |
# μ λ²‘ν° μΈλ±μ€ μμ± | |
self.vector_store.create_or_load(self.documents) | |
else: | |
# μ λ²‘ν° μΈλ±μ€ μμ± | |
logger.info("μ λ²‘ν° μΈλ±μ€ μμ± μ€...") | |
self.vector_store.create_or_load(self.documents) | |
# λ²‘ν° μΈλ±μ€ μ μ₯ | |
if self.vector_store and self.vector_store.vector_store is not None: | |
try: | |
logger.info(f"λ²‘ν° μΈλ±μ€ μ μ₯ μ€: {self.vector_index_dir}") | |
save_result = self.vector_store.save_local(self.vector_index_dir) | |
logger.info(f"λ²‘ν° μΈλ±μ€ μ μ₯ μλ£: {self.vector_index_dir}") | |
except Exception as e: | |
logger.error(f"λ²‘ν° μΈλ±μ€ μ μ₯ μ€ν¨: {e}", exc_info=True) | |
raise VectorStoreError(f"λ²‘ν° μΈλ±μ€ μ μ₯ μ€ν¨: {str(e)}") | |
else: | |
logger.warning("λ²‘ν° μΈλ±μ€κ° μ΄κΈ°νλμ§ μμ μ μ₯νμ§ μμ΅λλ€.") | |
def reset_cache(self) -> str: | |
""" | |
μΊμ μ΄κΈ°ν | |
Returns: | |
κ²°κ³Ό λ©μμ§ | |
""" | |
try: | |
# μ²ν¬ νμΌ μμ | |
try: | |
for filename in os.listdir(self.chunks_dir): | |
file_path = os.path.join(self.chunks_dir, filename) | |
if os.path.isfile(file_path): | |
os.remove(file_path) | |
logger.info("μ²ν¬ μΊμ νμΌ μμ μλ£") | |
except Exception as e: | |
logger.error(f"μ²ν¬ νμΌ μμ μ€ μ€λ₯: {e}") | |
return f"μ²ν¬ νμΌ μμ μ€ μ€λ₯ λ°μ: {str(e)}" | |
# μΈλ±μ€ μ΄κΈ°ν | |
self.file_index = {} | |
try: | |
self._save_file_index() | |
logger.info("νμΌ μΈλ±μ€ μ΄κΈ°ν μλ£") | |
except Exception as e: | |
logger.error(f"μΈλ±μ€ νμΌ μ΄κΈ°ν μ€ μ€λ₯: {e}") | |
return f"μΈλ±μ€ νμΌ μ΄κΈ°ν μ€ μ€λ₯ λ°μ: {str(e)}" | |
# λ²‘ν° μΈλ±μ€ μμ | |
try: | |
for filename in os.listdir(self.vector_index_dir): | |
file_path = os.path.join(self.vector_index_dir, filename) | |
if os.path.isfile(file_path): | |
os.remove(file_path) | |
logger.info("λ²‘ν° μΈλ±μ€ νμΌ μμ μλ£") | |
except Exception as e: | |
logger.error(f"λ²‘ν° μΈλ±μ€ νμΌ μμ μ€ μ€λ₯: {e}") | |
return f"λ²‘ν° μΈλ±μ€ νμΌ μμ μ€ μ€λ₯ λ°μ: {str(e)}" | |
self.documents = [] | |
self.processed_files = [] | |
self.is_initialized = False | |
logger.info("μΊμ μ΄κΈ°ν μλ£") | |
return "μΊμκ° μ΄κΈ°νλμμ΅λλ€. λ€μ μ€ν μ λͺ¨λ λ¬Έμκ° λ€μ μ²λ¦¬λ©λλ€." | |
except Exception as e: | |
error_msg = f"μΊμ μ΄κΈ°ν μ€ μ€λ₯ λ°μ: {str(e)}" | |
logger.error(error_msg, exc_info=True) | |
return error_msg | |
def process_query(self, query: str, chat_history: List[Tuple[str, str]]) -> Tuple[str, List[Tuple[str, str]]]: | |
""" | |
μ¬μ©μ 쿼리 μ²λ¦¬ | |
Args: | |
query: μ¬μ©μ μ§λ¬Έ | |
chat_history: λν κΈ°λ‘ | |
Returns: | |
μλ΅ λ° μ λ°μ΄νΈλ λν κΈ°λ‘ | |
""" | |
if not query or not query.strip(): | |
response = "μ§λ¬Έμ΄ λΉμ΄ μμ΅λλ€. μ§λ¬Έμ μ λ ₯ν΄ μ£ΌμΈμ." | |
chat_history.append((query, response)) | |
return "", chat_history | |
if not self.is_initialized: | |
response = "λ¬Έμ λ‘λκ° μ΄κΈ°νλμ§ μμμ΅λλ€. μλ λ‘λλ₯Ό μλν©λλ€." | |
chat_history.append((query, response)) | |
# μλ λ‘λ μλ | |
try: | |
init_result = self.auto_process_documents() | |
if not self.is_initialized: | |
response = f"λ¬Έμλ₯Ό λ‘λν μ μμ΅λλ€. 'documents' ν΄λμ PDF νμΌμ΄ μλμ§ νμΈνμΈμ. μ΄κΈ°ν κ²°κ³Ό: {init_result}" | |
chat_history.append((query, response)) | |
return "", chat_history | |
except Exception as e: | |
response = f"λ¬Έμ λ‘λ μ€ μ€λ₯ λ°μ: {str(e)}" | |
logger.error(f"μλ λ‘λ μ€ν¨: {e}", exc_info=True) | |
chat_history.append((query, response)) | |
return "", chat_history | |
try: | |
# RAG μ²΄μΈ μ€ν λ° μλ΅ μμ± | |
start_time = time.time() | |
logger.info(f"쿼리 μ²λ¦¬ μμ: {query}") | |
# rag_chainμ΄ μ΄κΈ°νλμλμ§ νμΈ | |
if not hasattr(self, 'rag_chain') or self.rag_chain is None: | |
raise RAGInitializationError("RAG 체μΈμ΄ μ΄κΈ°νλμ§ μμμ΅λλ€") | |
# 1. λ¨Όμ νμ€ RAG 체μΈμΌλ‘ μλ | |
try: | |
response = self.rag_chain.run(query) | |
logger.info(f"κΈ°λ³Έ RAG 체μΈμΌλ‘ μλ΅ μμ± μ±κ³΅") | |
except Exception as rag_error: | |
logger.error(f"κΈ°λ³Έ RAG μ²΄μΈ μ€ν μ€ν¨: {rag_error}, λμ μλ") | |
# 2. DeepSeek API μ§μ νΈμΆ μλ (RAG μ²΄μΈ μ°ν) | |
try: | |
# DeepSeek API μ 보 κ°μ Έμ€κΈ° | |
try: | |
from config import DEEPSEEK_API_KEY, DEEPSEEK_MODEL, DEEPSEEK_ENDPOINT | |
except ImportError: | |
# μ€μ λͺ¨λμμ κ°μ Έμ¬ μ μλ κ²½μ° κΈ°λ³Έκ° μ€μ | |
DEEPSEEK_API_KEY = os.environ.get("DEEPSEEK_API_KEY", "") | |
DEEPSEEK_MODEL = os.environ.get("DEEPSEEK_MODEL", "deepseek-chat") | |
DEEPSEEK_ENDPOINT = os.environ.get("DEEPSEEK_ENDPOINT", | |
"https://api.deepseek.com/v1/chat/completions") | |
# μ§μ API νΈμΆ ν¨μ μ μ (μΈλΆ λͺ¨λ μμ‘΄μ± μ κ±°) | |
def direct_api_call(query, context, api_key, model_name, endpoint, max_retries=3, timeout=60): | |
"""DeepSeek API μ§μ νΈμΆ ν¨μ""" | |
import requests | |
import json | |
import time | |
# ν둬ννΈ κΈΈμ΄ μ ν | |
if len(context) > 6000: | |
context = context[:2500] + "\n...(μ€λ΅)...\n" + context[-2500:] | |
# ν둬ννΈ κ΅¬μ± | |
prompt = f""" | |
λ€μ μ 보λ₯Ό κΈ°λ°μΌλ‘ μ§λ¬Έμ μ ννκ² λ΅λ³ν΄μ£ΌμΈμ. | |
μ§λ¬Έ: {query} | |
μ°Έκ³ μ 보: | |
{context} | |
μ°Έκ³ μ 보μ λ΅μ΄ μμΌλ©΄ λ°λμ κ·Έ μ 보λ₯Ό κΈ°λ°μΌλ‘ λ΅λ³νμΈμ. | |
μ°Έκ³ μ 보μ λ΅μ΄ μλ κ²½μ°μλ μΌλ°μ μΈ μ§μμ νμ©νμ¬ λ΅λ³ν μ μμ§λ§, "μ 곡λ λ¬Έμμλ μ΄ μ λ³΄κ° μμΌλ, μΌλ°μ μΌλ‘λ..." μμΌλ‘ μμνμΈμ. | |
λ΅λ³μ μ ννκ³ κ°κ²°νκ² μ 곡νλ, κ°λ₯ν μ°Έκ³ μ 보μμ κ·Όκ±°λ₯Ό μ°Ύμ μ€λͺ ν΄μ£ΌμΈμ. | |
μ°Έκ³ μ 보μ μΆμ²λ ν¨κ» μλ €μ£ΌμΈμ. | |
""" | |
# API μμ² μλ | |
headers = { | |
"Content-Type": "application/json", | |
"Authorization": f"Bearer {api_key}" | |
} | |
payload = { | |
"model": model_name, | |
"messages": [{"role": "user", "content": prompt}], | |
"temperature": 0.3, | |
"max_tokens": 1000 | |
} | |
# μ¬μλ λ‘μ§ | |
retry_delay = 1.0 | |
for attempt in range(max_retries): | |
try: | |
logger.info(f"DeepSeek API μ§μ νΈμΆ μλ ({attempt + 1}/{max_retries})...") | |
response = requests.post( | |
endpoint, | |
headers=headers, | |
json=payload, | |
timeout=timeout | |
) | |
if response.status_code == 200: | |
result = response.json() | |
content = result.get("choices", [{}])[0].get("message", {}).get("content", "") | |
logger.info(f"DeepSeek API μ§μ νΈμΆ μ±κ³΅") | |
return content | |
else: | |
logger.warning(f"API μ€λ₯: μν μ½λ {response.status_code}") | |
# μμ² νλμΈ κ²½μ° λ μ€λ λκΈ° | |
if response.status_code == 429: | |
retry_delay = min(retry_delay * 3, 15) | |
else: | |
retry_delay = min(retry_delay * 2, 10) | |
if attempt < max_retries - 1: | |
logger.info(f"{retry_delay}μ΄ ν μ¬μλ...") | |
time.sleep(retry_delay) | |
except Exception as e: | |
logger.error(f"API νΈμΆ μ€λ₯: {e}") | |
if attempt < max_retries - 1: | |
logger.info(f"{retry_delay}μ΄ ν μ¬μλ...") | |
time.sleep(retry_delay) | |
retry_delay = min(retry_delay * 2, 10) | |
# λͺ¨λ μλ μ€ν¨ | |
raise Exception("μ΅λ μ¬μλ νμ μ΄κ³Ό") | |
# λ²‘ν° κ²μ μν | |
if self.vector_store and hasattr(self.vector_store, "similarity_search"): | |
logger.info("λ²‘ν° κ²μ μν...") | |
docs = self.vector_store.similarity_search(query, k=5) | |
# κ²μ κ²°κ³Ό 컨ν μ€νΈ κ΅¬μ± | |
context_parts = [] | |
for i, doc in enumerate(docs, 1): | |
source = doc.metadata.get("source", "μ μ μλ μΆμ²") | |
page = doc.metadata.get("page", "") | |
source_info = f"{source}" | |
if page: | |
source_info += f" (νμ΄μ§: {page})" | |
context_parts.append(f"[μ°Έκ³ μλ£ {i}] - μΆμ²: {source_info}\n{doc.page_content}\n") | |
context = "\n".join(context_parts) | |
# μ§μ API νΈμΆ | |
logger.info("DeepSeek API μ§μ νΈμΆ μλ...") | |
response = direct_api_call( | |
query, | |
context, | |
DEEPSEEK_API_KEY, | |
DEEPSEEK_MODEL, | |
DEEPSEEK_ENDPOINT, | |
max_retries=3, | |
timeout=120 | |
) | |
logger.info("DeepSeek API μ§μ νΈμΆ μ±κ³΅") | |
else: | |
raise Exception("λ²‘ν° μ€ν μ΄κ° μ΄κΈ°νλμ§ μμμ΅λλ€") | |
except Exception as direct_api_error: | |
logger.error(f"DeepSeek API μ§μ νΈμΆ μ€ν¨: {direct_api_error}, κ²μ κ²°κ³Ό λ°ν") | |
# 3. κ²μ κ²°κ³Όλ§μ΄λΌλ λ°ν | |
try: | |
# λ²‘ν° κ²μ μν | |
if self.vector_store and hasattr(self.vector_store, "similarity_search"): | |
docs = self.vector_store.similarity_search(query, k=5) | |
# κ²μ κ²°κ³Ό 컨ν μ€νΈ κ΅¬μ± | |
context_parts = [] | |
for i, doc in enumerate(docs, 1): | |
source = doc.metadata.get("source", "μ μ μλ μΆμ²") | |
page = doc.metadata.get("page", "") | |
source_info = f"{source}" | |
if page: | |
source_info += f" (νμ΄μ§: {page})" | |
context_parts.append(f"[μ°Έκ³ μλ£ {i}] - μΆμ²: {source_info}\n{doc.page_content}\n") | |
context = "\n".join(context_parts) | |
# κ°λ¨ν μλ΅ μμ± | |
predefined_answers = { | |
"λνλ―Όκ΅μ μλ": "λνλ―Όκ΅μ μλλ μμΈμ λλ€.", | |
"μλ": "λνλ―Όκ΅μ μλλ μμΈμ λλ€.", | |
"λꡬμΌ": "μ λ RAG κΈ°λ° μ§μμλ΅ μμ€ν μ λλ€. λ¬Έμλ₯Ό κ²μνκ³ κ΄λ ¨ μ 보λ₯Ό μ°Ύμλ립λλ€.", | |
"μλ ": "μλ νμΈμ! 무μμ λμλ릴κΉμ?", | |
"λν΄": "μ¬μ©μμ μ§λ¬Έμ λ΅λ³νκΈ° μν΄ λ¬Έμλ₯Ό κ²μνκ³ μμ΅λλ€. 무μμ μλ €λ릴κΉμ?" | |
} | |
# μ§λ¬Έμ λ§λ 미리 μ μλ μλ΅μ΄ μλμ§ νμΈ | |
for key, answer in predefined_answers.items(): | |
if key in query.lower(): | |
response = answer | |
logger.info(f"미리 μ μλ μλ΅ μ 곡: {key}") | |
break | |
else: | |
# 미리 μ μλ μλ΅μ΄ μμΌλ©΄ κ²μ κ²°κ³Όλ§ νμ | |
response = f""" | |
API μλ² μ°κ²°μ λ¬Έμ κ° μμ΄ κ²μ κ²°κ³Όλ§ νμν©λλ€. | |
μ§λ¬Έ: {query} | |
κ²μλ κ΄λ ¨ λ¬Έμ: | |
{context} | |
[μ°Έκ³ ] API μ°κ²° λ¬Έμ λ‘ μΈν΄ μλ μμ½μ΄ μ 곡λμ§ μμ΅λλ€. λ€μ μλνκ±°λ λ€λ₯Έ μ§λ¬Έμ ν΄λ³΄μΈμ. | |
""" | |
logger.info("κ²μ κ²°κ³Όλ§ νμ") | |
else: | |
response = f"API μ°κ²° λ° λ²‘ν° κ²μμ λͺ¨λ μ€ν¨νμ΅λλ€. μμ€ν κ΄λ¦¬μμκ² λ¬ΈμνμΈμ." | |
except Exception as fallback_error: | |
logger.error(f"μ΅μ’ ν΄λ°± μλ΅ μμ± μ€ν¨: {fallback_error}") | |
# 4. μ΅νμ λ°©λ²: μ€λ₯ λ©μμ§λ₯Ό μλ΅μΌλ‘ λ°ν | |
if "Connection error" in str(rag_error) or "timeout" in str(rag_error).lower(): | |
response = f""" | |
API μλ² μ°κ²°μ λ¬Έμ κ° μμ΅λλ€. μ μ ν λ€μ μλν΄μ£ΌμΈμ. | |
μ§λ¬Έ: {query} | |
[μ°Έκ³ ] νμ¬ DeepSeek API μλ²μμ μ°κ²°μ΄ μννμ§ μμ΅λλ€. μ΄λ‘ μΈν΄ μ§λ¬Έμ λν μλ΅μ μ 곡ν μ μμ΅λλ€. | |
""" | |
else: | |
response = f"쿼리 μ²λ¦¬ μ€ μ€λ₯κ° λ°μνμ΅λλ€: {str(rag_error)}" | |
end_time = time.time() | |
query_time = end_time - start_time | |
logger.info(f"쿼리 μ²λ¦¬ μλ£: {query_time:.2f}μ΄") | |
chat_history.append((query, response)) | |
return "", chat_history | |
except RAGInitializationError as e: | |
error_msg = f"RAG μμ€ν μ΄κΈ°ν μ€λ₯: {str(e)}. 'documents' ν΄λμ PDF νμΌμ΄ μλμ§ νμΈνκ³ , μ¬μμν΄ λ³΄μΈμ." | |
logger.error(f"쿼리 μ²λ¦¬ μ€ RAG μ΄κΈ°ν μ€λ₯: {e}", exc_info=True) | |
chat_history.append((query, error_msg)) | |
return "", chat_history | |
except (VectorStoreError, DocumentProcessingError) as e: | |
error_msg = f"λ¬Έμ μ²λ¦¬ μμ€ν μ€λ₯: {str(e)}. λ¬Έμ νμμ΄ μ¬λ°λ₯Έμ§ νμΈν΄ 보μΈμ." | |
logger.error(f"쿼리 μ²λ¦¬ μ€ λ¬Έμ/λ²‘ν° μ€ν μ΄ μ€λ₯: {e}", exc_info=True) | |
chat_history.append((query, error_msg)) | |
return "", chat_history | |
except Exception as e: | |
error_msg = f"쿼리 μ²λ¦¬ μ€ μ€λ₯ λ°μ: {str(e)}" | |
logger.error(f"쿼리 μ²λ¦¬ μ€ μμμΉ λͺ»ν μ€λ₯: {e}", exc_info=True) | |
chat_history.append((query, error_msg)) | |
return "", chat_history | |
def launch_app(self) -> None: | |
""" | |
Gradio μ± μ€ν | |
""" | |
try: | |
import gradio as gr | |
except ImportError: | |
logger.error("Gradio λΌμ΄λΈλ¬λ¦¬λ₯Ό μ°Ύμ μ μμ΅λλ€. pip install gradioλ‘ μ€μΉνμΈμ.") | |
print("Gradio λΌμ΄λΈλ¬λ¦¬λ₯Ό μ°Ύμ μ μμ΅λλ€. pip install gradioλ‘ μ€μΉνμΈμ.") | |
return | |
# λ΄λΆ ν¨μλ€μ΄ νμ¬ μΈμ€ν΄μ€(self)μ μ κ·Όν μ μλλ‘ ν΄λ‘μ λ³μλ‘ μ μ | |
app_instance = self | |
try: | |
with gr.Blocks(title="PDF λ¬Έμ κΈ°λ° RAG μ±λ΄") as app: | |
gr.Markdown("# PDF λ¬Έμ κΈ°λ° RAG μ±λ΄") | |
gr.Markdown(f"* μ¬μ© μ€μΈ LLM λͺ¨λΈ: **{LLM_MODEL}**") | |
# μ¬κΈ°λ₯Ό μμ : μ€μ κ²½λ‘ νμ | |
actual_pdf_dir = self.pdf_directory.replace('\\', '\\\\') if os.name == 'nt' else self.pdf_directory | |
gr.Markdown(f"* PDF λ¬Έμ ν΄λ: **{actual_pdf_dir}**") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
# λ¬Έμ μν μΉμ | |
status_box = gr.Textbox( | |
label="λ¬Έμ μ²λ¦¬ μν", | |
value=self._get_status_message(), | |
lines=5, | |
interactive=False | |
) | |
# μΊμ κ΄λ¦¬ λ²νΌ | |
refresh_button = gr.Button("λ¬Έμ μλ‘ μ½κΈ°", variant="primary") | |
reset_button = gr.Button("μΊμ μ΄κΈ°ν", variant="stop") | |
# μν λ° μ€λ₯ νμ | |
status_info = gr.Markdown( | |
value=f"μμ€ν μν: {'μ΄κΈ°νλ¨' if self.is_initialized else 'μ΄κΈ°νλμ§ μμ'}" | |
) | |
# μ²λ¦¬λ νμΌ μ 보 | |
with gr.Accordion("μΊμ μΈλΆ μ 보", open=False): | |
cache_info = gr.Textbox( | |
label="μΊμλ νμΌ μ 보", | |
value=self._get_cache_info(), | |
lines=5, | |
interactive=False | |
) | |
with gr.Column(scale=2): | |
# μ±ν μΈν°νμ΄μ€ | |
chatbot = gr.Chatbot( | |
label="λν λ΄μ©", | |
bubble_full_width=False, | |
height=500, | |
show_copy_button=True | |
) | |
# μμ± λ Ήμ UI μΆκ° | |
with gr.Row(): | |
with gr.Column(scale=4): | |
# μ§λ¬Έ μ λ ₯κ³Ό μ μ‘ λ²νΌ | |
query_box = gr.Textbox( | |
label="μ§λ¬Έ", | |
placeholder="μ²λ¦¬λ λ¬Έμ λ΄μ©μ λν΄ μ§λ¬ΈνμΈμ...", | |
lines=2 | |
) | |
with gr.Column(scale=1): | |
# μμ± λ Ήμ μ»΄ν¬λνΈ | |
audio_input = gr.Audio( | |
sources=["microphone"], | |
type="numpy", | |
label="μμ±μΌλ‘ μ§λ¬ΈνκΈ°" | |
) | |
with gr.Row(): | |
submit_btn = gr.Button("μ μ‘", variant="primary") | |
clear_chat_button = gr.Button("λν μ΄κΈ°ν") | |
# μμ± μΈμ μ²λ¦¬ ν¨μ | |
# app.py λ΄ process_audio ν¨μ λ³΄κ° | |
# Gradio μ± λ΄μ μλ μμ± μΈμ μ²λ¦¬ ν¨μ (μλ³Έ) | |
def process_audio(audio): | |
logger.info("μμ± μΈμ μ²λ¦¬ μμ...") | |
try: | |
from clova_stt import ClovaSTT | |
import numpy as np | |
import soundfile as sf | |
import tempfile | |
import os | |
if audio is None: | |
return "μμ±μ΄ λ Ήμλμ§ μμμ΅λλ€." | |
# μ€λμ€ λ°μ΄ν°λ₯Ό μμ νμΌλ‘ μ μ₯ | |
sr, y = audio | |
logger.info(f"μ€λμ€ λ Ήμ λ°μ΄ν° μμ : μνλ μ΄νΈ={sr}Hz, κΈΈμ΄={len(y)}μν") | |
if len(y) / sr < 1.0: | |
return "λ Ήμλ μμ±μ΄ λ무 μ§§μ΅λλ€. λ€μ μλν΄μ£ΌμΈμ." | |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file: | |
temp_path = temp_file.name | |
sf.write(temp_path, y, sr, format="WAV") | |
logger.info(f"μμ WAV νμΌ μ μ₯λ¨: {temp_path}") | |
# μμ± μΈμ μ€ν | |
stt_client = ClovaSTT() | |
with open(temp_path, "rb") as f: | |
audio_bytes = f.read() | |
result = stt_client.recognize(audio_bytes) | |
# μμ νμΌ μμ | |
try: | |
os.unlink(temp_path) | |
logger.info("μμ μ€λμ€ νμΌ μμ λ¨") | |
except Exception as e: | |
logger.warning(f"μμ νμΌ μμ μ€ν¨: {e}") | |
if result["success"]: | |
recognized_text = result["text"] | |
logger.info(f"μμ±μΈμ μ±κ³΅: {recognized_text}") | |
return recognized_text | |
else: | |
error_msg = f"μμ± μΈμ μ€ν¨: {result.get('error', 'μ μ μλ μ€λ₯')}" | |
logger.error(error_msg) | |
return error_msg | |
except ImportError as e: | |
logger.error(f"νμν λΌμ΄λΈλ¬λ¦¬ λλ½: {e}") | |
return "μμ±μΈμμ νμν λΌμ΄λΈλ¬λ¦¬κ° μ€μΉλμ§ μμμ΅λλ€. pip install soundfile numpy requestsλ₯Ό μ€νν΄μ£ΌμΈμ." | |
except Exception as e: | |
logger.error(f"μμ± μ²λ¦¬ μ€ μ€λ₯ λ°μ: {e}", exc_info=True) | |
return f"μμ± μ²λ¦¬ μ€ μ€λ₯ λ°μ: {str(e)}" | |
# μλ‘ μΆκ°ν process_audio_and_submit ν¨μ | |
def process_audio_and_submit(audio, chat_history): | |
""" | |
λ Ήμ μ μ§ μ μμ± μΈμ ν μλμΌλ‘ μ§λ¬Έμ μ²λ¦¬νλ ν¨μ. | |
μ λ ₯: | |
- audio: λ Ήμ λ°μ΄ν° (gr.Audioμ κ°) | |
- chat_history: νμ¬ λν κΈ°λ‘ (gr.Chatbotμ κ°) | |
μΆλ ₯: | |
- query_box: λΉ λ¬Έμμ΄ (μ§λ¬Έ μ λ ₯λ μ΄κΈ°ν) | |
- chatbot: μ λ°μ΄νΈλ λν κΈ°λ‘ | |
""" | |
recognized_text = process_audio(audio) | |
# μμ± μΈμ κ²°κ³Όκ° μ€λ₯ λ©μμ§μΈ κ²½μ° κ·Έλλ‘ λ°ν | |
if not recognized_text or recognized_text.startswith("μμ± μΈμ μ€ν¨") or recognized_text.startswith( | |
"μμ± μ²λ¦¬ μ€ μ€λ₯"): | |
return recognized_text, chat_history | |
# μΈμλ ν μ€νΈλ₯Ό μ¬μ©νμ¬ μ§λ¬Έ μ²λ¦¬ | |
return app_instance.process_query(recognized_text, chat_history) | |
# κΈ°μ‘΄ update_ui_after_refresh ν¨μ μμ (self λμ app_instance μ¬μ©) | |
def update_ui_after_refresh(result): | |
return ( | |
result, # μν λ©μμ§ | |
app_instance._get_status_message(), # μν λ°μ€ μ λ°μ΄νΈ | |
f"μμ€ν μν: {'μ΄κΈ°νλ¨' if app_instance.is_initialized else 'μ΄κΈ°νλμ§ μμ'}", # μν μ 보 μ λ°μ΄νΈ | |
app_instance._get_cache_info() # μΊμ μ 보 μ λ°μ΄νΈ | |
) | |
# --- Gradio μ΄λ²€νΈ νΈλ€λ¬ μ€μ --- | |
# μ: audio_input μ»΄ν¬λνΈμ stop_recording μ΄λ²€νΈλ₯Ό μλμ κ°μ΄ μμ | |
audio_input.stop_recording( | |
fn=process_audio_and_submit, | |
inputs=[audio_input, chatbot], | |
outputs=[query_box, chatbot] | |
) | |
# μμ± μΈμ κ²°κ³Όλ₯Ό μ§λ¬Έ μμμ μ λ°μ΄νΈ | |
audio_input.stop_recording( | |
fn=process_audio, | |
inputs=[audio_input], | |
outputs=[query_box] | |
) | |
# λ¬Έμ μλ‘ μ½κΈ° λ²νΌ | |
refresh_button.click( | |
fn=lambda: update_ui_after_refresh(self.auto_process_documents()), | |
inputs=[], | |
outputs=[status_box, status_box, status_info, cache_info] | |
) | |
# μΊμ μ΄κΈ°ν λ²νΌ | |
def reset_and_process(): | |
reset_result = self.reset_cache() | |
process_result = self.auto_process_documents() | |
return update_ui_after_refresh(f"{reset_result}\n\n{process_result}") | |
reset_button.click( | |
fn=reset_and_process, | |
inputs=[], | |
outputs=[status_box, status_box, status_info, cache_info] | |
) | |
# μ μ‘ λ²νΌ ν΄λ¦ μ΄λ²€νΈ | |
submit_btn.click( | |
fn=self.process_query, | |
inputs=[query_box, chatbot], | |
outputs=[query_box, chatbot] | |
) | |
# μν°ν€ μ λ ₯ μ΄λ²€νΈ | |
query_box.submit( | |
fn=self.process_query, | |
inputs=[query_box, chatbot], | |
outputs=[query_box, chatbot] | |
) | |
# λν μ΄κΈ°ν λ²νΌ | |
clear_chat_button.click( | |
fn=lambda: [], | |
outputs=[chatbot] | |
) | |
# μ± μ€ν | |
app.launch(share=False) | |
except Exception as e: | |
logger.error(f"Gradio μ± μ€ν μ€ μ€λ₯ λ°μ: {e}", exc_info=True) | |
print(f"Gradio μ± μ€ν μ€ μ€λ₯ λ°μ: {e}") | |
def _get_status_message(self) -> str: | |
""" | |
νμ¬ μ²λ¦¬ μν λ©μμ§ μμ± | |
Returns: | |
μν λ©μμ§ | |
""" | |
if not self.processed_files: | |
return "μ²λ¦¬λ λ¬Έμκ° μμ΅λλ€. 'λ¬Έμ μλ‘ μ½κΈ°' λ²νΌμ ν΄λ¦νμΈμ." | |
# DeepSeek API μν νμΈ | |
from config import USE_DEEPSEEK, DEEPSEEK_API_KEY, DEEPSEEK_MODEL | |
model_info = "" | |
if USE_DEEPSEEK and DEEPSEEK_API_KEY: | |
# DeepSeek API ν μ€νΈ μν | |
try: | |
# ν μ€νΈ ν¨μ κ°μ Έμ€κΈ° μλ | |
try: | |
from deepseek_utils import test_deepseek_api | |
# DeepSeek μ€μ κ°μ Έμ€κΈ° | |
from config import DEEPSEEK_ENDPOINT | |
# API ν μ€νΈ | |
test_result = test_deepseek_api(DEEPSEEK_API_KEY, DEEPSEEK_ENDPOINT, DEEPSEEK_MODEL) | |
if test_result["success"]: | |
model_info = f"\nDeepSeek API μν: μ μ ({DEEPSEEK_MODEL})" | |
else: | |
model_info = f"\nDeepSeek API μν: μ€λ₯ - {test_result['message']}" | |
except ImportError: | |
# μ§μ ν μ€νΈ μ€ν | |
import requests | |
import json | |
# DeepSeek μ€μ κ°μ Έμ€κΈ° | |
from config import DEEPSEEK_ENDPOINT | |
# ν μ€νΈμ© κ°λ¨ν ν둬ννΈ | |
test_prompt = "Hello, please respond with a short greeting." | |
# API μμ² ν€λ λ° λ°μ΄ν° | |
headers = { | |
"Content-Type": "application/json", | |
"Authorization": f"Bearer {DEEPSEEK_API_KEY}" | |
} | |
payload = { | |
"model": DEEPSEEK_MODEL, | |
"messages": [{"role": "user", "content": test_prompt}], | |
"temperature": 0.7, | |
"max_tokens": 50 | |
} | |
# API μμ² μ μ‘ | |
try: | |
response = requests.post( | |
DEEPSEEK_ENDPOINT, | |
headers=headers, | |
data=json.dumps(payload), | |
timeout=5 # 5μ΄ νμμμ (UI λ°μμ± μ μ§) | |
) | |
# μλ΅ νμΈ | |
if response.status_code == 200: | |
model_info = f"\nDeepSeek API μν: μ μ ({DEEPSEEK_MODEL})" | |
else: | |
error_message = response.text[:100] | |
model_info = f"\nDeepSeek API μν: μ€λ₯ (μν μ½λ: {response.status_code})" | |
except Exception as e: | |
model_info = f"\nDeepSeek API μν: μ°κ²° μ€ν¨ ({str(e)[:100]})" | |
except Exception as e: | |
model_info = f"\nDeepSeek API μν νμΈ μ€ν¨: {str(e)[:100]}" | |
return f"μ²λ¦¬λ λ¬Έμ ({len(self.processed_files)}κ°): {', '.join(self.processed_files)}{model_info}" | |
def _get_cache_info(self) -> str: | |
""" | |
μΊμ μΈλΆ μ 보 λ©μμ§ μμ± | |
Returns: | |
μΊμ μ 보 λ©μμ§ | |
""" | |
if not self.file_index: | |
return "μΊμλ νμΌμ΄ μμ΅λλ€." | |
file_info = "" | |
for file_path, info in self.file_index.items(): | |
file_name = info.get('file_name', os.path.basename(file_path)) | |
chunks_count = info.get('chunks_count', 0) | |
file_size = info.get('file_size', 0) | |
last_processed = info.get('last_processed', 0) | |
# νμΌ ν¬κΈ°λ₯Ό μ¬λμ΄ μ½κΈ° μ¬μ΄ ννλ‘ λ³ν | |
if file_size < 1024: | |
size_str = f"{file_size} bytes" | |
elif file_size < 1024 * 1024: | |
size_str = f"{file_size / 1024:.1f} KB" | |
else: | |
size_str = f"{file_size / (1024 * 1024):.1f} MB" | |
# λ§μ§λ§ μ²λ¦¬ μκ°μ λ μ§/μκ° νμμΌλ‘ λ³ν | |
if last_processed: | |
from datetime import datetime | |
last_time = datetime.fromtimestamp(last_processed).strftime('%Y-%m-%d %H:%M:%S') | |
else: | |
last_time = "μ μ μμ" | |
file_info += f"- {file_name}: {chunks_count}κ° μ²ν¬, {size_str}, λ§μ§λ§ μ²λ¦¬: {last_time}\n" | |
return file_info | |
if __name__ == "__main__": | |
app = AutoRAGChatApp() | |
app.launch_app() |