RAG3_Voice / dir
jeongsoo's picture
Add voice recognition feature with Naver Clova API
14586a6
"""
λ™μ˜μ–΄ 처리 λͺ¨λ“ˆ
"""
import os
import sys
import re
from typing import Dict, List, Optional, Set
# κΈ°λ³Έ λ™μ˜μ–΄ 사전 (MP_synonyms.py 파일이 없을 경우 μ‚¬μš©)
DEFAULT_SYNONYMS = {
"μ—‘μΈ„λ ˆμ΄ν„°": "앑츄에이터",
"앑츄에이터": "앑츄에이터",
"λͺ¨ν„°": "앑츄에이터",
"컨박": "μ»¨νŠΈλ‘€λ°•μŠ€"
}
class SynonymsHandler:
"""
λΆ€ν’ˆλͺ…μ˜ λ™μ˜μ–΄λ₯Ό μ²˜λ¦¬ν•˜λŠ” 클래슀
"""
def __init__(self, synonyms_file: Optional[str] = None):
"""
λ™μ˜μ–΄ ν•Έλ“€λŸ¬ μ΄ˆκΈ°ν™”
Args:
synonyms_file: λ™μ˜μ–΄ 파일 경둜 (선택적)
"""
self.synonyms = {}
self.loaded = False
# 1. κΈ°λ³Έ 제곡된 파일 경둜 확인
if synonyms_file and os.path.exists(synonyms_file):
self._load_from_file(synonyms_file)
# 2. 일반적인 μœ„μΉ˜ 확인 (.venv/SYNONYMS/MP_synonyms.py)
elif os.path.exists(".venv/SYNONYMS/MP_synonyms.py"):
self._load_from_file(".venv/SYNONYMS/MP_synonyms.py")
# 3. ν˜„μž¬ 디렉토리 확인
elif os.path.exists("MP_synonyms.py"):
self._load_from_file("MP_synonyms.py")
# 4. κΈ°λ³Έ λ™μ˜μ–΄ μ‚¬μš©
else:
print("λ™μ˜μ–΄ νŒŒμΌμ„ 찾을 수 μ—†μ–΄ κΈ°λ³Έ λ™μ˜μ–΄ 사전을 μ‚¬μš©ν•©λ‹ˆλ‹€.")
self.synonyms = DEFAULT_SYNONYMS
self.loaded = True
def _load_from_file(self, file_path: str) -> None:
"""
νŒŒμΌμ—μ„œ λ™μ˜μ–΄ 사전 λ‘œλ“œ
Args:
file_path: λ™μ˜μ–΄ 파일 경둜
"""
try:
# 파일 λ‚΄μš© 읽기
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# SYNONYMS λ”•μ…”λ„ˆλ¦¬ μΆ”μΆœ
synonyms_match = re.search(r'SYNONYMS\s*=\s*\{(.*?)\}', content, re.DOTALL)
if synonyms_match:
# μ‹€ν–‰ν•˜μ§€ μ•Šκ³  λ³€ν™˜ν•˜λŠ” 방법
synonyms_str = "{" + synonyms_match.group(1) + "}"
# μ •κ·œμ‹μ„ μ‚¬μš©ν•˜μ—¬ λ”•μ…”λ„ˆλ¦¬ ν˜•νƒœλ‘œ νŒŒμ‹±
pattern = r'"([^"]*)"\s*:\s*"([^"]*)"'
matches = re.findall(pattern, synonyms_str)
self.synonyms = {key: value for key, value in matches}
self.loaded = True
print(f"λ™μ˜μ–΄ 사전 λ‘œλ“œ μ™„λ£Œ: {file_path}, {len(self.synonyms)}개 ν•­λͺ©")
else:
print(f"νŒŒμΌμ—μ„œ SYNONYMS λ”•μ…”λ„ˆλ¦¬λ₯Ό 찾을 수 μ—†μŠ΅λ‹ˆλ‹€: {file_path}")
self.synonyms = DEFAULT_SYNONYMS
self.loaded = True
except Exception as e:
print(f"λ™μ˜μ–΄ 사전 λ‘œλ“œ 쀑 였λ₯˜: {e}")
self.synonyms = DEFAULT_SYNONYMS
self.loaded = True
def find_in_text(self, text: str) -> List[str]:
"""
ν…μŠ€νŠΈμ—μ„œ λ™μ˜μ–΄ μ°ΎκΈ°
Args:
text: 검색할 ν…μŠ€νŠΈ
Returns:
찾은 ν‘œμ€€ν™”λœ λΆ€ν’ˆλͺ… 리슀트
"""
if not text or not self.loaded:
return []
# 곡백 제거 및 μ†Œλ¬Έμž λ³€ν™˜
text = text.lower()
found_parts = set()
# λ™μ˜μ–΄ ν‚€μ›Œλ“œκ°€ ν…μŠ€νŠΈμ— ν¬ν•¨λ˜μ–΄ μžˆλŠ”μ§€ 확인
for keyword, standard_name in self.synonyms.items():
if keyword.lower() in text:
found_parts.add(standard_name)
return list(found_parts)
def standardize(self, part_name: str) -> str:
"""
λΆ€ν’ˆλͺ…을 ν‘œμ€€ν™”
Args:
part_name: ν‘œμ€€ν™”ν•  λΆ€ν’ˆλͺ…
Returns:
ν‘œμ€€ν™”λœ λΆ€ν’ˆλͺ…
"""
if not part_name or not self.loaded:
return part_name
# μ†Œλ¬Έμž λ³€ν™˜ν•˜μ—¬ 비ꡐ
part_lower = part_name.lower().strip()
# λ™μ˜μ–΄ μ‚¬μ „μ—μ„œ 검색
for keyword, standard_name in self.synonyms.items():
if part_lower == keyword.lower():
return standard_name
# λ§€μΉ­λ˜μ§€ μ•ŠμœΌλ©΄ μ›λž˜ 이름 λ°˜ν™˜
return part_name
def standardize_parts_list(self, parts: List[str]) -> List[str]:
"""
λΆ€ν’ˆλͺ… 리슀트λ₯Ό ν‘œμ€€ν™”
Args:
parts: ν‘œμ€€ν™”ν•  λΆ€ν’ˆλͺ… 리슀트
Returns:
ν‘œμ€€ν™”λœ λΆ€ν’ˆλͺ… 리슀트
"""
if not parts or not self.loaded:
return parts
standardized = set()
for part in parts:
if part:
standardized.add(self.standardize(part))
return list(standardized)