{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle, os, numpy as np\n",
    "from tqdm import tqdm\n",
    "from langchain.schema import Document\n",
    "from langchain.vectorstores import FAISS\n",
    "from langchain.schema import Document\n",
    "from langchain_community.embeddings import HuggingFaceBgeEmbeddings\n",
    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
    "from langchain_community.retrievers import BM25Retriever\n",
    "from langchain.retrievers import EnsembleRetriever"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "데이터 로드 중...\n",
      "총 2736개의 배치가 로드되었습니다.\n"
     ]
    }
   ],
   "source": [
    "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
    "\n",
    "# cases.pkl 파일에서 데이터 로드\n",
    "print(\"데이터 로드 중...\")\n",
    "with open(\"/Users/anpigon/Documents/Embed/법원판례/Result2.pkl\", \"rb\") as file:\n",
    "    data = pickle.load(file)\n",
    "\n",
    "print(f\"총 {len(data)}개의 배치가 로드되었습니다.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 임베딩 모델 설정 (실제로 임베딩하지는 않음)\n",
    "embeddings = HuggingFaceBgeEmbeddings(model_name=\"BAAI/bge-m3\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 텍스트 분할기 설정\n",
    "text_splitter = RecursiveCharacterTextSplitter(\n",
    "    chunk_size=2000,\n",
    "    chunk_overlap=200,\n",
    "    length_function=len,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "문서 처리 및 청킹 중...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 2736/2736 [00:42<00:00, 64.15it/s] \n"
     ]
    }
   ],
   "source": [
    "# 문서 처리 및 청킹\n",
    "print(\"문서 처리 및 청킹 중...\")\n",
    "documents = []\n",
    "text_embedding_pairs = []\n",
    "\n",
    "for batch in tqdm(data):\n",
    "    original_sentences = batch[1]  # 배치당 32개의 원본 문장\n",
    "    embedding_vectors = batch[0]  # 배치당 32개의 임베딩 벡터\n",
    "\n",
    "    for sentence, vector in zip(original_sentences, embedding_vectors):\n",
    "        chunks = text_splitter.split_text(sentence)\n",
    "        for chunk in chunks:\n",
    "            doc = Document(page_content=chunk)\n",
    "            documents.append(doc)\n",
    "            text_embedding_pairs.append((chunk, vector))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "FAISS 인덱스 불러오기\n",
      "FAISS 인덱스 불러오기 완료\n"
     ]
    }
   ],
   "source": [
    "# FAISS 인덱스 생성\n",
    "print(\"FAISS 인덱스 불러오기\")\n",
    "FAISS_DB_INDEX = \"./index_faiss\"\n",
    "faiss_db = FAISS.load_local(\n",
    "    FAISS_DB_INDEX, embeddings, allow_dangerous_deserialization=True\n",
    ")\n",
    "faiss_retriever = faiss_db.as_retriever(search_type=\"mmr\", search_kwargs={\"k\": 10})\n",
    "print(\"FAISS 인덱스 불러오기 완료\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "BM25Retriever 불러오기\n",
      "BM25 리트리버 불러오기 완료\n"
     ]
    }
   ],
   "source": [
    "from kiwipiepy import Kiwi\n",
    "from typing import List\n",
    "\n",
    "kiwi = Kiwi()\n",
    "\n",
    "\n",
    "def kiwi_tokenize(text):\n",
    "    return [token.form for token in kiwi.tokenize(text)]\n",
    "\n",
    "\n",
    "print(\"BM25Retriever 불러오기\")\n",
    "# bm25_retriever = BM25Retriever.from_documents(documents, k=10)\n",
    "with open(\"./index_bm25/kiwi.pkl\", \"rb\") as f:\n",
    "    bm25_retriever = pickle.load(f)\n",
    "print(\"BM25 리트리버 불러오기 완료\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "ensemble_retriever = EnsembleRetriever(\n",
    "    retrievers=[bm25_retriever, faiss_retriever], weights=[0.7, 0.3], search_type=\"mmr\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "from operator import itemgetter\n",
    "from langchain.callbacks.base import BaseCallbackHandler\n",
    "from langchain_core.prompts import (\n",
    "    HumanMessagePromptTemplate,\n",
    "    SystemMessagePromptTemplate,\n",
    ")\n",
    "from langchain_openai import ChatOpenAI\n",
    "from langchain_anthropic import ChatAnthropic\n",
    "from langchain_core.output_parsers import StrOutputParser\n",
    "from langchain_community.chat_message_histories import ChatMessageHistory\n",
    "from langchain.schema import HumanMessage, AIMessage, SystemMessage\n",
    "from langchain.schema.runnable import RunnablePassthrough\n",
    "from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder\n",
    "\n",
    "\n",
    "class StreamCallback(BaseCallbackHandler):\n",
    "    def on_llm_new_token(self, token: str, **kwargs):\n",
    "        print(token, end=\"\", flush=True)\n",
    "\n",
    "\n",
    "# 프롬프트 템플릿 설정\n",
    "prompt_template = \"\"\"\n",
    "당신은 판사이자 20년차 법률 전문가입니다. 주어진 질문에 대해 문서의 정보를 최대한 활용하여 답변하세요.\n",
    "질문자는 자기 상황을 설명할 것이며, 질문자의 상황과 비슷한 판례를 설명해줘야 하며, 가장 최근 사건 순으로 소개랍니다.\n",
    "최대한 자세하게 답변합니다. 초등학생이 이해할 정도로 이해하기 쉽도록 답변하고, 한글로 작성하세요.\n",
    "질문에 대힌 답변 사, [사건명 1]..., [사건명 2]... 순서로 설명해야 합니다.\n",
    "문서에서 답변을 찾을 수 없는 경우, \"문서에 답변이 없습니다.\"라고 답변하세요.\n",
    "답변의 출처(source)를 반드시 표기해주세요. 출처는 메타데이터의 판례일련번호, 사건명, 사건번호 순으로 표기 합니다.\n",
    "\n",
    "# 주어진 문서:\n",
    "{context}\n",
    "\n",
    "# 질문: {question}\n",
    "\n",
    "# 답변:\n",
    "\n",
    "# 출처:\n",
    "- source1\n",
    "- source2\n",
    "- ...\n",
    "\"\"\"\n",
    "\n",
    "# LLM 및 출력 파서 설정\n",
    "llm = ChatOpenAI(\n",
    "    model=\"gpt-4o\",\n",
    "    temperature=0,\n",
    "    streaming=True,\n",
    "    verbose=True,\n",
    "    callbacks=[StreamCallback()],\n",
    ")\n",
    "# llm = ChatAnthropic(model=\"claude-3-5-sonnet-20240620\", temperature=0, streaming=True, callbacks=[StreamCallback()])\n",
    "\n",
    "output_parser = StrOutputParser()\n",
    "\n",
    "# 채팅 기록을 저장할 메모리 초기화\n",
    "chat_history = ChatMessageHistory()\n",
    "\n",
    "# 프롬프트 설정\n",
    "prompt = ChatPromptTemplate.from_messages(\n",
    "    [\n",
    "        (\"system\", prompt_template),\n",
    "        MessagesPlaceholder(variable_name=\"history\"),\n",
    "        (\"human\", \"{question}\"),\n",
    "    ]\n",
    ").partial(history=chat_history.messages)\n",
    "\n",
    "# Runnable 객체 생성\n",
    "runnable = RunnablePassthrough.assign(\n",
    "    context=itemgetter(\"question\") | ensemble_retriever,\n",
    ")\n",
    "# LCEL 체인 구성\n",
    "chain = runnable | prompt | llm | output_parser\n",
    "\n",
    "\n",
    "def rag_chain(question):\n",
    "    response = chain.invoke({\"question\": question})\n",
    "    chat_history.add_user_message(question)\n",
    "    chat_history.add_ai_message(response)\n",
    "    return response"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "안녕하세요. 판사님입니다. 질문하신 상황과 비슷한 판례를 찾아보았습니다. 아래에 두 가지 사례를 소개해드리겠습니다.\n",
      "\n",
      "### [사건명 1] 불공정한 법률행위에 관한 법리를 오해한 위법이 있는 실례\n",
      "- **출처**: 214987, 손해배상등, 68다88, 1968.07.30\n",
      "- **사건 내용**: 매도인이 부동산을 매도할 당시, 매수인이 매도인의 궁박한 사정을 알고 있었고, 매도인이 팔기를 꺼려하는 부분까지 매수인의 요구에 의해 함께 팔지 않을 수 없었습니다. 매매목적물의 경계확정측량도 매수인이 일방적으로 하고, 부동산 가격도 매우 저렴하게 책정되었습니다. 이 사건에서 법원은 이러한 매매행위가 불공정한 법률행위에 해당한다고 판단하였습니다.\n",
      "\n",
      "### [사건명 2] 원고의 주장이 착오로 인한 의사표시의 취소로도 보여지므로, 이를 석명치 않은 위법이 있는 예\n",
      "- **출처**: 153300, 매매대금반환등, 66다1289, 1966.09.20\n",
      "- **사건 내용**: 원고가 피고로부터 매수한 논 1,389평 중 일부는 하천으로 되어 있어 경작할 수 없는 땅이었고, 나머지 땅은 이미 다른 사람들이 경작하고 있었습니다. 원고는 이러한 사실을 알지 못한 채 매매계약을 체결하였고, 나중에 이를 알게 되어 계약을 무효로 주장하였습니다. 법원은 원고의 주장이 착오로 인한 의사표시의 취소로도 볼 수 있다고 판단하였습니다.\n",
      "\n",
      "이 두 사건 모두 매수인이 매매 대상 부동산의 실제 상태를 제대로 알지 못한 채 계약을 체결한 후, 그 사실을 알게 되어 법적 분쟁이 발생한 사례입니다. 질문자님의 상황과 유사한 점이 많으므로 참고하시기 바랍니다.\n",
      "\n",
      "### 요약\n",
      "- **사건명 1**: 매도인의 궁박한 사정을 이용하여 부동산을 저렴하게 매수한 경우.\n",
      "- **사건명 2**: 매수한 부동산이 실제로는 경작할 수 없는 땅이었음을 나중에 알게 된 경우.\n",
      "\n",
      "이와 같은 사례를 통해 법적 대응 방안을 모색해보시기 바랍니다. 추가적인 법적 조언이 필요하시면 변호사와 상담하시기를 권장드립니다.\n",
      "\n",
      "감사합니다.\n",
      "\n",
      "### 출처\n",
      "- 214987, 손해배상등, 68다88, 1968.07.30\n",
      "- 153300, 매매대금반환등, 66다1289, 1966.09.20"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'안녕하세요. 판사님입니다. 질문하신 상황과 비슷한 판례를 찾아보았습니다. 아래에 두 가지 사례를 소개해드리겠습니다.\\n\\n### [사건명 1] 불공정한 법률행위에 관한 법리를 오해한 위법이 있는 실례\\n- **출처**: 214987, 손해배상등, 68다88, 1968.07.30\\n- **사건 내용**: 매도인이 부동산을 매도할 당시, 매수인이 매도인의 궁박한 사정을 알고 있었고, 매도인이 팔기를 꺼려하는 부분까지 매수인의 요구에 의해 함께 팔지 않을 수 없었습니다. 매매목적물의 경계확정측량도 매수인이 일방적으로 하고, 부동산 가격도 매우 저렴하게 책정되었습니다. 이 사건에서 법원은 이러한 매매행위가 불공정한 법률행위에 해당한다고 판단하였습니다.\\n\\n### [사건명 2] 원고의 주장이 착오로 인한 의사표시의 취소로도 보여지므로, 이를 석명치 않은 위법이 있는 예\\n- **출처**: 153300, 매매대금반환등, 66다1289, 1966.09.20\\n- **사건 내용**: 원고가 피고로부터 매수한 논 1,389평 중 일부는 하천으로 되어 있어 경작할 수 없는 땅이었고, 나머지 땅은 이미 다른 사람들이 경작하고 있었습니다. 원고는 이러한 사실을 알지 못한 채 매매계약을 체결하였고, 나중에 이를 알게 되어 계약을 무효로 주장하였습니다. 법원은 원고의 주장이 착오로 인한 의사표시의 취소로도 볼 수 있다고 판단하였습니다.\\n\\n이 두 사건 모두 매수인이 매매 대상 부동산의 실제 상태를 제대로 알지 못한 채 계약을 체결한 후, 그 사실을 알게 되어 법적 분쟁이 발생한 사례입니다. 질문자님의 상황과 유사한 점이 많으므로 참고하시기 바랍니다.\\n\\n### 요약\\n- **사건명 1**: 매도인의 궁박한 사정을 이용하여 부동산을 저렴하게 매수한 경우.\\n- **사건명 2**: 매수한 부동산이 실제로는 경작할 수 없는 땅이었음을 나중에 알게 된 경우.\\n\\n이와 같은 사례를 통해 법적 대응 방안을 모색해보시기 바랍니다. 추가적인 법적 조언이 필요하시면 변호사와 상담하시기를 권장드립니다.\\n\\n감사합니다.\\n\\n### 출처\\n- 214987, 손해배상등, 68다88, 1968.07.30\\n- 153300, 매매대금반환등, 66다1289, 1966.09.20'"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rag_chain(\n",
    "    \"논밭은 약 2천평을 샀는데, 알고 보니 집을 지을 수 없는 땅이야. 이런 사기와 비슷한 걸 알려줘!\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "langchain",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}