{ "cells": [ { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2879" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import bs4\n", "from langchain_community.document_loaders import WebBaseLoader\n", "\n", "# Only keep post title, headers, and content from the full HTML.\n", "bs4_strainer = bs4.SoupStrainer(class_=(\"post-title\", \"post-header\", \"post-content\")) \n", "loader = WebBaseLoader(\n", " web_paths=(\n", " \"https://um.ugm.ac.id/ragam-seleksi-pascasarjana/\",\n", " \"https://um.ugm.ac.id/persyaratan-pendaftaran-magister/\",\n", " \"https://um.ugm.ac.id/persyaratan-pendaftaran-program-spesialis/\",\n", " \"https://um.ugm.ac.id/persyaratan-pendaftaran-subspesialis/\",\n", " \"https://um.ugm.ac.id/persyaratan-pendaftaran-doktor/\",\n", " \"https://um.ugm.ac.id/prosedur-pendaftaran-magister/\",\n", " \"https://um.ugm.ac.id/prosedur-pendaftaran-program-spesialis/\",\n", " \"https://um.ugm.ac.id/prosedur-pendaftaran-program-subspesialis/\",\n", " \"https://um.ugm.ac.id/prosedur-pendaftaran-doktor-2/\",\n", " \"https://um.ugm.ac.id/program-studi-program-magister-2/\",\n", " \"https://um.ugm.ac.id/program-studi-dan-daya-tampung-program-spesialis/\",\n", " \"https://um.ugm.ac.id/program-studi-program-doktor/\",\n", " \"https://um.ugm.ac.id/jadwal-seleksi-magister-dan-doktor/\",\n", " \"https://um.ugm.ac.id/jadwal-kegiatan-seleksi-program-spesialis/\",\n", " \"https://mkom.ugm.ac.id/alur-pendaftaran-magister/\",\n", " \"https://mkom.ugm.ac.id/informasi-pendaftaran-program-pra-s2-ilmu-komputer/\",\n", " \"https://mkom.ugm.ac.id/informasi-pendaftaran-program-s2-magister/\",\n", " \"https://mkom.ugm.ac.id/program-dual-degree-double-degree-magister-ilmu-komputer/\",\n", " \"https://mkom.ugm.ac.id/informasi-pendaftaran-program-s3-doktor/\"\n", " ),\n", " bs_kwargs={\"parse_only\": bs4_strainer},\n", ")\n", "docs = loader.load()\n", "\n", "len(docs[0].page_content)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Ragam Seleksi Program Pascasarjana\n", "Penerimaan Mahasiswa Baru (PMB) Program Pascasarjana terdiri dari:\n", "\n", "Jalur Reguler, dengan skema:\n", "\n", "\n", "Biaya Sendiri\n", "Diperuntukkan bagi pendaftar dengan biaya sendiri/swadana.\n", "Kerja sama\n", "Diperuntukkan bagi:\n", "1. pendaftar yang telah ditetapkan sebagai penerima beasiswa oleh lembaga/instansi pemberi beasiswa (dibuktikan dengan adanya surat penetapan sebagai penerima beasiswa); atau\n", "2. pendaftar yang pendidikanya dibiayai oleh mitra kerja sama UGM yang dibuktikan denga\n" ] } ], "source": [ "print(docs[0].page_content[:500])" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "111" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from langchain_text_splitters import RecursiveCharacterTextSplitter\n", "\n", "text_splitter = RecursiveCharacterTextSplitter(\n", " chunk_size=1000, chunk_overlap=200, add_start_index=True\n", ")\n", "all_splits = text_splitter.split_documents(docs)\n", "\n", "len(all_splits)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "835" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(all_splits[0].page_content)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'source': 'https://um.ugm.ac.id/persyaratan-pendaftaran-magister/',\n", " 'start_index': 4748}" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_splits[10].metadata" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\robit\\miniconda3\\envs\\rag\\Lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", " warnings.warn(\n" ] } ], "source": [ "from langchain.vectorstores import FAISS\n", "from langchain.embeddings import HuggingFaceEmbeddings\n", "\n", "vectorstore = FAISS.from_documents(all_splits, HuggingFaceEmbeddings(model_name='firqaaa/indo-sentence-bert-base'))" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "6" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "retriever = vectorstore.as_retriever(search_type=\"similarity\", search_kwargs={\"k\": 6})\n", "\n", "retrieved_docs = retriever.invoke(\"Apa saja skema Penerimaan Mahasiswa Baru (PMB) Program Pascasarjana jalur reguler?\")\n", "\n", "len(retrieved_docs)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Ragam Seleksi Program Pascasarjana\n", "Penerimaan Mahasiswa Baru (PMB) Program Pascasarjana terdiri dari:\n", "\n", "Jalur Reguler, dengan skema:\n", "\n", "\n", "Biaya Sendiri\n", "Diperuntukkan bagi pendaftar dengan biaya sendiri/swadana.\n", "Kerja sama\n", "Diperuntukkan bagi:\n", "1. pendaftar yang telah ditetapkan sebagai penerima beasiswa oleh lembaga/instansi pemberi beasiswa (dibuktikan dengan adanya surat penetapan sebagai penerima beasiswa); atau\n", "2. pendaftar yang pendidikanya dibiayai oleh mitra kerja sama UGM yang dibuktikan dengan adanya Memorandum of Understanding (MoU) atau Perjanian Kerja Sama (PKS) yang berlaku (daftar mitra kerja sama dapat dipilih pada saat mengisi pendaftaran online).\n", "Pelamar Beasiswa\n", "Diperuntukkan bagi pendaftar yang sedang mendaftar beasiswa atau menunggu hasil seleksi beasiswa dari berbagai lembaga/instansi/ pihak pemberi beasiswa.\n" ] } ], "source": [ "print(retrieved_docs[0].page_content)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "os.environ[\"COHERE_API_KEY\"] = 'I2LwLsW10InzTzGZ0WMriKxHUHq78E5pSVkl3MTe'\n", "\n", "from langchain_cohere import ChatCohere\n", "\n", "llm = ChatCohere(model=\"command-r\")" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[HumanMessage(content=\"You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\\nQuestion: filler question \\nContext: filler context \\nAnswer:\")]" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from langchain import hub\n", "\n", "prompt = hub.pull(\"rlm/rag-prompt\")\n", "\n", "example_messages = prompt.invoke(\n", " {\"context\": \"filler context\", \"question\": \"filler question\"}\n", ").to_messages()\n", "\n", "example_messages" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\n", "Question: filler question \n", "Context: filler context \n", "Answer:\n" ] } ], "source": [ "print(example_messages[0].content)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Penerimaan Mahasiswa Baru (PMB) Program Pascasarjana jalur reguler dilakukan melalui tiga skema: biaya sendiri, kerja sama, dan pelamar beasiswa. Jalur ini terbuka untuk semua pendaftar tanpa batasan institusi atau negara asal ijazah.Penerimaan Mahasiswa Baru (PMB) Program Pascasarjana jalur reguler dilakukan melalui tiga skema: biaya sendiri, kerja sama, dan pelamar beasiswa. Jalur ini terbuka untuk semua pendaftar tanpa batasan institusi atau negara asal ijazah." ] } ], "source": [ "from langchain_core.output_parsers import StrOutputParser\n", "from langchain_core.runnables import RunnablePassthrough\n", "\n", "\n", "def format_docs(docs):\n", " return \"\\n\\n\".join(doc.page_content for doc in docs)\n", "\n", "\n", "rag_chain = (\n", " {\"context\": retriever | format_docs, \"question\": RunnablePassthrough()}\n", " | prompt\n", " | llm\n", " | StrOutputParser()\n", ")\n", "\n", "for chunk in rag_chain.stream(\"Apa saja skema Penerimaan Mahasiswa Baru (PMB) Program Pascasarjana jalur reguler?\"):\n", " print(chunk, end=\"\", flush=True)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'274-513109, 0274-548516, 085292000355.\\n\\nTerima kasih sudah bertanya!'" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from langchain_core.prompts import PromptTemplate\n", "\n", "template = \"\"\"Gunakan konteks berikut untuk menjawab pertanyaan pada bagian akhir.\n", "Jika kamu tidak tahu jawabannya, katakan saja bahwa kamu tidak tahu, jangan mencoba untuk mengarang jawaban.\n", "Selalu katakan \"Terima kasih sudah bertanya!\" pada setiap akhir jawaban.\n", "\n", "{context}\n", "\n", "Pertanyaan: {question}\n", "\n", "Jawaban:\"\"\"\n", "\n", "custom_rag_prompt = PromptTemplate.from_template(template)\n", "\n", "rag_chain = (\n", " {\"context\": retriever | format_docs, \"question\": RunnablePassthrough()}\n", " | custom_rag_prompt\n", " | llm\n", " | StrOutputParser()\n", ")\n", "\n", "rag_chain.invoke(\"Berapa nomor telepon PROGRAM MAGISTER (S2) SEMESTER GASAL TA 2024/2025 program studi Magister Akuntansi?\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "rag", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 2 }