import streamlit as st import bs4 from langchain_community.document_loaders import WebBaseLoader from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain.vectorstores import FAISS from langchain.embeddings import HuggingFaceEmbeddings from langchain_cohere import ChatCohere from langchain_core.output_parsers import StrOutputParser from langchain_core.runnables import RunnablePassthrough from langchain_core.prompts import PromptTemplate from dotenv import load_dotenv load_dotenv('.env') st.header("MKOM UGM RAG App") @st.cache_resource def get_rag_chain(): # Only keep post title, headers, and content from the full HTML. bs4_strainer = bs4.SoupStrainer(class_=("post-title", "post-header", "post-content")) loader = WebBaseLoader( web_paths=( "https://um.ugm.ac.id/ragam-seleksi-pascasarjana/", "https://um.ugm.ac.id/persyaratan-pendaftaran-magister/", "https://um.ugm.ac.id/persyaratan-pendaftaran-program-spesialis/", "https://um.ugm.ac.id/persyaratan-pendaftaran-subspesialis/", "https://um.ugm.ac.id/persyaratan-pendaftaran-doktor/", "https://um.ugm.ac.id/prosedur-pendaftaran-magister/", "https://um.ugm.ac.id/prosedur-pendaftaran-program-spesialis/", "https://um.ugm.ac.id/prosedur-pendaftaran-program-subspesialis/", "https://um.ugm.ac.id/prosedur-pendaftaran-doktor-2/", "https://um.ugm.ac.id/program-studi-program-magister-2/", "https://um.ugm.ac.id/program-studi-dan-daya-tampung-program-spesialis/", "https://um.ugm.ac.id/program-studi-program-doktor/", "https://um.ugm.ac.id/jadwal-seleksi-magister-dan-doktor/", "https://um.ugm.ac.id/jadwal-kegiatan-seleksi-program-spesialis/", "https://mkom.ugm.ac.id/alur-pendaftaran-magister/", "https://mkom.ugm.ac.id/informasi-pendaftaran-program-pra-s2-ilmu-komputer/", "https://mkom.ugm.ac.id/informasi-pendaftaran-program-s2-magister/", "https://mkom.ugm.ac.id/program-dual-degree-double-degree-magister-ilmu-komputer/", "https://mkom.ugm.ac.id/informasi-pendaftaran-program-s3-doktor/" ), bs_kwargs={"parse_only": bs4_strainer}, ) docs = loader.load() text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200, add_start_index=True ) all_splits = text_splitter.split_documents(docs) vectorstore = FAISS.from_documents(all_splits, HuggingFaceEmbeddings(model_name='firqaaa/indo-sentence-bert-base')) retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6}) llm = ChatCohere(model="command-r") def format_docs(docs): return "\n\n".join(doc.page_content for doc in docs) template = """Gunakan konteks berikut untuk menjawab pertanyaan pada bagian akhir. Jika kamu tidak tahu jawabannya, katakan saja bahwa kamu tidak tahu, jangan mencoba untuk mengarang jawaban. Selalu katakan "Terima kasih sudah bertanya!" pada setiap akhir jawaban. {context} Pertanyaan: {question} Jawaban:""" custom_rag_prompt = PromptTemplate.from_template(template) rag_chain = ( {"context": retriever | format_docs, "question": RunnablePassthrough()} | custom_rag_prompt | llm | StrOutputParser() ) return rag_chain rag_chain = get_rag_chain() question = st.text_input("Tanya ujian masuk Pascasarjana Universitas Gadjah Mada") if question: response = rag_chain.invoke(question) st.write(response)