robitalhazmi commited on
Commit
29b19df
1 Parent(s): cb641ab

add cache_resource

Browse files
Files changed (3) hide show
  1. .gitignore +1 -0
  2. app.py +54 -49
  3. rag_notebook.ipynb +22 -22
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .env
app.py CHANGED
@@ -13,62 +13,67 @@ load_dotenv('.env')
13
 
14
  st.header("MKOM UGM RAG App")
15
 
16
- # Only keep post title, headers, and content from the full HTML.
17
- bs4_strainer = bs4.SoupStrainer(class_=("post-title", "post-header", "post-content"))
18
- loader = WebBaseLoader(
19
- web_paths=(
20
- "https://um.ugm.ac.id/ragam-seleksi-pascasarjana/",
21
- "https://um.ugm.ac.id/persyaratan-pendaftaran-magister/",
22
- "https://um.ugm.ac.id/persyaratan-pendaftaran-program-spesialis/",
23
- "https://um.ugm.ac.id/persyaratan-pendaftaran-subspesialis/",
24
- "https://um.ugm.ac.id/persyaratan-pendaftaran-doktor/",
25
- "https://um.ugm.ac.id/prosedur-pendaftaran-magister/",
26
- "https://um.ugm.ac.id/prosedur-pendaftaran-program-spesialis/",
27
- "https://um.ugm.ac.id/prosedur-pendaftaran-program-subspesialis/",
28
- "https://um.ugm.ac.id/prosedur-pendaftaran-doktor-2/",
29
- "https://um.ugm.ac.id/program-studi-program-magister-2/",
30
- "https://um.ugm.ac.id/program-studi-dan-daya-tampung-program-spesialis/",
31
- "https://um.ugm.ac.id/program-studi-program-doktor/",
32
- "https://um.ugm.ac.id/jadwal-seleksi-magister-dan-doktor/",
33
- "https://um.ugm.ac.id/jadwal-kegiatan-seleksi-program-spesialis/",
34
- "https://mkom.ugm.ac.id/alur-pendaftaran-magister/",
35
- "https://mkom.ugm.ac.id/informasi-pendaftaran-program-pra-s2-ilmu-komputer/",
36
- "https://mkom.ugm.ac.id/informasi-pendaftaran-program-s2-magister/",
37
- "https://mkom.ugm.ac.id/program-dual-degree-double-degree-magister-ilmu-komputer/",
38
- "https://mkom.ugm.ac.id/informasi-pendaftaran-program-s3-doktor/"
39
- ),
40
- bs_kwargs={"parse_only": bs4_strainer},
41
- )
42
- docs = loader.load()
43
- text_splitter = RecursiveCharacterTextSplitter(
44
- chunk_size=1000, chunk_overlap=200, add_start_index=True
45
- )
46
- all_splits = text_splitter.split_documents(docs)
47
- vectorstore = FAISS.from_documents(all_splits, HuggingFaceEmbeddings(model_name='firqaaa/indo-sentence-bert-base'))
48
- retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})
49
- llm = ChatCohere(model="command-r")
 
 
50
 
51
- def format_docs(docs):
52
- return "\n\n".join(doc.page_content for doc in docs)
53
 
54
- template = """Gunakan konteks berikut untuk menjawab pertanyaan pada bagian akhir.
55
- Jika kamu tidak tahu jawabannya, katakan saja bahwa kamu tidak tahu, jangan mencoba untuk mengarang jawaban.
56
- Selalu katakan "Terima kasih sudah bertanya!" pada setiap akhir jawaban.
57
 
58
- {context}
59
 
60
- Pertanyaan: {question}
61
 
62
- Jawaban:"""
63
 
64
- custom_rag_prompt = PromptTemplate.from_template(template)
65
 
66
- rag_chain = (
67
- {"context": retriever | format_docs, "question": RunnablePassthrough()}
68
- | custom_rag_prompt
69
- | llm
70
- | StrOutputParser()
71
- )
 
 
 
72
 
73
  question = st.text_input("Tanya ujian masuk Pascasarjana Universitas Gadjah Mada")
74
  if question:
 
13
 
14
  st.header("MKOM UGM RAG App")
15
 
16
+ @st.cache_resource
17
+ def get_rag_chain():
18
+ # Only keep post title, headers, and content from the full HTML.
19
+ bs4_strainer = bs4.SoupStrainer(class_=("post-title", "post-header", "post-content"))
20
+ loader = WebBaseLoader(
21
+ web_paths=(
22
+ "https://um.ugm.ac.id/ragam-seleksi-pascasarjana/",
23
+ "https://um.ugm.ac.id/persyaratan-pendaftaran-magister/",
24
+ "https://um.ugm.ac.id/persyaratan-pendaftaran-program-spesialis/",
25
+ "https://um.ugm.ac.id/persyaratan-pendaftaran-subspesialis/",
26
+ "https://um.ugm.ac.id/persyaratan-pendaftaran-doktor/",
27
+ "https://um.ugm.ac.id/prosedur-pendaftaran-magister/",
28
+ "https://um.ugm.ac.id/prosedur-pendaftaran-program-spesialis/",
29
+ "https://um.ugm.ac.id/prosedur-pendaftaran-program-subspesialis/",
30
+ "https://um.ugm.ac.id/prosedur-pendaftaran-doktor-2/",
31
+ "https://um.ugm.ac.id/program-studi-program-magister-2/",
32
+ "https://um.ugm.ac.id/program-studi-dan-daya-tampung-program-spesialis/",
33
+ "https://um.ugm.ac.id/program-studi-program-doktor/",
34
+ "https://um.ugm.ac.id/jadwal-seleksi-magister-dan-doktor/",
35
+ "https://um.ugm.ac.id/jadwal-kegiatan-seleksi-program-spesialis/",
36
+ "https://mkom.ugm.ac.id/alur-pendaftaran-magister/",
37
+ "https://mkom.ugm.ac.id/informasi-pendaftaran-program-pra-s2-ilmu-komputer/",
38
+ "https://mkom.ugm.ac.id/informasi-pendaftaran-program-s2-magister/",
39
+ "https://mkom.ugm.ac.id/program-dual-degree-double-degree-magister-ilmu-komputer/",
40
+ "https://mkom.ugm.ac.id/informasi-pendaftaran-program-s3-doktor/"
41
+ ),
42
+ bs_kwargs={"parse_only": bs4_strainer},
43
+ )
44
+ docs = loader.load()
45
+ text_splitter = RecursiveCharacterTextSplitter(
46
+ chunk_size=1000, chunk_overlap=200, add_start_index=True
47
+ )
48
+ all_splits = text_splitter.split_documents(docs)
49
+ vectorstore = FAISS.from_documents(all_splits, HuggingFaceEmbeddings(model_name='firqaaa/indo-sentence-bert-base'))
50
+ retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})
51
+ llm = ChatCohere(model="command-r")
52
 
53
+ def format_docs(docs):
54
+ return "\n\n".join(doc.page_content for doc in docs)
55
 
56
+ template = """Gunakan konteks berikut untuk menjawab pertanyaan pada bagian akhir.
57
+ Jika kamu tidak tahu jawabannya, katakan saja bahwa kamu tidak tahu, jangan mencoba untuk mengarang jawaban.
58
+ Selalu katakan "Terima kasih sudah bertanya!" pada setiap akhir jawaban.
59
 
60
+ {context}
61
 
62
+ Pertanyaan: {question}
63
 
64
+ Jawaban:"""
65
 
66
+ custom_rag_prompt = PromptTemplate.from_template(template)
67
 
68
+ rag_chain = (
69
+ {"context": retriever | format_docs, "question": RunnablePassthrough()}
70
+ | custom_rag_prompt
71
+ | llm
72
+ | StrOutputParser()
73
+ )
74
+ return rag_chain
75
+
76
+ rag_chain = get_rag_chain()
77
 
78
  question = st.text_input("Tanya ujian masuk Pascasarjana Universitas Gadjah Mada")
79
  if question:
rag_notebook.ipynb CHANGED
@@ -2,7 +2,7 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 94,
6
  "metadata": {},
7
  "outputs": [
8
  {
@@ -11,7 +11,7 @@
11
  "2879"
12
  ]
13
  },
14
- "execution_count": 94,
15
  "metadata": {},
16
  "output_type": "execute_result"
17
  }
@@ -53,7 +53,7 @@
53
  },
54
  {
55
  "cell_type": "code",
56
- "execution_count": 95,
57
  "metadata": {},
58
  "outputs": [
59
  {
@@ -81,7 +81,7 @@
81
  },
82
  {
83
  "cell_type": "code",
84
- "execution_count": 96,
85
  "metadata": {},
86
  "outputs": [
87
  {
@@ -90,7 +90,7 @@
90
  "111"
91
  ]
92
  },
93
- "execution_count": 96,
94
  "metadata": {},
95
  "output_type": "execute_result"
96
  }
@@ -108,7 +108,7 @@
108
  },
109
  {
110
  "cell_type": "code",
111
- "execution_count": 97,
112
  "metadata": {},
113
  "outputs": [
114
  {
@@ -117,7 +117,7 @@
117
  "835"
118
  ]
119
  },
120
- "execution_count": 97,
121
  "metadata": {},
122
  "output_type": "execute_result"
123
  }
@@ -128,7 +128,7 @@
128
  },
129
  {
130
  "cell_type": "code",
131
- "execution_count": 98,
132
  "metadata": {},
133
  "outputs": [
134
  {
@@ -138,7 +138,7 @@
138
  " 'start_index': 4748}"
139
  ]
140
  },
141
- "execution_count": 98,
142
  "metadata": {},
143
  "output_type": "execute_result"
144
  }
@@ -149,7 +149,7 @@
149
  },
150
  {
151
  "cell_type": "code",
152
- "execution_count": 99,
153
  "metadata": {},
154
  "outputs": [
155
  {
@@ -170,7 +170,7 @@
170
  },
171
  {
172
  "cell_type": "code",
173
- "execution_count": null,
174
  "metadata": {},
175
  "outputs": [
176
  {
@@ -179,7 +179,7 @@
179
  "6"
180
  ]
181
  },
182
- "execution_count": 73,
183
  "metadata": {},
184
  "output_type": "execute_result"
185
  }
@@ -194,7 +194,7 @@
194
  },
195
  {
196
  "cell_type": "code",
197
- "execution_count": null,
198
  "metadata": {},
199
  "outputs": [
200
  {
@@ -224,7 +224,7 @@
224
  },
225
  {
226
  "cell_type": "code",
227
- "execution_count": null,
228
  "metadata": {},
229
  "outputs": [],
230
  "source": [
@@ -239,7 +239,7 @@
239
  },
240
  {
241
  "cell_type": "code",
242
- "execution_count": null,
243
  "metadata": {},
244
  "outputs": [
245
  {
@@ -248,7 +248,7 @@
248
  "[HumanMessage(content=\"You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\\nQuestion: filler question \\nContext: filler context \\nAnswer:\")]"
249
  ]
250
  },
251
- "execution_count": 76,
252
  "metadata": {},
253
  "output_type": "execute_result"
254
  }
@@ -267,7 +267,7 @@
267
  },
268
  {
269
  "cell_type": "code",
270
- "execution_count": null,
271
  "metadata": {},
272
  "outputs": [
273
  {
@@ -287,14 +287,14 @@
287
  },
288
  {
289
  "cell_type": "code",
290
- "execution_count": null,
291
  "metadata": {},
292
  "outputs": [
293
  {
294
  "name": "stdout",
295
  "output_type": "stream",
296
  "text": [
297
- "Penerimaan Mahasiswa Baru (PMB) Program Pascasarjana jalur reguler terdiri dari dua skema: biaya sendiri dan kerja sama. Skema kerja sama diperuntukkan bagi penerima beasiswa atau yang pendidikannya dibiayai mitra UGM.Penerimaan Mahasiswa Baru (PMB) Program Pascasarjana jalur reguler terdiri dari dua skema: biaya sendiri dan kerja sama. Skema kerja sama diperuntukkan bagi penerima beasiswa atau yang pendidikannya dibiayai mitra UGM."
298
  ]
299
  }
300
  ],
@@ -320,16 +320,16 @@
320
  },
321
  {
322
  "cell_type": "code",
323
- "execution_count": null,
324
  "metadata": {},
325
  "outputs": [
326
  {
327
  "data": {
328
  "text/plain": [
329
- "'Nomor telepon untuk PROGRAM MAGISTER (S2) SEMESTER GASAL TA 2024/2025 program studi Magister Akuntansi adalah 274-513109, 0274-548516, dan 085292000355.\\n\\nTerima kasih sudah bertanya!'"
330
  ]
331
  },
332
- "execution_count": 93,
333
  "metadata": {},
334
  "output_type": "execute_result"
335
  }
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 12,
6
  "metadata": {},
7
  "outputs": [
8
  {
 
11
  "2879"
12
  ]
13
  },
14
+ "execution_count": 12,
15
  "metadata": {},
16
  "output_type": "execute_result"
17
  }
 
53
  },
54
  {
55
  "cell_type": "code",
56
+ "execution_count": 13,
57
  "metadata": {},
58
  "outputs": [
59
  {
 
81
  },
82
  {
83
  "cell_type": "code",
84
+ "execution_count": 14,
85
  "metadata": {},
86
  "outputs": [
87
  {
 
90
  "111"
91
  ]
92
  },
93
+ "execution_count": 14,
94
  "metadata": {},
95
  "output_type": "execute_result"
96
  }
 
108
  },
109
  {
110
  "cell_type": "code",
111
+ "execution_count": 15,
112
  "metadata": {},
113
  "outputs": [
114
  {
 
117
  "835"
118
  ]
119
  },
120
+ "execution_count": 15,
121
  "metadata": {},
122
  "output_type": "execute_result"
123
  }
 
128
  },
129
  {
130
  "cell_type": "code",
131
+ "execution_count": 16,
132
  "metadata": {},
133
  "outputs": [
134
  {
 
138
  " 'start_index': 4748}"
139
  ]
140
  },
141
+ "execution_count": 16,
142
  "metadata": {},
143
  "output_type": "execute_result"
144
  }
 
149
  },
150
  {
151
  "cell_type": "code",
152
+ "execution_count": 17,
153
  "metadata": {},
154
  "outputs": [
155
  {
 
170
  },
171
  {
172
  "cell_type": "code",
173
+ "execution_count": 18,
174
  "metadata": {},
175
  "outputs": [
176
  {
 
179
  "6"
180
  ]
181
  },
182
+ "execution_count": 18,
183
  "metadata": {},
184
  "output_type": "execute_result"
185
  }
 
194
  },
195
  {
196
  "cell_type": "code",
197
+ "execution_count": 19,
198
  "metadata": {},
199
  "outputs": [
200
  {
 
224
  },
225
  {
226
  "cell_type": "code",
227
+ "execution_count": 20,
228
  "metadata": {},
229
  "outputs": [],
230
  "source": [
 
239
  },
240
  {
241
  "cell_type": "code",
242
+ "execution_count": 21,
243
  "metadata": {},
244
  "outputs": [
245
  {
 
248
  "[HumanMessage(content=\"You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\\nQuestion: filler question \\nContext: filler context \\nAnswer:\")]"
249
  ]
250
  },
251
+ "execution_count": 21,
252
  "metadata": {},
253
  "output_type": "execute_result"
254
  }
 
267
  },
268
  {
269
  "cell_type": "code",
270
+ "execution_count": 22,
271
  "metadata": {},
272
  "outputs": [
273
  {
 
287
  },
288
  {
289
  "cell_type": "code",
290
+ "execution_count": 23,
291
  "metadata": {},
292
  "outputs": [
293
  {
294
  "name": "stdout",
295
  "output_type": "stream",
296
  "text": [
297
+ "Penerimaan Mahasiswa Baru (PMB) Program Pascasarjana jalur reguler dilakukan melalui tiga skema: biaya sendiri, kerja sama, dan pelamar beasiswa. Jalur ini terbuka untuk semua pendaftar tanpa batasan institusi atau negara asal ijazah.Penerimaan Mahasiswa Baru (PMB) Program Pascasarjana jalur reguler dilakukan melalui tiga skema: biaya sendiri, kerja sama, dan pelamar beasiswa. Jalur ini terbuka untuk semua pendaftar tanpa batasan institusi atau negara asal ijazah."
298
  ]
299
  }
300
  ],
 
320
  },
321
  {
322
  "cell_type": "code",
323
+ "execution_count": 24,
324
  "metadata": {},
325
  "outputs": [
326
  {
327
  "data": {
328
  "text/plain": [
329
+ "'274-513109, 0274-548516, 085292000355.\\n\\nTerima kasih sudah bertanya!'"
330
  ]
331
  },
332
+ "execution_count": 24,
333
  "metadata": {},
334
  "output_type": "execute_result"
335
  }