Spaces:
Sleeping
Sleeping
robitalhazmi
commited on
Commit
•
29b19df
1
Parent(s):
cb641ab
add cache_resource
Browse files- .gitignore +1 -0
- app.py +54 -49
- rag_notebook.ipynb +22 -22
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
.env
|
app.py
CHANGED
@@ -13,62 +13,67 @@ load_dotenv('.env')
|
|
13 |
|
14 |
st.header("MKOM UGM RAG App")
|
15 |
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
)
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
|
|
|
|
50 |
|
51 |
-
def format_docs(docs):
|
52 |
-
|
53 |
|
54 |
-
template = """Gunakan konteks berikut untuk menjawab pertanyaan pada bagian akhir.
|
55 |
-
Jika kamu tidak tahu jawabannya, katakan saja bahwa kamu tidak tahu, jangan mencoba untuk mengarang jawaban.
|
56 |
-
Selalu katakan "Terima kasih sudah bertanya!" pada setiap akhir jawaban.
|
57 |
|
58 |
-
{context}
|
59 |
|
60 |
-
Pertanyaan: {question}
|
61 |
|
62 |
-
Jawaban:"""
|
63 |
|
64 |
-
custom_rag_prompt = PromptTemplate.from_template(template)
|
65 |
|
66 |
-
rag_chain = (
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
)
|
|
|
|
|
|
|
72 |
|
73 |
question = st.text_input("Tanya ujian masuk Pascasarjana Universitas Gadjah Mada")
|
74 |
if question:
|
|
|
13 |
|
14 |
st.header("MKOM UGM RAG App")
|
15 |
|
16 |
+
@st.cache_resource
|
17 |
+
def get_rag_chain():
|
18 |
+
# Only keep post title, headers, and content from the full HTML.
|
19 |
+
bs4_strainer = bs4.SoupStrainer(class_=("post-title", "post-header", "post-content"))
|
20 |
+
loader = WebBaseLoader(
|
21 |
+
web_paths=(
|
22 |
+
"https://um.ugm.ac.id/ragam-seleksi-pascasarjana/",
|
23 |
+
"https://um.ugm.ac.id/persyaratan-pendaftaran-magister/",
|
24 |
+
"https://um.ugm.ac.id/persyaratan-pendaftaran-program-spesialis/",
|
25 |
+
"https://um.ugm.ac.id/persyaratan-pendaftaran-subspesialis/",
|
26 |
+
"https://um.ugm.ac.id/persyaratan-pendaftaran-doktor/",
|
27 |
+
"https://um.ugm.ac.id/prosedur-pendaftaran-magister/",
|
28 |
+
"https://um.ugm.ac.id/prosedur-pendaftaran-program-spesialis/",
|
29 |
+
"https://um.ugm.ac.id/prosedur-pendaftaran-program-subspesialis/",
|
30 |
+
"https://um.ugm.ac.id/prosedur-pendaftaran-doktor-2/",
|
31 |
+
"https://um.ugm.ac.id/program-studi-program-magister-2/",
|
32 |
+
"https://um.ugm.ac.id/program-studi-dan-daya-tampung-program-spesialis/",
|
33 |
+
"https://um.ugm.ac.id/program-studi-program-doktor/",
|
34 |
+
"https://um.ugm.ac.id/jadwal-seleksi-magister-dan-doktor/",
|
35 |
+
"https://um.ugm.ac.id/jadwal-kegiatan-seleksi-program-spesialis/",
|
36 |
+
"https://mkom.ugm.ac.id/alur-pendaftaran-magister/",
|
37 |
+
"https://mkom.ugm.ac.id/informasi-pendaftaran-program-pra-s2-ilmu-komputer/",
|
38 |
+
"https://mkom.ugm.ac.id/informasi-pendaftaran-program-s2-magister/",
|
39 |
+
"https://mkom.ugm.ac.id/program-dual-degree-double-degree-magister-ilmu-komputer/",
|
40 |
+
"https://mkom.ugm.ac.id/informasi-pendaftaran-program-s3-doktor/"
|
41 |
+
),
|
42 |
+
bs_kwargs={"parse_only": bs4_strainer},
|
43 |
+
)
|
44 |
+
docs = loader.load()
|
45 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
46 |
+
chunk_size=1000, chunk_overlap=200, add_start_index=True
|
47 |
+
)
|
48 |
+
all_splits = text_splitter.split_documents(docs)
|
49 |
+
vectorstore = FAISS.from_documents(all_splits, HuggingFaceEmbeddings(model_name='firqaaa/indo-sentence-bert-base'))
|
50 |
+
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})
|
51 |
+
llm = ChatCohere(model="command-r")
|
52 |
|
53 |
+
def format_docs(docs):
|
54 |
+
return "\n\n".join(doc.page_content for doc in docs)
|
55 |
|
56 |
+
template = """Gunakan konteks berikut untuk menjawab pertanyaan pada bagian akhir.
|
57 |
+
Jika kamu tidak tahu jawabannya, katakan saja bahwa kamu tidak tahu, jangan mencoba untuk mengarang jawaban.
|
58 |
+
Selalu katakan "Terima kasih sudah bertanya!" pada setiap akhir jawaban.
|
59 |
|
60 |
+
{context}
|
61 |
|
62 |
+
Pertanyaan: {question}
|
63 |
|
64 |
+
Jawaban:"""
|
65 |
|
66 |
+
custom_rag_prompt = PromptTemplate.from_template(template)
|
67 |
|
68 |
+
rag_chain = (
|
69 |
+
{"context": retriever | format_docs, "question": RunnablePassthrough()}
|
70 |
+
| custom_rag_prompt
|
71 |
+
| llm
|
72 |
+
| StrOutputParser()
|
73 |
+
)
|
74 |
+
return rag_chain
|
75 |
+
|
76 |
+
rag_chain = get_rag_chain()
|
77 |
|
78 |
question = st.text_input("Tanya ujian masuk Pascasarjana Universitas Gadjah Mada")
|
79 |
if question:
|
rag_notebook.ipynb
CHANGED
@@ -2,7 +2,7 @@
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
-
"execution_count":
|
6 |
"metadata": {},
|
7 |
"outputs": [
|
8 |
{
|
@@ -11,7 +11,7 @@
|
|
11 |
"2879"
|
12 |
]
|
13 |
},
|
14 |
-
"execution_count":
|
15 |
"metadata": {},
|
16 |
"output_type": "execute_result"
|
17 |
}
|
@@ -53,7 +53,7 @@
|
|
53 |
},
|
54 |
{
|
55 |
"cell_type": "code",
|
56 |
-
"execution_count":
|
57 |
"metadata": {},
|
58 |
"outputs": [
|
59 |
{
|
@@ -81,7 +81,7 @@
|
|
81 |
},
|
82 |
{
|
83 |
"cell_type": "code",
|
84 |
-
"execution_count":
|
85 |
"metadata": {},
|
86 |
"outputs": [
|
87 |
{
|
@@ -90,7 +90,7 @@
|
|
90 |
"111"
|
91 |
]
|
92 |
},
|
93 |
-
"execution_count":
|
94 |
"metadata": {},
|
95 |
"output_type": "execute_result"
|
96 |
}
|
@@ -108,7 +108,7 @@
|
|
108 |
},
|
109 |
{
|
110 |
"cell_type": "code",
|
111 |
-
"execution_count":
|
112 |
"metadata": {},
|
113 |
"outputs": [
|
114 |
{
|
@@ -117,7 +117,7 @@
|
|
117 |
"835"
|
118 |
]
|
119 |
},
|
120 |
-
"execution_count":
|
121 |
"metadata": {},
|
122 |
"output_type": "execute_result"
|
123 |
}
|
@@ -128,7 +128,7 @@
|
|
128 |
},
|
129 |
{
|
130 |
"cell_type": "code",
|
131 |
-
"execution_count":
|
132 |
"metadata": {},
|
133 |
"outputs": [
|
134 |
{
|
@@ -138,7 +138,7 @@
|
|
138 |
" 'start_index': 4748}"
|
139 |
]
|
140 |
},
|
141 |
-
"execution_count":
|
142 |
"metadata": {},
|
143 |
"output_type": "execute_result"
|
144 |
}
|
@@ -149,7 +149,7 @@
|
|
149 |
},
|
150 |
{
|
151 |
"cell_type": "code",
|
152 |
-
"execution_count":
|
153 |
"metadata": {},
|
154 |
"outputs": [
|
155 |
{
|
@@ -170,7 +170,7 @@
|
|
170 |
},
|
171 |
{
|
172 |
"cell_type": "code",
|
173 |
-
"execution_count":
|
174 |
"metadata": {},
|
175 |
"outputs": [
|
176 |
{
|
@@ -179,7 +179,7 @@
|
|
179 |
"6"
|
180 |
]
|
181 |
},
|
182 |
-
"execution_count":
|
183 |
"metadata": {},
|
184 |
"output_type": "execute_result"
|
185 |
}
|
@@ -194,7 +194,7 @@
|
|
194 |
},
|
195 |
{
|
196 |
"cell_type": "code",
|
197 |
-
"execution_count":
|
198 |
"metadata": {},
|
199 |
"outputs": [
|
200 |
{
|
@@ -224,7 +224,7 @@
|
|
224 |
},
|
225 |
{
|
226 |
"cell_type": "code",
|
227 |
-
"execution_count":
|
228 |
"metadata": {},
|
229 |
"outputs": [],
|
230 |
"source": [
|
@@ -239,7 +239,7 @@
|
|
239 |
},
|
240 |
{
|
241 |
"cell_type": "code",
|
242 |
-
"execution_count":
|
243 |
"metadata": {},
|
244 |
"outputs": [
|
245 |
{
|
@@ -248,7 +248,7 @@
|
|
248 |
"[HumanMessage(content=\"You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\\nQuestion: filler question \\nContext: filler context \\nAnswer:\")]"
|
249 |
]
|
250 |
},
|
251 |
-
"execution_count":
|
252 |
"metadata": {},
|
253 |
"output_type": "execute_result"
|
254 |
}
|
@@ -267,7 +267,7 @@
|
|
267 |
},
|
268 |
{
|
269 |
"cell_type": "code",
|
270 |
-
"execution_count":
|
271 |
"metadata": {},
|
272 |
"outputs": [
|
273 |
{
|
@@ -287,14 +287,14 @@
|
|
287 |
},
|
288 |
{
|
289 |
"cell_type": "code",
|
290 |
-
"execution_count":
|
291 |
"metadata": {},
|
292 |
"outputs": [
|
293 |
{
|
294 |
"name": "stdout",
|
295 |
"output_type": "stream",
|
296 |
"text": [
|
297 |
-
"Penerimaan Mahasiswa Baru (PMB) Program Pascasarjana jalur reguler
|
298 |
]
|
299 |
}
|
300 |
],
|
@@ -320,16 +320,16 @@
|
|
320 |
},
|
321 |
{
|
322 |
"cell_type": "code",
|
323 |
-
"execution_count":
|
324 |
"metadata": {},
|
325 |
"outputs": [
|
326 |
{
|
327 |
"data": {
|
328 |
"text/plain": [
|
329 |
-
"'
|
330 |
]
|
331 |
},
|
332 |
-
"execution_count":
|
333 |
"metadata": {},
|
334 |
"output_type": "execute_result"
|
335 |
}
|
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
+
"execution_count": 12,
|
6 |
"metadata": {},
|
7 |
"outputs": [
|
8 |
{
|
|
|
11 |
"2879"
|
12 |
]
|
13 |
},
|
14 |
+
"execution_count": 12,
|
15 |
"metadata": {},
|
16 |
"output_type": "execute_result"
|
17 |
}
|
|
|
53 |
},
|
54 |
{
|
55 |
"cell_type": "code",
|
56 |
+
"execution_count": 13,
|
57 |
"metadata": {},
|
58 |
"outputs": [
|
59 |
{
|
|
|
81 |
},
|
82 |
{
|
83 |
"cell_type": "code",
|
84 |
+
"execution_count": 14,
|
85 |
"metadata": {},
|
86 |
"outputs": [
|
87 |
{
|
|
|
90 |
"111"
|
91 |
]
|
92 |
},
|
93 |
+
"execution_count": 14,
|
94 |
"metadata": {},
|
95 |
"output_type": "execute_result"
|
96 |
}
|
|
|
108 |
},
|
109 |
{
|
110 |
"cell_type": "code",
|
111 |
+
"execution_count": 15,
|
112 |
"metadata": {},
|
113 |
"outputs": [
|
114 |
{
|
|
|
117 |
"835"
|
118 |
]
|
119 |
},
|
120 |
+
"execution_count": 15,
|
121 |
"metadata": {},
|
122 |
"output_type": "execute_result"
|
123 |
}
|
|
|
128 |
},
|
129 |
{
|
130 |
"cell_type": "code",
|
131 |
+
"execution_count": 16,
|
132 |
"metadata": {},
|
133 |
"outputs": [
|
134 |
{
|
|
|
138 |
" 'start_index': 4748}"
|
139 |
]
|
140 |
},
|
141 |
+
"execution_count": 16,
|
142 |
"metadata": {},
|
143 |
"output_type": "execute_result"
|
144 |
}
|
|
|
149 |
},
|
150 |
{
|
151 |
"cell_type": "code",
|
152 |
+
"execution_count": 17,
|
153 |
"metadata": {},
|
154 |
"outputs": [
|
155 |
{
|
|
|
170 |
},
|
171 |
{
|
172 |
"cell_type": "code",
|
173 |
+
"execution_count": 18,
|
174 |
"metadata": {},
|
175 |
"outputs": [
|
176 |
{
|
|
|
179 |
"6"
|
180 |
]
|
181 |
},
|
182 |
+
"execution_count": 18,
|
183 |
"metadata": {},
|
184 |
"output_type": "execute_result"
|
185 |
}
|
|
|
194 |
},
|
195 |
{
|
196 |
"cell_type": "code",
|
197 |
+
"execution_count": 19,
|
198 |
"metadata": {},
|
199 |
"outputs": [
|
200 |
{
|
|
|
224 |
},
|
225 |
{
|
226 |
"cell_type": "code",
|
227 |
+
"execution_count": 20,
|
228 |
"metadata": {},
|
229 |
"outputs": [],
|
230 |
"source": [
|
|
|
239 |
},
|
240 |
{
|
241 |
"cell_type": "code",
|
242 |
+
"execution_count": 21,
|
243 |
"metadata": {},
|
244 |
"outputs": [
|
245 |
{
|
|
|
248 |
"[HumanMessage(content=\"You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\\nQuestion: filler question \\nContext: filler context \\nAnswer:\")]"
|
249 |
]
|
250 |
},
|
251 |
+
"execution_count": 21,
|
252 |
"metadata": {},
|
253 |
"output_type": "execute_result"
|
254 |
}
|
|
|
267 |
},
|
268 |
{
|
269 |
"cell_type": "code",
|
270 |
+
"execution_count": 22,
|
271 |
"metadata": {},
|
272 |
"outputs": [
|
273 |
{
|
|
|
287 |
},
|
288 |
{
|
289 |
"cell_type": "code",
|
290 |
+
"execution_count": 23,
|
291 |
"metadata": {},
|
292 |
"outputs": [
|
293 |
{
|
294 |
"name": "stdout",
|
295 |
"output_type": "stream",
|
296 |
"text": [
|
297 |
+
"Penerimaan Mahasiswa Baru (PMB) Program Pascasarjana jalur reguler dilakukan melalui tiga skema: biaya sendiri, kerja sama, dan pelamar beasiswa. Jalur ini terbuka untuk semua pendaftar tanpa batasan institusi atau negara asal ijazah.Penerimaan Mahasiswa Baru (PMB) Program Pascasarjana jalur reguler dilakukan melalui tiga skema: biaya sendiri, kerja sama, dan pelamar beasiswa. Jalur ini terbuka untuk semua pendaftar tanpa batasan institusi atau negara asal ijazah."
|
298 |
]
|
299 |
}
|
300 |
],
|
|
|
320 |
},
|
321 |
{
|
322 |
"cell_type": "code",
|
323 |
+
"execution_count": 24,
|
324 |
"metadata": {},
|
325 |
"outputs": [
|
326 |
{
|
327 |
"data": {
|
328 |
"text/plain": [
|
329 |
+
"'274-513109, 0274-548516, 085292000355.\\n\\nTerima kasih sudah bertanya!'"
|
330 |
]
|
331 |
},
|
332 |
+
"execution_count": 24,
|
333 |
"metadata": {},
|
334 |
"output_type": "execute_result"
|
335 |
}
|