update
Browse files- rag.py +69 -7
- raw_data/faq.xlsx +0 -0
rag.py
CHANGED
@@ -27,6 +27,68 @@ vectorstore3, retriever3 = process_data(data3, child_text_splitter, embedding, "
|
|
27 |
|
28 |
##############################################################################
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
ANYSCALE_API_BASE = "credential-1711634141163"
|
31 |
ANYSCALE_API_KEY = "esecret_chitz7splr5ut6vfvqpn72itd3"
|
32 |
ANYSCALE_MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"
|
@@ -159,13 +221,13 @@ rag_chain_with_source3 = RunnableParallel(
|
|
159 |
|
160 |
############################################################################################
|
161 |
|
162 |
-
from flashtext import KeywordProcessor
|
163 |
-
keyword_processor = KeywordProcessor()
|
164 |
-
# keyword_processor.add_keyword(<unclean name>, <standardised name>)
|
165 |
-
keyword_processor.add_keyword('thạc sĩ')
|
166 |
-
keyword_processor.add_keyword('học viên')
|
167 |
-
keyword_processor.add_keyword('nghiên cứu sinh')
|
168 |
-
keyword_processor.add_keyword('tiến sĩ')
|
169 |
|
170 |
################################################################################
|
171 |
|
|
|
27 |
|
28 |
##############################################################################
|
29 |
|
30 |
+
from flashtext import KeywordProcessor
|
31 |
+
keyword_processor = KeywordProcessor()
|
32 |
+
# keyword_processor.add_keyword(<unclean name>, <standardised name>)
|
33 |
+
keyword_processor.add_keyword('thạc sĩ')
|
34 |
+
keyword_processor.add_keyword('học viên')
|
35 |
+
keyword_processor.add_keyword('nghiên cứu sinh')
|
36 |
+
keyword_processor.add_keyword('tiến sĩ')
|
37 |
+
|
38 |
+
################################################################################
|
39 |
+
|
40 |
+
import pandas as pd
|
41 |
+
|
42 |
+
faq = "raw_data/faq.xlsx"
|
43 |
+
df = pd.read_excel(faq)
|
44 |
+
questions = df["question"].tolist()
|
45 |
+
answers = df["answer"].tolist()
|
46 |
+
|
47 |
+
faq_thsi_q = []
|
48 |
+
faq_thsi_a = []
|
49 |
+
faq_tsi_q = []
|
50 |
+
faq_tsi_a = []
|
51 |
+
|
52 |
+
for i in range(len(questions)):
|
53 |
+
keywords_found = keyword_processor.extract_keywords(questions[i])
|
54 |
+
if 'thạc sĩ' in keywords_found or 'học viên' in keywords_found:
|
55 |
+
faq_thsi_q.append(questions[i])
|
56 |
+
faq_thsi_a.append(answers[i])
|
57 |
+
|
58 |
+
elif 'nghiên cứu sinh' in keywords_found or 'tiến sĩ' in keywords_found:
|
59 |
+
faq_tsi_q.append(questions[i])
|
60 |
+
faq_tsi_a.append(answers[i])
|
61 |
+
|
62 |
+
import uuid
|
63 |
+
from langchain_core.documents import Document
|
64 |
+
|
65 |
+
def add_faq(retriever, vectorstore, questions, answers):
|
66 |
+
id_key = "doc_id"
|
67 |
+
|
68 |
+
doc_ids = [str(uuid.uuid4()) for _ in answers]
|
69 |
+
|
70 |
+
question_ = [
|
71 |
+
Document(page_content=s, metadata={id_key: doc_ids[i]})
|
72 |
+
for i, s in enumerate(questions)
|
73 |
+
]
|
74 |
+
|
75 |
+
answers_ = [ Document(page_content=s) for s in answers]
|
76 |
+
|
77 |
+
retriever.vectorstore.add_documents(question_)
|
78 |
+
|
79 |
+
retriever.docstore.mset(list(zip(doc_ids, answers_)))
|
80 |
+
|
81 |
+
# Add FAQ to vectorstore
|
82 |
+
|
83 |
+
add_faq(retriever2, vectorstore2, faq_thsi_q, faq_thsi_a)
|
84 |
+
|
85 |
+
add_faq(retriever3, vectorstore3, faq_tsi_q, faq_tsi_a)
|
86 |
+
|
87 |
+
add_faq(retriever1, vectorstore1, questions, answers)
|
88 |
+
|
89 |
+
|
90 |
+
##################################################################################
|
91 |
+
|
92 |
ANYSCALE_API_BASE = "credential-1711634141163"
|
93 |
ANYSCALE_API_KEY = "esecret_chitz7splr5ut6vfvqpn72itd3"
|
94 |
ANYSCALE_MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"
|
|
|
221 |
|
222 |
############################################################################################
|
223 |
|
224 |
+
# from flashtext import KeywordProcessor
|
225 |
+
# keyword_processor = KeywordProcessor()
|
226 |
+
# # keyword_processor.add_keyword(<unclean name>, <standardised name>)
|
227 |
+
# keyword_processor.add_keyword('thạc sĩ')
|
228 |
+
# keyword_processor.add_keyword('học viên')
|
229 |
+
# keyword_processor.add_keyword('nghiên cứu sinh')
|
230 |
+
# keyword_processor.add_keyword('tiến sĩ')
|
231 |
|
232 |
################################################################################
|
233 |
|
raw_data/faq.xlsx
ADDED
Binary file (22.2 kB). View file
|
|