nnngoc commited on
Commit
66bb03b
1 Parent(s): 9524ba3
Files changed (2) hide show
  1. rag.py +69 -7
  2. raw_data/faq.xlsx +0 -0
rag.py CHANGED
@@ -27,6 +27,68 @@ vectorstore3, retriever3 = process_data(data3, child_text_splitter, embedding, "
27
 
28
  ##############################################################################
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  ANYSCALE_API_BASE = "credential-1711634141163"
31
  ANYSCALE_API_KEY = "esecret_chitz7splr5ut6vfvqpn72itd3"
32
  ANYSCALE_MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"
@@ -159,13 +221,13 @@ rag_chain_with_source3 = RunnableParallel(
159
 
160
  ############################################################################################
161
 
162
- from flashtext import KeywordProcessor
163
- keyword_processor = KeywordProcessor()
164
- # keyword_processor.add_keyword(<unclean name>, <standardised name>)
165
- keyword_processor.add_keyword('thạc sĩ')
166
- keyword_processor.add_keyword('học viên')
167
- keyword_processor.add_keyword('nghiên cứu sinh')
168
- keyword_processor.add_keyword('tiến sĩ')
169
 
170
  ################################################################################
171
 
 
27
 
28
  ##############################################################################
29
 
30
+ from flashtext import KeywordProcessor
31
+ keyword_processor = KeywordProcessor()
32
+ # keyword_processor.add_keyword(<unclean name>, <standardised name>)
33
+ keyword_processor.add_keyword('thạc sĩ')
34
+ keyword_processor.add_keyword('học viên')
35
+ keyword_processor.add_keyword('nghiên cứu sinh')
36
+ keyword_processor.add_keyword('tiến sĩ')
37
+
38
+ ################################################################################
39
+
40
+ import pandas as pd
41
+
42
+ faq = "raw_data/faq.xlsx"
43
+ df = pd.read_excel(faq)
44
+ questions = df["question"].tolist()
45
+ answers = df["answer"].tolist()
46
+
47
+ faq_thsi_q = []
48
+ faq_thsi_a = []
49
+ faq_tsi_q = []
50
+ faq_tsi_a = []
51
+
52
+ for i in range(len(questions)):
53
+ keywords_found = keyword_processor.extract_keywords(questions[i])
54
+ if 'thạc sĩ' in keywords_found or 'học viên' in keywords_found:
55
+ faq_thsi_q.append(questions[i])
56
+ faq_thsi_a.append(answers[i])
57
+
58
+ elif 'nghiên cứu sinh' in keywords_found or 'tiến sĩ' in keywords_found:
59
+ faq_tsi_q.append(questions[i])
60
+ faq_tsi_a.append(answers[i])
61
+
62
+ import uuid
63
+ from langchain_core.documents import Document
64
+
65
+ def add_faq(retriever, vectorstore, questions, answers):
66
+ id_key = "doc_id"
67
+
68
+ doc_ids = [str(uuid.uuid4()) for _ in answers]
69
+
70
+ question_ = [
71
+ Document(page_content=s, metadata={id_key: doc_ids[i]})
72
+ for i, s in enumerate(questions)
73
+ ]
74
+
75
+ answers_ = [ Document(page_content=s) for s in answers]
76
+
77
+ retriever.vectorstore.add_documents(question_)
78
+
79
+ retriever.docstore.mset(list(zip(doc_ids, answers_)))
80
+
81
+ # Add FAQ to vectorstore
82
+
83
+ add_faq(retriever2, vectorstore2, faq_thsi_q, faq_thsi_a)
84
+
85
+ add_faq(retriever3, vectorstore3, faq_tsi_q, faq_tsi_a)
86
+
87
+ add_faq(retriever1, vectorstore1, questions, answers)
88
+
89
+
90
+ ##################################################################################
91
+
92
  ANYSCALE_API_BASE = "credential-1711634141163"
93
  ANYSCALE_API_KEY = "esecret_chitz7splr5ut6vfvqpn72itd3"
94
  ANYSCALE_MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"
 
221
 
222
  ############################################################################################
223
 
224
+ # from flashtext import KeywordProcessor
225
+ # keyword_processor = KeywordProcessor()
226
+ # # keyword_processor.add_keyword(<unclean name>, <standardised name>)
227
+ # keyword_processor.add_keyword('thạc sĩ')
228
+ # keyword_processor.add_keyword('học viên')
229
+ # keyword_processor.add_keyword('nghiên cứu sinh')
230
+ # keyword_processor.add_keyword('tiến sĩ')
231
 
232
  ################################################################################
233
 
raw_data/faq.xlsx ADDED
Binary file (22.2 kB). View file