Sirinoot commited on
Commit
65303a1
1 Parent(s): b86a43f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +467 -215
app.py CHANGED
@@ -1,4 +1,245 @@
1
  # @title web interface demo
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import random
3
  import gradio as gr
4
  import time
@@ -17,224 +258,235 @@ from pythainlp.tokenize import sent_tokenize
17
  DEFAULT_MODEL = 'wangchanberta-hyp'
18
  DEFAULT_SENTENCE_EMBEDDING_MODEL = 'intfloat/multilingual-e5-base'
19
 
20
- MODEL_DICT = {
21
- 'wangchanberta': 'Chananchida/wangchanberta-xet_ref-params',
22
- 'wangchanberta-hyp': 'Chananchida/wangchanberta-xet_hyp-params',
23
- }
24
-
25
- EMBEDDINGS_PATH = 'data/embeddings.pkl'
26
- DATA_PATH='data/dataset.xlsx'
27
-
28
-
29
- def load_data(path=DATA_PATH):
30
- df = pd.read_excel(path, sheet_name='Default')
31
- df['Context'] = pd.read_excel(path, sheet_name='mdeberta')['Context']
32
- print(len(df))
33
- print('Load data done')
34
- return df
35
-
36
-
37
- def load_model(model_name=DEFAULT_MODEL):
38
- model = AutoModelForQuestionAnswering.from_pretrained(MODEL_DICT[model_name])
39
- tokenizer = AutoTokenizer.from_pretrained(MODEL_DICT[model_name])
40
- print('Load model done')
41
- return model, tokenizer
42
-
43
- def load_embedding_model(model_name=DEFAULT_SENTENCE_EMBEDDING_MODEL):
44
- # if torch.cuda.is_available():
45
- # embedding_model = SentenceTransformer(model_name, device='cuda')
46
- # else:
47
- embedding_model = SentenceTransformer(model_name)
48
- print('Load sentence embedding model done')
49
- return embedding_model
50
-
51
-
52
- def set_index(vector):
53
- if torch.cuda.is_available():
54
- res = faiss.StandardGpuResources()
55
- index = faiss.IndexFlatL2(vector.shape[1])
56
- gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index)
57
- gpu_index_flat.add(vector)
58
- index = gpu_index_flat
59
- else:
60
- index = faiss.IndexFlatL2(vector.shape[1])
61
- index.add(vector)
62
- return index
63
-
64
-
65
- def get_embeddings(embedding_model, text_list):
66
- return embedding_model.encode(text_list)
67
-
68
-
69
- def prepare_sentences_vector(encoded_list):
70
- encoded_list = [i.reshape(1, -1) for i in encoded_list]
71
- encoded_list = np.vstack(encoded_list).astype('float32')
72
- encoded_list = normalize(encoded_list)
73
- return encoded_list
74
-
75
-
76
- def store_embeddings(df, embeddings):
77
- with open('embeddings.pkl', "wb") as fOut:
78
- pickle.dump({'sentences': df['Question'], 'embeddings': embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL)
79
- print('Store embeddings done')
80
-
81
-
82
- def load_embeddings(file_path=EMBEDDINGS_PATH):
83
- with open(file_path, "rb") as fIn:
84
- stored_data = pickle.load(fIn)
85
- stored_sentences = stored_data['sentences']
86
- stored_embeddings = stored_data['embeddings']
87
- print('Load (questions) embeddings done')
88
- return stored_embeddings
89
-
90
-
91
- def model_pipeline(model, tokenizer, question, similar_context):
92
- inputs = tokenizer(question, similar_context, return_tensors="pt")
93
- with torch.no_grad():
94
- outputs = model(**inputs)
95
- answer_start_index = outputs.start_logits.argmax()
96
- answer_end_index = outputs.end_logits.argmax()
97
- predict_answer_tokens = inputs.input_ids[0, answer_start_index: answer_end_index + 1]
98
- Answer = tokenizer.decode(predict_answer_tokens)
99
- return Answer.replace('<unk>','@')
100
-
101
-
102
- def faiss_search(index, question_vector, k=1):
103
- distances, indices = index.search(question_vector, k)
104
- return distances,indices
105
-
106
- def create_segment_index(vector):
107
- segment_index = faiss.IndexFlatL2(vector.shape[1])
108
- segment_index.add(vector)
109
- return segment_index
110
-
111
-
112
- def predict_faiss(model, tokenizer, embedding_model, df, question, index):
113
- t = time.time()
114
- question = question.strip()
115
- question_vector = get_embeddings(embedding_model, question)
116
- question_vector = prepare_sentences_vector([question_vector])
117
- distances,indices = faiss_search(index, question_vector)
118
- Answers = [df['Answer'][i] for i in indices[0]]
119
- _time = time.time() - t
120
- output = {
121
- "user_question": question,
122
- "answer": Answers[0],
123
- "totaltime": round(_time, 3),
124
- "score": round(distances[0][0], 4)
125
- }
126
- return output
127
-
128
- def predict(model, tokenizer, embedding_model, df, question, index):
129
- t = time.time()
130
- question = question.strip()
131
- question_vector = get_embeddings(embedding_model, question)
132
- question_vector = prepare_sentences_vector([question_vector])
133
- distances,indices = faiss_search(index, question_vector)
134
-
135
- # Answer = model_pipeline(model, tokenizer, df['Question'][indices[0][0]], df['Context'][indices[0][0]])
136
- Answer = model_pipeline(model, tokenizer, question, df['Context'][indices[0][0]])
137
- _time = time.time() - t
138
- output = {
139
- "user_question": question,
140
- "answer": Answer,
141
- "totaltime": round(_time, 3),
142
- "distance": round(distances[0][0], 4)
143
- }
144
- return Answer
145
-
146
- def predict_test(model, tokenizer, embedding_model, df, question, index): # sent_tokenize pythainlp
147
- t = time.time()
148
- question = question.strip()
149
- question_vector = get_embeddings(embedding_model, question)
150
- question_vector = prepare_sentences_vector([question_vector])
151
- distances,indices = faiss_search(index, question_vector)
152
-
153
- mostSimContext = df['Context'][indices[0][0]]
154
- pattern = r'(?<=\s{10}).*'
155
- matches = re.search(pattern, mostSimContext, flags=re.DOTALL)
156
-
157
- if matches:
158
- mostSimContext = matches.group(0)
159
-
160
- mostSimContext = mostSimContext.strip()
161
- mostSimContext = re.sub(r'\s+', ' ', mostSimContext)
162
-
163
- segments = sent_tokenize(mostSimContext, engine="crfcut")
164
-
165
- segment_embeddings = get_embeddings(embedding_model, segments)
166
- segment_embeddings = prepare_sentences_vector(segment_embeddings)
167
- segment_index = create_segment_index(segment_embeddings)
168
 
169
- _distances,_indices = faiss_search(segment_index, question_vector)
170
- mostSimSegment = segments[_indices[0][0]]
171
 
172
- Answer = model_pipeline(model, tokenizer,question,mostSimSegment)
 
 
173
 
174
- if len(Answer) <= 2:
175
- Answer = mostSimSegment
176
 
177
- # Find the start and end indices of mostSimSegment within mostSimContext
178
- start_index = mostSimContext.find(Answer)
179
- end_index = start_index + len(Answer)
180
-
181
- print(f"answer {len(Answer)} => {Answer} || startIndex =>{start_index} || endIndex =>{end_index}")
182
- print(f"mostSimContext{len(mostSimContext)}=>{mostSimContext}\nsegments{len(segments)}=>{segments}\nmostSimSegment{len(mostSimSegment)}=>{mostSimSegment}")
183
-
184
- _time = time.time() - t
185
- output = {
186
- "user_question": question,
187
- "answer": df['Answer'][indices[0][0]],
188
- "totaltime": round(_time, 3),
189
- "distance": round(distances[0][0], 4),
190
- "highlight_start": start_index,
191
- "highlight_end": end_index
192
- }
193
- return output
194
-
195
- def highlight_text(text, start_index, end_index):
196
- if start_index < 0:
197
- start_index = 0
198
- if end_index > len(text):
199
- end_index = len(text)
200
- highlighted_text = ""
201
- for i, char in enumerate(text):
202
- if i == start_index:
203
- highlighted_text += "<mark>"
204
- highlighted_text += char
205
- if i == end_index - 1:
206
- highlighted_text += "</mark>"
207
- return highlighted_text
208
-
209
- def chat_interface_before(question, history):
210
- response = predict(model, tokenizer, embedding_model, df, question, index)
211
- return response
212
-
213
- def chat_interface_after(question, history):
214
- response = predict_test(model, tokenizer, embedding_model, df, question, index)
215
- highlighted_answer = highlight_text(response["answer"], response["highlight_start"], response["highlight_end"])
216
- return highlighted_answer
217
-
218
- examples=[
219
- 'ขอเลขที่บัญชีของบริษัทหน่อย',
220
- 'บริษัทตั้งอยู่ที่ถนนอะไร',
221
- 'ขอช่องทางติดตามข่าวสารทาง Line หน่อย',
222
- 'อยากทราบความถี่ในการดึงข้อมูลของ DXT360 ในแต่ละแพลตฟอร์ม',
223
- 'อยากทราบความถี่ในการดึงข้อมูลของ DXT360 บน Twitter',
224
- # 'ช่องทางติดตามข่าวสารของเรา',
225
- ]
226
- demo_before = gr.ChatInterface(fn=chat_interface_before,
227
- examples=examples)
228
-
229
- demo_after = gr.ChatInterface(fn=chat_interface_after,
230
- examples=examples)
231
-
232
- interface = gr.TabbedInterface([demo_before, demo_after], ["Before", "After"])
233
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  if __name__ == "__main__":
235
- # Load your model, tokenizer, data, and index here...
236
- df = load_data()
237
- model, tokenizer = load_model('wangchanberta-hyp')
238
- embedding_model = load_embedding_model()
239
- index = set_index(prepare_sentences_vector(load_embeddings(EMBEDDINGS_PATH)))
240
- interface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # @title web interface demo
2
+ # import random
3
+ # import gradio as gr
4
+ # import time
5
+ # import numpy as np
6
+ # import pandas as pd
7
+ # import torch
8
+ # import faiss
9
+ # from sklearn.preprocessing import normalize
10
+ # from transformers import AutoTokenizer, AutoModelForQuestionAnswering
11
+ # from sentence_transformers import SentenceTransformer, util
12
+ # from pythainlp import Tokenizer
13
+ # import pickle
14
+ # import re
15
+ # from pythainlp.tokenize import sent_tokenize
16
+
17
+ # DEFAULT_MODEL = 'wangchanberta-hyp'
18
+ # DEFAULT_SENTENCE_EMBEDDING_MODEL = 'intfloat/multilingual-e5-base'
19
+
20
+ # MODEL_DICT = {
21
+ # 'wangchanberta': 'Chananchida/wangchanberta-xet_ref-params',
22
+ # 'wangchanberta-hyp': 'Chananchida/wangchanberta-xet_hyp-params',
23
+ # }
24
+
25
+ # EMBEDDINGS_PATH = 'data/embeddings.pkl'
26
+ # DATA_PATH='data/dataset.xlsx'
27
+
28
+
29
+ # def load_data(path=DATA_PATH):
30
+ # df = pd.read_excel(path, sheet_name='Default')
31
+ # df['Context'] = pd.read_excel(path, sheet_name='mdeberta')['Context']
32
+ # print(len(df))
33
+ # print('Load data done')
34
+ # return df
35
+
36
+
37
+ # def load_model(model_name=DEFAULT_MODEL):
38
+ # model = AutoModelForQuestionAnswering.from_pretrained(MODEL_DICT[model_name])
39
+ # tokenizer = AutoTokenizer.from_pretrained(MODEL_DICT[model_name])
40
+ # print('Load model done')
41
+ # return model, tokenizer
42
+
43
+ # def load_embedding_model(model_name=DEFAULT_SENTENCE_EMBEDDING_MODEL):
44
+ # # if torch.cuda.is_available():
45
+ # # embedding_model = SentenceTransformer(model_name, device='cuda')
46
+ # # else:
47
+ # embedding_model = SentenceTransformer(model_name)
48
+ # print('Load sentence embedding model done')
49
+ # return embedding_model
50
+
51
+
52
+ # def set_index(vector):
53
+ # if torch.cuda.is_available():
54
+ # res = faiss.StandardGpuResources()
55
+ # index = faiss.IndexFlatL2(vector.shape[1])
56
+ # gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index)
57
+ # gpu_index_flat.add(vector)
58
+ # index = gpu_index_flat
59
+ # else:
60
+ # index = faiss.IndexFlatL2(vector.shape[1])
61
+ # index.add(vector)
62
+ # return index
63
+
64
+
65
+ # def get_embeddings(embedding_model, text_list):
66
+ # return embedding_model.encode(text_list)
67
+
68
+
69
+ # def prepare_sentences_vector(encoded_list):
70
+ # encoded_list = [i.reshape(1, -1) for i in encoded_list]
71
+ # encoded_list = np.vstack(encoded_list).astype('float32')
72
+ # encoded_list = normalize(encoded_list)
73
+ # return encoded_list
74
+
75
+
76
+ # def store_embeddings(df, embeddings):
77
+ # with open('embeddings.pkl', "wb") as fOut:
78
+ # pickle.dump({'sentences': df['Question'], 'embeddings': embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL)
79
+ # print('Store embeddings done')
80
+
81
+
82
+ # def load_embeddings(file_path=EMBEDDINGS_PATH):
83
+ # with open(file_path, "rb") as fIn:
84
+ # stored_data = pickle.load(fIn)
85
+ # stored_sentences = stored_data['sentences']
86
+ # stored_embeddings = stored_data['embeddings']
87
+ # print('Load (questions) embeddings done')
88
+ # return stored_embeddings
89
+
90
+
91
+ # def model_pipeline(model, tokenizer, question, similar_context):
92
+ # inputs = tokenizer(question, similar_context, return_tensors="pt")
93
+ # with torch.no_grad():
94
+ # outputs = model(**inputs)
95
+ # answer_start_index = outputs.start_logits.argmax()
96
+ # answer_end_index = outputs.end_logits.argmax()
97
+ # predict_answer_tokens = inputs.input_ids[0, answer_start_index: answer_end_index + 1]
98
+ # Answer = tokenizer.decode(predict_answer_tokens)
99
+ # return Answer.replace('<unk>','@')
100
+
101
+
102
+ # def faiss_search(index, question_vector, k=1):
103
+ # distances, indices = index.search(question_vector, k)
104
+ # return distances,indices
105
+
106
+ # def create_segment_index(vector):
107
+ # segment_index = faiss.IndexFlatL2(vector.shape[1])
108
+ # segment_index.add(vector)
109
+ # return segment_index
110
+
111
+
112
+ # def predict_faiss(model, tokenizer, embedding_model, df, question, index):
113
+ # t = time.time()
114
+ # question = question.strip()
115
+ # question_vector = get_embeddings(embedding_model, question)
116
+ # question_vector = prepare_sentences_vector([question_vector])
117
+ # distances,indices = faiss_search(index, question_vector)
118
+ # Answers = [df['Answer'][i] for i in indices[0]]
119
+ # _time = time.time() - t
120
+ # output = {
121
+ # "user_question": question,
122
+ # "answer": Answers[0],
123
+ # "totaltime": round(_time, 3),
124
+ # "score": round(distances[0][0], 4)
125
+ # }
126
+ # return output
127
+
128
+ # def predict(model, tokenizer, embedding_model, df, question, index):
129
+ # t = time.time()
130
+ # question = question.strip()
131
+ # question_vector = get_embeddings(embedding_model, question)
132
+ # question_vector = prepare_sentences_vector([question_vector])
133
+ # distances,indices = faiss_search(index, question_vector)
134
+
135
+ # # Answer = model_pipeline(model, tokenizer, df['Question'][indices[0][0]], df['Context'][indices[0][0]])
136
+ # Answer = model_pipeline(model, tokenizer, question, df['Context'][indices[0][0]])
137
+ # _time = time.time() - t
138
+ # output = {
139
+ # "user_question": question,
140
+ # "answer": Answer,
141
+ # "totaltime": round(_time, 3),
142
+ # "distance": round(distances[0][0], 4)
143
+ # }
144
+ # return Answer
145
+
146
+ # def predict_test(model, tokenizer, embedding_model, df, question, index): # sent_tokenize pythainlp
147
+ # t = time.time()
148
+ # question = question.strip()
149
+ # question_vector = get_embeddings(embedding_model, question)
150
+ # question_vector = prepare_sentences_vector([question_vector])
151
+ # distances,indices = faiss_search(index, question_vector)
152
+
153
+ # mostSimContext = df['Context'][indices[0][0]]
154
+ # pattern = r'(?<=\s{10}).*'
155
+ # matches = re.search(pattern, mostSimContext, flags=re.DOTALL)
156
+
157
+ # if matches:
158
+ # mostSimContext = matches.group(0)
159
+
160
+ # mostSimContext = mostSimContext.strip()
161
+ # mostSimContext = re.sub(r'\s+', ' ', mostSimContext)
162
+
163
+ # segments = sent_tokenize(mostSimContext, engine="crfcut")
164
+
165
+ # segment_embeddings = get_embeddings(embedding_model, segments)
166
+ # segment_embeddings = prepare_sentences_vector(segment_embeddings)
167
+ # segment_index = create_segment_index(segment_embeddings)
168
+
169
+ # _distances,_indices = faiss_search(segment_index, question_vector)
170
+ # mostSimSegment = segments[_indices[0][0]]
171
+
172
+ # Answer = model_pipeline(model, tokenizer,question,mostSimSegment)
173
+
174
+ # if len(Answer) <= 2:
175
+ # Answer = mostSimSegment
176
+
177
+ # # Find the start and end indices of mostSimSegment within mostSimContext
178
+ # start_index = mostSimContext.find(Answer)
179
+ # end_index = start_index + len(Answer)
180
+
181
+ # print(f"answer {len(Answer)} => {Answer} || startIndex =>{start_index} || endIndex =>{end_index}")
182
+ # print(f"mostSimContext{len(mostSimContext)}=>{mostSimContext}\nsegments{len(segments)}=>{segments}\nmostSimSegment{len(mostSimSegment)}=>{mostSimSegment}")
183
+
184
+ # _time = time.time() - t
185
+ # output = {
186
+ # "user_question": question,
187
+ # "answer": df['Answer'][indices[0][0]],
188
+ # "totaltime": round(_time, 3),
189
+ # "distance": round(distances[0][0], 4),
190
+ # "highlight_start": start_index,
191
+ # "highlight_end": end_index
192
+ # }
193
+ # return output
194
+
195
+ # def highlight_text(text, start_index, end_index):
196
+ # if start_index < 0:
197
+ # start_index = 0
198
+ # if end_index > len(text):
199
+ # end_index = len(text)
200
+ # highlighted_text = ""
201
+ # for i, char in enumerate(text):
202
+ # if i == start_index:
203
+ # highlighted_text += "<mark>"
204
+ # highlighted_text += char
205
+ # if i == end_index - 1:
206
+ # highlighted_text += "</mark>"
207
+ # return highlighted_text
208
+
209
+ # def chat_interface_before(question, history):
210
+ # response = predict(model, tokenizer, embedding_model, df, question, index)
211
+ # return response
212
+
213
+ # def chat_interface_after(question, history):
214
+ # response = predict_test(model, tokenizer, embedding_model, df, question, index)
215
+ # highlighted_answer = highlight_text(response["answer"], response["highlight_start"], response["highlight_end"])
216
+ # return highlighted_answer
217
+
218
+ # examples=[
219
+ # 'ขอเลขที่บัญชีของบริษัทหน่อย',
220
+ # 'บริษัทตั้งอยู่ที่ถนนอะไร',
221
+ # 'ขอช่องทางติดตามข่าวสารทาง Line หน่อย',
222
+ # 'อยากทราบความถี่ในการดึงข้อมูลของ DXT360 ในแต่ละแพลตฟอร์ม',
223
+ # 'อยากทราบความถี่ในการดึงข้อมูลของ DXT360 บน Twitter',
224
+ # # 'ช่องทางติดตามข่าวสารของเรา',
225
+ # ]
226
+ # demo_before = gr.ChatInterface(fn=chat_interface_before,
227
+ # examples=examples)
228
+
229
+ # demo_after = gr.ChatInterface(fn=chat_interface_after,
230
+ # examples=examples)
231
+
232
+ # interface = gr.TabbedInterface([demo_before, demo_after], ["Before", "After"])
233
+
234
+ # if __name__ == "__main__":
235
+ # # Load your model, tokenizer, data, and index here...
236
+ # df = load_data()
237
+ # model, tokenizer = load_model('wangchanberta-hyp')
238
+ # embedding_model = load_embedding_model()
239
+ # index = set_index(prepare_sentences_vector(load_embeddings(EMBEDDINGS_PATH)))
240
+ # interface.launch()
241
+
242
+
243
  import random
244
  import gradio as gr
245
  import time
 
258
  DEFAULT_MODEL = 'wangchanberta-hyp'
259
  DEFAULT_SENTENCE_EMBEDDING_MODEL = 'intfloat/multilingual-e5-base'
260
 
261
+ MODEL_DICT = 'Chananchida/wangchanberta-xet_hyp-params'
262
+
263
+
264
+ EMBEDDINGS_PATH = '/content/dataset.xlsx'
265
+ DATA_PATH='/content/embeddings.pkl'
266
+
267
+ class ChatBot:
268
+ SHEET_NAME_MDEBERTA = 'mdeberta'
269
+ SHEET_NAME_DEFAULT = 'Default'
270
+ UNKNOWN_ANSWERS = ["กรุณาลงรายระเอียดมากกว่านี้ได้มั้ยคะ", "ขอโทษค่ะลูกค้า ดิฉันไม่ทราบจริง ๆ"]
271
+
272
+ def __init__(self, df_path=None, model_path=None, tokenizer_path=None, embedding_model_name=None, embeddingsPath=None):
273
+ self.df = None
274
+ self.model = None
275
+ self.tokenizer = None
276
+ self.embedding_model = None
277
+ self.index = None
278
+ self.k = 5
279
+ if all(arg is not None for arg in (df_path, model_path, tokenizer_path, embedding_model_name, embeddingsPath)):
280
+ self.set_df(df_path)
281
+ self.set_model(model_path)
282
+ self.set_tokenizer(tokenizer_path)
283
+ self.set_embedding_model(embedding_model_name)
284
+ sentences_vector = self.load_embeddings(embeddingsPath)
285
+ repared_vector = self.prepare_sentences_vector(sentences_vector)
286
+ self.set_index(repared_vector)
287
+
288
+ def set_index(self, vector):
289
+ if torch.cuda.is_available(): # Check if GPU is available
290
+ res = faiss.StandardGpuResources()
291
+ index = faiss.IndexFlatL2(vector.shape[1])
292
+ gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index)
293
+ gpu_index_flat.add(vector)
294
+ self.index = gpu_index_flat
295
+ else: # If GPU is not available, use CPU-based Faiss index
296
+ self.index = faiss.IndexFlatL2(vector.shape[1])
297
+ self.index.add(vector)
298
+ return self.index
299
+
300
+ def set_df(self, path):
301
+ self.df = pd.read_excel(path, sheet_name=self.SHEET_NAME_DEFAULT)
302
+ self.df.rename(columns={'Response': 'Answer'}, inplace=True)
303
+ self.df['Context'] = pd.read_excel(path, self.SHEET_NAME_MDEBERTA)['Context']
304
+
305
+ def set_model(self, model):
306
+ self.model = AutoModelForQuestionAnswering.from_pretrained(model)
307
+
308
+ def set_tokenizer(self, tokenizer):
309
+ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer)
310
+
311
+ def set_embedding_model(self, model):
312
+ self.embedding_model = SentenceTransformer(model)
313
+
314
+ def set_k(self, k_value):
315
+ self.k = k_value
316
+
317
+ def get_df(self):
318
+ return self.df
319
+
320
+ def get_model(self):
321
+ return self.model
322
+
323
+ def get_tokenizer(self):
324
+ return self.tokenizer
325
+
326
+ def get_embedding_model(self):
327
+ return self.embedding_model
328
+
329
+ def get_index(self):
330
+ return self.index
331
+
332
+ def get_k(self):
333
+ return self.k
334
+
335
+ def get_embeddings(self, text_list):
336
+ return self.embedding_model.encode(text_list)
337
+
338
+ def prepare_sentences_vector(self, encoded_list):
339
+ encoded_list = [i.reshape(1, -1) for i in encoded_list]
340
+ encoded_list = np.vstack(encoded_list).astype('float32')
341
+ encoded_list = normalize(encoded_list)
342
+ return encoded_list
343
+
344
+ def load_embeddings(self, file_path):
345
+ with open(file_path, "rb") as fIn:
346
+ stored_data = pickle.load(fIn)
347
+ stored_sentences = stored_data['sentences']
348
+ stored_embeddings = stored_data['embeddings']
349
+ return stored_embeddings
350
+
351
+ def model_pipeline(self, question, similar_context):
352
+ inputs = self.tokenizer(question, similar_context, return_tensors="pt")
353
+ with torch.no_grad():
354
+ outputs = self.model(**inputs)
355
+ answer_start_index = outputs.start_logits.argmax()
356
+ answer_end_index = outputs.end_logits.argmax()
357
+ predict_answer_tokens = inputs.input_ids[0, answer_start_index: answer_end_index + 1]
358
+ Answer = self.tokenizer.decode(predict_answer_tokens)
359
+ return Answer.replace('<unk>','@')
360
+
361
+ def faiss_search(self, index, question_vector):
362
+ if index is None:
363
+ raise ValueError("Index has not been initialized.")
364
+ distances, indices = index.search(question_vector, self.k)
365
+ similar_questions = [self.df['Question'][indices[0][i]] for i in range(self.k)]
366
+ similar_contexts = [self.df['Context'][indices[0][i]] for i in range(self.k)]
367
+ return similar_questions, similar_contexts, distances, indices
368
+
369
+ def faiss_segment_search(self, index, question_vector, x=1):
370
+ if index is None:
371
+ raise ValueError("Index has not been initialized.")
372
+ distances, indices = index.search(question_vector, x)
373
+ return distances, indices
374
+
375
+ def create_segment_index(self, vector):
376
+ segment_index = faiss.IndexFlatL2(vector.shape[1])
377
+ segment_index.add(vector)
378
+ return segment_index
379
+
380
+ def predict_test(self, question):
381
+ list_context_for_show = []
382
+ list_distance_for_show = []
383
+ list_similar_question = []
384
+
385
+ question = question.strip()
386
+ question_vector = self.get_embeddings([question])
387
+ question_vector = self.prepare_sentences_vector([question_vector])
388
+ similar_questions, similar_contexts, distances, indices = self.faiss_search(self.index, question_vector)
389
+
390
+
391
+ mostSimContext = similar_contexts[0]
392
+ pattern = r'(?<=\s{10}).*'
393
+ matches = re.search(pattern, mostSimContext, flags=re.DOTALL)
394
+
395
+ if matches:
396
+ mostSimContext = matches.group(0)
397
+
398
+ mostSimContext = mostSimContext.strip()
399
+ mostSimContext = re.sub(r'\s+', ' ', mostSimContext)
 
 
 
 
 
 
 
 
 
400
 
401
+ segments = sent_tokenize(mostSimContext, engine="crfcut")
 
402
 
403
+ segment_embeddings = self.get_embeddings(segments)
404
+ segment_embeddings = self.prepare_sentences_vector(segment_embeddings)
405
+ segment_index = self.create_segment_index(segment_embeddings)
406
 
407
+ _distances, _indices = self.faiss_segment_search(segment_index, question_vector)
 
408
 
409
+ mostSimSegment = segments[_indices[0][0]]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
410
 
411
+ print(f"_indices => {_indices[0][0]}")
412
+ answer = self.model_pipeline(question, mostSimSegment)
413
+
414
+ if len(answer) <= 2:
415
+ answer = mostSimSegment
416
+
417
+ start_index = mostSimContext.find(answer)
418
+ end_index = start_index + len(answer)
419
+
420
+ print(f"mostSimContext {len(mostSimContext)} =>{mostSimContext}\nsegments {len(segments)} =>{segments}\nmostSimSegment {len(mostSimSegment)} =>{mostSimSegment}")
421
+ print(f"answer {len(answer)} => {answer} || startIndex =>{start_index} || endIndex =>{end_index}")
422
+
423
+ for i in range(min(5, self.k)):
424
+ index = indices[0][i]
425
+ similar_question = similar_questions[i]
426
+ similar_context = similar_contexts[i]
427
+
428
+ list_similar_question.append(similar_question)
429
+ list_context_for_show.append(similar_context)
430
+ list_distance_for_show.append(str(1 - distances[0][i]))
431
+
432
+ distance = list_distance_for_show[0]
433
+
434
+ if float(distance) < 0.5:
435
+ answer = random.choice(self.UNKNOWN_ANSWERS)
436
+
437
+ output = {
438
+ "user_question": question,
439
+ "answer": self.df['Answer'][indices[0][0]],
440
+ "distance": distance,
441
+ "highlight_start": start_index,
442
+ "highlight_end": end_index,
443
+ "list_context": list_context_for_show,
444
+ "list_distance": list_distance_for_show
445
+ }
446
+ return output
447
+
448
+ def highlight_text(self, text, start_index, end_index):
449
+ if start_index < 0:
450
+ start_index = 0
451
+ if end_index > len(text):
452
+ end_index = len(text)
453
+ highlighted_text = ""
454
+ for i, char in enumerate(text):
455
+ if i == start_index:
456
+ highlighted_text += "<mark>"
457
+ highlighted_text += char
458
+ if i == end_index - 1:
459
+ highlighted_text += "</mark>"
460
+ return highlighted_text
461
+
462
+ def chat_interface_before(self, question, history):
463
+ response = self.predict(question)
464
+ return response
465
+
466
+ def chat_interface_after(self, question, history):
467
+ response = self.predict_test(question)
468
+ highlighted_answer = self.highlight_text(response["answer"], response["highlight_start"], response["highlight_end"])
469
+ return highlighted_answer
470
+
471
  if __name__ == "__main__":
472
+ bot = ChatBot(df_path=DATA_PATH, model_path=MODEL_DICT, tokenizer_path=MODEL_DICT, embedding_model_name=DEFAULT_SENTENCE_EMBEDDING_MODEL, embeddingsPath=EMBEDDINGS_PATH)
473
+ # bot.load_data()
474
+ # bot.load_model()
475
+ # bot.load_embedding_model()
476
+ # embeddings = bot.load_embeddings(EMBEDDINGS_PATH)
477
+ # bot.set_index(bot.prepare_sentences_vector(embeddings))
478
+
479
+ examples = [
480
+ 'ขอเลขที่บัญชีของบริษัทหน่อย',
481
+ 'บริษัทตั้งอยู่ที่ถนนอะไร',
482
+ 'ขอช่องทางติดตามข่าวสารทาง Line หน่อย',
483
+ 'อยากทราบความถี่ในการดึงข้อมูลของ DXT360 ในแต่ละแพลตฟอร์ม',
484
+ 'อยากทราบความถี่ในการดึงข้อมูลของ DXT360 บน Twitter',
485
+ # 'ช่องทางติดตามข่าวสารของเรา',
486
+ ]
487
+
488
+ demo_before = gr.ChatInterface(fn=bot.chat_interface_before, examples=examples)
489
+ demo_after = gr.ChatInterface(fn=bot.chat_interface_after, examples=examples)
490
+
491
+ interface = gr.TabbedInterface([demo_before, demo_after], ["Before", "After"])
492
+ interface.launch()