Sirinoot commited on
Commit
8164131
1 Parent(s): f09c3e1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +210 -462
app.py CHANGED
@@ -1,245 +1,4 @@
1
  # @title web interface demo
2
- # import random
3
- # import gradio as gr
4
- # import time
5
- # import numpy as np
6
- # import pandas as pd
7
- # import torch
8
- # import faiss
9
- # from sklearn.preprocessing import normalize
10
- # from transformers import AutoTokenizer, AutoModelForQuestionAnswering
11
- # from sentence_transformers import SentenceTransformer, util
12
- # from pythainlp import Tokenizer
13
- # import pickle
14
- # import re
15
- # from pythainlp.tokenize import sent_tokenize
16
-
17
- # DEFAULT_MODEL = 'wangchanberta-hyp'
18
- # DEFAULT_SENTENCE_EMBEDDING_MODEL = 'intfloat/multilingual-e5-base'
19
-
20
- # MODEL_DICT = {
21
- # 'wangchanberta': 'Chananchida/wangchanberta-xet_ref-params',
22
- # 'wangchanberta-hyp': 'Chananchida/wangchanberta-xet_hyp-params',
23
- # }
24
-
25
- # EMBEDDINGS_PATH = 'data/embeddings.pkl'
26
- # DATA_PATH='data/dataset.xlsx'
27
-
28
-
29
- # def load_data(path=DATA_PATH):
30
- # df = pd.read_excel(path, sheet_name='Default')
31
- # df['Context'] = pd.read_excel(path, sheet_name='mdeberta')['Context']
32
- # print(len(df))
33
- # print('Load data done')
34
- # return df
35
-
36
-
37
- # def load_model(model_name=DEFAULT_MODEL):
38
- # model = AutoModelForQuestionAnswering.from_pretrained(MODEL_DICT[model_name])
39
- # tokenizer = AutoTokenizer.from_pretrained(MODEL_DICT[model_name])
40
- # print('Load model done')
41
- # return model, tokenizer
42
-
43
- # def load_embedding_model(model_name=DEFAULT_SENTENCE_EMBEDDING_MODEL):
44
- # # if torch.cuda.is_available():
45
- # # embedding_model = SentenceTransformer(model_name, device='cuda')
46
- # # else:
47
- # embedding_model = SentenceTransformer(model_name)
48
- # print('Load sentence embedding model done')
49
- # return embedding_model
50
-
51
-
52
- # def set_index(vector):
53
- # if torch.cuda.is_available():
54
- # res = faiss.StandardGpuResources()
55
- # index = faiss.IndexFlatL2(vector.shape[1])
56
- # gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index)
57
- # gpu_index_flat.add(vector)
58
- # index = gpu_index_flat
59
- # else:
60
- # index = faiss.IndexFlatL2(vector.shape[1])
61
- # index.add(vector)
62
- # return index
63
-
64
-
65
- # def get_embeddings(embedding_model, text_list):
66
- # return embedding_model.encode(text_list)
67
-
68
-
69
- # def prepare_sentences_vector(encoded_list):
70
- # encoded_list = [i.reshape(1, -1) for i in encoded_list]
71
- # encoded_list = np.vstack(encoded_list).astype('float32')
72
- # encoded_list = normalize(encoded_list)
73
- # return encoded_list
74
-
75
-
76
- # def store_embeddings(df, embeddings):
77
- # with open('embeddings.pkl', "wb") as fOut:
78
- # pickle.dump({'sentences': df['Question'], 'embeddings': embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL)
79
- # print('Store embeddings done')
80
-
81
-
82
- # def load_embeddings(file_path=EMBEDDINGS_PATH):
83
- # with open(file_path, "rb") as fIn:
84
- # stored_data = pickle.load(fIn)
85
- # stored_sentences = stored_data['sentences']
86
- # stored_embeddings = stored_data['embeddings']
87
- # print('Load (questions) embeddings done')
88
- # return stored_embeddings
89
-
90
-
91
- # def model_pipeline(model, tokenizer, question, similar_context):
92
- # inputs = tokenizer(question, similar_context, return_tensors="pt")
93
- # with torch.no_grad():
94
- # outputs = model(**inputs)
95
- # answer_start_index = outputs.start_logits.argmax()
96
- # answer_end_index = outputs.end_logits.argmax()
97
- # predict_answer_tokens = inputs.input_ids[0, answer_start_index: answer_end_index + 1]
98
- # Answer = tokenizer.decode(predict_answer_tokens)
99
- # return Answer.replace('<unk>','@')
100
-
101
-
102
- # def faiss_search(index, question_vector, k=1):
103
- # distances, indices = index.search(question_vector, k)
104
- # return distances,indices
105
-
106
- # def create_segment_index(vector):
107
- # segment_index = faiss.IndexFlatL2(vector.shape[1])
108
- # segment_index.add(vector)
109
- # return segment_index
110
-
111
-
112
- # def predict_faiss(model, tokenizer, embedding_model, df, question, index):
113
- # t = time.time()
114
- # question = question.strip()
115
- # question_vector = get_embeddings(embedding_model, question)
116
- # question_vector = prepare_sentences_vector([question_vector])
117
- # distances,indices = faiss_search(index, question_vector)
118
- # Answers = [df['Answer'][i] for i in indices[0]]
119
- # _time = time.time() - t
120
- # output = {
121
- # "user_question": question,
122
- # "answer": Answers[0],
123
- # "totaltime": round(_time, 3),
124
- # "score": round(distances[0][0], 4)
125
- # }
126
- # return output
127
-
128
- # def predict(model, tokenizer, embedding_model, df, question, index):
129
- # t = time.time()
130
- # question = question.strip()
131
- # question_vector = get_embeddings(embedding_model, question)
132
- # question_vector = prepare_sentences_vector([question_vector])
133
- # distances,indices = faiss_search(index, question_vector)
134
-
135
- # # Answer = model_pipeline(model, tokenizer, df['Question'][indices[0][0]], df['Context'][indices[0][0]])
136
- # Answer = model_pipeline(model, tokenizer, question, df['Context'][indices[0][0]])
137
- # _time = time.time() - t
138
- # output = {
139
- # "user_question": question,
140
- # "answer": Answer,
141
- # "totaltime": round(_time, 3),
142
- # "distance": round(distances[0][0], 4)
143
- # }
144
- # return Answer
145
-
146
- # def predict_test(model, tokenizer, embedding_model, df, question, index): # sent_tokenize pythainlp
147
- # t = time.time()
148
- # question = question.strip()
149
- # question_vector = get_embeddings(embedding_model, question)
150
- # question_vector = prepare_sentences_vector([question_vector])
151
- # distances,indices = faiss_search(index, question_vector)
152
-
153
- # mostSimContext = df['Context'][indices[0][0]]
154
- # pattern = r'(?<=\s{10}).*'
155
- # matches = re.search(pattern, mostSimContext, flags=re.DOTALL)
156
-
157
- # if matches:
158
- # mostSimContext = matches.group(0)
159
-
160
- # mostSimContext = mostSimContext.strip()
161
- # mostSimContext = re.sub(r'\s+', ' ', mostSimContext)
162
-
163
- # segments = sent_tokenize(mostSimContext, engine="crfcut")
164
-
165
- # segment_embeddings = get_embeddings(embedding_model, segments)
166
- # segment_embeddings = prepare_sentences_vector(segment_embeddings)
167
- # segment_index = create_segment_index(segment_embeddings)
168
-
169
- # _distances,_indices = faiss_search(segment_index, question_vector)
170
- # mostSimSegment = segments[_indices[0][0]]
171
-
172
- # Answer = model_pipeline(model, tokenizer,question,mostSimSegment)
173
-
174
- # if len(Answer) <= 2:
175
- # Answer = mostSimSegment
176
-
177
- # # Find the start and end indices of mostSimSegment within mostSimContext
178
- # start_index = mostSimContext.find(Answer)
179
- # end_index = start_index + len(Answer)
180
-
181
- # print(f"answer {len(Answer)} => {Answer} || startIndex =>{start_index} || endIndex =>{end_index}")
182
- # print(f"mostSimContext{len(mostSimContext)}=>{mostSimContext}\nsegments{len(segments)}=>{segments}\nmostSimSegment{len(mostSimSegment)}=>{mostSimSegment}")
183
-
184
- # _time = time.time() - t
185
- # output = {
186
- # "user_question": question,
187
- # "answer": df['Answer'][indices[0][0]],
188
- # "totaltime": round(_time, 3),
189
- # "distance": round(distances[0][0], 4),
190
- # "highlight_start": start_index,
191
- # "highlight_end": end_index
192
- # }
193
- # return output
194
-
195
- # def highlight_text(text, start_index, end_index):
196
- # if start_index < 0:
197
- # start_index = 0
198
- # if end_index > len(text):
199
- # end_index = len(text)
200
- # highlighted_text = ""
201
- # for i, char in enumerate(text):
202
- # if i == start_index:
203
- # highlighted_text += "<mark>"
204
- # highlighted_text += char
205
- # if i == end_index - 1:
206
- # highlighted_text += "</mark>"
207
- # return highlighted_text
208
-
209
- # def chat_interface_before(question, history):
210
- # response = predict(model, tokenizer, embedding_model, df, question, index)
211
- # return response
212
-
213
- # def chat_interface_after(question, history):
214
- # response = predict_test(model, tokenizer, embedding_model, df, question, index)
215
- # highlighted_answer = highlight_text(response["answer"], response["highlight_start"], response["highlight_end"])
216
- # return highlighted_answer
217
-
218
- # examples=[
219
- # 'ขอเลขที่บัญชีของบริษัทหน่อย',
220
- # 'บริษัทตั้งอยู่ที่ถนนอะไร',
221
- # 'ขอช่องทางติดตามข่าวสารทาง Line หน่อย',
222
- # 'อยากทราบความถี่ในการดึงข้อมูลของ DXT360 ในแต่ละแพลตฟอร์ม',
223
- # 'อยากทราบความถี่ในการดึงข้อมูลของ DXT360 บน Twitter',
224
- # # 'ช่องทางติดตามข่าวสารของเรา',
225
- # ]
226
- # demo_before = gr.ChatInterface(fn=chat_interface_before,
227
- # examples=examples)
228
-
229
- # demo_after = gr.ChatInterface(fn=chat_interface_after,
230
- # examples=examples)
231
-
232
- # interface = gr.TabbedInterface([demo_before, demo_after], ["Before", "After"])
233
-
234
- # if __name__ == "__main__":
235
- # # Load your model, tokenizer, data, and index here...
236
- # df = load_data()
237
- # model, tokenizer = load_model('wangchanberta-hyp')
238
- # embedding_model = load_embedding_model()
239
- # index = set_index(prepare_sentences_vector(load_embeddings(EMBEDDINGS_PATH)))
240
- # interface.launch()
241
-
242
-
243
  import random
244
  import gradio as gr
245
  import time
@@ -258,235 +17,224 @@ from pythainlp.tokenize import sent_tokenize
258
  DEFAULT_MODEL = 'wangchanberta-hyp'
259
  DEFAULT_SENTENCE_EMBEDDING_MODEL = 'intfloat/multilingual-e5-base'
260
 
261
- MODEL_DICT = 'Chananchida/wangchanberta-xet_hyp-params'
262
-
 
 
263
 
264
  EMBEDDINGS_PATH = 'data/embeddings.pkl'
265
  DATA_PATH='data/dataset.xlsx'
266
 
267
- class ChatBot:
268
- SHEET_NAME_MDEBERTA = 'mdeberta'
269
- SHEET_NAME_DEFAULT = 'Default'
270
- UNKNOWN_ANSWERS = ["กรุณาลงรายระเอียดมากกว่านี้ได้มั้ยคะ", "ขอโทษค่ะลูกค้า ดิฉันไม่ทราบจริง ๆ"]
271
-
272
- def __init__(self, df_path=None, model_path=None, tokenizer_path=None, embedding_model_name=None, embeddingsPath=None):
273
- self.df = None
274
- self.model = None
275
- self.tokenizer = None
276
- self.embedding_model = None
277
- self.index = None
278
- self.k = 5
279
- if all(arg is not None for arg in (df_path, model_path, tokenizer_path, embedding_model_name, embeddingsPath)):
280
- self.set_df(df_path)
281
- self.set_model(model_path)
282
- self.set_tokenizer(tokenizer_path)
283
- self.set_embedding_model(embedding_model_name)
284
- sentences_vector = self.load_embeddings(embeddingsPath)
285
- repared_vector = self.prepare_sentences_vector(sentences_vector)
286
- self.set_index(repared_vector)
287
-
288
- def set_index(self, vector):
289
- if torch.cuda.is_available(): # Check if GPU is available
290
- res = faiss.StandardGpuResources()
291
- index = faiss.IndexFlatL2(vector.shape[1])
292
- gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index)
293
- gpu_index_flat.add(vector)
294
- self.index = gpu_index_flat
295
- else: # If GPU is not available, use CPU-based Faiss index
296
- self.index = faiss.IndexFlatL2(vector.shape[1])
297
- self.index.add(vector)
298
- return self.index
299
-
300
- def set_df(self, path):
301
- self.df = pd.read_excel(path, sheet_name=self.SHEET_NAME_DEFAULT)
302
- self.df.rename(columns={'Response': 'Answer'}, inplace=True)
303
- self.df['Context'] = pd.read_excel(path, self.SHEET_NAME_MDEBERTA)['Context']
304
-
305
- def set_model(self, model):
306
- self.model = AutoModelForQuestionAnswering.from_pretrained(model)
307
-
308
- def set_tokenizer(self, tokenizer):
309
- self.tokenizer = AutoTokenizer.from_pretrained(tokenizer)
310
-
311
- def set_embedding_model(self, model):
312
- self.embedding_model = SentenceTransformer(model)
313
-
314
- def set_k(self, k_value):
315
- self.k = k_value
316
-
317
- def get_df(self):
318
- return self.df
319
-
320
- def get_model(self):
321
- return self.model
322
-
323
- def get_tokenizer(self):
324
- return self.tokenizer
325
-
326
- def get_embedding_model(self):
327
- return self.embedding_model
328
-
329
- def get_index(self):
330
- return self.index
331
-
332
- def get_k(self):
333
- return self.k
334
-
335
- def get_embeddings(self, text_list):
336
- return self.embedding_model.encode(text_list)
337
-
338
- def prepare_sentences_vector(self, encoded_list):
339
- encoded_list = [i.reshape(1, -1) for i in encoded_list]
340
- encoded_list = np.vstack(encoded_list).astype('float32')
341
- encoded_list = normalize(encoded_list)
342
- return encoded_list
343
-
344
- def load_embeddings(self, file_path):
345
- with open(file_path, "rb") as fIn:
346
- stored_data = pickle.load(fIn)
347
- stored_sentences = stored_data['sentences']
348
- stored_embeddings = stored_data['embeddings']
349
- return stored_embeddings
350
-
351
- def model_pipeline(self, question, similar_context):
352
- inputs = self.tokenizer(question, similar_context, return_tensors="pt")
353
- with torch.no_grad():
354
- outputs = self.model(**inputs)
355
- answer_start_index = outputs.start_logits.argmax()
356
- answer_end_index = outputs.end_logits.argmax()
357
- predict_answer_tokens = inputs.input_ids[0, answer_start_index: answer_end_index + 1]
358
- Answer = self.tokenizer.decode(predict_answer_tokens)
359
- return Answer.replace('<unk>','@')
360
-
361
- def faiss_search(self, index, question_vector):
362
- if index is None:
363
- raise ValueError("Index has not been initialized.")
364
- distances, indices = index.search(question_vector, self.k)
365
- similar_questions = [self.df['Question'][indices[0][i]] for i in range(self.k)]
366
- similar_contexts = [self.df['Context'][indices[0][i]] for i in range(self.k)]
367
- return similar_questions, similar_contexts, distances, indices
368
-
369
- def faiss_segment_search(self, index, question_vector, x=1):
370
- if index is None:
371
- raise ValueError("Index has not been initialized.")
372
- distances, indices = index.search(question_vector, x)
373
- return distances, indices
374
-
375
- def create_segment_index(self, vector):
376
- segment_index = faiss.IndexFlatL2(vector.shape[1])
377
- segment_index.add(vector)
378
- return segment_index
379
-
380
- def predict_test(self, question):
381
- list_context_for_show = []
382
- list_distance_for_show = []
383
- list_similar_question = []
384
-
385
- question = question.strip()
386
- question_vector = self.get_embeddings([question])
387
- question_vector = self.prepare_sentences_vector([question_vector])
388
- similar_questions, similar_contexts, distances, indices = self.faiss_search(self.index, question_vector)
389
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
390
 
391
- mostSimContext = similar_contexts[0]
392
- pattern = r'(?<=\s{10}).*'
393
- matches = re.search(pattern, mostSimContext, flags=re.DOTALL)
394
-
395
- if matches:
396
- mostSimContext = matches.group(0)
397
-
398
- mostSimContext = mostSimContext.strip()
399
- mostSimContext = re.sub(r'\s+', ' ', mostSimContext)
400
-
401
- segments = sent_tokenize(mostSimContext, engine="crfcut")
402
 
403
- segment_embeddings = self.get_embeddings(segments)
404
- segment_embeddings = self.prepare_sentences_vector(segment_embeddings)
405
- segment_index = self.create_segment_index(segment_embeddings)
406
 
407
- _distances, _indices = self.faiss_segment_search(segment_index, question_vector)
 
408
 
409
- mostSimSegment = segments[_indices[0][0]]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
410
 
411
- print(f"_indices => {_indices[0][0]}")
412
- answer = self.model_pipeline(question, mostSimSegment)
413
-
414
- if len(answer) <= 2:
415
- answer = mostSimSegment
416
-
417
- start_index = mostSimContext.find(answer)
418
- end_index = start_index + len(answer)
419
-
420
- print(f"mostSimContext {len(mostSimContext)} =>{mostSimContext}\nsegments {len(segments)} =>{segments}\nmostSimSegment {len(mostSimSegment)} =>{mostSimSegment}")
421
- print(f"answer {len(answer)} => {answer} || startIndex =>{start_index} || endIndex =>{end_index}")
422
-
423
- for i in range(min(5, self.k)):
424
- index = indices[0][i]
425
- similar_question = similar_questions[i]
426
- similar_context = similar_contexts[i]
427
-
428
- list_similar_question.append(similar_question)
429
- list_context_for_show.append(similar_context)
430
- list_distance_for_show.append(str(1 - distances[0][i]))
431
-
432
- distance = list_distance_for_show[0]
433
-
434
- if float(distance) < 0.5:
435
- answer = random.choice(self.UNKNOWN_ANSWERS)
436
-
437
- output = {
438
- "user_question": question,
439
- "answer": self.df['Answer'][indices[0][0]],
440
- "distance": distance,
441
- "highlight_start": start_index,
442
- "highlight_end": end_index,
443
- "list_context": list_context_for_show,
444
- "list_distance": list_distance_for_show
445
- }
446
- return output
447
-
448
- def highlight_text(self, text, start_index, end_index):
449
- if start_index < 0:
450
- start_index = 0
451
- if end_index > len(text):
452
- end_index = len(text)
453
- highlighted_text = ""
454
- for i, char in enumerate(text):
455
- if i == start_index:
456
- highlighted_text += "<mark>"
457
- highlighted_text += char
458
- if i == end_index - 1:
459
- highlighted_text += "</mark>"
460
- return highlighted_text
461
-
462
- def chat_interface_before(self, question, history):
463
- response = self.predict(question)
464
- return response
465
-
466
- def chat_interface_after(self, question, history):
467
- response = self.predict_test(question)
468
- highlighted_answer = self.highlight_text(response["answer"], response["highlight_start"], response["highlight_end"])
469
- return highlighted_answer
470
-
471
  if __name__ == "__main__":
472
- bot = ChatBot(df_path=DATA_PATH, model_path=MODEL_DICT, tokenizer_path=MODEL_DICT, embedding_model_name=DEFAULT_SENTENCE_EMBEDDING_MODEL, embeddingsPath=EMBEDDINGS_PATH)
473
- # bot.load_data()
474
- # bot.load_model()
475
- # bot.load_embedding_model()
476
- # embeddings = bot.load_embeddings(EMBEDDINGS_PATH)
477
- # bot.set_index(bot.prepare_sentences_vector(embeddings))
478
-
479
- examples = [
480
- 'ขอเลขที่บัญชีของบริษัทหน่อย',
481
- 'บริษัทตั้งอยู่ที่ถนนอะไร',
482
- 'ขอช่องทางติดตามข่าวสารทาง Line หน่อย',
483
- 'อยากทราบความถี่ในการดึงข้อมูลของ DXT360 ในแต่ละแพลตฟอร์ม',
484
- 'อยากทราบความถี่ในการดึงข้อมูลของ DXT360 บน Twitter',
485
- # 'ช่องทางติดตามข่าวสารของเรา',
486
- ]
487
-
488
- demo_before = gr.ChatInterface(fn=bot.chat_interface_before, examples=examples)
489
- demo_after = gr.ChatInterface(fn=bot.chat_interface_after, examples=examples)
490
-
491
- interface = gr.TabbedInterface([demo_before, demo_after], ["Before", "After"])
492
- interface.launch()
 
1
  # @title web interface demo
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import random
3
  import gradio as gr
4
  import time
 
17
  DEFAULT_MODEL = 'wangchanberta-hyp'
18
  DEFAULT_SENTENCE_EMBEDDING_MODEL = 'intfloat/multilingual-e5-base'
19
 
20
+ MODEL_DICT = {
21
+ 'wangchanberta': 'Chananchida/wangchanberta-xet_ref-params',
22
+ 'wangchanberta-hyp': 'Chananchida/wangchanberta-xet_hyp-params',
23
+ }
24
 
25
  EMBEDDINGS_PATH = 'data/embeddings.pkl'
26
  DATA_PATH='data/dataset.xlsx'
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
+ def load_data(path=DATA_PATH):
30
+ df = pd.read_excel(path, sheet_name='Default')
31
+ df['Context'] = pd.read_excel(path, sheet_name='mdeberta')['Context']
32
+ print(len(df))
33
+ print('Load data done')
34
+ return df
35
+
36
+
37
+ def load_model(model_name=DEFAULT_MODEL):
38
+ model = AutoModelForQuestionAnswering.from_pretrained(MODEL_DICT[model_name])
39
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_DICT[model_name])
40
+ print('Load model done')
41
+ return model, tokenizer
42
+
43
+ def load_embedding_model(model_name=DEFAULT_SENTENCE_EMBEDDING_MODEL):
44
+ # if torch.cuda.is_available():
45
+ # embedding_model = SentenceTransformer(model_name, device='cuda')
46
+ # else:
47
+ embedding_model = SentenceTransformer(model_name)
48
+ print('Load sentence embedding model done')
49
+ return embedding_model
50
+
51
+
52
+ def set_index(vector):
53
+ if torch.cuda.is_available():
54
+ res = faiss.StandardGpuResources()
55
+ index = faiss.IndexFlatL2(vector.shape[1])
56
+ gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index)
57
+ gpu_index_flat.add(vector)
58
+ index = gpu_index_flat
59
+ else:
60
+ index = faiss.IndexFlatL2(vector.shape[1])
61
+ index.add(vector)
62
+ return index
63
+
64
+
65
+ def get_embeddings(embedding_model, text_list):
66
+ return embedding_model.encode(text_list)
67
+
68
+
69
+ def prepare_sentences_vector(encoded_list):
70
+ encoded_list = [i.reshape(1, -1) for i in encoded_list]
71
+ encoded_list = np.vstack(encoded_list).astype('float32')
72
+ encoded_list = normalize(encoded_list)
73
+ return encoded_list
74
+
75
+
76
+ def store_embeddings(df, embeddings):
77
+ with open('embeddings.pkl', "wb") as fOut:
78
+ pickle.dump({'sentences': df['Question'], 'embeddings': embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL)
79
+ print('Store embeddings done')
80
+
81
+
82
+ def load_embeddings(file_path=EMBEDDINGS_PATH):
83
+ with open(file_path, "rb") as fIn:
84
+ stored_data = pickle.load(fIn)
85
+ stored_sentences = stored_data['sentences']
86
+ stored_embeddings = stored_data['embeddings']
87
+ print('Load (questions) embeddings done')
88
+ return stored_embeddings
89
+
90
+
91
+ def model_pipeline(model, tokenizer, question, similar_context):
92
+ inputs = tokenizer(question, similar_context, return_tensors="pt")
93
+ with torch.no_grad():
94
+ outputs = model(**inputs)
95
+ answer_start_index = outputs.start_logits.argmax()
96
+ answer_end_index = outputs.end_logits.argmax()
97
+ predict_answer_tokens = inputs.input_ids[0, answer_start_index: answer_end_index + 1]
98
+ Answer = tokenizer.decode(predict_answer_tokens)
99
+ return Answer.replace('<unk>','@')
100
+
101
+
102
+ def faiss_search(index, question_vector, k=1):
103
+ distances, indices = index.search(question_vector, k)
104
+ return distances,indices
105
+
106
+ def create_segment_index(vector):
107
+ segment_index = faiss.IndexFlatL2(vector.shape[1])
108
+ segment_index.add(vector)
109
+ return segment_index
110
+
111
+
112
+ def predict_faiss(model, tokenizer, embedding_model, df, question, index):
113
+ t = time.time()
114
+ question = question.strip()
115
+ question_vector = get_embeddings(embedding_model, question)
116
+ question_vector = prepare_sentences_vector([question_vector])
117
+ distances,indices = faiss_search(index, question_vector)
118
+ Answers = [df['Answer'][i] for i in indices[0]]
119
+ _time = time.time() - t
120
+ output = {
121
+ "user_question": question,
122
+ "answer": Answers[0],
123
+ "totaltime": round(_time, 3),
124
+ "score": round(distances[0][0], 4)
125
+ }
126
+ return output
127
+
128
+ def predict(model, tokenizer, embedding_model, df, question, index):
129
+ t = time.time()
130
+ question = question.strip()
131
+ question_vector = get_embeddings(embedding_model, question)
132
+ question_vector = prepare_sentences_vector([question_vector])
133
+ distances,indices = faiss_search(index, question_vector)
134
+
135
+ # Answer = model_pipeline(model, tokenizer, df['Question'][indices[0][0]], df['Context'][indices[0][0]])
136
+ Answer = model_pipeline(model, tokenizer, question, df['Context'][indices[0][0]])
137
+ _time = time.time() - t
138
+ output = {
139
+ "user_question": question,
140
+ "answer": Answer,
141
+ "totaltime": round(_time, 3),
142
+ "distance": round(distances[0][0], 4)
143
+ }
144
+ return Answer
145
+
146
+ def predict_test(model, tokenizer, embedding_model, df, question, index): # sent_tokenize pythainlp
147
+ t = time.time()
148
+ question = question.strip()
149
+ question_vector = get_embeddings(embedding_model, question)
150
+ question_vector = prepare_sentences_vector([question_vector])
151
+ distances,indices = faiss_search(index, question_vector)
152
+
153
+ mostSimContext = df['Context'][indices[0][0]]
154
+ pattern = r'(?<=\s{10}).*'
155
+ matches = re.search(pattern, mostSimContext, flags=re.DOTALL)
156
+
157
+ if matches:
158
+ mostSimContext = matches.group(0)
159
+
160
+ mostSimContext = mostSimContext.strip()
161
+ mostSimContext = re.sub(r'\s+', ' ', mostSimContext)
162
+
163
+ segments = sent_tokenize(mostSimContext, engine="crfcut")
164
+
165
+ segment_embeddings = get_embeddings(embedding_model, segments)
166
+ segment_embeddings = prepare_sentences_vector(segment_embeddings)
167
+ segment_index = create_segment_index(segment_embeddings)
168
 
169
+ _distances,_indices = faiss_search(segment_index, question_vector)
170
+ mostSimSegment = segments[_indices[0][0]]
 
 
 
 
 
 
 
 
 
171
 
172
+ Answer = model_pipeline(model, tokenizer,question,mostSimSegment)
 
 
173
 
174
+ if len(Answer) <= 2:
175
+ Answer = mostSimSegment
176
 
177
+ # Find the start and end indices of mostSimSegment within mostSimContext
178
+ start_index = mostSimContext.find(Answer)
179
+ end_index = start_index + len(Answer)
180
+
181
+ print(f"answer {len(Answer)} => {Answer} || startIndex =>{start_index} || endIndex =>{end_index}")
182
+ print(f"mostSimContext{len(mostSimContext)}=>{mostSimContext}\nsegments{len(segments)}=>{segments}\nmostSimSegment{len(mostSimSegment)}=>{mostSimSegment}")
183
+
184
+ _time = time.time() - t
185
+ output = {
186
+ "user_question": question,
187
+ "answer": df['Answer'][indices[0][0]],
188
+ "totaltime": round(_time, 3),
189
+ "distance": round(distances[0][0], 4),
190
+ "highlight_start": start_index,
191
+ "highlight_end": end_index
192
+ }
193
+ return output
194
+
195
+ def highlight_text(text, start_index, end_index):
196
+ if start_index < 0:
197
+ start_index = 0
198
+ if end_index > len(text):
199
+ end_index = len(text)
200
+ highlighted_text = ""
201
+ for i, char in enumerate(text):
202
+ if i == start_index:
203
+ highlighted_text += "<mark>"
204
+ highlighted_text += char
205
+ if i == end_index - 1:
206
+ highlighted_text += "</mark>"
207
+ return highlighted_text
208
+
209
+ def chat_interface_before(question, history):
210
+ response = predict(model, tokenizer, embedding_model, df, question, index)
211
+ return response
212
+
213
+ def chat_interface_after(question, history):
214
+ response = predict_test(model, tokenizer, embedding_model, df, question, index)
215
+ highlighted_answer = highlight_text(response["answer"], response["highlight_start"], response["highlight_end"])
216
+ return highlighted_answer
217
+
218
+ examples=[
219
+ 'ขอเลขที่บัญชีของบริษัทหน่อย',
220
+ 'บริษัทตั้งอยู่ที่ถนนอะไร',
221
+ 'ขอช่องทางติดตามข่าวสารทาง Line หน่อย',
222
+ 'อยากทราบความถี่ในการดึงข้อมูลของ DXT360 ในแต่ละแพลตฟอร์ม',
223
+ 'อยากทราบความถี่ในการดึงข้อมูลของ DXT360 บน Twitter',
224
+ # 'ช่องทางติดตามข่าวสารของเรา',
225
+ ]
226
+ demo_before = gr.ChatInterface(fn=chat_interface_before,
227
+ examples=examples)
228
+
229
+ demo_after = gr.ChatInterface(fn=chat_interface_after,
230
+ examples=examples)
231
+
232
+ interface = gr.TabbedInterface([demo_before, demo_after], ["Before", "After"])
233
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  if __name__ == "__main__":
235
+ # Load your model, tokenizer, data, and index here...
236
+ df = load_data()
237
+ model, tokenizer = load_model('wangchanberta-hyp')
238
+ embedding_model = load_embedding_model()
239
+ index = set_index(prepare_sentences_vector(load_embeddings(EMBEDDINGS_PATH)))
240
+ interface.launch()