Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,245 +1,4 @@
|
|
1 |
# @title web interface demo
|
2 |
-
# import random
|
3 |
-
# import gradio as gr
|
4 |
-
# import time
|
5 |
-
# import numpy as np
|
6 |
-
# import pandas as pd
|
7 |
-
# import torch
|
8 |
-
# import faiss
|
9 |
-
# from sklearn.preprocessing import normalize
|
10 |
-
# from transformers import AutoTokenizer, AutoModelForQuestionAnswering
|
11 |
-
# from sentence_transformers import SentenceTransformer, util
|
12 |
-
# from pythainlp import Tokenizer
|
13 |
-
# import pickle
|
14 |
-
# import re
|
15 |
-
# from pythainlp.tokenize import sent_tokenize
|
16 |
-
|
17 |
-
# DEFAULT_MODEL = 'wangchanberta-hyp'
|
18 |
-
# DEFAULT_SENTENCE_EMBEDDING_MODEL = 'intfloat/multilingual-e5-base'
|
19 |
-
|
20 |
-
# MODEL_DICT = {
|
21 |
-
# 'wangchanberta': 'Chananchida/wangchanberta-xet_ref-params',
|
22 |
-
# 'wangchanberta-hyp': 'Chananchida/wangchanberta-xet_hyp-params',
|
23 |
-
# }
|
24 |
-
|
25 |
-
# EMBEDDINGS_PATH = 'data/embeddings.pkl'
|
26 |
-
# DATA_PATH='data/dataset.xlsx'
|
27 |
-
|
28 |
-
|
29 |
-
# def load_data(path=DATA_PATH):
|
30 |
-
# df = pd.read_excel(path, sheet_name='Default')
|
31 |
-
# df['Context'] = pd.read_excel(path, sheet_name='mdeberta')['Context']
|
32 |
-
# print(len(df))
|
33 |
-
# print('Load data done')
|
34 |
-
# return df
|
35 |
-
|
36 |
-
|
37 |
-
# def load_model(model_name=DEFAULT_MODEL):
|
38 |
-
# model = AutoModelForQuestionAnswering.from_pretrained(MODEL_DICT[model_name])
|
39 |
-
# tokenizer = AutoTokenizer.from_pretrained(MODEL_DICT[model_name])
|
40 |
-
# print('Load model done')
|
41 |
-
# return model, tokenizer
|
42 |
-
|
43 |
-
# def load_embedding_model(model_name=DEFAULT_SENTENCE_EMBEDDING_MODEL):
|
44 |
-
# # if torch.cuda.is_available():
|
45 |
-
# # embedding_model = SentenceTransformer(model_name, device='cuda')
|
46 |
-
# # else:
|
47 |
-
# embedding_model = SentenceTransformer(model_name)
|
48 |
-
# print('Load sentence embedding model done')
|
49 |
-
# return embedding_model
|
50 |
-
|
51 |
-
|
52 |
-
# def set_index(vector):
|
53 |
-
# if torch.cuda.is_available():
|
54 |
-
# res = faiss.StandardGpuResources()
|
55 |
-
# index = faiss.IndexFlatL2(vector.shape[1])
|
56 |
-
# gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index)
|
57 |
-
# gpu_index_flat.add(vector)
|
58 |
-
# index = gpu_index_flat
|
59 |
-
# else:
|
60 |
-
# index = faiss.IndexFlatL2(vector.shape[1])
|
61 |
-
# index.add(vector)
|
62 |
-
# return index
|
63 |
-
|
64 |
-
|
65 |
-
# def get_embeddings(embedding_model, text_list):
|
66 |
-
# return embedding_model.encode(text_list)
|
67 |
-
|
68 |
-
|
69 |
-
# def prepare_sentences_vector(encoded_list):
|
70 |
-
# encoded_list = [i.reshape(1, -1) for i in encoded_list]
|
71 |
-
# encoded_list = np.vstack(encoded_list).astype('float32')
|
72 |
-
# encoded_list = normalize(encoded_list)
|
73 |
-
# return encoded_list
|
74 |
-
|
75 |
-
|
76 |
-
# def store_embeddings(df, embeddings):
|
77 |
-
# with open('embeddings.pkl', "wb") as fOut:
|
78 |
-
# pickle.dump({'sentences': df['Question'], 'embeddings': embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL)
|
79 |
-
# print('Store embeddings done')
|
80 |
-
|
81 |
-
|
82 |
-
# def load_embeddings(file_path=EMBEDDINGS_PATH):
|
83 |
-
# with open(file_path, "rb") as fIn:
|
84 |
-
# stored_data = pickle.load(fIn)
|
85 |
-
# stored_sentences = stored_data['sentences']
|
86 |
-
# stored_embeddings = stored_data['embeddings']
|
87 |
-
# print('Load (questions) embeddings done')
|
88 |
-
# return stored_embeddings
|
89 |
-
|
90 |
-
|
91 |
-
# def model_pipeline(model, tokenizer, question, similar_context):
|
92 |
-
# inputs = tokenizer(question, similar_context, return_tensors="pt")
|
93 |
-
# with torch.no_grad():
|
94 |
-
# outputs = model(**inputs)
|
95 |
-
# answer_start_index = outputs.start_logits.argmax()
|
96 |
-
# answer_end_index = outputs.end_logits.argmax()
|
97 |
-
# predict_answer_tokens = inputs.input_ids[0, answer_start_index: answer_end_index + 1]
|
98 |
-
# Answer = tokenizer.decode(predict_answer_tokens)
|
99 |
-
# return Answer.replace('<unk>','@')
|
100 |
-
|
101 |
-
|
102 |
-
# def faiss_search(index, question_vector, k=1):
|
103 |
-
# distances, indices = index.search(question_vector, k)
|
104 |
-
# return distances,indices
|
105 |
-
|
106 |
-
# def create_segment_index(vector):
|
107 |
-
# segment_index = faiss.IndexFlatL2(vector.shape[1])
|
108 |
-
# segment_index.add(vector)
|
109 |
-
# return segment_index
|
110 |
-
|
111 |
-
|
112 |
-
# def predict_faiss(model, tokenizer, embedding_model, df, question, index):
|
113 |
-
# t = time.time()
|
114 |
-
# question = question.strip()
|
115 |
-
# question_vector = get_embeddings(embedding_model, question)
|
116 |
-
# question_vector = prepare_sentences_vector([question_vector])
|
117 |
-
# distances,indices = faiss_search(index, question_vector)
|
118 |
-
# Answers = [df['Answer'][i] for i in indices[0]]
|
119 |
-
# _time = time.time() - t
|
120 |
-
# output = {
|
121 |
-
# "user_question": question,
|
122 |
-
# "answer": Answers[0],
|
123 |
-
# "totaltime": round(_time, 3),
|
124 |
-
# "score": round(distances[0][0], 4)
|
125 |
-
# }
|
126 |
-
# return output
|
127 |
-
|
128 |
-
# def predict(model, tokenizer, embedding_model, df, question, index):
|
129 |
-
# t = time.time()
|
130 |
-
# question = question.strip()
|
131 |
-
# question_vector = get_embeddings(embedding_model, question)
|
132 |
-
# question_vector = prepare_sentences_vector([question_vector])
|
133 |
-
# distances,indices = faiss_search(index, question_vector)
|
134 |
-
|
135 |
-
# # Answer = model_pipeline(model, tokenizer, df['Question'][indices[0][0]], df['Context'][indices[0][0]])
|
136 |
-
# Answer = model_pipeline(model, tokenizer, question, df['Context'][indices[0][0]])
|
137 |
-
# _time = time.time() - t
|
138 |
-
# output = {
|
139 |
-
# "user_question": question,
|
140 |
-
# "answer": Answer,
|
141 |
-
# "totaltime": round(_time, 3),
|
142 |
-
# "distance": round(distances[0][0], 4)
|
143 |
-
# }
|
144 |
-
# return Answer
|
145 |
-
|
146 |
-
# def predict_test(model, tokenizer, embedding_model, df, question, index): # sent_tokenize pythainlp
|
147 |
-
# t = time.time()
|
148 |
-
# question = question.strip()
|
149 |
-
# question_vector = get_embeddings(embedding_model, question)
|
150 |
-
# question_vector = prepare_sentences_vector([question_vector])
|
151 |
-
# distances,indices = faiss_search(index, question_vector)
|
152 |
-
|
153 |
-
# mostSimContext = df['Context'][indices[0][0]]
|
154 |
-
# pattern = r'(?<=\s{10}).*'
|
155 |
-
# matches = re.search(pattern, mostSimContext, flags=re.DOTALL)
|
156 |
-
|
157 |
-
# if matches:
|
158 |
-
# mostSimContext = matches.group(0)
|
159 |
-
|
160 |
-
# mostSimContext = mostSimContext.strip()
|
161 |
-
# mostSimContext = re.sub(r'\s+', ' ', mostSimContext)
|
162 |
-
|
163 |
-
# segments = sent_tokenize(mostSimContext, engine="crfcut")
|
164 |
-
|
165 |
-
# segment_embeddings = get_embeddings(embedding_model, segments)
|
166 |
-
# segment_embeddings = prepare_sentences_vector(segment_embeddings)
|
167 |
-
# segment_index = create_segment_index(segment_embeddings)
|
168 |
-
|
169 |
-
# _distances,_indices = faiss_search(segment_index, question_vector)
|
170 |
-
# mostSimSegment = segments[_indices[0][0]]
|
171 |
-
|
172 |
-
# Answer = model_pipeline(model, tokenizer,question,mostSimSegment)
|
173 |
-
|
174 |
-
# if len(Answer) <= 2:
|
175 |
-
# Answer = mostSimSegment
|
176 |
-
|
177 |
-
# # Find the start and end indices of mostSimSegment within mostSimContext
|
178 |
-
# start_index = mostSimContext.find(Answer)
|
179 |
-
# end_index = start_index + len(Answer)
|
180 |
-
|
181 |
-
# print(f"answer {len(Answer)} => {Answer} || startIndex =>{start_index} || endIndex =>{end_index}")
|
182 |
-
# print(f"mostSimContext{len(mostSimContext)}=>{mostSimContext}\nsegments{len(segments)}=>{segments}\nmostSimSegment{len(mostSimSegment)}=>{mostSimSegment}")
|
183 |
-
|
184 |
-
# _time = time.time() - t
|
185 |
-
# output = {
|
186 |
-
# "user_question": question,
|
187 |
-
# "answer": df['Answer'][indices[0][0]],
|
188 |
-
# "totaltime": round(_time, 3),
|
189 |
-
# "distance": round(distances[0][0], 4),
|
190 |
-
# "highlight_start": start_index,
|
191 |
-
# "highlight_end": end_index
|
192 |
-
# }
|
193 |
-
# return output
|
194 |
-
|
195 |
-
# def highlight_text(text, start_index, end_index):
|
196 |
-
# if start_index < 0:
|
197 |
-
# start_index = 0
|
198 |
-
# if end_index > len(text):
|
199 |
-
# end_index = len(text)
|
200 |
-
# highlighted_text = ""
|
201 |
-
# for i, char in enumerate(text):
|
202 |
-
# if i == start_index:
|
203 |
-
# highlighted_text += "<mark>"
|
204 |
-
# highlighted_text += char
|
205 |
-
# if i == end_index - 1:
|
206 |
-
# highlighted_text += "</mark>"
|
207 |
-
# return highlighted_text
|
208 |
-
|
209 |
-
# def chat_interface_before(question, history):
|
210 |
-
# response = predict(model, tokenizer, embedding_model, df, question, index)
|
211 |
-
# return response
|
212 |
-
|
213 |
-
# def chat_interface_after(question, history):
|
214 |
-
# response = predict_test(model, tokenizer, embedding_model, df, question, index)
|
215 |
-
# highlighted_answer = highlight_text(response["answer"], response["highlight_start"], response["highlight_end"])
|
216 |
-
# return highlighted_answer
|
217 |
-
|
218 |
-
# examples=[
|
219 |
-
# 'ขอเลขที่บัญชีของบริษัทหน่อย',
|
220 |
-
# 'บริษัทตั้งอยู่ที่ถนนอะไร',
|
221 |
-
# 'ขอช่องทางติดตามข่าวสารทาง Line หน่อย',
|
222 |
-
# 'อยากทราบความถี่ในการดึงข้อมูลของ DXT360 ในแต่ละแพลตฟอร์ม',
|
223 |
-
# 'อยากทราบความถี่ในการดึงข้อมูลของ DXT360 บน Twitter',
|
224 |
-
# # 'ช่องทางติดตามข่าวสารของเรา',
|
225 |
-
# ]
|
226 |
-
# demo_before = gr.ChatInterface(fn=chat_interface_before,
|
227 |
-
# examples=examples)
|
228 |
-
|
229 |
-
# demo_after = gr.ChatInterface(fn=chat_interface_after,
|
230 |
-
# examples=examples)
|
231 |
-
|
232 |
-
# interface = gr.TabbedInterface([demo_before, demo_after], ["Before", "After"])
|
233 |
-
|
234 |
-
# if __name__ == "__main__":
|
235 |
-
# # Load your model, tokenizer, data, and index here...
|
236 |
-
# df = load_data()
|
237 |
-
# model, tokenizer = load_model('wangchanberta-hyp')
|
238 |
-
# embedding_model = load_embedding_model()
|
239 |
-
# index = set_index(prepare_sentences_vector(load_embeddings(EMBEDDINGS_PATH)))
|
240 |
-
# interface.launch()
|
241 |
-
|
242 |
-
|
243 |
import random
|
244 |
import gradio as gr
|
245 |
import time
|
@@ -258,235 +17,224 @@ from pythainlp.tokenize import sent_tokenize
|
|
258 |
DEFAULT_MODEL = 'wangchanberta-hyp'
|
259 |
DEFAULT_SENTENCE_EMBEDDING_MODEL = 'intfloat/multilingual-e5-base'
|
260 |
|
261 |
-
MODEL_DICT =
|
262 |
-
|
|
|
|
|
263 |
|
264 |
EMBEDDINGS_PATH = 'data/embeddings.pkl'
|
265 |
DATA_PATH='data/dataset.xlsx'
|
266 |
|
267 |
-
class ChatBot:
|
268 |
-
SHEET_NAME_MDEBERTA = 'mdeberta'
|
269 |
-
SHEET_NAME_DEFAULT = 'Default'
|
270 |
-
UNKNOWN_ANSWERS = ["กรุณาลงรายระเอียดมากกว่านี้ได้มั้ยคะ", "ขอโทษค่ะลูกค้า ดิฉันไม่ทราบจริง ๆ"]
|
271 |
-
|
272 |
-
def __init__(self, df_path=None, model_path=None, tokenizer_path=None, embedding_model_name=None, embeddingsPath=None):
|
273 |
-
self.df = None
|
274 |
-
self.model = None
|
275 |
-
self.tokenizer = None
|
276 |
-
self.embedding_model = None
|
277 |
-
self.index = None
|
278 |
-
self.k = 5
|
279 |
-
if all(arg is not None for arg in (df_path, model_path, tokenizer_path, embedding_model_name, embeddingsPath)):
|
280 |
-
self.set_df(df_path)
|
281 |
-
self.set_model(model_path)
|
282 |
-
self.set_tokenizer(tokenizer_path)
|
283 |
-
self.set_embedding_model(embedding_model_name)
|
284 |
-
sentences_vector = self.load_embeddings(embeddingsPath)
|
285 |
-
repared_vector = self.prepare_sentences_vector(sentences_vector)
|
286 |
-
self.set_index(repared_vector)
|
287 |
-
|
288 |
-
def set_index(self, vector):
|
289 |
-
if torch.cuda.is_available(): # Check if GPU is available
|
290 |
-
res = faiss.StandardGpuResources()
|
291 |
-
index = faiss.IndexFlatL2(vector.shape[1])
|
292 |
-
gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index)
|
293 |
-
gpu_index_flat.add(vector)
|
294 |
-
self.index = gpu_index_flat
|
295 |
-
else: # If GPU is not available, use CPU-based Faiss index
|
296 |
-
self.index = faiss.IndexFlatL2(vector.shape[1])
|
297 |
-
self.index.add(vector)
|
298 |
-
return self.index
|
299 |
-
|
300 |
-
def set_df(self, path):
|
301 |
-
self.df = pd.read_excel(path, sheet_name=self.SHEET_NAME_DEFAULT)
|
302 |
-
self.df.rename(columns={'Response': 'Answer'}, inplace=True)
|
303 |
-
self.df['Context'] = pd.read_excel(path, self.SHEET_NAME_MDEBERTA)['Context']
|
304 |
-
|
305 |
-
def set_model(self, model):
|
306 |
-
self.model = AutoModelForQuestionAnswering.from_pretrained(model)
|
307 |
-
|
308 |
-
def set_tokenizer(self, tokenizer):
|
309 |
-
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer)
|
310 |
-
|
311 |
-
def set_embedding_model(self, model):
|
312 |
-
self.embedding_model = SentenceTransformer(model)
|
313 |
-
|
314 |
-
def set_k(self, k_value):
|
315 |
-
self.k = k_value
|
316 |
-
|
317 |
-
def get_df(self):
|
318 |
-
return self.df
|
319 |
-
|
320 |
-
def get_model(self):
|
321 |
-
return self.model
|
322 |
-
|
323 |
-
def get_tokenizer(self):
|
324 |
-
return self.tokenizer
|
325 |
-
|
326 |
-
def get_embedding_model(self):
|
327 |
-
return self.embedding_model
|
328 |
-
|
329 |
-
def get_index(self):
|
330 |
-
return self.index
|
331 |
-
|
332 |
-
def get_k(self):
|
333 |
-
return self.k
|
334 |
-
|
335 |
-
def get_embeddings(self, text_list):
|
336 |
-
return self.embedding_model.encode(text_list)
|
337 |
-
|
338 |
-
def prepare_sentences_vector(self, encoded_list):
|
339 |
-
encoded_list = [i.reshape(1, -1) for i in encoded_list]
|
340 |
-
encoded_list = np.vstack(encoded_list).astype('float32')
|
341 |
-
encoded_list = normalize(encoded_list)
|
342 |
-
return encoded_list
|
343 |
-
|
344 |
-
def load_embeddings(self, file_path):
|
345 |
-
with open(file_path, "rb") as fIn:
|
346 |
-
stored_data = pickle.load(fIn)
|
347 |
-
stored_sentences = stored_data['sentences']
|
348 |
-
stored_embeddings = stored_data['embeddings']
|
349 |
-
return stored_embeddings
|
350 |
-
|
351 |
-
def model_pipeline(self, question, similar_context):
|
352 |
-
inputs = self.tokenizer(question, similar_context, return_tensors="pt")
|
353 |
-
with torch.no_grad():
|
354 |
-
outputs = self.model(**inputs)
|
355 |
-
answer_start_index = outputs.start_logits.argmax()
|
356 |
-
answer_end_index = outputs.end_logits.argmax()
|
357 |
-
predict_answer_tokens = inputs.input_ids[0, answer_start_index: answer_end_index + 1]
|
358 |
-
Answer = self.tokenizer.decode(predict_answer_tokens)
|
359 |
-
return Answer.replace('<unk>','@')
|
360 |
-
|
361 |
-
def faiss_search(self, index, question_vector):
|
362 |
-
if index is None:
|
363 |
-
raise ValueError("Index has not been initialized.")
|
364 |
-
distances, indices = index.search(question_vector, self.k)
|
365 |
-
similar_questions = [self.df['Question'][indices[0][i]] for i in range(self.k)]
|
366 |
-
similar_contexts = [self.df['Context'][indices[0][i]] for i in range(self.k)]
|
367 |
-
return similar_questions, similar_contexts, distances, indices
|
368 |
-
|
369 |
-
def faiss_segment_search(self, index, question_vector, x=1):
|
370 |
-
if index is None:
|
371 |
-
raise ValueError("Index has not been initialized.")
|
372 |
-
distances, indices = index.search(question_vector, x)
|
373 |
-
return distances, indices
|
374 |
-
|
375 |
-
def create_segment_index(self, vector):
|
376 |
-
segment_index = faiss.IndexFlatL2(vector.shape[1])
|
377 |
-
segment_index.add(vector)
|
378 |
-
return segment_index
|
379 |
-
|
380 |
-
def predict_test(self, question):
|
381 |
-
list_context_for_show = []
|
382 |
-
list_distance_for_show = []
|
383 |
-
list_similar_question = []
|
384 |
-
|
385 |
-
question = question.strip()
|
386 |
-
question_vector = self.get_embeddings([question])
|
387 |
-
question_vector = self.prepare_sentences_vector([question_vector])
|
388 |
-
similar_questions, similar_contexts, distances, indices = self.faiss_search(self.index, question_vector)
|
389 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
390 |
|
391 |
-
|
392 |
-
|
393 |
-
matches = re.search(pattern, mostSimContext, flags=re.DOTALL)
|
394 |
-
|
395 |
-
if matches:
|
396 |
-
mostSimContext = matches.group(0)
|
397 |
-
|
398 |
-
mostSimContext = mostSimContext.strip()
|
399 |
-
mostSimContext = re.sub(r'\s+', ' ', mostSimContext)
|
400 |
-
|
401 |
-
segments = sent_tokenize(mostSimContext, engine="crfcut")
|
402 |
|
403 |
-
|
404 |
-
segment_embeddings = self.prepare_sentences_vector(segment_embeddings)
|
405 |
-
segment_index = self.create_segment_index(segment_embeddings)
|
406 |
|
407 |
-
|
|
|
408 |
|
409 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
410 |
|
411 |
-
print(f"_indices => {_indices[0][0]}")
|
412 |
-
answer = self.model_pipeline(question, mostSimSegment)
|
413 |
-
|
414 |
-
if len(answer) <= 2:
|
415 |
-
answer = mostSimSegment
|
416 |
-
|
417 |
-
start_index = mostSimContext.find(answer)
|
418 |
-
end_index = start_index + len(answer)
|
419 |
-
|
420 |
-
print(f"mostSimContext {len(mostSimContext)} =>{mostSimContext}\nsegments {len(segments)} =>{segments}\nmostSimSegment {len(mostSimSegment)} =>{mostSimSegment}")
|
421 |
-
print(f"answer {len(answer)} => {answer} || startIndex =>{start_index} || endIndex =>{end_index}")
|
422 |
-
|
423 |
-
for i in range(min(5, self.k)):
|
424 |
-
index = indices[0][i]
|
425 |
-
similar_question = similar_questions[i]
|
426 |
-
similar_context = similar_contexts[i]
|
427 |
-
|
428 |
-
list_similar_question.append(similar_question)
|
429 |
-
list_context_for_show.append(similar_context)
|
430 |
-
list_distance_for_show.append(str(1 - distances[0][i]))
|
431 |
-
|
432 |
-
distance = list_distance_for_show[0]
|
433 |
-
|
434 |
-
if float(distance) < 0.5:
|
435 |
-
answer = random.choice(self.UNKNOWN_ANSWERS)
|
436 |
-
|
437 |
-
output = {
|
438 |
-
"user_question": question,
|
439 |
-
"answer": self.df['Answer'][indices[0][0]],
|
440 |
-
"distance": distance,
|
441 |
-
"highlight_start": start_index,
|
442 |
-
"highlight_end": end_index,
|
443 |
-
"list_context": list_context_for_show,
|
444 |
-
"list_distance": list_distance_for_show
|
445 |
-
}
|
446 |
-
return output
|
447 |
-
|
448 |
-
def highlight_text(self, text, start_index, end_index):
|
449 |
-
if start_index < 0:
|
450 |
-
start_index = 0
|
451 |
-
if end_index > len(text):
|
452 |
-
end_index = len(text)
|
453 |
-
highlighted_text = ""
|
454 |
-
for i, char in enumerate(text):
|
455 |
-
if i == start_index:
|
456 |
-
highlighted_text += "<mark>"
|
457 |
-
highlighted_text += char
|
458 |
-
if i == end_index - 1:
|
459 |
-
highlighted_text += "</mark>"
|
460 |
-
return highlighted_text
|
461 |
-
|
462 |
-
def chat_interface_before(self, question, history):
|
463 |
-
response = self.predict(question)
|
464 |
-
return response
|
465 |
-
|
466 |
-
def chat_interface_after(self, question, history):
|
467 |
-
response = self.predict_test(question)
|
468 |
-
highlighted_answer = self.highlight_text(response["answer"], response["highlight_start"], response["highlight_end"])
|
469 |
-
return highlighted_answer
|
470 |
-
|
471 |
if __name__ == "__main__":
|
472 |
-
|
473 |
-
|
474 |
-
|
475 |
-
|
476 |
-
|
477 |
-
|
478 |
-
|
479 |
-
examples = [
|
480 |
-
'ขอเลขที่บัญชีของบริษัทหน่อย',
|
481 |
-
'บริษัทตั้งอยู่ที่ถนนอะไร',
|
482 |
-
'ขอช่องทางติดตามข่าวสารทาง Line หน่อย',
|
483 |
-
'อยากทราบความถี่ในการดึงข้อมูลของ DXT360 ในแต่ละแพลตฟอร์ม',
|
484 |
-
'อยากทราบความถี่ในการดึงข้อมูลของ DXT360 บน Twitter',
|
485 |
-
# 'ช่องทางติดตามข่าวสารของเรา',
|
486 |
-
]
|
487 |
-
|
488 |
-
demo_before = gr.ChatInterface(fn=bot.chat_interface_before, examples=examples)
|
489 |
-
demo_after = gr.ChatInterface(fn=bot.chat_interface_after, examples=examples)
|
490 |
-
|
491 |
-
interface = gr.TabbedInterface([demo_before, demo_after], ["Before", "After"])
|
492 |
-
interface.launch()
|
|
|
1 |
# @title web interface demo
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import random
|
3 |
import gradio as gr
|
4 |
import time
|
|
|
17 |
DEFAULT_MODEL = 'wangchanberta-hyp'
|
18 |
DEFAULT_SENTENCE_EMBEDDING_MODEL = 'intfloat/multilingual-e5-base'
|
19 |
|
20 |
+
MODEL_DICT = {
|
21 |
+
'wangchanberta': 'Chananchida/wangchanberta-xet_ref-params',
|
22 |
+
'wangchanberta-hyp': 'Chananchida/wangchanberta-xet_hyp-params',
|
23 |
+
}
|
24 |
|
25 |
EMBEDDINGS_PATH = 'data/embeddings.pkl'
|
26 |
DATA_PATH='data/dataset.xlsx'
|
27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
+
def load_data(path=DATA_PATH):
|
30 |
+
df = pd.read_excel(path, sheet_name='Default')
|
31 |
+
df['Context'] = pd.read_excel(path, sheet_name='mdeberta')['Context']
|
32 |
+
print(len(df))
|
33 |
+
print('Load data done')
|
34 |
+
return df
|
35 |
+
|
36 |
+
|
37 |
+
def load_model(model_name=DEFAULT_MODEL):
|
38 |
+
model = AutoModelForQuestionAnswering.from_pretrained(MODEL_DICT[model_name])
|
39 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_DICT[model_name])
|
40 |
+
print('Load model done')
|
41 |
+
return model, tokenizer
|
42 |
+
|
43 |
+
def load_embedding_model(model_name=DEFAULT_SENTENCE_EMBEDDING_MODEL):
|
44 |
+
# if torch.cuda.is_available():
|
45 |
+
# embedding_model = SentenceTransformer(model_name, device='cuda')
|
46 |
+
# else:
|
47 |
+
embedding_model = SentenceTransformer(model_name)
|
48 |
+
print('Load sentence embedding model done')
|
49 |
+
return embedding_model
|
50 |
+
|
51 |
+
|
52 |
+
def set_index(vector):
|
53 |
+
if torch.cuda.is_available():
|
54 |
+
res = faiss.StandardGpuResources()
|
55 |
+
index = faiss.IndexFlatL2(vector.shape[1])
|
56 |
+
gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index)
|
57 |
+
gpu_index_flat.add(vector)
|
58 |
+
index = gpu_index_flat
|
59 |
+
else:
|
60 |
+
index = faiss.IndexFlatL2(vector.shape[1])
|
61 |
+
index.add(vector)
|
62 |
+
return index
|
63 |
+
|
64 |
+
|
65 |
+
def get_embeddings(embedding_model, text_list):
|
66 |
+
return embedding_model.encode(text_list)
|
67 |
+
|
68 |
+
|
69 |
+
def prepare_sentences_vector(encoded_list):
|
70 |
+
encoded_list = [i.reshape(1, -1) for i in encoded_list]
|
71 |
+
encoded_list = np.vstack(encoded_list).astype('float32')
|
72 |
+
encoded_list = normalize(encoded_list)
|
73 |
+
return encoded_list
|
74 |
+
|
75 |
+
|
76 |
+
def store_embeddings(df, embeddings):
|
77 |
+
with open('embeddings.pkl', "wb") as fOut:
|
78 |
+
pickle.dump({'sentences': df['Question'], 'embeddings': embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL)
|
79 |
+
print('Store embeddings done')
|
80 |
+
|
81 |
+
|
82 |
+
def load_embeddings(file_path=EMBEDDINGS_PATH):
|
83 |
+
with open(file_path, "rb") as fIn:
|
84 |
+
stored_data = pickle.load(fIn)
|
85 |
+
stored_sentences = stored_data['sentences']
|
86 |
+
stored_embeddings = stored_data['embeddings']
|
87 |
+
print('Load (questions) embeddings done')
|
88 |
+
return stored_embeddings
|
89 |
+
|
90 |
+
|
91 |
+
def model_pipeline(model, tokenizer, question, similar_context):
|
92 |
+
inputs = tokenizer(question, similar_context, return_tensors="pt")
|
93 |
+
with torch.no_grad():
|
94 |
+
outputs = model(**inputs)
|
95 |
+
answer_start_index = outputs.start_logits.argmax()
|
96 |
+
answer_end_index = outputs.end_logits.argmax()
|
97 |
+
predict_answer_tokens = inputs.input_ids[0, answer_start_index: answer_end_index + 1]
|
98 |
+
Answer = tokenizer.decode(predict_answer_tokens)
|
99 |
+
return Answer.replace('<unk>','@')
|
100 |
+
|
101 |
+
|
102 |
+
def faiss_search(index, question_vector, k=1):
|
103 |
+
distances, indices = index.search(question_vector, k)
|
104 |
+
return distances,indices
|
105 |
+
|
106 |
+
def create_segment_index(vector):
|
107 |
+
segment_index = faiss.IndexFlatL2(vector.shape[1])
|
108 |
+
segment_index.add(vector)
|
109 |
+
return segment_index
|
110 |
+
|
111 |
+
|
112 |
+
def predict_faiss(model, tokenizer, embedding_model, df, question, index):
|
113 |
+
t = time.time()
|
114 |
+
question = question.strip()
|
115 |
+
question_vector = get_embeddings(embedding_model, question)
|
116 |
+
question_vector = prepare_sentences_vector([question_vector])
|
117 |
+
distances,indices = faiss_search(index, question_vector)
|
118 |
+
Answers = [df['Answer'][i] for i in indices[0]]
|
119 |
+
_time = time.time() - t
|
120 |
+
output = {
|
121 |
+
"user_question": question,
|
122 |
+
"answer": Answers[0],
|
123 |
+
"totaltime": round(_time, 3),
|
124 |
+
"score": round(distances[0][0], 4)
|
125 |
+
}
|
126 |
+
return output
|
127 |
+
|
128 |
+
def predict(model, tokenizer, embedding_model, df, question, index):
|
129 |
+
t = time.time()
|
130 |
+
question = question.strip()
|
131 |
+
question_vector = get_embeddings(embedding_model, question)
|
132 |
+
question_vector = prepare_sentences_vector([question_vector])
|
133 |
+
distances,indices = faiss_search(index, question_vector)
|
134 |
+
|
135 |
+
# Answer = model_pipeline(model, tokenizer, df['Question'][indices[0][0]], df['Context'][indices[0][0]])
|
136 |
+
Answer = model_pipeline(model, tokenizer, question, df['Context'][indices[0][0]])
|
137 |
+
_time = time.time() - t
|
138 |
+
output = {
|
139 |
+
"user_question": question,
|
140 |
+
"answer": Answer,
|
141 |
+
"totaltime": round(_time, 3),
|
142 |
+
"distance": round(distances[0][0], 4)
|
143 |
+
}
|
144 |
+
return Answer
|
145 |
+
|
146 |
+
def predict_test(model, tokenizer, embedding_model, df, question, index): # sent_tokenize pythainlp
|
147 |
+
t = time.time()
|
148 |
+
question = question.strip()
|
149 |
+
question_vector = get_embeddings(embedding_model, question)
|
150 |
+
question_vector = prepare_sentences_vector([question_vector])
|
151 |
+
distances,indices = faiss_search(index, question_vector)
|
152 |
+
|
153 |
+
mostSimContext = df['Context'][indices[0][0]]
|
154 |
+
pattern = r'(?<=\s{10}).*'
|
155 |
+
matches = re.search(pattern, mostSimContext, flags=re.DOTALL)
|
156 |
+
|
157 |
+
if matches:
|
158 |
+
mostSimContext = matches.group(0)
|
159 |
+
|
160 |
+
mostSimContext = mostSimContext.strip()
|
161 |
+
mostSimContext = re.sub(r'\s+', ' ', mostSimContext)
|
162 |
+
|
163 |
+
segments = sent_tokenize(mostSimContext, engine="crfcut")
|
164 |
+
|
165 |
+
segment_embeddings = get_embeddings(embedding_model, segments)
|
166 |
+
segment_embeddings = prepare_sentences_vector(segment_embeddings)
|
167 |
+
segment_index = create_segment_index(segment_embeddings)
|
168 |
|
169 |
+
_distances,_indices = faiss_search(segment_index, question_vector)
|
170 |
+
mostSimSegment = segments[_indices[0][0]]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
|
172 |
+
Answer = model_pipeline(model, tokenizer,question,mostSimSegment)
|
|
|
|
|
173 |
|
174 |
+
if len(Answer) <= 2:
|
175 |
+
Answer = mostSimSegment
|
176 |
|
177 |
+
# Find the start and end indices of mostSimSegment within mostSimContext
|
178 |
+
start_index = mostSimContext.find(Answer)
|
179 |
+
end_index = start_index + len(Answer)
|
180 |
+
|
181 |
+
print(f"answer {len(Answer)} => {Answer} || startIndex =>{start_index} || endIndex =>{end_index}")
|
182 |
+
print(f"mostSimContext{len(mostSimContext)}=>{mostSimContext}\nsegments{len(segments)}=>{segments}\nmostSimSegment{len(mostSimSegment)}=>{mostSimSegment}")
|
183 |
+
|
184 |
+
_time = time.time() - t
|
185 |
+
output = {
|
186 |
+
"user_question": question,
|
187 |
+
"answer": df['Answer'][indices[0][0]],
|
188 |
+
"totaltime": round(_time, 3),
|
189 |
+
"distance": round(distances[0][0], 4),
|
190 |
+
"highlight_start": start_index,
|
191 |
+
"highlight_end": end_index
|
192 |
+
}
|
193 |
+
return output
|
194 |
+
|
195 |
+
def highlight_text(text, start_index, end_index):
|
196 |
+
if start_index < 0:
|
197 |
+
start_index = 0
|
198 |
+
if end_index > len(text):
|
199 |
+
end_index = len(text)
|
200 |
+
highlighted_text = ""
|
201 |
+
for i, char in enumerate(text):
|
202 |
+
if i == start_index:
|
203 |
+
highlighted_text += "<mark>"
|
204 |
+
highlighted_text += char
|
205 |
+
if i == end_index - 1:
|
206 |
+
highlighted_text += "</mark>"
|
207 |
+
return highlighted_text
|
208 |
+
|
209 |
+
def chat_interface_before(question, history):
|
210 |
+
response = predict(model, tokenizer, embedding_model, df, question, index)
|
211 |
+
return response
|
212 |
+
|
213 |
+
def chat_interface_after(question, history):
|
214 |
+
response = predict_test(model, tokenizer, embedding_model, df, question, index)
|
215 |
+
highlighted_answer = highlight_text(response["answer"], response["highlight_start"], response["highlight_end"])
|
216 |
+
return highlighted_answer
|
217 |
+
|
218 |
+
examples=[
|
219 |
+
'ขอเลขที่บัญชีของบริษัทหน่อย',
|
220 |
+
'บริษัทตั้งอยู่ที่ถนนอะไร',
|
221 |
+
'ขอช่องทางติดตามข่าวสารทาง Line หน่อย',
|
222 |
+
'อยากทราบความถี่ในการดึงข้อมูลของ DXT360 ในแต่ละแพลตฟอร์ม',
|
223 |
+
'อยากทราบความถี่ในการดึงข้อมูลของ DXT360 บน Twitter',
|
224 |
+
# 'ช่องทางติดตามข่าวสารของเรา',
|
225 |
+
]
|
226 |
+
demo_before = gr.ChatInterface(fn=chat_interface_before,
|
227 |
+
examples=examples)
|
228 |
+
|
229 |
+
demo_after = gr.ChatInterface(fn=chat_interface_after,
|
230 |
+
examples=examples)
|
231 |
+
|
232 |
+
interface = gr.TabbedInterface([demo_before, demo_after], ["Before", "After"])
|
233 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
if __name__ == "__main__":
|
235 |
+
# Load your model, tokenizer, data, and index here...
|
236 |
+
df = load_data()
|
237 |
+
model, tokenizer = load_model('wangchanberta-hyp')
|
238 |
+
embedding_model = load_embedding_model()
|
239 |
+
index = set_index(prepare_sentences_vector(load_embeddings(EMBEDDINGS_PATH)))
|
240 |
+
interface.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|