chidung7271 commited on
Commit
67abb7e
·
1 Parent(s): b708a2d
Files changed (1) hide show
  1. app.py +64 -46
app.py CHANGED
@@ -1,67 +1,85 @@
1
- import py_vncorenlp
2
- import gradio as gr
3
- import os
4
- import shutil
5
- from sentence_transformers import CrossEncoder
6
 
7
- save_dir = './vncorenlp'
8
 
9
- models_dir = os.path.join(save_dir, 'models')
10
 
11
- #if os.path.exists(models_dir):
12
- #j shutil.rmtree(models_dir)
13
- # print("[DEBUG]: Delete model")
14
 
15
 
16
- #print("[DEBUG]: Tao lai folder model")
17
- #os.makedirs(save_dir + "/models", exist_ok=True)
18
 
19
 
20
- print("[DEBUG]: Download model")
21
- py_vncorenlp.download_model(save_dir=save_dir+'/')
22
- print("[DEBUG]: Downdload model complete!")
23
 
24
- #py_vncorenlp.download_model(save_dir='/absolute/path/to/vncorenlp')
25
- print("[DEBUG] rdsegmenter setep")
26
- rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir=save_dir)
27
 
28
- def rerank(query,sentences):
29
- print("[DEBUG]: Start rerank function...")
30
- tokenized_query = rdrsegmenter.word_segment(query)
31
- tokenized_sentences = [rdrsegmenter.word_segment(sent) for sent in sentences]
32
 
33
- tokenized_pairs = [[tokenized_query, sent] for sent in tokenized_sentences]
34
 
35
- MODEL_ID = 'itdainb/PhoRanker'
36
- MAX_LENGTH = 512
37
 
38
- model = CrossEncoder(MODEL_ID, max_length=MAX_LENGTH)
39
 
40
- # For fp16 usage
41
- model.model.half()
 
 
 
 
 
 
42
 
43
- scores = model.predict(tokenized_pairs)
44
 
45
- # 0.982, 0.2444, 0.9253
46
- #print(scores)
47
- return scores
48
 
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
 
 
 
 
 
 
51
 
52
- # Create Gradio interface
53
- interface = gr.Interface(
54
- fn=rerank,
55
- inputs=[
56
- gr.Textbox(label="Query", placeholder="Enter your query"),
57
- gr.Textbox(label="Documents (one per line)", lines=5, placeholder="Enter documents to rank"),
58
- ],
59
- outputs=gr.Textbox(label="Reranked Documents"),
60
- title="MonoT5 Reranking",
61
- description="Provide a query and a list of documents to rerank them using MonoT5."
62
- )
63
 
64
- # Launch the app
65
- if __name__ == "__main__":
66
- interface.launch()
67
 
 
 
 
1
+ # import py_vncorenlp
2
+ # import gradio as gr
3
+ # import os
4
+ # import shutil
5
+ # from sentence_transformers import CrossEncoder
6
 
7
+ # save_dir = './vncorenlp'
8
 
9
+ # models_dir = os.path.join(save_dir, 'models')
10
 
11
+ # #if os.path.exists(models_dir):
12
+ # #j shutil.rmtree(models_dir)
13
+ # # print("[DEBUG]: Delete model")
14
 
15
 
16
+ # #print("[DEBUG]: Tao lai folder model")
17
+ # #os.makedirs(save_dir + "/models", exist_ok=True)
18
 
19
 
20
+ # print("[DEBUG]: Download model")
21
+ # py_vncorenlp.download_model(save_dir=save_dir+'/')
22
+ # print("[DEBUG]: Downdload model complete!")
23
 
24
+ # #py_vncorenlp.download_model(save_dir='/absolute/path/to/vncorenlp')
25
+ # print("[DEBUG] rdsegmenter setep")
26
+ # rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir=save_dir)
27
 
28
+ # def rerank(query,sentences):
29
+ # print("[DEBUG]: Start rerank function...")
30
+ # tokenized_query = rdrsegmenter.word_segment(query)
31
+ # tokenized_sentences = [rdrsegmenter.word_segment(sent) for sent in sentences]
32
 
33
+ # tokenized_pairs = [[tokenized_query, sent] for sent in tokenized_sentences]
34
 
35
+ # MODEL_ID = 'itdainb/PhoRanker'
36
+ # MAX_LENGTH = 512
37
 
38
+ # model = CrossEncoder(MODEL_ID, max_length=MAX_LENGTH)
39
 
40
+ # # For fp16 usage
41
+ # model.model.half()
42
+
43
+ # scores = model.predict(tokenized_pairs)
44
+
45
+ # # 0.982, 0.2444, 0.9253
46
+ # #print(scores)
47
+ # return scores
48
 
 
49
 
 
 
 
50
 
51
 
52
+ # # Create Gradio interface
53
+ # interface = gr.Interface(
54
+ # fn=rerank,
55
+ # inputs=[
56
+ # gr.Textbox(label="Query", placeholder="Enter your query"),
57
+ # gr.Textbox(label="Documents (one per line)", lines=5, placeholder="Enter documents to rank"),
58
+ # ],
59
+ # outputs=gr.Textbox(label="Reranked Documents"),
60
+ # title="MonoT5 Reranking",
61
+ # description="Provide a query and a list of documents to rerank them using MonoT5."
62
+ # )
63
+
64
+ # # Launch the app
65
+ # if __name__ == "__main__":
66
+ # interface.launch()
67
+
68
+ import py_vncorenlp
69
+ py_vncorenlp.download_model(save_dir='/absolute/path/to/vncorenlp')
70
+ rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir='/absolute/path/to/vncorenlp')
71
 
72
+ query = "Trường UIT là gì?"
73
+ sentences = [
74
+ "Trường Đại học Công nghệ Thông tin có tên tiếng Anh là University of Information Technology (viết tắt là UIT) là thành viên của Đại học Quốc Gia TP.HCM.",
75
+ "Trường Đại học Kinh tế – Luật (tiếng Anh: University of Economics and Law – UEL) là trường đại học đào tạo và nghiên cứu khối ngành kinh tế, kinh doanh và luật hàng đầu Việt Nam.",
76
+ "Quĩ uỷ thác đầu tư (tiếng Anh: Unit Investment Trusts; viết tắt: UIT) là một công ty đầu tư mua hoặc nắm giữ một danh mục đầu tư cố định"
77
+ ]
78
 
79
+ tokenized_query = rdrsegmenter.word_segment(query)
80
+ tokenized_sentences = [rdrsegmenter.word_segment(sent) for sent in sentences]
 
 
 
 
 
 
 
 
 
81
 
82
+ tokenized_pairs = [[tokenized_query, sent] for sent in tokenized_sentences]
 
 
83
 
84
+ MODEL_ID = 'itdainb/PhoRanker'
85
+ MAX_LENGTH = 256