Upload 80 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- Model/MultimodelNER/Ner_processing.py +95 -0
- Model/MultimodelNER/UMT.py +290 -0
- Model/MultimodelNER/VLSP2016/Filetxt/list.txt +106 -0
- Model/MultimodelNER/VLSP2016/Filetxt/output.txt +6 -0
- Model/MultimodelNER/VLSP2016/Filetxt/test.txt +97 -0
- Model/MultimodelNER/VLSP2016/Image/014716.jpg +0 -0
- Model/MultimodelNER/VLSP2016/Image/My model.jpg +0 -0
- Model/MultimodelNER/VLSP2016/Image/bully.jpeg +0 -0
- Model/MultimodelNER/VLSP2016/Image/bully.jpg +0 -0
- Model/MultimodelNER/VLSP2016/Image/maria.jpg +0 -0
- Model/MultimodelNER/VLSP2016/Image/penguin.jpg +0 -0
- Model/MultimodelNER/VLSP2016/Image/pero.jpg +0 -0
- Model/MultimodelNER/VLSP2016/Image/pero2.jpg +0 -0
- Model/MultimodelNER/VLSP2016/Image/taybannha.jpg +0 -0
- Model/MultimodelNER/VLSP2016/MNER_2016.py +106 -0
- Model/MultimodelNER/VLSP2016/__pycache__/MNER_2016.cpython-39.pyc +0 -0
- Model/MultimodelNER/VLSP2016/__pycache__/dataset_roberta.cpython-39.pyc +0 -0
- Model/MultimodelNER/VLSP2016/__pycache__/train_umt_2016.cpython-39.pyc +0 -0
- Model/MultimodelNER/VLSP2016/best_model/bert_config.json +28 -0
- Model/MultimodelNER/VLSP2016/best_model/eval_results.txt +11 -0
- Model/MultimodelNER/VLSP2016/best_model/model_config.json +1 -0
- Model/MultimodelNER/VLSP2016/best_model/mtmner_pred.txt +0 -0
- Model/MultimodelNER/VLSP2016/best_model/pytorch_encoder.bin +3 -0
- Model/MultimodelNER/VLSP2016/best_model/pytorch_model.bin +3 -0
- Model/MultimodelNER/VLSP2016/cache/models--vinai--phobert-base-v2/.no_exist/2b51e367d92093c9688112098510e6a58bab67cd/model.safetensors +3 -0
- Model/MultimodelNER/VLSP2016/cache/models--vinai--phobert-base-v2/.no_exist/2b51e367d92093c9688112098510e6a58bab67cd/model.safetensors.index.json +0 -0
- Model/MultimodelNER/VLSP2016/cache/models--vinai--phobert-base-v2/refs/main +1 -0
- Model/MultimodelNER/VLSP2016/cache/models--vinai--phobert-base-v2/snapshots/2b51e367d92093c9688112098510e6a58bab67cd/config.json +27 -0
- Model/MultimodelNER/VLSP2016/cache/models--vinai--phobert-base-v2/snapshots/2b51e367d92093c9688112098510e6a58bab67cd/pytorch_model.bin +3 -0
- Model/MultimodelNER/VLSP2016/dataset_roberta.py +452 -0
- Model/MultimodelNER/VLSP2016/list.txt +5 -0
- Model/MultimodelNER/VLSP2016/models--vinai--phobert-base-v2/.no_exist/2b51e367d92093c9688112098510e6a58bab67cd/model.safetensors +3 -0
- Model/MultimodelNER/VLSP2016/models--vinai--phobert-base-v2/.no_exist/2b51e367d92093c9688112098510e6a58bab67cd/model.safetensors.index.json +0 -0
- Model/MultimodelNER/VLSP2016/models--vinai--phobert-base-v2/refs/main +1 -0
- Model/MultimodelNER/VLSP2016/models--vinai--phobert-base-v2/snapshots/2b51e367d92093c9688112098510e6a58bab67cd/config.json +27 -0
- Model/MultimodelNER/VLSP2016/models--vinai--phobert-base-v2/snapshots/2b51e367d92093c9688112098510e6a58bab67cd/pytorch_model.bin +3 -0
- Model/MultimodelNER/VLSP2016/test.txt +78 -0
- Model/MultimodelNER/VLSP2016/train_umt_2016.py +352 -0
- Model/MultimodelNER/VLSP2021/Filetxt/test.txt +97 -0
- Model/MultimodelNER/VLSP2021/Image/taybannha.jpg +0 -0
- Model/MultimodelNER/VLSP2021/MNER_2021.py +151 -0
- Model/MultimodelNER/VLSP2021/__pycache__/MNER_2021.cpython-39.pyc +0 -0
- Model/MultimodelNER/VLSP2021/__pycache__/dataset_roberta.cpython-39.pyc +0 -0
- Model/MultimodelNER/VLSP2021/__pycache__/train_umt_2021.cpython-39.pyc +0 -0
- Model/MultimodelNER/VLSP2021/best_model/bert_config.json +28 -0
- Model/MultimodelNER/VLSP2021/best_model/eval_results.txt +50 -0
- Model/MultimodelNER/VLSP2021/best_model/model_config.json +1 -0
- Model/MultimodelNER/VLSP2021/best_model/mtmner_pred.txt +0 -0
- Model/MultimodelNER/VLSP2021/best_model/pytorch_encoder.bin +3 -0
- Model/MultimodelNER/VLSP2021/best_model/pytorch_model.bin +3 -0
Model/MultimodelNER/Ner_processing.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def format_predictions(words, predictions):
|
2 |
+
'''
|
3 |
+
Chuyển đổi danh sách từ và dự đoán sang định dạng (word, label)
|
4 |
+
'''
|
5 |
+
formatted = []
|
6 |
+
for word, label in zip(words, predictions):
|
7 |
+
formatted.append((word, label))
|
8 |
+
return formatted
|
9 |
+
|
10 |
+
def process_predictions(predictions):
|
11 |
+
'''
|
12 |
+
Tách các từ có dấu gạch dưới thành các từ riêng biệt với cùng nhãn
|
13 |
+
'''
|
14 |
+
formatted = []
|
15 |
+
for word, label in predictions:
|
16 |
+
if '_' in word:
|
17 |
+
formatted.append((word.replace('_', ' '), label))
|
18 |
+
else:
|
19 |
+
formatted.append((word, label))
|
20 |
+
return formatted
|
21 |
+
|
22 |
+
|
23 |
+
def combine_entities(predictions):
|
24 |
+
combined = []
|
25 |
+
temp_entity = []
|
26 |
+
temp_label = None
|
27 |
+
|
28 |
+
for word, label in predictions:
|
29 |
+
if label.startswith('B-'):
|
30 |
+
if temp_entity:
|
31 |
+
combined.append((' '.join(temp_entity), temp_label))
|
32 |
+
temp_entity = []
|
33 |
+
temp_entity.append(word)
|
34 |
+
temp_label = label
|
35 |
+
elif label.startswith('I-') and temp_label and label[2:] == temp_label[2:]:
|
36 |
+
temp_entity.append(word)
|
37 |
+
else:
|
38 |
+
if temp_entity:
|
39 |
+
combined.append((' '.join(temp_entity), temp_label))
|
40 |
+
temp_entity = []
|
41 |
+
temp_label = None
|
42 |
+
combined.append((word, label))
|
43 |
+
|
44 |
+
if temp_entity:
|
45 |
+
combined.append((' '.join(temp_entity), temp_label))
|
46 |
+
|
47 |
+
return combined
|
48 |
+
|
49 |
+
|
50 |
+
|
51 |
+
|
52 |
+
def remove_B_prefix(entities):
|
53 |
+
modified_entities = []
|
54 |
+
for word, label in entities:
|
55 |
+
if label.startswith('B-'):
|
56 |
+
label = label[2:] # Loại bỏ phần 'B-' khỏi nhãn
|
57 |
+
modified_entities.append((word, label))
|
58 |
+
return modified_entities
|
59 |
+
|
60 |
+
|
61 |
+
def combine_i_tags(tokens_labels):
|
62 |
+
combined = []
|
63 |
+
current_combination = []
|
64 |
+
current_label = None
|
65 |
+
|
66 |
+
for token, label in tokens_labels:
|
67 |
+
if label.startswith('I-'):
|
68 |
+
label = label[2:] # Remove the 'I-' prefix
|
69 |
+
if current_label is None:
|
70 |
+
current_label = label
|
71 |
+
current_combination.append(token)
|
72 |
+
elif current_label == label:
|
73 |
+
current_combination.append(token)
|
74 |
+
else:
|
75 |
+
combined.append((' '.join(current_combination), current_label))
|
76 |
+
current_combination = [token]
|
77 |
+
current_label = label
|
78 |
+
else:
|
79 |
+
if current_combination:
|
80 |
+
combined.append((' '.join(current_combination), current_label))
|
81 |
+
current_combination = []
|
82 |
+
current_label = None
|
83 |
+
combined.append((token, label))
|
84 |
+
|
85 |
+
if current_combination:
|
86 |
+
combined.append((' '.join(current_combination), current_label))
|
87 |
+
|
88 |
+
return combined
|
89 |
+
|
90 |
+
tokens_labels = [('Dân', 'O'), ('trí', 'O'), ('Chức', 'O'), ('vô', 'O'), ('địch', 'O'), ('Euro 2008', 'EVENT-SPORT'), ('đầy', 'O'), ('thuyết', 'O'), ('phục', 'O'), ('của', 'O'), ('Tây Ban Nha', 'LOCATION'), ('trên', 'O'), ('đất', 'O'), ('Áo', 'LOCATION'), ('và', 'O'), ('Thụy Sĩ', 'PERSON'), ('đã', 'O'), ('mở', 'O'), ('ra', 'O'), ('kỷ', 'O'), ('nguyên', 'O'), ('vinh', 'O'), ('quanh', 'O'), ('của', 'O'), ('La', 'ORGANIZATION'), ('Furia', 'I-ORGANIZATION-SPORTS'), ('Roja', 'I-ORGANIZATION-SPORTS'), (',', 'O'), ('với', 'O'), ('lối', 'O'), ('chơi', 'O'), ('tiqui', 'O'), ('taka', 'O'), ('đầy', 'O'), ('biến', 'O'), ('ảo', 'O'), ('.', 'O'), ('Trong', 'O'), ('quá', 'O'), ('khứ', 'O'), (',', 'O'), ('Tây Ban Nha', 'LOCATION'), ('nổi', 'O'), ('tiếng', 'O'), ('với', 'O'), ('biệt', 'O'), ('danh', 'O'), ('Vua', 'O'), ('vòng', 'O'), ('loại', 'O'), ('.', 'O'), ('Họ', 'O'), ('thường', 'O'), ('thi', 'O'), ('đấu', 'O'), ('rất', 'O'), ('tốt', 'O'), ('ở', 'O'), ('vòng', 'O'), ('loại', 'O'), ('nhưng', 'O'), ('lại', 'O'), ('chưa', 'O'), ('bao', 'O'), ('giờ', 'O'), ('chứng', 'O'), ('minh', 'O'), ('được', 'O'), ('sức', 'O'), ('mạnh', 'O'), ('ở', 'O'), ('vòng', 'O'), ('chung', 'O'), ('kết', 'O'), ('giải', 'O'), ('đấu', 'O'), ('lớn', 'O'), ('.', 'O'), ('Lần', 'O'), ('duy', 'O'), ('nhất', 'O'), ('họ', 'O'), ('lên', 'O'), ('ngôi', 'O'), ('là', 'O'), ('ở', 'O'), ('kỳ', 'O'), ('Euro', 'EVENT-SPORT'), ('1964', 'O'), ('.', 'O')]
|
91 |
+
|
92 |
+
combined_tokens_labels = combine_i_tags(tokens_labels)
|
93 |
+
print(combined_tokens_labels)
|
94 |
+
|
95 |
+
|
Model/MultimodelNER/UMT.py
ADDED
@@ -0,0 +1,290 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
3 |
+
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
4 |
+
#
|
5 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6 |
+
# you may not use this file except in compliance with the License.
|
7 |
+
# You may obtain a copy of the License at
|
8 |
+
#
|
9 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10 |
+
#
|
11 |
+
# Unless required by applicable law or agreed to in writing, software
|
12 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14 |
+
# See the License for the specific language governing permissions and
|
15 |
+
# limitations under the License.
|
16 |
+
"""PyTorch BERT model."""
|
17 |
+
|
18 |
+
from __future__ import absolute_import, division, print_function, unicode_literals
|
19 |
+
|
20 |
+
import copy
|
21 |
+
import json
|
22 |
+
import logging
|
23 |
+
import math
|
24 |
+
import os
|
25 |
+
import shutil
|
26 |
+
import tarfile
|
27 |
+
import tempfile
|
28 |
+
import sys
|
29 |
+
from io import open
|
30 |
+
from torchcrf import CRF
|
31 |
+
|
32 |
+
import torch
|
33 |
+
from torch import nn
|
34 |
+
from torch.nn import CrossEntropyLoss
|
35 |
+
|
36 |
+
import torch.nn.functional as F
|
37 |
+
from torch.autograd import Variable
|
38 |
+
|
39 |
+
logger = logging.getLogger(__name__)
|
40 |
+
|
41 |
+
|
42 |
+
def gelu(x):
|
43 |
+
"""Implementation of the gelu activation function.
|
44 |
+
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
|
45 |
+
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
|
46 |
+
Also see https://arxiv.org/abs/1606.08415
|
47 |
+
"""
|
48 |
+
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
|
49 |
+
|
50 |
+
|
51 |
+
def swish(x):
|
52 |
+
return x * torch.sigmoid(x)
|
53 |
+
|
54 |
+
|
55 |
+
ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
|
56 |
+
|
57 |
+
from transformers import RobertaModel
|
58 |
+
from transformers.models.roberta.modeling_roberta import RobertaLayer, RobertaPreTrainedModel, RobertaOutput, \
|
59 |
+
RobertaSelfOutput, RobertaIntermediate
|
60 |
+
|
61 |
+
|
62 |
+
class RobertaSelfEncoder(nn.Module):
|
63 |
+
def __init__(self, config):
|
64 |
+
super(RobertaSelfEncoder, self).__init__()
|
65 |
+
layer = RobertaLayer(config)
|
66 |
+
self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(1)])
|
67 |
+
|
68 |
+
def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
|
69 |
+
all_encoder_layers = []
|
70 |
+
for layer_module in self.layer:
|
71 |
+
hidden_states = layer_module(hidden_states, attention_mask)
|
72 |
+
if output_all_encoded_layers:
|
73 |
+
all_encoder_layers.append(hidden_states)
|
74 |
+
if not output_all_encoded_layers:
|
75 |
+
all_encoder_layers.append(hidden_states)
|
76 |
+
return all_encoder_layers
|
77 |
+
|
78 |
+
|
79 |
+
class RobertaCrossEncoder(nn.Module):
|
80 |
+
def __init__(self, config, layer_num):
|
81 |
+
super(RobertaCrossEncoder, self).__init__()
|
82 |
+
layer = RobertaCrossAttentionLayer(config)
|
83 |
+
self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(layer_num)])
|
84 |
+
|
85 |
+
def forward(self, s1_hidden_states, s2_hidden_states, s2_attention_mask, output_all_encoded_layers=True):
|
86 |
+
all_encoder_layers = []
|
87 |
+
for layer_module in self.layer:
|
88 |
+
s1_hidden_states = layer_module(s1_hidden_states, s2_hidden_states, s2_attention_mask)
|
89 |
+
if output_all_encoded_layers:
|
90 |
+
all_encoder_layers.append(s1_hidden_states)
|
91 |
+
if not output_all_encoded_layers:
|
92 |
+
all_encoder_layers.append(s1_hidden_states)
|
93 |
+
return all_encoder_layers
|
94 |
+
|
95 |
+
|
96 |
+
class RobertaCoAttention(nn.Module):
|
97 |
+
def __init__(self, config):
|
98 |
+
super(RobertaCoAttention, self).__init__()
|
99 |
+
if config.hidden_size % config.num_attention_heads != 0:
|
100 |
+
raise ValueError(
|
101 |
+
"The hidden size (%d) is not a multiple of the number of attention "
|
102 |
+
"heads (%d)" % (config.hidden_size, config.num_attention_heads))
|
103 |
+
self.num_attention_heads = config.num_attention_heads
|
104 |
+
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
|
105 |
+
self.all_head_size = self.num_attention_heads * self.attention_head_size
|
106 |
+
|
107 |
+
self.query = nn.Linear(config.hidden_size, self.all_head_size)
|
108 |
+
self.key = nn.Linear(config.hidden_size, self.all_head_size)
|
109 |
+
self.value = nn.Linear(config.hidden_size, self.all_head_size)
|
110 |
+
|
111 |
+
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
|
112 |
+
|
113 |
+
def transpose_for_scores(self, x):
|
114 |
+
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
|
115 |
+
x = x.view(*new_x_shape)
|
116 |
+
return x.permute(0, 2, 1, 3)
|
117 |
+
|
118 |
+
def forward(self, s1_hidden_states, s2_hidden_states, s2_attention_mask):
|
119 |
+
mixed_query_layer = self.query(s1_hidden_states)
|
120 |
+
mixed_key_layer = self.key(s2_hidden_states)
|
121 |
+
mixed_value_layer = self.value(s2_hidden_states)
|
122 |
+
|
123 |
+
query_layer = self.transpose_for_scores(mixed_query_layer)
|
124 |
+
key_layer = self.transpose_for_scores(mixed_key_layer)
|
125 |
+
value_layer = self.transpose_for_scores(mixed_value_layer)
|
126 |
+
|
127 |
+
# Take the dot product between "query" and "key" to get the raw attention scores.
|
128 |
+
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
|
129 |
+
|
130 |
+
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
|
131 |
+
# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
|
132 |
+
attention_scores = attention_scores + s2_attention_mask
|
133 |
+
|
134 |
+
# Normalize the attention scores to probabilities.
|
135 |
+
attention_probs = nn.Softmax(dim=-1)(attention_scores)
|
136 |
+
|
137 |
+
# This is actually dropping out entire tokens to attend to, which might
|
138 |
+
# seem a bit unusual, but is taken from the original Transformer paper.
|
139 |
+
attention_probs = self.dropout(attention_probs)
|
140 |
+
|
141 |
+
context_layer = torch.matmul(attention_probs, value_layer)
|
142 |
+
|
143 |
+
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
|
144 |
+
|
145 |
+
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
|
146 |
+
context_layer = context_layer.view(*new_context_layer_shape)
|
147 |
+
return context_layer
|
148 |
+
|
149 |
+
|
150 |
+
class RobertaCrossAttention(nn.Module):
|
151 |
+
def __init__(self, config):
|
152 |
+
super(RobertaCrossAttention, self).__init__()
|
153 |
+
self.self = RobertaCoAttention(config)
|
154 |
+
self.output = RobertaSelfOutput(config)
|
155 |
+
|
156 |
+
def forward(self, s1_input_tensor, s2_input_tensor, s2_attention_mask):
|
157 |
+
s1_cross_output = self.self(s1_input_tensor, s2_input_tensor, s2_attention_mask)
|
158 |
+
attention_output = self.output(s1_cross_output, s1_input_tensor)
|
159 |
+
return attention_output
|
160 |
+
|
161 |
+
|
162 |
+
class RobertaCrossAttentionLayer(nn.Module):
|
163 |
+
def __init__(self, config):
|
164 |
+
super(RobertaCrossAttentionLayer, self).__init__()
|
165 |
+
self.attention = RobertaCrossAttention(config)
|
166 |
+
self.intermediate = RobertaIntermediate(config)
|
167 |
+
self.output = RobertaOutput(config)
|
168 |
+
|
169 |
+
def forward(self, s1_hidden_states, s2_hidden_states, s2_attention_mask):
|
170 |
+
attention_output = self.attention(s1_hidden_states, s2_hidden_states, s2_attention_mask)
|
171 |
+
intermediate_output = self.intermediate(attention_output)
|
172 |
+
layer_output = self.output(intermediate_output, attention_output)
|
173 |
+
return layer_output
|
174 |
+
|
175 |
+
|
176 |
+
class UMT(RobertaPreTrainedModel):
|
177 |
+
"""Coupled Cross-Modal Attention BERT model for token-level classification with CRF on top.
|
178 |
+
"""
|
179 |
+
|
180 |
+
def __init__(self, config, layer_num1=1, layer_num2=1, layer_num3=1, num_labels_=2, auxnum_labels=2):
|
181 |
+
super(UMT, self).__init__(config)
|
182 |
+
self.num_labels = num_labels_
|
183 |
+
self.roberta = RobertaModel(config)
|
184 |
+
# self.trans_matrix = torch.zeros(num_labels, auxnum_labels)
|
185 |
+
self.self_attention = RobertaSelfEncoder(config)
|
186 |
+
self.self_attention_v2 = RobertaSelfEncoder(config)
|
187 |
+
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
188 |
+
self.vismap2text = nn.Linear(2048, config.hidden_size)
|
189 |
+
self.vismap2text_v2 = nn.Linear(2048, config.hidden_size)
|
190 |
+
self.txt2img_attention = RobertaCrossEncoder(config, layer_num1)
|
191 |
+
self.img2txt_attention = RobertaCrossEncoder(config, layer_num2)
|
192 |
+
self.txt2txt_attention = RobertaCrossEncoder(config, layer_num3)
|
193 |
+
self.gate = nn.Linear(config.hidden_size * 2, config.hidden_size)
|
194 |
+
### self.self_attention = BertLastSelfAttention(config)
|
195 |
+
self.classifier = nn.Linear(config.hidden_size * 2, num_labels_)
|
196 |
+
self.aux_classifier = nn.Linear(config.hidden_size, auxnum_labels)
|
197 |
+
|
198 |
+
self.crf = CRF(num_labels_, batch_first=True)
|
199 |
+
self.aux_crf = CRF(auxnum_labels, batch_first=True)
|
200 |
+
|
201 |
+
self.init_weights()
|
202 |
+
|
203 |
+
# this forward is just for predict, not for train
|
204 |
+
# dont confuse this with _forward_alg above.
|
205 |
+
def forward(self, input_ids, segment_ids, input_mask, added_attention_mask, visual_embeds_att, trans_matrix,
|
206 |
+
labels=None, auxlabels=None):
|
207 |
+
# Get the emission scores from the BiLSTM
|
208 |
+
features = self.roberta(input_ids, token_type_ids=segment_ids,
|
209 |
+
attention_mask=input_mask) # batch_size * seq_len * hidden_size
|
210 |
+
sequence_output = features["last_hidden_state"]
|
211 |
+
sequence_output = self.dropout(sequence_output)
|
212 |
+
|
213 |
+
extended_txt_mask = input_mask.unsqueeze(1).unsqueeze(2)
|
214 |
+
extended_txt_mask = extended_txt_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
|
215 |
+
extended_txt_mask = (1.0 - extended_txt_mask) * -10000.0
|
216 |
+
aux_addon_sequence_encoder = self.self_attention(sequence_output, extended_txt_mask)
|
217 |
+
|
218 |
+
aux_addon_sequence_output = aux_addon_sequence_encoder[-1]
|
219 |
+
aux_addon_sequence_output = aux_addon_sequence_output[0]
|
220 |
+
aux_bert_feats = self.aux_classifier(aux_addon_sequence_output)
|
221 |
+
#######aux_bert_feats = self.aux_classifier(sequence_output)
|
222 |
+
trans_matrix_tensor = torch.tensor(trans_matrix, dtype=torch.float32, device=aux_bert_feats.device)
|
223 |
+
trans_bert_feats = torch.matmul(aux_bert_feats, trans_matrix_tensor)
|
224 |
+
|
225 |
+
# trans_bert_feats = torch.matmul(aux_bert_feats, trans_matrix.float())
|
226 |
+
|
227 |
+
main_addon_sequence_encoder = self.self_attention_v2(sequence_output, extended_txt_mask)
|
228 |
+
main_addon_sequence_output = main_addon_sequence_encoder[-1]
|
229 |
+
main_addon_sequence_output = main_addon_sequence_output[0]
|
230 |
+
vis_embed_map = visual_embeds_att.view(-1, 2048, 49).permute(0, 2, 1) # self.batch_size, 49, 2048
|
231 |
+
converted_vis_embed_map = self.vismap2text(vis_embed_map) # self.batch_size, 49, hidden_dim
|
232 |
+
|
233 |
+
# '''
|
234 |
+
# apply txt2img attention mechanism to obtain image-based text representations
|
235 |
+
img_mask = added_attention_mask[:, :49]
|
236 |
+
extended_img_mask = img_mask.unsqueeze(1).unsqueeze(2)
|
237 |
+
extended_img_mask = extended_img_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
|
238 |
+
extended_img_mask = (1.0 - extended_img_mask) * -10000.0
|
239 |
+
|
240 |
+
cross_encoder = self.txt2img_attention(main_addon_sequence_output, converted_vis_embed_map, extended_img_mask)
|
241 |
+
cross_output_layer = cross_encoder[-1] # self.batch_size * text_len * hidden_dim
|
242 |
+
|
243 |
+
# apply img2txt attention mechanism to obtain multimodal-based text representations
|
244 |
+
converted_vis_embed_map_v2 = self.vismap2text_v2(vis_embed_map) # self.batch_size, 49, hidden_dim
|
245 |
+
|
246 |
+
cross_txt_encoder = self.img2txt_attention(converted_vis_embed_map_v2, main_addon_sequence_output,
|
247 |
+
extended_txt_mask)
|
248 |
+
cross_txt_output_layer = cross_txt_encoder[-1] # self.batch_size * 49 * hidden_dim
|
249 |
+
cross_final_txt_encoder = self.txt2txt_attention(main_addon_sequence_output, cross_txt_output_layer,
|
250 |
+
extended_img_mask)
|
251 |
+
##cross_final_txt_encoder = self.txt2txt_attention(aux_addon_sequence_output, cross_txt_output_layer, extended_img_mask)
|
252 |
+
cross_final_txt_layer = cross_final_txt_encoder[-1] # self.batch_size * text_len * hidden_dim
|
253 |
+
# cross_final_txt_layer = torch.add(cross_final_txt_layer, sequence_output)
|
254 |
+
|
255 |
+
# visual gate
|
256 |
+
merge_representation = torch.cat((cross_final_txt_layer, cross_output_layer), dim=-1)
|
257 |
+
gate_value = torch.sigmoid(self.gate(merge_representation)) # batch_size, text_len, hidden_dim
|
258 |
+
gated_converted_att_vis_embed = torch.mul(gate_value, cross_output_layer)
|
259 |
+
# reverse_gate_value = torch.neg(gate_value).add(1)
|
260 |
+
# gated_converted_att_vis_embed = torch.add(torch.mul(reverse_gate_value, cross_final_txt_layer),
|
261 |
+
# torch.mul(gate_value, cross_output_layer))
|
262 |
+
|
263 |
+
# direct concatenation
|
264 |
+
# gated_converted_att_vis_embed = self.dropout(gated_converted_att_vis_embed)
|
265 |
+
final_output = torch.cat((cross_final_txt_layer, gated_converted_att_vis_embed), dim=-1)
|
266 |
+
###### final_output = self.dropout(final_output)
|
267 |
+
# middle_output = torch.cat((cross_final_txt_layer, gated_converted_att_vis_embed), dim=-1)
|
268 |
+
# final_output = torch.cat((sequence_output, middle_output), dim=-1)
|
269 |
+
|
270 |
+
###### addon_sequence_output = self.self_attention(final_output, extended_txt_mask)
|
271 |
+
bert_feats = self.classifier(final_output)
|
272 |
+
|
273 |
+
alpha = 0.5
|
274 |
+
final_bert_feats = torch.add(torch.mul(bert_feats, alpha), torch.mul(trans_bert_feats, 1 - alpha))
|
275 |
+
|
276 |
+
# suggested by Hongjie
|
277 |
+
# bert_feats = F.log_softmax(bert_feats, dim=-1)
|
278 |
+
|
279 |
+
if labels is not None:
|
280 |
+
beta = 0.5 # 73.87(73.50) 85.37(85.00) 0.5 5e-5 #73.45 85.05 1.0 1 1 1 4e-5 # 73.63 0.1 1 1 1 5e-5 # old 0.1 2 1 1 85.23 0.2 1 1 85.04
|
281 |
+
##beta = 0.6
|
282 |
+
aux_loss = - self.aux_crf(aux_bert_feats, auxlabels, mask=input_mask.byte(), reduction='mean')
|
283 |
+
main_loss = - self.crf(final_bert_feats, labels, mask=input_mask.byte(), reduction='mean')
|
284 |
+
loss = main_loss + beta * aux_loss
|
285 |
+
return loss
|
286 |
+
else:
|
287 |
+
pred_tags = self.crf.decode(final_bert_feats, mask=input_mask.byte())
|
288 |
+
return pred_tags
|
289 |
+
|
290 |
+
|
Model/MultimodelNER/VLSP2016/Filetxt/list.txt
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
IMGID:pero
|
2 |
+
James
|
3 |
+
và
|
4 |
+
Shan
|
5 |
+
đều
|
6 |
+
nghĩ
|
7 |
+
rằng
|
8 |
+
gửi
|
9 |
+
Pero
|
10 |
+
đi
|
11 |
+
làm
|
12 |
+
nông
|
13 |
+
thử
|
14 |
+
cho
|
15 |
+
vui
|
16 |
+
và
|
17 |
+
bắt
|
18 |
+
đầu
|
19 |
+
cho
|
20 |
+
Pero
|
21 |
+
du
|
22 |
+
hí
|
23 |
+
từ
|
24 |
+
đầu
|
25 |
+
tháng
|
26 |
+
3
|
27 |
+
năm
|
28 |
+
nay
|
29 |
+
.
|
30 |
+
Ngày
|
31 |
+
8
|
32 |
+
4
|
33 |
+
,
|
34 |
+
người
|
35 |
+
bạn
|
36 |
+
báo
|
37 |
+
tin
|
38 |
+
cho
|
39 |
+
Alan
|
40 |
+
biết
|
41 |
+
Pero
|
42 |
+
mất
|
43 |
+
tích
|
44 |
+
dù
|
45 |
+
họ
|
46 |
+
tìm
|
47 |
+
kiếm
|
48 |
+
Pero
|
49 |
+
ở
|
50 |
+
rất
|
51 |
+
nhiều
|
52 |
+
nơi
|
53 |
+
.
|
54 |
+
Hai
|
55 |
+
vợ
|
56 |
+
chồng
|
57 |
+
anh
|
58 |
+
James
|
59 |
+
đều
|
60 |
+
nghĩ
|
61 |
+
Pero
|
62 |
+
đã
|
63 |
+
mất
|
64 |
+
.
|
65 |
+
Tuy
|
66 |
+
nhiên
|
67 |
+
một
|
68 |
+
tối
|
69 |
+
nọ
|
70 |
+
khi
|
71 |
+
xuống
|
72 |
+
thăm
|
73 |
+
bầy
|
74 |
+
gia
|
75 |
+
súc
|
76 |
+
sau
|
77 |
+
khi
|
78 |
+
ăn
|
79 |
+
tối
|
80 |
+
,
|
81 |
+
Alan
|
82 |
+
đã
|
83 |
+
rất
|
84 |
+
ngạc
|
85 |
+
nhiên
|
86 |
+
khi
|
87 |
+
mở
|
88 |
+
cửa
|
89 |
+
và
|
90 |
+
thấy
|
91 |
+
Pero
|
92 |
+
ngồi
|
93 |
+
chờ
|
94 |
+
ngay
|
95 |
+
trước
|
96 |
+
cửa
|
97 |
+
.
|
98 |
+
Thấy
|
99 |
+
Alan
|
100 |
+
,
|
101 |
+
Pero
|
102 |
+
mừng
|
103 |
+
rỡ
|
104 |
+
vô
|
105 |
+
cùng
|
106 |
+
.
|
Model/MultimodelNER/VLSP2016/Filetxt/output.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
This
|
2 |
+
is
|
3 |
+
some
|
4 |
+
example
|
5 |
+
text
|
6 |
+
.
|
Model/MultimodelNER/VLSP2016/Filetxt/test.txt
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
IMGID:taybannha
|
2 |
+
Dân
|
3 |
+
trí
|
4 |
+
Chức
|
5 |
+
vô
|
6 |
+
địch
|
7 |
+
Euro
|
8 |
+
2008
|
9 |
+
đầy
|
10 |
+
thuyết
|
11 |
+
phục
|
12 |
+
của
|
13 |
+
Tây
|
14 |
+
Ban
|
15 |
+
Nha
|
16 |
+
trên
|
17 |
+
đất
|
18 |
+
Áo
|
19 |
+
và
|
20 |
+
Thụy
|
21 |
+
Sĩ
|
22 |
+
đã
|
23 |
+
mở
|
24 |
+
ra
|
25 |
+
kỷ
|
26 |
+
nguyên
|
27 |
+
vinh
|
28 |
+
quanh
|
29 |
+
của
|
30 |
+
La
|
31 |
+
Furia
|
32 |
+
Roja
|
33 |
+
,
|
34 |
+
với
|
35 |
+
lối
|
36 |
+
chơi
|
37 |
+
tiqui
|
38 |
+
taka
|
39 |
+
đầy
|
40 |
+
biến
|
41 |
+
ảo
|
42 |
+
.
|
43 |
+
Trong
|
44 |
+
quá
|
45 |
+
khứ
|
46 |
+
,
|
47 |
+
Tây
|
48 |
+
Ban
|
49 |
+
Nha
|
50 |
+
nổi
|
51 |
+
tiếng
|
52 |
+
với
|
53 |
+
biệt
|
54 |
+
danh
|
55 |
+
Vua
|
56 |
+
vòng
|
57 |
+
loại
|
58 |
+
.
|
59 |
+
Họ
|
60 |
+
thường
|
61 |
+
thi
|
62 |
+
đấu
|
63 |
+
rất
|
64 |
+
tốt
|
65 |
+
ở
|
66 |
+
vòng
|
67 |
+
loại
|
68 |
+
nhưng
|
69 |
+
lại
|
70 |
+
chưa
|
71 |
+
bao
|
72 |
+
giờ
|
73 |
+
chứng
|
74 |
+
minh
|
75 |
+
được
|
76 |
+
sức
|
77 |
+
mạnh
|
78 |
+
ở
|
79 |
+
vòng
|
80 |
+
chung
|
81 |
+
kết
|
82 |
+
giải
|
83 |
+
đấu
|
84 |
+
lớn
|
85 |
+
.
|
86 |
+
Lần
|
87 |
+
duy
|
88 |
+
nhất
|
89 |
+
họ
|
90 |
+
lên
|
91 |
+
ngôi
|
92 |
+
là
|
93 |
+
ở
|
94 |
+
kỳ
|
95 |
+
Euro
|
96 |
+
1964
|
97 |
+
.
|
Model/MultimodelNER/VLSP2016/Image/014716.jpg
ADDED
Model/MultimodelNER/VLSP2016/Image/My model.jpg
ADDED
Model/MultimodelNER/VLSP2016/Image/bully.jpeg
ADDED
Model/MultimodelNER/VLSP2016/Image/bully.jpg
ADDED
Model/MultimodelNER/VLSP2016/Image/maria.jpg
ADDED
Model/MultimodelNER/VLSP2016/Image/penguin.jpg
ADDED
Model/MultimodelNER/VLSP2016/Image/pero.jpg
ADDED
Model/MultimodelNER/VLSP2016/Image/pero2.jpg
ADDED
Model/MultimodelNER/VLSP2016/Image/taybannha.jpg
ADDED
Model/MultimodelNER/VLSP2016/MNER_2016.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from spacy import displacy
|
3 |
+
from Model.NER.VLSP2021.Predict_Ner import ViTagger
|
4 |
+
import re
|
5 |
+
from thunghiemxuly import save_uploaded_image,convert_text_to_txt,add_string_to_txt
|
6 |
+
|
7 |
+
import os
|
8 |
+
from transformers import AutoTokenizer, BertConfig
|
9 |
+
from Model.MultimodelNER.VLSP2016.train_umt_2016 import load_model,predict
|
10 |
+
from Model.MultimodelNER.Ner_processing import format_predictions,process_predictions,combine_entities,remove_B_prefix,combine_i_tags
|
11 |
+
|
12 |
+
from Model.MultimodelNER.predict import get_test_examples_predict
|
13 |
+
from Model.MultimodelNER import resnet as resnet
|
14 |
+
from Model.MultimodelNER.resnet_utils import myResnet
|
15 |
+
import torch
|
16 |
+
import numpy as np
|
17 |
+
from Model.MultimodelNER.VLSP2016.dataset_roberta import MNERProcessor_2016
|
18 |
+
|
19 |
+
|
20 |
+
CONFIG_NAME = 'bert_config.json'
|
21 |
+
WEIGHTS_NAME = 'pytorch_model.bin'
|
22 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
23 |
+
|
24 |
+
|
25 |
+
net = getattr(resnet, 'resnet152')()
|
26 |
+
net.load_state_dict(torch.load(os.path.join('E:/demo_datn/pythonProject1/Model/Resnet/', 'resnet152.pth')))
|
27 |
+
encoder = myResnet(net, True, device)
|
28 |
+
def process_text(text):
|
29 |
+
# Loại bỏ dấu cách thừa và dấu cách ở đầu và cuối văn bản
|
30 |
+
processed_text = re.sub(r'\s+', ' ', text.strip())
|
31 |
+
return processed_text
|
32 |
+
|
33 |
+
|
34 |
+
|
35 |
+
def show_mner_2016():
|
36 |
+
multimodal_text = st.text_area("Enter your text for MNER:", height=300)
|
37 |
+
multimodal_text = process_text(multimodal_text) # Xử lý văn bản
|
38 |
+
image = st.file_uploader("Upload an image (only jpg):", type=["jpg"])
|
39 |
+
if st.button("Process Multimodal NER"):
|
40 |
+
save_image = 'E:/demo_datn/pythonProject1/Model/MultimodelNER/VLSP2016/Image'
|
41 |
+
save_txt = 'E:/demo_datn/pythonProject1/Model/MultimodelNER/VLSP2016/Filetxt/test.txt'
|
42 |
+
image_name = image.name
|
43 |
+
save_uploaded_image(image, save_image)
|
44 |
+
convert_text_to_txt(multimodal_text, save_txt)
|
45 |
+
add_string_to_txt(image_name, save_txt)
|
46 |
+
st.image(image, caption="Uploaded Image", use_column_width=True)
|
47 |
+
|
48 |
+
bert_model='vinai/phobert-base-v2'
|
49 |
+
output_dir='E:/demo_datn/pythonProject1/Model/MultimodelNER/VLSP2016/best_model'
|
50 |
+
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
|
51 |
+
output_encoder_file = os.path.join(output_dir, "pytorch_encoder.bin")
|
52 |
+
processor = MNERProcessor_2016()
|
53 |
+
label_list = processor.get_labels()
|
54 |
+
auxlabel_list = processor.get_auxlabels()
|
55 |
+
num_labels = len(label_list) + 1
|
56 |
+
auxnum_labels = len(auxlabel_list) + 1
|
57 |
+
trans_matrix = np.zeros((auxnum_labels, num_labels), dtype=float)
|
58 |
+
trans_matrix[0, 0] = 1 # pad to pad
|
59 |
+
trans_matrix[1, 1] = 1 # O to O
|
60 |
+
trans_matrix[2, 2] = 0.25 # B to B-MISC
|
61 |
+
trans_matrix[2, 4] = 0.25 # B to B-PER
|
62 |
+
trans_matrix[2, 6] = 0.25 # B to B-ORG
|
63 |
+
trans_matrix[2, 8] = 0.25 # B to B-LOC
|
64 |
+
trans_matrix[3, 3] = 0.25 # I to I-MISC
|
65 |
+
trans_matrix[3, 5] = 0.25 # I to I-PER
|
66 |
+
trans_matrix[3, 7] = 0.25 # I to I-ORG
|
67 |
+
trans_matrix[3, 9] = 0.25 # I to I-LOC
|
68 |
+
trans_matrix[4, 10] = 1 # X to X
|
69 |
+
trans_matrix[5, 11] = 1 # [CLS] to [CLS]
|
70 |
+
trans_matrix[6, 12] = 1
|
71 |
+
tokenizer = AutoTokenizer.from_pretrained(bert_model, do_lower_case=False)
|
72 |
+
model_umt, encoder_umt = load_model(output_model_file, output_encoder_file, encoder,num_labels,auxnum_labels)
|
73 |
+
eval_examples = get_test_examples_predict('E:/demo_datn/pythonProject1/Model/MultimodelNER/VLSP2016/Filetxt/')
|
74 |
+
|
75 |
+
y_pred, a = predict(model_umt, encoder_umt, eval_examples, tokenizer, device,save_image,trans_matrix)
|
76 |
+
formatted_output = format_predictions(a, y_pred[0])
|
77 |
+
final = process_predictions(formatted_output)
|
78 |
+
final2 = combine_entities(final)
|
79 |
+
final3 = remove_B_prefix(final2)
|
80 |
+
final4 = combine_i_tags(final3)
|
81 |
+
words_and_labels = final4
|
82 |
+
# Tạo danh sách từ
|
83 |
+
words = [word for word, _ in words_and_labels]
|
84 |
+
# Tạo danh sách thực thể và nhãn cho mỗi từ, loại bỏ nhãn 'O'
|
85 |
+
entities = [{'start': sum(len(word) + 1 for word, _ in words_and_labels[:i]),
|
86 |
+
'end': sum(len(word) + 1 for word, _ in words_and_labels[:i + 1]), 'label': label} for
|
87 |
+
i, (word, label)
|
88 |
+
in enumerate(words_and_labels) if label != 'O']
|
89 |
+
# print(entities)
|
90 |
+
|
91 |
+
# Render the visualization without color for 'O' labels
|
92 |
+
html = displacy.render(
|
93 |
+
{"text": " ".join(words), "ents": entities, "title": None},
|
94 |
+
style="ent",
|
95 |
+
manual=True,
|
96 |
+
options={"colors": {"MISC": "#806699",
|
97 |
+
"ORG": "#ff6666",
|
98 |
+
"LOC": "#66cc66",
|
99 |
+
"PER": "#bf80ff",
|
100 |
+
"O": None}}
|
101 |
+
)
|
102 |
+
# print(html)
|
103 |
+
st.markdown(html, unsafe_allow_html=True)
|
104 |
+
|
105 |
+
|
106 |
+
###Ví dụ 1 : Một trận hỗn chiến đã xảy ra tại trận đấu khúc côn cầu giữa Penguins và Islanders ở Mỹ (image:penguin)
|
Model/MultimodelNER/VLSP2016/__pycache__/MNER_2016.cpython-39.pyc
ADDED
Binary file (4.34 kB). View file
|
|
Model/MultimodelNER/VLSP2016/__pycache__/dataset_roberta.cpython-39.pyc
ADDED
Binary file (9.5 kB). View file
|
|
Model/MultimodelNER/VLSP2016/__pycache__/train_umt_2016.cpython-39.pyc
ADDED
Binary file (8.82 kB). View file
|
|
Model/MultimodelNER/VLSP2016/best_model/bert_config.json
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "vinai/phobert-base-v2",
|
3 |
+
"architectures": [
|
4 |
+
"RobertaForMaskedLM"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"bos_token_id": 0,
|
8 |
+
"classifier_dropout": null,
|
9 |
+
"eos_token_id": 2,
|
10 |
+
"hidden_act": "gelu",
|
11 |
+
"hidden_dropout_prob": 0.1,
|
12 |
+
"hidden_size": 768,
|
13 |
+
"initializer_range": 0.02,
|
14 |
+
"intermediate_size": 3072,
|
15 |
+
"layer_norm_eps": 1e-05,
|
16 |
+
"max_position_embeddings": 258,
|
17 |
+
"model_type": "roberta",
|
18 |
+
"num_attention_heads": 12,
|
19 |
+
"num_hidden_layers": 12,
|
20 |
+
"pad_token_id": 1,
|
21 |
+
"position_embedding_type": "absolute",
|
22 |
+
"tokenizer_class": "PhobertTokenizer",
|
23 |
+
"torch_dtype": "float32",
|
24 |
+
"transformers_version": "4.35.2",
|
25 |
+
"type_vocab_size": 1,
|
26 |
+
"use_cache": true,
|
27 |
+
"vocab_size": 64001
|
28 |
+
}
|
Model/MultimodelNER/VLSP2016/best_model/eval_results.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
precision recall f1-score support
|
2 |
+
|
3 |
+
LOC 0.9570 0.9618 0.9594 996
|
4 |
+
MISC 0.9143 0.8889 0.9014 36
|
5 |
+
ORG 0.8129 0.7975 0.8051 158
|
6 |
+
PER 0.9835 0.9788 0.9812 851
|
7 |
+
|
8 |
+
micro avg 0.9563 0.9549 0.9556 2041
|
9 |
+
macro avg 0.9169 0.9068 0.9118 2041
|
10 |
+
weighted avg 0.9561 0.9549 0.9555 2041
|
11 |
+
Overall: 0.9563297350343474 0.9549240568348849 0.9556263790144643
|
Model/MultimodelNER/VLSP2016/best_model/model_config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"bert_model": "vinai/phobert-base-v2", "do_lower": false, "max_seq_length": 256, "num_labels": 13, "label_map": {"1": "B-ORG", "2": "B-MISC", "3": "I-PER", "4": "I-ORG", "5": "B-LOC", "6": "I-MISC", "7": "I-LOC", "8": "O", "9": "B-PER", "10": "X", "11": "<s>", "12": "</s>"}}
|
Model/MultimodelNER/VLSP2016/best_model/mtmner_pred.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
Model/MultimodelNER/VLSP2016/best_model/pytorch_encoder.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ab29aaf11c3beb874e34fc9bccaa1fb838d94701cf4a4189c37d768a7678e958
|
3 |
+
size 241699561
|
Model/MultimodelNER/VLSP2016/best_model/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c950c331c48a229744b1b727a49d3dc248f28377ba8efbd86612daf2721e4368
|
3 |
+
size 699285929
|
Model/MultimodelNER/VLSP2016/cache/models--vinai--phobert-base-v2/.no_exist/2b51e367d92093c9688112098510e6a58bab67cd/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
|
3 |
+
size 0
|
Model/MultimodelNER/VLSP2016/cache/models--vinai--phobert-base-v2/.no_exist/2b51e367d92093c9688112098510e6a58bab67cd/model.safetensors.index.json
ADDED
File without changes
|
Model/MultimodelNER/VLSP2016/cache/models--vinai--phobert-base-v2/refs/main
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
2b51e367d92093c9688112098510e6a58bab67cd
|
Model/MultimodelNER/VLSP2016/cache/models--vinai--phobert-base-v2/snapshots/2b51e367d92093c9688112098510e6a58bab67cd/config.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"RobertaForMaskedLM"
|
4 |
+
],
|
5 |
+
"attention_probs_dropout_prob": 0.1,
|
6 |
+
"bos_token_id": 0,
|
7 |
+
"classifier_dropout": null,
|
8 |
+
"eos_token_id": 2,
|
9 |
+
"hidden_act": "gelu",
|
10 |
+
"hidden_dropout_prob": 0.1,
|
11 |
+
"hidden_size": 768,
|
12 |
+
"initializer_range": 0.02,
|
13 |
+
"intermediate_size": 3072,
|
14 |
+
"layer_norm_eps": 1e-05,
|
15 |
+
"max_position_embeddings": 258,
|
16 |
+
"model_type": "roberta",
|
17 |
+
"num_attention_heads": 12,
|
18 |
+
"num_hidden_layers": 12,
|
19 |
+
"pad_token_id": 1,
|
20 |
+
"position_embedding_type": "absolute",
|
21 |
+
"tokenizer_class": "PhobertTokenizer",
|
22 |
+
"torch_dtype": "float32",
|
23 |
+
"transformers_version": "4.26.1",
|
24 |
+
"type_vocab_size": 1,
|
25 |
+
"use_cache": true,
|
26 |
+
"vocab_size": 64001
|
27 |
+
}
|
Model/MultimodelNER/VLSP2016/cache/models--vinai--phobert-base-v2/snapshots/2b51e367d92093c9688112098510e6a58bab67cd/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7ba09eb4c244a5b3a49ad76d52d129ac085b61f5c6287de7f99508b02be589f9
|
3 |
+
size 540322347
|
Model/MultimodelNER/VLSP2016/dataset_roberta.py
ADDED
@@ -0,0 +1,452 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import logging
|
3 |
+
import os
|
4 |
+
|
5 |
+
logger = logging.getLogger(__name__)
|
6 |
+
from torchvision import transforms
|
7 |
+
from PIL import Image
|
8 |
+
|
9 |
+
|
10 |
+
class SBInputExample(object):
|
11 |
+
"""A single training/test example for simple sequence classification."""
|
12 |
+
|
13 |
+
def __init__(self, guid, text_a, text_b, img_id, label=None, auxlabel=None):
|
14 |
+
"""Constructs a InputExample.
|
15 |
+
|
16 |
+
Args:
|
17 |
+
guid: Unique id for the example.
|
18 |
+
text_a: string. The untokenized text of the first sequence. For single
|
19 |
+
sequence tasks, only this sequence must be specified.
|
20 |
+
text_b: (Optional) string. The untokenized text of the second sequence.
|
21 |
+
Only must be specified for sequence pair tasks.
|
22 |
+
label: (Optional) string. The label of the example. This should be
|
23 |
+
specified for train and dev examples, but not for test examples.
|
24 |
+
"""
|
25 |
+
self.guid = guid
|
26 |
+
self.text_a = text_a
|
27 |
+
self.text_b = text_b
|
28 |
+
self.img_id = img_id
|
29 |
+
self.label = label
|
30 |
+
# Please note that the auxlabel is not used in SB
|
31 |
+
# it is just kept in order not to modify the original code
|
32 |
+
self.auxlabel = auxlabel
|
33 |
+
|
34 |
+
|
35 |
+
class SBInputFeatures(object):
|
36 |
+
"""A single set of features of data"""
|
37 |
+
|
38 |
+
def __init__(self, input_ids, input_mask, added_input_mask, segment_ids, img_feat, label_id, auxlabel_id):
|
39 |
+
self.input_ids = input_ids
|
40 |
+
self.input_mask = input_mask
|
41 |
+
self.added_input_mask = added_input_mask
|
42 |
+
self.segment_ids = segment_ids
|
43 |
+
self.img_feat = img_feat
|
44 |
+
self.label_id = label_id
|
45 |
+
self.auxlabel_id = auxlabel_id
|
46 |
+
|
47 |
+
|
48 |
+
def sbreadfile(filename):
|
49 |
+
'''
|
50 |
+
Đọc dữ liệu từ tệp và trả về dưới dạng danh sách các cặp từ và nhãn, cùng với danh sách hình ảnh và nhãn phụ.
|
51 |
+
'''
|
52 |
+
print("Chuẩn bị dữ liệu cho ", filename)
|
53 |
+
f = open(filename, encoding='utf8')
|
54 |
+
data = []
|
55 |
+
imgs = []
|
56 |
+
auxlabels = []
|
57 |
+
sentence = []
|
58 |
+
label = []
|
59 |
+
auxlabel = []
|
60 |
+
imgid = ''
|
61 |
+
|
62 |
+
for line in f:
|
63 |
+
line = line.strip() # Loại bỏ các dấu cách thừa ở đầu và cuối dòng
|
64 |
+
if line.startswith('IMGID:'):
|
65 |
+
imgid = line.split('IMGID:')[1] + '.jpg'
|
66 |
+
continue
|
67 |
+
if line == '':
|
68 |
+
if len(sentence) > 0:
|
69 |
+
data.append((sentence, label))
|
70 |
+
imgs.append(imgid)
|
71 |
+
auxlabels.append(auxlabel)
|
72 |
+
sentence = []
|
73 |
+
label = []
|
74 |
+
auxlabel = []
|
75 |
+
imgid = ''
|
76 |
+
continue
|
77 |
+
splits = line.split('\t')
|
78 |
+
if len(splits) == 2: # Đảm bảo dòng có ít nhất một từ và một nhãn
|
79 |
+
word, cur_label = splits
|
80 |
+
sentence.append(word)
|
81 |
+
label.append(cur_label)
|
82 |
+
auxlabel.append(cur_label[0]) # Lấy ký tự đầu tiên của nhãn làm nhãn phụ
|
83 |
+
|
84 |
+
if len(sentence) > 0: # Xử lý dữ liệu cuối cùng trong tệp
|
85 |
+
data.append((sentence, label))
|
86 |
+
imgs.append(imgid)
|
87 |
+
auxlabels.append(auxlabel)
|
88 |
+
|
89 |
+
print("Số lượng mẫu: " + str(len(data)))
|
90 |
+
print("Số lượng hình ảnh: " + str(len(imgs)))
|
91 |
+
return data, imgs, auxlabels
|
92 |
+
|
93 |
+
|
94 |
+
# def sbreadfile(filename): #code gốc
|
95 |
+
# '''
|
96 |
+
# read file
|
97 |
+
# return format :
|
98 |
+
# [ ['EU', 'B-ORG'], ['rejects', 'O'], ['German', 'B-MISC'], ['call', 'O'], ['to', 'O'], ['boycott', 'O'], ['British', 'B-MISC'], ['lamb', 'O'], ['.', 'O'] ]
|
99 |
+
# '''
|
100 |
+
# print("prepare data for ",filename)
|
101 |
+
# f = open(filename,encoding='utf8')
|
102 |
+
# data = []
|
103 |
+
# imgs = []
|
104 |
+
# auxlabels = []
|
105 |
+
# sentence = []
|
106 |
+
# label = []
|
107 |
+
# auxlabel = []
|
108 |
+
# imgid = ''
|
109 |
+
# a = 0
|
110 |
+
# for line in f:
|
111 |
+
# if line.startswith('IMGID:'):
|
112 |
+
# imgid = line.strip().split('IMGID:')[1] + '.jpg'
|
113 |
+
# continue
|
114 |
+
# if line[0] == "\n":
|
115 |
+
# if len(sentence) > 0:
|
116 |
+
# data.append((sentence, label))
|
117 |
+
# imgs.append(imgid)
|
118 |
+
# auxlabels.append(auxlabel)
|
119 |
+
# sentence = []
|
120 |
+
# label = []
|
121 |
+
# imgid = ''
|
122 |
+
# auxlabel = []
|
123 |
+
# continue
|
124 |
+
# splits = line.split('\t')
|
125 |
+
# sentence.append(splits[0])
|
126 |
+
# cur_label = splits[-1][:-1]
|
127 |
+
# # if cur_label == 'B-OTHER':
|
128 |
+
# # cur_label = 'B-MISC'
|
129 |
+
# # elif cur_label == 'I-OTHER':
|
130 |
+
# # cur_label = 'I-MISC'
|
131 |
+
# label.append(cur_label)
|
132 |
+
# auxlabel.append(cur_label[0])
|
133 |
+
|
134 |
+
# if len(sentence) > 0:
|
135 |
+
# data.append((sentence, label))
|
136 |
+
# imgs.append(imgid)
|
137 |
+
# auxlabels.append(auxlabel)
|
138 |
+
# sentence = []
|
139 |
+
# label = []
|
140 |
+
# auxlabel = []
|
141 |
+
|
142 |
+
# print("The number of samples: " + str(len(data)))
|
143 |
+
# print("The number of images: " + str(len(imgs)))
|
144 |
+
# return data, imgs, auxlabels
|
145 |
+
|
146 |
+
class DataProcessor(object):
|
147 |
+
"""Base class for data converters for sequence classification data sets."""
|
148 |
+
|
149 |
+
def get_train_examples(self, data_dir):
|
150 |
+
"""Gets a collection of `InputExample`s for the train set."""
|
151 |
+
raise NotImplementedError()
|
152 |
+
|
153 |
+
def get_dev_examples(self, data_dir):
|
154 |
+
"""Gets a collection of `InputExample`s for the dev set."""
|
155 |
+
raise NotImplementedError()
|
156 |
+
|
157 |
+
def get_labels(self):
|
158 |
+
"""Gets the list of labels for this data set."""
|
159 |
+
raise NotImplementedError()
|
160 |
+
|
161 |
+
@classmethod
|
162 |
+
def _read_sbtsv(cls, input_file, quotechar=None):
|
163 |
+
"""Reads a tab separated value file."""
|
164 |
+
return sbreadfile(input_file)
|
165 |
+
|
166 |
+
|
167 |
+
class MNERProcessor_2016(DataProcessor):
|
168 |
+
"""Processor for the CoNLL-2003 data set."""
|
169 |
+
|
170 |
+
def get_train_examples(self, data_dir):
|
171 |
+
"""See base class."""
|
172 |
+
data, imgs, auxlabels = self._read_sbtsv(os.path.join(data_dir, "train.txt"))
|
173 |
+
return self._create_examples(data, imgs, auxlabels, "train")
|
174 |
+
|
175 |
+
def get_dev_examples(self, data_dir):
|
176 |
+
"""See base class."""
|
177 |
+
data, imgs, auxlabels = self._read_sbtsv(os.path.join(data_dir, "dev.txt"))
|
178 |
+
return self._create_examples(data, imgs, auxlabels, "dev")
|
179 |
+
|
180 |
+
def get_test_examples(self, data_dir):
|
181 |
+
"""See base class."""
|
182 |
+
data, imgs, auxlabels = self._read_sbtsv(os.path.join(data_dir, "test.txt"))
|
183 |
+
return self._create_examples(data, imgs, auxlabels, "test")
|
184 |
+
|
185 |
+
def get_labels(self):
|
186 |
+
# return [
|
187 |
+
# "O","I-PRODUCT-AWARD",
|
188 |
+
# "B-MISCELLANEOUS",
|
189 |
+
# "B-QUANTITY-NUM",
|
190 |
+
# "B-ORGANIZATION-SPORTS",
|
191 |
+
# "B-DATETIME",
|
192 |
+
# "I-ADDRESS",
|
193 |
+
# "I-PERSON",
|
194 |
+
# "I-EVENT-SPORT",
|
195 |
+
# "B-ADDRESS",
|
196 |
+
# "B-EVENT-NATURAL",
|
197 |
+
# "I-LOCATION-GPE",
|
198 |
+
# "B-EVENT-GAMESHOW",
|
199 |
+
# "B-DATETIME-TIMERANGE",
|
200 |
+
# "I-QUANTITY-NUM",
|
201 |
+
# "I-QUANTITY-AGE",
|
202 |
+
# "B-EVENT-CUL",
|
203 |
+
# "I-QUANTITY-TEM",
|
204 |
+
# "I-PRODUCT-LEGAL",
|
205 |
+
# "I-LOCATION-STRUC",
|
206 |
+
# "I-ORGANIZATION",
|
207 |
+
# "B-PHONENUMBER",
|
208 |
+
# "B-IP",
|
209 |
+
# "B-QUANTITY-AGE",
|
210 |
+
# "I-DATETIME-TIME",
|
211 |
+
# "I-DATETIME",
|
212 |
+
# "B-ORGANIZATION-MED",
|
213 |
+
# "B-DATETIME-SET",
|
214 |
+
# "I-EVENT-CUL",
|
215 |
+
# "B-QUANTITY-DIM",
|
216 |
+
# "I-QUANTITY-DIM",
|
217 |
+
# "B-EVENT",
|
218 |
+
# "B-DATETIME-DATERANGE",
|
219 |
+
# "I-EVENT-GAMESHOW",
|
220 |
+
# "B-PRODUCT-AWARD",
|
221 |
+
# "B-LOCATION-STRUC",
|
222 |
+
# "B-LOCATION",
|
223 |
+
# "B-PRODUCT",
|
224 |
+
# "I-MISCELLANEOUS",
|
225 |
+
# "B-SKILL",
|
226 |
+
# "I-QUANTITY-ORD",
|
227 |
+
# "I-ORGANIZATION-STOCK",
|
228 |
+
# "I-LOCATION-GEO",
|
229 |
+
# "B-PERSON",
|
230 |
+
# "B-PRODUCT-COM",
|
231 |
+
# "B-PRODUCT-LEGAL",
|
232 |
+
# "I-LOCATION",
|
233 |
+
# "B-QUANTITY-TEM",
|
234 |
+
# "I-PRODUCT",
|
235 |
+
# "B-QUANTITY-CUR",
|
236 |
+
# "I-QUANTITY-CUR",
|
237 |
+
# "B-LOCATION-GPE",
|
238 |
+
# "I-PHONENUMBER",
|
239 |
+
# "I-ORGANIZATION-MED",
|
240 |
+
# "I-EVENT-NATURAL",
|
241 |
+
# "I-EMAIL",
|
242 |
+
# "B-ORGANIZATION",
|
243 |
+
# "B-URL",
|
244 |
+
# "I-DATETIME-TIMERANGE",
|
245 |
+
# "I-QUANTITY",
|
246 |
+
# "I-IP",
|
247 |
+
# "B-EVENT-SPORT",
|
248 |
+
# "B-PERSONTYPE",
|
249 |
+
# "B-QUANTITY-PER",
|
250 |
+
# "I-QUANTITY-PER",
|
251 |
+
# "I-PRODUCT-COM",
|
252 |
+
# "I-DATETIME-DURATION",
|
253 |
+
# "B-LOCATION-GPE-GEO",
|
254 |
+
# "B-QUANTITY-ORD",
|
255 |
+
# "I-EVENT",
|
256 |
+
# "B-DATETIME-TIME",
|
257 |
+
# "B-QUANTITY",
|
258 |
+
# "I-DATETIME-SET",
|
259 |
+
# "I-LOCATION-GPE-GEO",
|
260 |
+
# "B-ORGANIZATION-STOCK",
|
261 |
+
# "I-ORGANIZATION-SPORTS",
|
262 |
+
# "I-SKILL",
|
263 |
+
# "I-URL",
|
264 |
+
# "B-DATETIME-DURATION",
|
265 |
+
# "I-DATETIME-DATE",
|
266 |
+
# "I-PERSONTYPE",
|
267 |
+
# "B-DATETIME-DATE",
|
268 |
+
# "I-DATETIME-DATERANGE",
|
269 |
+
# "B-LOCATION-GEO",
|
270 |
+
# "B-EMAIL","X","<s>", "</s>"]
|
271 |
+
|
272 |
+
# vlsp2016
|
273 |
+
return [
|
274 |
+
"B-ORG", "B-MISC",
|
275 |
+
"I-PER",
|
276 |
+
"I-ORG",
|
277 |
+
"B-LOC",
|
278 |
+
"I-MISC",
|
279 |
+
"I-LOC",
|
280 |
+
"O",
|
281 |
+
"B-PER",
|
282 |
+
"X",
|
283 |
+
"<s>",
|
284 |
+
"</s>"]
|
285 |
+
|
286 |
+
# vlsp2018
|
287 |
+
# return [
|
288 |
+
# "O","I-ORGANIZATION",
|
289 |
+
# "B-ORGANIZATION",
|
290 |
+
# "I-LOCATION",
|
291 |
+
# "B-MISCELLANEOUS",
|
292 |
+
# "I-PERSON",
|
293 |
+
# "B-PERSON",
|
294 |
+
# "I-MISCELLANEOUS",
|
295 |
+
# "B-LOCATION",
|
296 |
+
# "X",
|
297 |
+
# "<s>",
|
298 |
+
# "</s>"]
|
299 |
+
|
300 |
+
def get_auxlabels(self):
|
301 |
+
return ["O", "B", "I", "X", "<s>", "</s>"]
|
302 |
+
|
303 |
+
def get_start_label_id(self):
|
304 |
+
label_list = self.get_labels()
|
305 |
+
label_map = {label: i for i, label in enumerate(label_list, 1)}
|
306 |
+
return label_map['<s>']
|
307 |
+
|
308 |
+
def get_stop_label_id(self):
|
309 |
+
label_list = self.get_labels()
|
310 |
+
label_map = {label: i for i, label in enumerate(label_list, 1)}
|
311 |
+
return label_map['</s>']
|
312 |
+
|
313 |
+
def _create_examples(self, lines, imgs, auxlabels, set_type):
|
314 |
+
examples = []
|
315 |
+
for i, (sentence, label) in enumerate(lines):
|
316 |
+
guid = "%s-%s" % (set_type, i)
|
317 |
+
text_a = ' '.join(sentence)
|
318 |
+
text_b = None
|
319 |
+
img_id = imgs[i]
|
320 |
+
label = label
|
321 |
+
auxlabel = auxlabels[i]
|
322 |
+
examples.append(
|
323 |
+
SBInputExample(guid=guid, text_a=text_a, text_b=text_b, img_id=img_id, label=label, auxlabel=auxlabel))
|
324 |
+
return examples
|
325 |
+
|
326 |
+
|
327 |
+
def image_process(image_path, transform):
|
328 |
+
image = Image.open(image_path).convert('RGB')
|
329 |
+
image = transform(image)
|
330 |
+
return image
|
331 |
+
|
332 |
+
|
333 |
+
def convert_mm_examples_to_features(examples, label_list, auxlabel_list,
|
334 |
+
max_seq_length, tokenizer, crop_size, path_img):
|
335 |
+
label_map = {label: i for i, label in enumerate(label_list, 1)}
|
336 |
+
auxlabel_map = {label: i for i, label in enumerate(auxlabel_list, 1)}
|
337 |
+
|
338 |
+
features = []
|
339 |
+
count = 0
|
340 |
+
|
341 |
+
transform = transforms.Compose([
|
342 |
+
transforms.Resize([256, 256]),
|
343 |
+
transforms.RandomCrop(crop_size), # args.crop_size, by default it is set to be 224
|
344 |
+
transforms.RandomHorizontalFlip(),
|
345 |
+
transforms.ToTensor(),
|
346 |
+
transforms.Normalize((0.485, 0.456, 0.406),
|
347 |
+
(0.229, 0.224, 0.225))])
|
348 |
+
|
349 |
+
for (ex_index, example) in enumerate(examples):
|
350 |
+
textlist = example.text_a.split(' ')
|
351 |
+
labellist = example.label
|
352 |
+
auxlabellist = example.auxlabel
|
353 |
+
tokens = []
|
354 |
+
labels = []
|
355 |
+
auxlabels = []
|
356 |
+
for i, word in enumerate(textlist):
|
357 |
+
token = tokenizer.tokenize(word)
|
358 |
+
tokens.extend(token)
|
359 |
+
label_1 = labellist[i]
|
360 |
+
auxlabel_1 = auxlabellist[i]
|
361 |
+
for m in range(len(token)):
|
362 |
+
if m == 0:
|
363 |
+
labels.append(label_1)
|
364 |
+
auxlabels.append(auxlabel_1)
|
365 |
+
else:
|
366 |
+
labels.append("X")
|
367 |
+
auxlabels.append("X")
|
368 |
+
if len(tokens) >= max_seq_length - 1:
|
369 |
+
tokens = tokens[0:(max_seq_length - 2)]
|
370 |
+
labels = labels[0:(max_seq_length - 2)]
|
371 |
+
auxlabels = auxlabels[0:(max_seq_length - 2)]
|
372 |
+
ntokens = []
|
373 |
+
segment_ids = []
|
374 |
+
label_ids = []
|
375 |
+
auxlabel_ids = []
|
376 |
+
ntokens.append("<s>")
|
377 |
+
segment_ids.append(0)
|
378 |
+
label_ids.append(label_map["<s>"])
|
379 |
+
auxlabel_ids.append(auxlabel_map["<s>"])
|
380 |
+
for i, token in enumerate(tokens):
|
381 |
+
ntokens.append(token)
|
382 |
+
segment_ids.append(0)
|
383 |
+
label_ids.append(label_map[labels[i]])
|
384 |
+
auxlabel_ids.append(auxlabel_map[auxlabels[i]])
|
385 |
+
ntokens.append("</s>")
|
386 |
+
segment_ids.append(0)
|
387 |
+
label_ids.append(label_map["</s>"])
|
388 |
+
auxlabel_ids.append(auxlabel_map["</s>"])
|
389 |
+
input_ids = tokenizer.convert_tokens_to_ids(ntokens)
|
390 |
+
input_mask = [1] * len(input_ids)
|
391 |
+
added_input_mask = [1] * (len(input_ids) + 49) # 1 or 49 is for encoding regional image representations
|
392 |
+
|
393 |
+
while len(input_ids) < max_seq_length:
|
394 |
+
input_ids.append(0)
|
395 |
+
input_mask.append(0)
|
396 |
+
added_input_mask.append(0)
|
397 |
+
segment_ids.append(0)
|
398 |
+
label_ids.append(0)
|
399 |
+
auxlabel_ids.append(0)
|
400 |
+
|
401 |
+
assert len(input_ids) == max_seq_length
|
402 |
+
assert len(input_mask) == max_seq_length
|
403 |
+
assert len(segment_ids) == max_seq_length
|
404 |
+
assert len(label_ids) == max_seq_length
|
405 |
+
assert len(auxlabel_ids) == max_seq_length
|
406 |
+
|
407 |
+
image_name = example.img_id
|
408 |
+
image_path = os.path.join(path_img, image_name)
|
409 |
+
|
410 |
+
if not os.path.exists(image_path):
|
411 |
+
if 'NaN' not in image_path:
|
412 |
+
print(image_path)
|
413 |
+
try:
|
414 |
+
image = image_process(image_path, transform)
|
415 |
+
except:
|
416 |
+
count += 1
|
417 |
+
image_path_fail = os.path.join(path_img, 'background.jpg')
|
418 |
+
image = image_process(image_path_fail, transform)
|
419 |
+
|
420 |
+
else:
|
421 |
+
if ex_index < 2:
|
422 |
+
logger.info("*** Example ***")
|
423 |
+
logger.info("guid: %s" % (example.guid))
|
424 |
+
logger.info("tokens: %s" % " ".join(
|
425 |
+
[str(x) for x in tokens]))
|
426 |
+
logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
|
427 |
+
logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
|
428 |
+
logger.info(
|
429 |
+
"segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
|
430 |
+
logger.info("label: %s" % " ".join([str(x) for x in label_ids]))
|
431 |
+
logger.info("auxlabel: %s" % " ".join([str(x) for x in auxlabel_ids]))
|
432 |
+
|
433 |
+
features.append(
|
434 |
+
SBInputFeatures(input_ids=input_ids, input_mask=input_mask, added_input_mask=added_input_mask,
|
435 |
+
segment_ids=segment_ids, img_feat=image, label_id=label_ids, auxlabel_id=auxlabel_ids))
|
436 |
+
|
437 |
+
print('the number of problematic samples: ' + str(count))
|
438 |
+
return features
|
439 |
+
|
440 |
+
|
441 |
+
# if __name__ == "__main__":
|
442 |
+
# processor = MNERProcessor_2016()
|
443 |
+
# label_list = processor.get_labels()
|
444 |
+
# auxlabel_list = processor.get_auxlabels()
|
445 |
+
# num_labels = len(label_list) + 1 # label 0 corresponds to padding, label in label_list starts from 1
|
446 |
+
#
|
447 |
+
# start_label_id = processor.get_start_label_id()
|
448 |
+
# stop_label_id = processor.get_stop_label_id()
|
449 |
+
#
|
450 |
+
# data_dir = r'sample_data'
|
451 |
+
# train_examples = processor.get_train_examples(data_dir)
|
452 |
+
# print(train_examples[0].img_id)
|
Model/MultimodelNER/VLSP2016/list.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
IMGID:namngo
|
2 |
+
Toi
|
3 |
+
ten
|
4 |
+
la
|
5 |
+
Minh
|
Model/MultimodelNER/VLSP2016/models--vinai--phobert-base-v2/.no_exist/2b51e367d92093c9688112098510e6a58bab67cd/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
|
3 |
+
size 0
|
Model/MultimodelNER/VLSP2016/models--vinai--phobert-base-v2/.no_exist/2b51e367d92093c9688112098510e6a58bab67cd/model.safetensors.index.json
ADDED
File without changes
|
Model/MultimodelNER/VLSP2016/models--vinai--phobert-base-v2/refs/main
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
2b51e367d92093c9688112098510e6a58bab67cd
|
Model/MultimodelNER/VLSP2016/models--vinai--phobert-base-v2/snapshots/2b51e367d92093c9688112098510e6a58bab67cd/config.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"RobertaForMaskedLM"
|
4 |
+
],
|
5 |
+
"attention_probs_dropout_prob": 0.1,
|
6 |
+
"bos_token_id": 0,
|
7 |
+
"classifier_dropout": null,
|
8 |
+
"eos_token_id": 2,
|
9 |
+
"hidden_act": "gelu",
|
10 |
+
"hidden_dropout_prob": 0.1,
|
11 |
+
"hidden_size": 768,
|
12 |
+
"initializer_range": 0.02,
|
13 |
+
"intermediate_size": 3072,
|
14 |
+
"layer_norm_eps": 1e-05,
|
15 |
+
"max_position_embeddings": 258,
|
16 |
+
"model_type": "roberta",
|
17 |
+
"num_attention_heads": 12,
|
18 |
+
"num_hidden_layers": 12,
|
19 |
+
"pad_token_id": 1,
|
20 |
+
"position_embedding_type": "absolute",
|
21 |
+
"tokenizer_class": "PhobertTokenizer",
|
22 |
+
"torch_dtype": "float32",
|
23 |
+
"transformers_version": "4.26.1",
|
24 |
+
"type_vocab_size": 1,
|
25 |
+
"use_cache": true,
|
26 |
+
"vocab_size": 64001
|
27 |
+
}
|
Model/MultimodelNER/VLSP2016/models--vinai--phobert-base-v2/snapshots/2b51e367d92093c9688112098510e6a58bab67cd/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7ba09eb4c244a5b3a49ad76d52d129ac085b61f5c6287de7f99508b02be589f9
|
3 |
+
size 540322347
|
Model/MultimodelNER/VLSP2016/test.txt
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
IMGID:014716
|
2 |
+
“ O
|
3 |
+
Tôi O
|
4 |
+
xin O
|
5 |
+
cám_ơn O
|
6 |
+
thượng_sĩ O
|
7 |
+
Nguyễn B-PER
|
8 |
+
Trung I-PER
|
9 |
+
Hiếu I-PER
|
10 |
+
( O
|
11 |
+
người O
|
12 |
+
phiên_dịch O
|
13 |
+
tiếng B-MISC
|
14 |
+
Anh I-MISC
|
15 |
+
cho O
|
16 |
+
đơn_vị O
|
17 |
+
tình_báo O
|
18 |
+
quân_sự O
|
19 |
+
số O
|
20 |
+
635 O
|
21 |
+
của O
|
22 |
+
quân_đội O
|
23 |
+
Mỹ B-LOC
|
24 |
+
biên_chế O
|
25 |
+
bên O
|
26 |
+
cạnh O
|
27 |
+
lữ_đoàn B-ORG
|
28 |
+
bộ_binh I-ORG
|
29 |
+
số I-ORG
|
30 |
+
11 I-ORG
|
31 |
+
, O
|
32 |
+
sư_đoàn B-ORG
|
33 |
+
bộ_binh I-ORG
|
34 |
+
23 I-ORG
|
35 |
+
) O
|
36 |
+
, O
|
37 |
+
người O
|
38 |
+
đã O
|
39 |
+
cứu O
|
40 |
+
cuốn O
|
41 |
+
nhật_ký O
|
42 |
+
của O
|
43 |
+
chị O
|
44 |
+
tôi O
|
45 |
+
khỏi O
|
46 |
+
bị O
|
47 |
+
quẳng O
|
48 |
+
vào O
|
49 |
+
đống O
|
50 |
+
lửa O
|
51 |
+
bởi O
|
52 |
+
anh O
|
53 |
+
đã O
|
54 |
+
nhận O
|
55 |
+
ra O
|
56 |
+
trong O
|
57 |
+
cuốn O
|
58 |
+
sổ O
|
59 |
+
này O
|
60 |
+
đã O
|
61 |
+
chứa_đựng O
|
62 |
+
lửa O
|
63 |
+
rồi O
|
64 |
+
để O
|
65 |
+
anh O
|
66 |
+
trao O
|
67 |
+
lại O
|
68 |
+
nó O
|
69 |
+
cho O
|
70 |
+
Fred B-PER
|
71 |
+
như O
|
72 |
+
một O
|
73 |
+
lời O
|
74 |
+
uỷ_thác O
|
75 |
+
từ O
|
76 |
+
chị O
|
77 |
+
tôi O
|
78 |
+
. O
|
Model/MultimodelNER/VLSP2016/train_umt_2016.py
ADDED
@@ -0,0 +1,352 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
|
4 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
5 |
+
import argparse
|
6 |
+
|
7 |
+
import logging
|
8 |
+
import random
|
9 |
+
import numpy as np
|
10 |
+
import torch
|
11 |
+
import torch.nn.functional as F
|
12 |
+
from transformers import AutoTokenizer, BertConfig
|
13 |
+
from Model.MultimodelNER.UMT import UMT
|
14 |
+
from Model.MultimodelNER import resnet as resnet
|
15 |
+
from Model.MultimodelNER.resnet_utils import myResnet
|
16 |
+
from Model.MultimodelNER.VLSP2016.dataset_roberta import convert_mm_examples_to_features, MNERProcessor_2016
|
17 |
+
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
|
18 |
+
TensorDataset)
|
19 |
+
from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
|
20 |
+
from Model.MultimodelNER.ner_evaluate import evaluate_each_class,evaluate
|
21 |
+
from seqeval.metrics import classification_report
|
22 |
+
from tqdm import tqdm, trange
|
23 |
+
import json
|
24 |
+
from Model.MultimodelNER.predict import convert_mm_examples_to_features_predict, get_test_examples_predict
|
25 |
+
from Model.MultimodelNER.Ner_processing import *
|
26 |
+
CONFIG_NAME = 'bert_config.json'
|
27 |
+
WEIGHTS_NAME = 'pytorch_model.bin'
|
28 |
+
|
29 |
+
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
30 |
+
datefmt='%m/%d/%Y %H:%M:%S',
|
31 |
+
level=logging.INFO)
|
32 |
+
logger = logging.getLogger(__name__)
|
33 |
+
parser = argparse.ArgumentParser()
|
34 |
+
## Required parameters
|
35 |
+
parser.add_argument("--negative_rate",
|
36 |
+
default=16,
|
37 |
+
type=int,
|
38 |
+
help="the negative samples rate")
|
39 |
+
|
40 |
+
parser.add_argument('--lamb',
|
41 |
+
default=0.62,
|
42 |
+
type=float)
|
43 |
+
|
44 |
+
parser.add_argument('--temp',
|
45 |
+
type=float,
|
46 |
+
default=0.179,
|
47 |
+
help="parameter for CL training")
|
48 |
+
|
49 |
+
parser.add_argument('--temp_lamb',
|
50 |
+
type=float,
|
51 |
+
default=0.7,
|
52 |
+
help="parameter for CL training")
|
53 |
+
|
54 |
+
parser.add_argument("--data_dir",
|
55 |
+
default='./data/twitter2017',
|
56 |
+
type=str,
|
57 |
+
|
58 |
+
help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
|
59 |
+
parser.add_argument("--bert_model", default='vinai/phobert-base-v2', type=str)
|
60 |
+
parser.add_argument("--task_name",
|
61 |
+
default='sonba',
|
62 |
+
type=str,
|
63 |
+
|
64 |
+
help="The name of the task to train.")
|
65 |
+
parser.add_argument("--output_dir",
|
66 |
+
default='E:/demo_datn/pythonProject1/Model/MultimodelNER/VLSP2016/best_model/',
|
67 |
+
type=str,
|
68 |
+
help="The output directory where the model predictions and checkpoints will be written.")
|
69 |
+
|
70 |
+
## Other parameters
|
71 |
+
parser.add_argument("--cache_dir",
|
72 |
+
default="",
|
73 |
+
type=str,
|
74 |
+
help="Where do you want to store the pre-trained models downloaded from s3")
|
75 |
+
|
76 |
+
parser.add_argument("--max_seq_length",
|
77 |
+
default=128,
|
78 |
+
type=int,
|
79 |
+
help="The maximum total input sequence length after WordPiece tokenization. \n"
|
80 |
+
"Sequences longer than this will be truncated, and sequences shorter \n"
|
81 |
+
"than this will be padded.")
|
82 |
+
|
83 |
+
parser.add_argument("--do_train",
|
84 |
+
action='store_true',
|
85 |
+
help="Whether to run training.")
|
86 |
+
|
87 |
+
parser.add_argument("--do_eval",
|
88 |
+
action='store_true',
|
89 |
+
help="Whether to run eval on the dev set.")
|
90 |
+
|
91 |
+
parser.add_argument("--do_lower_case",
|
92 |
+
action='store_true',
|
93 |
+
help="Set this flag if you are using an uncased model.")
|
94 |
+
|
95 |
+
parser.add_argument("--train_batch_size",
|
96 |
+
default=64,
|
97 |
+
type=int,
|
98 |
+
help="Total batch size for training.")
|
99 |
+
|
100 |
+
parser.add_argument("--eval_batch_size",
|
101 |
+
default=16,
|
102 |
+
type=int,
|
103 |
+
help="Total batch size for eval.")
|
104 |
+
|
105 |
+
parser.add_argument("--learning_rate",
|
106 |
+
default=5e-5,
|
107 |
+
type=float,
|
108 |
+
help="The initial learning rate for Adam.")
|
109 |
+
|
110 |
+
parser.add_argument("--num_train_epochs",
|
111 |
+
default=12.0,
|
112 |
+
type=float,
|
113 |
+
help="Total number of training epochs to perform.")
|
114 |
+
|
115 |
+
parser.add_argument("--warmup_proportion",
|
116 |
+
default=0.1,
|
117 |
+
type=float,
|
118 |
+
help="Proportion of training to perform linear learning rate warmup for. "
|
119 |
+
"E.g., 0.1 = 10%% of training.")
|
120 |
+
|
121 |
+
parser.add_argument("--no_cuda",
|
122 |
+
action='store_true',
|
123 |
+
help="Whether not to use CUDA when available")
|
124 |
+
|
125 |
+
parser.add_argument("--local_rank",
|
126 |
+
type=int,
|
127 |
+
default=-1,
|
128 |
+
help="local_rank for distributed training on gpus")
|
129 |
+
|
130 |
+
parser.add_argument('--seed',
|
131 |
+
type=int,
|
132 |
+
default=37,
|
133 |
+
help="random seed for initialization")
|
134 |
+
|
135 |
+
parser.add_argument('--gradient_accumulation_steps',
|
136 |
+
type=int,
|
137 |
+
default=1,
|
138 |
+
help="Number of updates steps to accumulate before performing a backward/update pass.")
|
139 |
+
|
140 |
+
parser.add_argument('--fp16',
|
141 |
+
action='store_true',
|
142 |
+
help="Whether to use 16-bit float precision instead of 32-bit")
|
143 |
+
|
144 |
+
parser.add_argument('--loss_scale',
|
145 |
+
type=float, default=0,
|
146 |
+
help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
|
147 |
+
"0 (default value): dynamic loss scaling.\n"
|
148 |
+
"Positive power of 2: static loss scaling value.\n")
|
149 |
+
|
150 |
+
parser.add_argument('--mm_model', default='MTCCMBert', help='model name') # 'MTCCMBert', 'NMMTCCMBert'
|
151 |
+
parser.add_argument('--layer_num1', type=int, default=1, help='number of txt2img layer')
|
152 |
+
parser.add_argument('--layer_num2', type=int, default=1, help='number of img2txt layer')
|
153 |
+
parser.add_argument('--layer_num3', type=int, default=1, help='number of txt2txt layer')
|
154 |
+
parser.add_argument('--fine_tune_cnn', action='store_true', help='fine tune pre-trained CNN if True')
|
155 |
+
parser.add_argument('--resnet_root', default='E:/demo_datn/pythonProject1/Model/Resnet/', help='path the pre-trained cnn models')
|
156 |
+
parser.add_argument('--crop_size', type=int, default=224, help='crop size of image')
|
157 |
+
parser.add_argument('--path_image', default='E:/demo_datn/pythonProject1/Model/MultimodelNER/VLSP2016/Image', help='path to images')
|
158 |
+
# parser.add_argument('--mm_model', default='TomBert', help='model name') #
|
159 |
+
parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
|
160 |
+
parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
|
161 |
+
args = parser.parse_args()
|
162 |
+
|
163 |
+
|
164 |
+
|
165 |
+
processors = {
|
166 |
+
"twitter2015": MNERProcessor_2016,
|
167 |
+
"twitter2017": MNERProcessor_2016,
|
168 |
+
"sonba": MNERProcessor_2016
|
169 |
+
}
|
170 |
+
|
171 |
+
|
172 |
+
|
173 |
+
random.seed(args.seed)
|
174 |
+
np.random.seed(args.seed)
|
175 |
+
torch.manual_seed(args.seed)
|
176 |
+
|
177 |
+
|
178 |
+
task_name = args.task_name.lower()
|
179 |
+
|
180 |
+
|
181 |
+
|
182 |
+
processor = processors[task_name]()
|
183 |
+
label_list = processor.get_labels()
|
184 |
+
auxlabel_list = processor.get_auxlabels()
|
185 |
+
num_labels = len(label_list) + 1 # label 0 corresponds to padding, label in label_list starts from 1
|
186 |
+
auxnum_labels = len(auxlabel_list) + 1 # label 0 corresponds to padding, label in label_list starts from 1
|
187 |
+
|
188 |
+
start_label_id = processor.get_start_label_id()
|
189 |
+
stop_label_id = processor.get_stop_label_id()
|
190 |
+
|
191 |
+
# ''' initialization of our conversion matrix, in our implementation, it is a 7*12 matrix initialized as follows:
|
192 |
+
trans_matrix = np.zeros((auxnum_labels, num_labels), dtype=float)
|
193 |
+
trans_matrix[0, 0] = 1 # pad to pad
|
194 |
+
trans_matrix[1, 1] = 1 # O to O
|
195 |
+
trans_matrix[2, 2] = 0.25 # B to B-MISC
|
196 |
+
trans_matrix[2, 4] = 0.25 # B to B-PER
|
197 |
+
trans_matrix[2, 6] = 0.25 # B to B-ORG
|
198 |
+
trans_matrix[2, 8] = 0.25 # B to B-LOC
|
199 |
+
trans_matrix[3, 3] = 0.25 # I to I-MISC
|
200 |
+
trans_matrix[3, 5] = 0.25 # I to I-PER
|
201 |
+
trans_matrix[3, 7] = 0.25 # I to I-ORG
|
202 |
+
trans_matrix[3, 9] = 0.25 # I to I-LOC
|
203 |
+
trans_matrix[4, 10] = 1 # X to X
|
204 |
+
trans_matrix[5, 11] = 1 # [CLS] to [CLS]
|
205 |
+
trans_matrix[6, 12] = 1 # [SEP] to [SEP]
|
206 |
+
'''
|
207 |
+
trans_matrix = np.zeros((num_labels, auxnum_labels), dtype=float)
|
208 |
+
trans_matrix[0,0]=1 # pad to pad
|
209 |
+
trans_matrix[1,1]=1
|
210 |
+
trans_matrix[2,2]=1
|
211 |
+
trans_matrix[4,2]=1
|
212 |
+
trans_matrix[6,2]=1
|
213 |
+
trans_matrix[8,2]=1
|
214 |
+
trans_matrix[3,3]=1
|
215 |
+
trans_matrix[5,3]=1
|
216 |
+
trans_matrix[7,3]=1
|
217 |
+
trans_matrix[9,3]=1
|
218 |
+
trans_matrix[10,4]=1
|
219 |
+
trans_matrix[11,5]=1
|
220 |
+
trans_matrix[12,6]=1
|
221 |
+
'''
|
222 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
223 |
+
|
224 |
+
tokenizer = AutoTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
|
225 |
+
|
226 |
+
|
227 |
+
|
228 |
+
net = getattr(resnet, 'resnet152')()
|
229 |
+
net.load_state_dict(torch.load(os.path.join(args.resnet_root, 'resnet152.pth')))
|
230 |
+
encoder = myResnet(net, args.fine_tune_cnn, device)
|
231 |
+
|
232 |
+
|
233 |
+
output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
|
234 |
+
# output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
|
235 |
+
output_encoder_file = os.path.join(args.output_dir, "pytorch_encoder.bin")
|
236 |
+
|
237 |
+
temp = args.temp
|
238 |
+
temp_lamb = args.temp_lamb
|
239 |
+
lamb = args.lamb
|
240 |
+
negative_rate = args.negative_rate
|
241 |
+
# # loadmodel
|
242 |
+
# model = UMT.from_pretrained(args.bert_model,
|
243 |
+
# cache_dir=args.cache_dir, layer_num1=args.layer_num1,
|
244 |
+
# layer_num2=args.layer_num2,
|
245 |
+
# layer_num3=args.layer_num3,
|
246 |
+
# num_labels_=num_labels, auxnum_labels=auxnum_labels)
|
247 |
+
# model.load_state_dict(torch.load(output_model_file,map_location=torch.device('cpu')))
|
248 |
+
# model.to(device)
|
249 |
+
# encoder_state_dict = torch.load(output_encoder_file,map_location=torch.device('cpu'))
|
250 |
+
# encoder.load_state_dict(encoder_state_dict)
|
251 |
+
# encoder.to(device)
|
252 |
+
# print(model)
|
253 |
+
|
254 |
+
def load_model(output_model_file, output_encoder_file,encoder,num_labels,auxnum_labels):
|
255 |
+
model = UMT.from_pretrained(args.bert_model,
|
256 |
+
cache_dir=args.cache_dir, layer_num1=args.layer_num1,
|
257 |
+
layer_num2=args.layer_num2,
|
258 |
+
layer_num3=args.layer_num3,
|
259 |
+
num_labels_=num_labels, auxnum_labels=auxnum_labels)
|
260 |
+
model.load_state_dict(torch.load(output_model_file, map_location=torch.device('cpu')))
|
261 |
+
model.to(device)
|
262 |
+
encoder_state_dict = torch.load(output_encoder_file, map_location=torch.device('cpu'))
|
263 |
+
encoder.load_state_dict(encoder_state_dict)
|
264 |
+
encoder.to(device)
|
265 |
+
return model, encoder
|
266 |
+
|
267 |
+
model_umt,encoder_umt=load_model(output_model_file, output_encoder_file,encoder,num_labels,auxnum_labels)
|
268 |
+
#
|
269 |
+
# # sentence = 'Thương biết_mấy những Thuận, những Liên, những Luận, Xuân, Nghĩa mỗi người một hoàn_cảnh nhưng đều rất giống nhau: rất ham học, rất cố_gắng để đạt mức hiểu biết cao nhất.'
|
270 |
+
# # image_path = '/kaggle/working/data/014715.jpg'
|
271 |
+
# # # crop_size = 224'
|
272 |
+
path_image='E:\demo_datn\pythonProject1\Model\MultimodelNER\VLSP2016\Image'
|
273 |
+
trans_matrix = np.zeros((auxnum_labels,num_labels), dtype=float)
|
274 |
+
trans_matrix[0,0]=1 # pad to pad
|
275 |
+
trans_matrix[1,1]=1 # O to O
|
276 |
+
trans_matrix[2,2]=0.25 # B to B-MISC
|
277 |
+
trans_matrix[2,4]=0.25 # B to B-PER
|
278 |
+
trans_matrix[2,6]=0.25 # B to B-ORG
|
279 |
+
trans_matrix[2,8]=0.25 # B to B-LOC
|
280 |
+
trans_matrix[3,3]=0.25 # I to I-MISC
|
281 |
+
trans_matrix[3,5]=0.25 # I to I-PER
|
282 |
+
trans_matrix[3,7]=0.25 # I to I-ORG
|
283 |
+
trans_matrix[3,9]=0.25 # I to I-LOC
|
284 |
+
trans_matrix[4,10]=1 # X to X
|
285 |
+
trans_matrix[5,11]=1 # [CLS] to [CLS]
|
286 |
+
trans_matrix[6,12]=1 # [SE
|
287 |
+
path_image='E:\demo_datn\pythonProject1\Model\MultimodelNER\VLSP2016\Image'
|
288 |
+
|
289 |
+
def predict(model_umt, encoder_umt, eval_examples, tokenizer, device,path_image,trans_matrix):
|
290 |
+
|
291 |
+
features = convert_mm_examples_to_features_predict(eval_examples, 256, tokenizer, 224,path_image)
|
292 |
+
|
293 |
+
input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
|
294 |
+
input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
|
295 |
+
added_input_mask = torch.tensor([f.added_input_mask for f in features], dtype=torch.long)
|
296 |
+
segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
|
297 |
+
img_feats = torch.stack([f.img_feat for f in features])
|
298 |
+
print(img_feats)
|
299 |
+
eval_data = TensorDataset(input_ids, input_mask, added_input_mask, segment_ids, img_feats)
|
300 |
+
eval_sampler = SequentialSampler(eval_data)
|
301 |
+
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=16)
|
302 |
+
|
303 |
+
model_umt.eval()
|
304 |
+
encoder_umt.eval()
|
305 |
+
|
306 |
+
y_pred = []
|
307 |
+
label_map = {i: label for i, label in enumerate(label_list, 1)}
|
308 |
+
label_map[0] = "<pad>"
|
309 |
+
|
310 |
+
for input_ids, input_mask, added_input_mask, segment_ids, img_feats in tqdm(eval_dataloader, desc="Evaluating"):
|
311 |
+
input_ids = input_ids.to(device)
|
312 |
+
input_mask = input_mask.to(device)
|
313 |
+
added_input_mask = added_input_mask.to(device)
|
314 |
+
segment_ids = segment_ids.to(device)
|
315 |
+
img_feats = img_feats.to(device)
|
316 |
+
|
317 |
+
with torch.no_grad():
|
318 |
+
imgs_f, img_mean, img_att = encoder_umt(img_feats)
|
319 |
+
predicted_label_seq_ids = model_umt(input_ids, segment_ids, input_mask, added_input_mask, img_att,
|
320 |
+
trans_matrix)
|
321 |
+
|
322 |
+
logits = predicted_label_seq_ids
|
323 |
+
input_mask = input_mask.to('cpu').numpy()
|
324 |
+
|
325 |
+
for i, mask in enumerate(input_mask):
|
326 |
+
temp_1 = []
|
327 |
+
for j, m in enumerate(mask):
|
328 |
+
if j == 0:
|
329 |
+
continue
|
330 |
+
if m:
|
331 |
+
if label_map[logits[i][j]] not in ["<pad>", "<s>", "</s>", "X"]:
|
332 |
+
temp_1.append(label_map[logits[i][j]])
|
333 |
+
else:
|
334 |
+
break
|
335 |
+
y_pred.append(temp_1)
|
336 |
+
|
337 |
+
a = eval_examples[0].text_a.split(" ")
|
338 |
+
|
339 |
+
return y_pred, a
|
340 |
+
|
341 |
+
eval_examples = get_test_examples_predict('E:/demo_datn/pythonProject1/Model/MultimodelNER/VLSP2016/Filetxt/')
|
342 |
+
y_pred, a = predict(model_umt, encoder_umt, eval_examples, tokenizer, device,path_image,trans_matrix)
|
343 |
+
print(y_pred)
|
344 |
+
formatted_output = format_predictions(a, y_pred[0])
|
345 |
+
print(formatted_output)
|
346 |
+
final= process_predictions(formatted_output)
|
347 |
+
final2= combine_entities(final)
|
348 |
+
final3= remove_B_prefix(final2)
|
349 |
+
final4=combine_i_tags(final3)
|
350 |
+
|
351 |
+
print(final4)
|
352 |
+
|
Model/MultimodelNER/VLSP2021/Filetxt/test.txt
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
IMGID:taybannha
|
2 |
+
Dân
|
3 |
+
trí
|
4 |
+
Chức
|
5 |
+
vô
|
6 |
+
địch
|
7 |
+
Euro
|
8 |
+
2008
|
9 |
+
đầy
|
10 |
+
thuyết
|
11 |
+
phục
|
12 |
+
của
|
13 |
+
Tây
|
14 |
+
Ban
|
15 |
+
Nha
|
16 |
+
trên
|
17 |
+
đất
|
18 |
+
Áo
|
19 |
+
và
|
20 |
+
Thụy
|
21 |
+
Sĩ
|
22 |
+
đã
|
23 |
+
mở
|
24 |
+
ra
|
25 |
+
kỷ
|
26 |
+
nguyên
|
27 |
+
vinh
|
28 |
+
quanh
|
29 |
+
của
|
30 |
+
La
|
31 |
+
Furia
|
32 |
+
Roja
|
33 |
+
,
|
34 |
+
với
|
35 |
+
lối
|
36 |
+
chơi
|
37 |
+
tiqui
|
38 |
+
taka
|
39 |
+
đầy
|
40 |
+
biến
|
41 |
+
ảo
|
42 |
+
.
|
43 |
+
Trong
|
44 |
+
quá
|
45 |
+
khứ
|
46 |
+
,
|
47 |
+
Tây
|
48 |
+
Ban
|
49 |
+
Nha
|
50 |
+
nổi
|
51 |
+
tiếng
|
52 |
+
với
|
53 |
+
biệt
|
54 |
+
danh
|
55 |
+
Vua
|
56 |
+
vòng
|
57 |
+
loại
|
58 |
+
.
|
59 |
+
Họ
|
60 |
+
thường
|
61 |
+
thi
|
62 |
+
đấu
|
63 |
+
rất
|
64 |
+
tốt
|
65 |
+
ở
|
66 |
+
vòng
|
67 |
+
loại
|
68 |
+
nhưng
|
69 |
+
lại
|
70 |
+
chưa
|
71 |
+
bao
|
72 |
+
giờ
|
73 |
+
chứng
|
74 |
+
minh
|
75 |
+
được
|
76 |
+
sức
|
77 |
+
mạnh
|
78 |
+
ở
|
79 |
+
vòng
|
80 |
+
chung
|
81 |
+
kết
|
82 |
+
giải
|
83 |
+
đấu
|
84 |
+
lớn
|
85 |
+
.
|
86 |
+
Lần
|
87 |
+
duy
|
88 |
+
nhất
|
89 |
+
họ
|
90 |
+
lên
|
91 |
+
ngôi
|
92 |
+
là
|
93 |
+
ở
|
94 |
+
kỳ
|
95 |
+
Euro
|
96 |
+
1964
|
97 |
+
.
|
Model/MultimodelNER/VLSP2021/Image/taybannha.jpg
ADDED
Model/MultimodelNER/VLSP2021/MNER_2021.py
ADDED
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from spacy import displacy
|
3 |
+
from Model.NER.VLSP2021.Predict_Ner import ViTagger
|
4 |
+
import re
|
5 |
+
from thunghiemxuly import save_uploaded_image,convert_text_to_txt,add_string_to_txt
|
6 |
+
|
7 |
+
import os
|
8 |
+
from transformers import AutoTokenizer, BertConfig
|
9 |
+
from Model.MultimodelNER.VLSP2021.train_umt_2021 import load_model,predict
|
10 |
+
from Model.MultimodelNER.Ner_processing import format_predictions,process_predictions,combine_entities,remove_B_prefix,combine_i_tags
|
11 |
+
|
12 |
+
from Model.MultimodelNER.predict import get_test_examples_predict
|
13 |
+
from Model.MultimodelNER import resnet as resnet
|
14 |
+
from Model.MultimodelNER.resnet_utils import myResnet
|
15 |
+
import torch
|
16 |
+
import numpy as np
|
17 |
+
from Model.MultimodelNER.VLSP2021.dataset_roberta import MNERProcessor_2021
|
18 |
+
|
19 |
+
|
20 |
+
CONFIG_NAME = 'bert_config.json'
|
21 |
+
WEIGHTS_NAME = 'pytorch_model.bin'
|
22 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
23 |
+
|
24 |
+
|
25 |
+
net = getattr(resnet, 'resnet152')()
|
26 |
+
net.load_state_dict(torch.load(os.path.join('E:/demo_datn/pythonProject1/Model/Resnet/', 'resnet152.pth')))
|
27 |
+
encoder = myResnet(net, True, device)
|
28 |
+
def process_text(text):
|
29 |
+
# Loại bỏ dấu cách thừa và dấu cách ở đầu và cuối văn bản
|
30 |
+
processed_text = re.sub(r'\s+', ' ', text.strip())
|
31 |
+
return processed_text
|
32 |
+
|
33 |
+
|
34 |
+
|
35 |
+
def show_mner_2021():
|
36 |
+
multimodal_text = st.text_area("Enter your text for MNER:", height=300)
|
37 |
+
multimodal_text = process_text(multimodal_text) # Xử lý văn bản
|
38 |
+
image = st.file_uploader("Upload an image (only jpg):", type=["jpg"])
|
39 |
+
if st.button("Process Multimodal NER"):
|
40 |
+
save_image = 'E:/demo_datn/pythonProject1/Model/MultimodelNER/VLSP2021/Image'
|
41 |
+
save_txt = 'E:/demo_datn/pythonProject1/Model/MultimodelNER/VLSP2021/Filetxt/test.txt'
|
42 |
+
image_name = image.name
|
43 |
+
save_uploaded_image(image, save_image)
|
44 |
+
convert_text_to_txt(multimodal_text, save_txt)
|
45 |
+
add_string_to_txt(image_name, save_txt)
|
46 |
+
st.image(image, caption="Uploaded Image", use_column_width=True)
|
47 |
+
|
48 |
+
bert_model = 'vinai/phobert-base-v2'
|
49 |
+
output_dir = 'E:/demo_datn/pythonProject1/Model/MultimodelNER/VLSP2021/best_model'
|
50 |
+
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
|
51 |
+
output_encoder_file = os.path.join(output_dir, "pytorch_encoder.bin")
|
52 |
+
processor = MNERProcessor_2021()
|
53 |
+
label_list = processor.get_labels()
|
54 |
+
auxlabel_list = processor.get_auxlabels()
|
55 |
+
num_labels = len(label_list) + 1
|
56 |
+
auxnum_labels = len(auxlabel_list) + 1
|
57 |
+
trans_matrix = np.zeros((auxnum_labels, num_labels), dtype=float)
|
58 |
+
trans_matrix[0, 0] = 1 # pad to pad
|
59 |
+
trans_matrix[1, 1] = 1 # O to O
|
60 |
+
trans_matrix[2, 2] = 0.25 # B to B-MISC
|
61 |
+
trans_matrix[2, 4] = 0.25 # B to B-PER
|
62 |
+
trans_matrix[2, 6] = 0.25 # B to B-ORG
|
63 |
+
trans_matrix[2, 8] = 0.25 # B to B-LOC
|
64 |
+
trans_matrix[3, 3] = 0.25 # I to I-MISC
|
65 |
+
trans_matrix[3, 5] = 0.25 # I to I-PER
|
66 |
+
trans_matrix[3, 7] = 0.25 # I to I-ORG
|
67 |
+
trans_matrix[3, 9] = 0.25 # I to I-LOC
|
68 |
+
trans_matrix[4, 10] = 1 # X to X
|
69 |
+
trans_matrix[5, 11] = 1 # [CLS] to [CLS]
|
70 |
+
trans_matrix[6, 12] = 1
|
71 |
+
tokenizer = AutoTokenizer.from_pretrained(bert_model, do_lower_case=False)
|
72 |
+
model_umt, encoder_umt = load_model(output_model_file, output_encoder_file, encoder, num_labels,
|
73 |
+
auxnum_labels)
|
74 |
+
eval_examples = get_test_examples_predict(
|
75 |
+
'E:/demo_datn/pythonProject1/Model/MultimodelNER/VLSP2021/Filetxt/')
|
76 |
+
|
77 |
+
y_pred, a = predict(model_umt, encoder_umt, eval_examples, tokenizer, device, save_image, trans_matrix)
|
78 |
+
formatted_output = format_predictions(a, y_pred[0])
|
79 |
+
final = process_predictions(formatted_output)
|
80 |
+
final2 = combine_entities(final)
|
81 |
+
final3 = remove_B_prefix(final2)
|
82 |
+
final4 = combine_i_tags(final3)
|
83 |
+
|
84 |
+
words_and_labels = final4
|
85 |
+
# Tạo danh sách từ
|
86 |
+
words = [word for word, _ in words_and_labels]
|
87 |
+
# Tạo danh sách thực thể và nhãn cho mỗi từ, loại bỏ nhãn 'O'
|
88 |
+
entities = [{'start': sum(len(word) + 1 for word, _ in words_and_labels[:i]),
|
89 |
+
'end': sum(len(word) + 1 for word, _ in words_and_labels[:i + 1]), 'label': label} for
|
90 |
+
i, (word, label)
|
91 |
+
in enumerate(words_and_labels) if label != 'O']
|
92 |
+
# print(entities)
|
93 |
+
|
94 |
+
# Render the visualization without color for 'O' labels
|
95 |
+
html = displacy.render(
|
96 |
+
{"text": " ".join(words), "ents": entities, "title": None},
|
97 |
+
style="ent",
|
98 |
+
manual=True,
|
99 |
+
options={"colors": {"DATETIME-DATERANGE": "#66c2ff",
|
100 |
+
"LOCATION-GPE": "#ffcc99",
|
101 |
+
"O": None, # Màu cho nhãn 'O'
|
102 |
+
"QUANTITY-NUM": "#ffdf80",
|
103 |
+
"EVENT-CUL": "#bfbfbf",
|
104 |
+
"DATETIME": "#80ff80",
|
105 |
+
"PERSONTYPE": "#ff80ff",
|
106 |
+
"PERSON": "#bf80ff",
|
107 |
+
"QUANTITY-PER": "#80cccc",
|
108 |
+
"ORGANIZATION": "#ff6666",
|
109 |
+
"LOCATION-GEO": "#66cc66",
|
110 |
+
"LOCATION-STRUC": "#cccc66",
|
111 |
+
"PRODUCT-COM": "#ffff66",
|
112 |
+
"DATETIME-DATE": "#66cccc",
|
113 |
+
"QUANTITY-DIM": "#6666ff",
|
114 |
+
"PRODUCT": "#cc6666",
|
115 |
+
"QUANTITY": "#6666cc",
|
116 |
+
"DATETIME-DURATION": "#9966ff",
|
117 |
+
"QUANTITY-CUR": "#ff9966",
|
118 |
+
"DATETIME-TIME": "#cdbf93",
|
119 |
+
"QUANTITY-TEM": "#cc9966",
|
120 |
+
"DATETIME-TIMERANGE": "#cc8566",
|
121 |
+
"EVENT-GAMESHOW": "#8c8c5a",
|
122 |
+
"QUANTITY-AGE": "#70db70",
|
123 |
+
"QUANTITY-ORD": "#e699ff",
|
124 |
+
"PRODUCT-LEGAL": "#806699",
|
125 |
+
"LOCATION": "#993366",
|
126 |
+
"ORGANIZATION-MED": "#339933",
|
127 |
+
"URL": "#ff4d4d",
|
128 |
+
"PHONENUMBER": "#99cc99",
|
129 |
+
"ORGANIZATION-SPORTS": "#6666ff",
|
130 |
+
"EVENT-SPORT": "#ffff80",
|
131 |
+
"SKILL": "#b38f66",
|
132 |
+
"EVENT-NATURAL": "#ff9966",
|
133 |
+
"ADDRESS": "#cc9966",
|
134 |
+
"IP": "#b38f66",
|
135 |
+
"EMAIL": "#cc8566",
|
136 |
+
"ORGANIZATION-STOCK": "#666633",
|
137 |
+
"DATETIME-SET": "#70db70",
|
138 |
+
"PRODUCT-AWARD": "#e699ff",
|
139 |
+
"MISCELLANEOUS": "#806699",
|
140 |
+
"LOCATION-GPE-GEO": "#99ffff"}}
|
141 |
+
)
|
142 |
+
# print(html)
|
143 |
+
st.markdown(html, unsafe_allow_html=True)
|
144 |
+
|
145 |
+
# Sử dụng widget st.html để hiển thị HTML
|
146 |
+
|
147 |
+
# Hiển thị văn bản đã nhập
|
148 |
+
# st.write("Văn bản đã nhập:", text)
|
149 |
+
|
150 |
+
|
151 |
+
###Ví dụ 1 : Một trận hỗn chiến đã xảy ra tại trận đấu khúc côn cầu giữa Penguins và Islanders ở Mỹ (image:penguin)
|
Model/MultimodelNER/VLSP2021/__pycache__/MNER_2021.cpython-39.pyc
ADDED
Binary file (5.34 kB). View file
|
|
Model/MultimodelNER/VLSP2021/__pycache__/dataset_roberta.cpython-39.pyc
ADDED
Binary file (10.7 kB). View file
|
|
Model/MultimodelNER/VLSP2021/__pycache__/train_umt_2021.cpython-39.pyc
ADDED
Binary file (8.82 kB). View file
|
|
Model/MultimodelNER/VLSP2021/best_model/bert_config.json
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "vinai/phobert-base-v2",
|
3 |
+
"architectures": [
|
4 |
+
"RobertaForMaskedLM"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"bos_token_id": 0,
|
8 |
+
"classifier_dropout": null,
|
9 |
+
"eos_token_id": 2,
|
10 |
+
"hidden_act": "gelu",
|
11 |
+
"hidden_dropout_prob": 0.1,
|
12 |
+
"hidden_size": 768,
|
13 |
+
"initializer_range": 0.02,
|
14 |
+
"intermediate_size": 3072,
|
15 |
+
"layer_norm_eps": 1e-05,
|
16 |
+
"max_position_embeddings": 258,
|
17 |
+
"model_type": "roberta",
|
18 |
+
"num_attention_heads": 12,
|
19 |
+
"num_hidden_layers": 12,
|
20 |
+
"pad_token_id": 1,
|
21 |
+
"position_embedding_type": "absolute",
|
22 |
+
"tokenizer_class": "PhobertTokenizer",
|
23 |
+
"torch_dtype": "float32",
|
24 |
+
"transformers_version": "4.35.2",
|
25 |
+
"type_vocab_size": 1,
|
26 |
+
"use_cache": true,
|
27 |
+
"vocab_size": 64001
|
28 |
+
}
|
Model/MultimodelNER/VLSP2021/best_model/eval_results.txt
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
precision recall f1-score support
|
2 |
+
|
3 |
+
/s> 0.0000 0.0000 0.0000 0
|
4 |
+
ADDRESS 0.0455 0.0455 0.0455 22
|
5 |
+
DATETIME 0.4883 0.6221 0.5472 606
|
6 |
+
DATETIME-DATE 0.6016 0.5204 0.5581 563
|
7 |
+
DATETIME-DATERANGE 0.3426 0.2701 0.3020 137
|
8 |
+
DATETIME-DURATION 0.7900 0.6337 0.7033 475
|
9 |
+
DATETIME-SET 0.0000 0.0000 0.0000 4
|
10 |
+
DATETIME-TIME 0.3291 0.5200 0.4031 50
|
11 |
+
DATETIME-TIMERANGE 0.2540 0.1231 0.1658 130
|
12 |
+
EMAIL 1.0000 1.0000 1.0000 2
|
13 |
+
EVENT 0.1687 0.0782 0.1069 179
|
14 |
+
EVENT-CUL 0.5000 0.4375 0.4667 16
|
15 |
+
EVENT-GAMESHOW 0.4085 0.5370 0.4640 54
|
16 |
+
EVENT-NATURAL 0.0000 0.0000 0.0000 9
|
17 |
+
EVENT-SPORT 0.5634 0.5517 0.5575 145
|
18 |
+
IP 1.0000 1.0000 1.0000 15
|
19 |
+
LOCATION 0.1120 0.2668 0.1578 431
|
20 |
+
LOCATION-GEO 0.5556 0.3333 0.4167 120
|
21 |
+
LOCATION-GPE 0.7486 0.6113 0.6730 2367
|
22 |
+
LOCATION-STRUC 0.5286 0.5248 0.5267 141
|
23 |
+
MISCELLANEOUS 0.0000 0.0000 0.0000 0
|
24 |
+
ORGANIZATION 0.6576 0.7000 0.6782 1630
|
25 |
+
ORGANIZATION-MED 0.5395 0.6833 0.6029 120
|
26 |
+
ORGANIZATION-SPORTS 0.6362 0.7827 0.7019 382
|
27 |
+
ORGANIZATION-STOCK 0.0000 0.0000 0.0000 29
|
28 |
+
PERSON 0.9347 0.9574 0.9459 2466
|
29 |
+
PERSONTYPE 0.5262 0.6104 0.5652 806
|
30 |
+
PHONENUMBER 0.7273 0.8000 0.7619 10
|
31 |
+
PRODUCT 0.4419 0.3834 0.4106 446
|
32 |
+
PRODUCT-AWARD 0.0000 0.0000 0.0000 78
|
33 |
+
PRODUCT-COM 0.3488 0.5068 0.4132 148
|
34 |
+
PRODUCT-LEGAL 0.4107 0.1322 0.2000 174
|
35 |
+
QUANTITY 0.0756 0.3464 0.1241 153
|
36 |
+
QUANTITY-AGE 0.8433 0.9187 0.8794 246
|
37 |
+
QUANTITY-CUR 0.8256 0.8367 0.8311 447
|
38 |
+
QUANTITY-DIM 0.5763 0.2787 0.3757 244
|
39 |
+
QUANTITY-NUM 0.8569 0.5573 0.6754 2182
|
40 |
+
QUANTITY-ORD 0.5388 0.2907 0.3777 454
|
41 |
+
QUANTITY-PER 0.9088 0.8911 0.8999 358
|
42 |
+
QUANTITY-TEM 0.6923 0.8182 0.7500 11
|
43 |
+
SKILL 0.0000 0.0000 0.0000 2
|
44 |
+
URL 0.5714 0.5714 0.5714 7
|
45 |
+
_ 0.0000 0.0000 0.0000 0
|
46 |
+
|
47 |
+
micro avg 0.6371 0.6260 0.6315 15859
|
48 |
+
macro avg 0.4546 0.4451 0.4386 15859
|
49 |
+
weighted avg 0.6838 0.6260 0.6430 15859
|
50 |
+
Overall: 0.6368360277136259 0.6259537171322278 0.6313479823194582
|
Model/MultimodelNER/VLSP2021/best_model/model_config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"bert_model": "vinai/phobert-base-v2", "do_lower": false, "max_seq_length": 256, "num_labels": 89, "label_map": {"1": "O", "2": "I-PRODUCT-AWARD", "3": "B-MISCELLANEOUS", "4": "B-QUANTITY-NUM", "5": "B-ORGANIZATION-SPORTS", "6": "B-DATETIME", "7": "I-ADDRESS", "8": "I-PERSON", "9": "I-EVENT-SPORT", "10": "B-ADDRESS", "11": "B-EVENT-NATURAL", "12": "I-LOCATION-GPE", "13": "B-EVENT-GAMESHOW", "14": "B-DATETIME-TIMERANGE", "15": "I-QUANTITY-NUM", "16": "I-QUANTITY-AGE", "17": "B-EVENT-CUL", "18": "I-QUANTITY-TEM", "19": "I-PRODUCT-LEGAL", "20": "I-LOCATION-STRUC", "21": "I-ORGANIZATION", "22": "B-PHONENUMBER", "23": "B-IP", "24": "B-QUANTITY-AGE", "25": "I-DATETIME-TIME", "26": "I-DATETIME", "27": "B-ORGANIZATION-MED", "28": "B-DATETIME-SET", "29": "I-EVENT-CUL", "30": "B-QUANTITY-DIM", "31": "I-QUANTITY-DIM", "32": "B-EVENT", "33": "B-DATETIME-DATERANGE", "34": "I-EVENT-GAMESHOW", "35": "B-PRODUCT-AWARD", "36": "B-LOCATION-STRUC", "37": "B-LOCATION", "38": "B-PRODUCT", "39": "I-MISCELLANEOUS", "40": "B-SKILL", "41": "I-QUANTITY-ORD", "42": "I-ORGANIZATION-STOCK", "43": "I-LOCATION-GEO", "44": "B-PERSON", "45": "B-PRODUCT-COM", "46": "B-PRODUCT-LEGAL", "47": "I-LOCATION", "48": "B-QUANTITY-TEM", "49": "I-PRODUCT", "50": "B-QUANTITY-CUR", "51": "I-QUANTITY-CUR", "52": "B-LOCATION-GPE", "53": "I-PHONENUMBER", "54": "I-ORGANIZATION-MED", "55": "I-EVENT-NATURAL", "56": "I-EMAIL", "57": "B-ORGANIZATION", "58": "B-URL", "59": "I-DATETIME-TIMERANGE", "60": "I-QUANTITY", "61": "I-IP", "62": "B-EVENT-SPORT", "63": "B-PERSONTYPE", "64": "B-QUANTITY-PER", "65": "I-QUANTITY-PER", "66": "I-PRODUCT-COM", "67": "I-DATETIME-DURATION", "68": "B-LOCATION-GPE-GEO", "69": "B-QUANTITY-ORD", "70": "I-EVENT", "71": "B-DATETIME-TIME", "72": "B-QUANTITY", "73": "I-DATETIME-SET", "74": "I-LOCATION-GPE-GEO", "75": "B-ORGANIZATION-STOCK", "76": "I-ORGANIZATION-SPORTS", "77": "I-SKILL", "78": "I-URL", "79": "B-DATETIME-DURATION", "80": "I-DATETIME-DATE", "81": "I-PERSONTYPE", "82": "B-DATETIME-DATE", "83": "I-DATETIME-DATERANGE", "84": "B-LOCATION-GEO", "85": "B-EMAIL", "86": "X", "87": "<s>", "88": "</s>"}}
|
Model/MultimodelNER/VLSP2021/best_model/mtmner_pred.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
Model/MultimodelNER/VLSP2021/best_model/pytorch_encoder.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9314f35060b4df32a623855c3e6a665cdaee354eeb5cd4925085fc7b00cc180c
|
3 |
+
size 241699561
|
Model/MultimodelNER/VLSP2021/best_model/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bd439026a1a847a3c53a011acff0ca41b734b4084fd801d99c5e1ba962358d20
|
3 |
+
size 699784873
|