Spaces:
Sleeping
Sleeping
Upload 8 files
Browse files- config.py +55 -0
- dataset.py +40 -0
- engine.py +116 -0
- main.py +123 -0
- metrics.py +76 -0
- model.py +36 -0
- predict.py +145 -0
- utils.py +25 -0
config.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import transformers
|
2 |
+
import os
|
3 |
+
import torch
|
4 |
+
|
5 |
+
MAX_LEN = 150 #256
|
6 |
+
TRAIN_BATCH_SIZE = 8
|
7 |
+
VALID_BATCH_SIZE = 4
|
8 |
+
EPOCHS = 5
|
9 |
+
|
10 |
+
# Folder to contain all the datasets
|
11 |
+
DATASET_LOCATION = "" #
|
12 |
+
MODEL_PATH = "/mnt/data/group3/gaurish/SentimentAnalyserLVTwitter/bert-sentiment/src/trained_models/mbert-7epoch-lower/model.bin"
|
13 |
+
|
14 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
15 |
+
|
16 |
+
# MBERT Raw Version
|
17 |
+
# BERT_PATH = "bert-base-multilingual-cased"
|
18 |
+
|
19 |
+
# 2 EPOCH Version
|
20 |
+
# BERT_PATH = "bert-twitter-fine-tunning/LatvianTwittermBERT-v1"
|
21 |
+
|
22 |
+
# 7 EPOCH Version
|
23 |
+
BERT_PATH = "FFZG-cleopatra/bert-emoji-latvian-twitter"
|
24 |
+
|
25 |
+
# 7 EPOCH Version + emoticons
|
26 |
+
# BERT_PATH = "bert-twitter-language-pretraining/models/LatvianTwittermBERT-v2/checkpoint-106000"
|
27 |
+
|
28 |
+
# TODO check if lower casing is required
|
29 |
+
# BertTokenizer
|
30 |
+
TOKENIZER = transformers.BertTokenizer.from_pretrained(
|
31 |
+
BERT_PATH,
|
32 |
+
do_lower_case=True
|
33 |
+
)
|
34 |
+
|
35 |
+
#####################################################################################################################################
|
36 |
+
# Electra
|
37 |
+
# Step 1: Model path
|
38 |
+
# BERT_PATH = "lmtuners/experiments/disc_lm_small/electra-small/discriminator/final"
|
39 |
+
# #"lmtuners/experiments/disc_lm_small/albert-small/final"
|
40 |
+
|
41 |
+
# # Step 2: Vocab and Lowercase setting
|
42 |
+
# TOKENIZER = transformers.BertTokenizer.from_pretrained(
|
43 |
+
# "lmtuners/experiments/disc_lm_small/lvtwitterbwpt-vocab-lower_accent.txt",
|
44 |
+
# # "lmtuners/experiments/disc_lm_small/bert-base-multilingual-cased-vocab.txt",
|
45 |
+
# do_lower_case=True
|
46 |
+
# )
|
47 |
+
|
48 |
+
# ALBERT_CONFIG = transformers.AlbertConfig(
|
49 |
+
# vocab_size=len(TOKENIZER), #.get_vocab_size(),
|
50 |
+
# hidden_size=256,
|
51 |
+
# embedding_size=128,
|
52 |
+
# num_hidden_layers=12,
|
53 |
+
# num_attention_heads=4,
|
54 |
+
# intermediate_size=1024,
|
55 |
+
# max_position_embeddings=128)
|
dataset.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import config
|
2 |
+
import torch
|
3 |
+
|
4 |
+
|
5 |
+
class BERTDataset:
|
6 |
+
def __init__(self, review, target):
|
7 |
+
self.review = review
|
8 |
+
self.target = target
|
9 |
+
self.tokenizer = config.TOKENIZER
|
10 |
+
self.max_len = config.MAX_LEN
|
11 |
+
|
12 |
+
def __len__(self):
|
13 |
+
return len(self.review)
|
14 |
+
|
15 |
+
def __getitem__(self, item):
|
16 |
+
review = str(self.review[item])
|
17 |
+
review = " ".join(review.split())
|
18 |
+
|
19 |
+
inputs = self.tokenizer.encode_plus(
|
20 |
+
review,
|
21 |
+
None,
|
22 |
+
add_special_tokens=True,
|
23 |
+
max_length=self.max_len
|
24 |
+
)
|
25 |
+
|
26 |
+
ids = inputs["input_ids"]
|
27 |
+
mask = inputs["attention_mask"]
|
28 |
+
token_type_ids = inputs["token_type_ids"]
|
29 |
+
|
30 |
+
padding_length = self.max_len - len(ids)
|
31 |
+
ids = ids + ([0] * padding_length)
|
32 |
+
mask = mask + ([0] * padding_length)
|
33 |
+
token_type_ids = token_type_ids + ([0] * padding_length)
|
34 |
+
|
35 |
+
return {
|
36 |
+
'ids': torch.tensor(ids, dtype=torch.long),
|
37 |
+
'mask': torch.tensor(mask, dtype=torch.long),
|
38 |
+
'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
|
39 |
+
'targets': torch.tensor(self.target[item], dtype=torch.float)
|
40 |
+
}
|
engine.py
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
from tqdm import tqdm
|
4 |
+
from utils import categorical_accuracy
|
5 |
+
|
6 |
+
|
7 |
+
def loss_fn(outputs, targets):
|
8 |
+
return nn.CrossEntropyLoss()(outputs, targets)
|
9 |
+
|
10 |
+
|
11 |
+
def train_fn(data_loader, model, optimizer, device, scheduler):
|
12 |
+
model.train()
|
13 |
+
train_loss, train_acc = 0.0, 0.0
|
14 |
+
|
15 |
+
for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
|
16 |
+
ids = d["ids"]
|
17 |
+
token_type_ids = d["token_type_ids"]
|
18 |
+
mask = d["mask"]
|
19 |
+
targets = d["targets"]
|
20 |
+
|
21 |
+
ids = ids.to(device, dtype=torch.long)
|
22 |
+
token_type_ids = token_type_ids.to(device, dtype=torch.long)
|
23 |
+
mask = mask.to(device, dtype=torch.long)
|
24 |
+
targets = targets.to(device, dtype=torch.long)
|
25 |
+
|
26 |
+
optimizer.zero_grad()
|
27 |
+
outputs = model(
|
28 |
+
ids=ids,
|
29 |
+
mask=mask,
|
30 |
+
token_type_ids=token_type_ids
|
31 |
+
)
|
32 |
+
|
33 |
+
loss = loss_fn(outputs, targets)
|
34 |
+
loss.backward()
|
35 |
+
|
36 |
+
optimizer.step()
|
37 |
+
scheduler.step()
|
38 |
+
train_loss += loss.item()
|
39 |
+
pred_labels = torch.argmax(outputs, dim=1)
|
40 |
+
# (pred_labels == targets).sum().item()
|
41 |
+
train_acc += categorical_accuracy(outputs, targets).item()
|
42 |
+
|
43 |
+
train_loss /= len(data_loader)
|
44 |
+
train_acc /= len(data_loader)
|
45 |
+
return train_loss, train_acc
|
46 |
+
|
47 |
+
|
48 |
+
def eval_fn(data_loader, model, device):
|
49 |
+
model.eval()
|
50 |
+
eval_loss, eval_acc = 0.0, 0.0
|
51 |
+
fin_targets = []
|
52 |
+
fin_outputs = []
|
53 |
+
with torch.no_grad():
|
54 |
+
for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
|
55 |
+
ids = d["ids"]
|
56 |
+
token_type_ids = d["token_type_ids"]
|
57 |
+
mask = d["mask"]
|
58 |
+
targets = d["targets"]
|
59 |
+
|
60 |
+
ids = ids.to(device, dtype=torch.long)
|
61 |
+
token_type_ids = token_type_ids.to(device, dtype=torch.long)
|
62 |
+
mask = mask.to(device, dtype=torch.long)
|
63 |
+
targets = targets.to(device, dtype=torch.long)
|
64 |
+
|
65 |
+
outputs = model(
|
66 |
+
ids=ids,
|
67 |
+
mask=mask,
|
68 |
+
token_type_ids=token_type_ids
|
69 |
+
)
|
70 |
+
loss = loss_fn(outputs, targets)
|
71 |
+
eval_loss += loss.item()
|
72 |
+
pred_labels = torch.argmax(outputs, axis=1)
|
73 |
+
# (pred_labels == targets).sum().item()
|
74 |
+
eval_acc += categorical_accuracy(outputs, targets).item()
|
75 |
+
fin_targets.extend(targets.cpu().detach().numpy().tolist())
|
76 |
+
fin_outputs.extend(torch.argmax(
|
77 |
+
outputs, dim=1).cpu().detach().numpy().tolist())
|
78 |
+
eval_loss /= len(data_loader)
|
79 |
+
eval_acc /= len(data_loader)
|
80 |
+
return fin_outputs, fin_targets, eval_loss, eval_acc
|
81 |
+
|
82 |
+
|
83 |
+
|
84 |
+
def predict_fn(data_loader, model, device, extract_features=False):
|
85 |
+
model.eval()
|
86 |
+
|
87 |
+
fin_outputs = []
|
88 |
+
extracted_features =[]
|
89 |
+
with torch.no_grad():
|
90 |
+
for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
|
91 |
+
ids = d["ids"]
|
92 |
+
token_type_ids = d["token_type_ids"]
|
93 |
+
mask = d["mask"]
|
94 |
+
# targets = d["targets"]
|
95 |
+
|
96 |
+
ids = ids.to(device, dtype=torch.long)
|
97 |
+
token_type_ids = token_type_ids.to(device, dtype=torch.long)
|
98 |
+
mask = mask.to(device, dtype=torch.long)
|
99 |
+
|
100 |
+
outputs = model(
|
101 |
+
ids=ids,
|
102 |
+
mask=mask,
|
103 |
+
token_type_ids=token_type_ids
|
104 |
+
)
|
105 |
+
if extract_features:
|
106 |
+
extracted_features.extend( model.extract_features(
|
107 |
+
ids=ids,
|
108 |
+
mask=mask,
|
109 |
+
token_type_ids=token_type_ids
|
110 |
+
).cpu().detach().numpy().tolist())
|
111 |
+
|
112 |
+
fin_outputs.extend(torch.argmax(
|
113 |
+
outputs, dim=1).cpu().detach().numpy().tolist())
|
114 |
+
|
115 |
+
return fin_outputs, extracted_features
|
116 |
+
|
main.py
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import flask
|
2 |
+
import torch
|
3 |
+
from flask import Flask, render_template, request
|
4 |
+
from utils import label_full_decoder
|
5 |
+
import sys
|
6 |
+
import config
|
7 |
+
import dataset
|
8 |
+
import engine
|
9 |
+
from model import BERTBaseUncased
|
10 |
+
from tokenizer import tokenizer
|
11 |
+
from werkzeug.serving import run_simple
|
12 |
+
# from werkzeug.wsgi import DispatcherMiddleware
|
13 |
+
|
14 |
+
|
15 |
+
T = tokenizer.TweetTokenizer(
|
16 |
+
preserve_handles=True, preserve_hashes=True, preserve_case=False, preserve_url=False)
|
17 |
+
|
18 |
+
app = Flask(__name__,
|
19 |
+
static_folder='app_resources/static',
|
20 |
+
static_url_path='/sentimentanalyzer',
|
21 |
+
instance_relative_config=True,
|
22 |
+
template_folder='app_resources/templates/public')
|
23 |
+
|
24 |
+
|
25 |
+
MODEL = None
|
26 |
+
DEVICE = config.device
|
27 |
+
|
28 |
+
|
29 |
+
def preprocess(text):
|
30 |
+
tokens = T.tokenize(text)
|
31 |
+
print(tokens, file=sys.stderr)
|
32 |
+
ptokens = []
|
33 |
+
for index, token in enumerate(tokens):
|
34 |
+
if "@" in token:
|
35 |
+
if index > 0:
|
36 |
+
# check if previous token was mention
|
37 |
+
if "@" in tokens[index-1]:
|
38 |
+
pass
|
39 |
+
else:
|
40 |
+
ptokens.append("mention_0")
|
41 |
+
else:
|
42 |
+
ptokens.append("mention_0")
|
43 |
+
else:
|
44 |
+
ptokens.append(token)
|
45 |
+
|
46 |
+
print(ptokens, file=sys.stderr)
|
47 |
+
return " ".join(ptokens)
|
48 |
+
|
49 |
+
|
50 |
+
def sentence_prediction(sentence):
|
51 |
+
sentence = preprocess(sentence)
|
52 |
+
model_path = config.MODEL_PATH
|
53 |
+
|
54 |
+
test_dataset = dataset.BERTDataset(
|
55 |
+
review=[sentence],
|
56 |
+
target=[0]
|
57 |
+
)
|
58 |
+
|
59 |
+
test_data_loader = torch.utils.data.DataLoader(
|
60 |
+
test_dataset,
|
61 |
+
batch_size=config.VALID_BATCH_SIZE,
|
62 |
+
num_workers=3
|
63 |
+
)
|
64 |
+
|
65 |
+
device = config.device
|
66 |
+
|
67 |
+
model = BERTBaseUncased()
|
68 |
+
model.load_state_dict(torch.load(
|
69 |
+
model_path, map_location=torch.device(device)))
|
70 |
+
model.to(device)
|
71 |
+
|
72 |
+
outputs, [] = engine.predict_fn(test_data_loader, model, device)
|
73 |
+
print(outputs)
|
74 |
+
return outputs[0]
|
75 |
+
|
76 |
+
|
77 |
+
@app.route("/sentimentanalyzer/predict", methods=['POST'])
|
78 |
+
def predict():
|
79 |
+
print(request.form, file=sys.stderr)
|
80 |
+
# print([(x) for x in request.get_json()],file=sys.stderr)
|
81 |
+
# sentence = request.get_json().get("sentence","")
|
82 |
+
sentence = request.form['sentence']
|
83 |
+
if sentence:
|
84 |
+
print(sentence, file=sys.stderr)
|
85 |
+
prediction = sentence_prediction(sentence)
|
86 |
+
response = {}
|
87 |
+
response["response"] = {
|
88 |
+
'sentence': sentence,
|
89 |
+
'prediction': label_full_decoder(prediction),
|
90 |
+
}
|
91 |
+
return flask.jsonify(response)
|
92 |
+
else:
|
93 |
+
return flask.jsonify({"error": "empty text"})
|
94 |
+
|
95 |
+
|
96 |
+
@app.route("/sentimentanalyzer/")
|
97 |
+
def index():
|
98 |
+
return render_template("index.html")
|
99 |
+
|
100 |
+
|
101 |
+
@app.route("/sentimentanalyzer/demo")
|
102 |
+
def demo():
|
103 |
+
return render_template("demo.html")
|
104 |
+
|
105 |
+
|
106 |
+
@app.route("/sentimentanalyzer/models")
|
107 |
+
def models():
|
108 |
+
return render_template("models.html")
|
109 |
+
|
110 |
+
|
111 |
+
@app.route("/sentimentanalyzer/about")
|
112 |
+
def about():
|
113 |
+
return render_template("about.html")
|
114 |
+
|
115 |
+
|
116 |
+
if __name__ == "__main__":
|
117 |
+
MODEL = BERTBaseUncased()
|
118 |
+
MODEL.load_state_dict(torch.load(
|
119 |
+
config.MODEL_PATH, map_location=torch.device(DEVICE)))
|
120 |
+
MODEL.eval()
|
121 |
+
|
122 |
+
app.run("127.0.0.1", port=1095, debug=True)
|
123 |
+
# host="http://cleopatra.ijs.si/sentimentanalyzer"
|
metrics.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
import sys
|
5 |
+
|
6 |
+
file_path = sys.argv[1]
|
7 |
+
|
8 |
+
metric = {}
|
9 |
+
stastics = {}
|
10 |
+
|
11 |
+
with open(file_path) as input_file:
|
12 |
+
current_epoch=None
|
13 |
+
for line in input_file:
|
14 |
+
line= line.strip()
|
15 |
+
|
16 |
+
if line.find("Bert Model") >-1:
|
17 |
+
stastics["Bert Model"] =line
|
18 |
+
if line.find("Current date and time") >-1:
|
19 |
+
stastics["Current date and time"] =line
|
20 |
+
if line.find("Train file") >-1:
|
21 |
+
stastics["Train file"] =line
|
22 |
+
if line.find("Valid file") >-1:
|
23 |
+
stastics["Valid file"] =line
|
24 |
+
if line.find("Test file") >-1:
|
25 |
+
stastics["Test file"] =line
|
26 |
+
if line.find("Train size") >-1:
|
27 |
+
stastics["Train size"] =line
|
28 |
+
if line.find("Valid size") >-1:
|
29 |
+
stastics["Valid size"] =line
|
30 |
+
if line.find("Test size") >-1:
|
31 |
+
stastics["Test size"] =line
|
32 |
+
tokens = line.split()
|
33 |
+
for token in tokens:
|
34 |
+
if token.find("epoch")==0:
|
35 |
+
metric[token]=[]
|
36 |
+
current_epoch=token
|
37 |
+
continue
|
38 |
+
if token.find("train_loss")>-1:
|
39 |
+
metric[current_epoch].append(token)
|
40 |
+
if token.find("val_loss")>-1:
|
41 |
+
metric[current_epoch].append(token)
|
42 |
+
if token.find("test_loss")>-1:
|
43 |
+
metric[current_epoch].append(token)
|
44 |
+
if token.find("train_acc")>-1:
|
45 |
+
metric[current_epoch].append(token)
|
46 |
+
if token.find("val_acc")>-1:
|
47 |
+
metric[current_epoch].append(token)
|
48 |
+
if token.find("test_acc")>-1:
|
49 |
+
metric[current_epoch].append(token)
|
50 |
+
results =[]
|
51 |
+
for item in metric.items():
|
52 |
+
result=[]
|
53 |
+
result.append(item[0].replace('epoch=',""))
|
54 |
+
for fig in item[1]:
|
55 |
+
result.append(fig.split("=")[-1].replace(",",""))
|
56 |
+
results.append(result)
|
57 |
+
|
58 |
+
for item in stastics.items():
|
59 |
+
print(item[0],item[1].split()[-1])
|
60 |
+
|
61 |
+
#lets convert that to numpy array as np.array
|
62 |
+
num = np.array(results)
|
63 |
+
|
64 |
+
#now construct a beautiful table
|
65 |
+
df = pd.DataFrame(num, columns=["EPOCH","Trn loss","Val Acc" ,"Tst loss","Trn Acc","Val loss","Tst Acc"]) #
|
66 |
+
dash = 62
|
67 |
+
print("-"*dash)
|
68 |
+
print("| ".join(df.columns), "|")
|
69 |
+
for index,row in df.iterrows():
|
70 |
+
print("-"*dash)
|
71 |
+
print("|",row["EPOCH"]," |", row["Trn loss"]," |", row["Val loss"]," |",row["Tst loss"], " |", row["Trn Acc"]," |",row["Val Acc"]," |",row["Tst Acc"]," |")
|
72 |
+
|
73 |
+
print("-"*dash)
|
74 |
+
|
75 |
+
|
76 |
+
#
|
model.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import config
|
2 |
+
import transformers
|
3 |
+
import torch.nn as nn
|
4 |
+
|
5 |
+
|
6 |
+
class BERTBaseUncased(nn.Module):
|
7 |
+
def __init__(self):
|
8 |
+
super(BERTBaseUncased, self).__init__()
|
9 |
+
self.bert = transformers.BertModel.from_pretrained(config.BERT_PATH)
|
10 |
+
|
11 |
+
self.bert_drop = nn.Dropout(0.3)
|
12 |
+
|
13 |
+
self.out = nn.Linear(768, 3)
|
14 |
+
# self.out = nn.Linear(256, 3)
|
15 |
+
|
16 |
+
nn.init.xavier_uniform_(self.out.weight)
|
17 |
+
|
18 |
+
def forward(self, ids, mask, token_type_ids):
|
19 |
+
_, o2 = self.bert(
|
20 |
+
ids,
|
21 |
+
attention_mask=mask,
|
22 |
+
token_type_ids=token_type_ids
|
23 |
+
)
|
24 |
+
bo = self.bert_drop(o2)
|
25 |
+
# bo = self.tanh(self.fc(bo)) # to be commented if original
|
26 |
+
output = self.out(bo)
|
27 |
+
return output
|
28 |
+
|
29 |
+
def extract_features(self, ids, mask, token_type_ids):
|
30 |
+
_, o2 = self.bert(
|
31 |
+
ids,
|
32 |
+
attention_mask=mask,
|
33 |
+
token_type_ids=token_type_ids
|
34 |
+
)
|
35 |
+
bo = self.bert_drop(o2)
|
36 |
+
return bo
|
predict.py
ADDED
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import datetime
|
2 |
+
import random
|
3 |
+
|
4 |
+
import matplotlib
|
5 |
+
import matplotlib.pyplot as plt
|
6 |
+
import numpy as np
|
7 |
+
import pandas as pd
|
8 |
+
import seaborn as sns
|
9 |
+
import torch
|
10 |
+
import torch.nn as nn
|
11 |
+
from absl import app, flags, logging
|
12 |
+
from loguru import logger
|
13 |
+
from scipy import stats
|
14 |
+
from sklearn import metrics, model_selection
|
15 |
+
from sklearn.decomposition import PCA
|
16 |
+
from sklearn.manifold import TSNE
|
17 |
+
from torch.utils.tensorboard import SummaryWriter
|
18 |
+
|
19 |
+
import config
|
20 |
+
import dataset
|
21 |
+
import engine
|
22 |
+
from model import BERTBaseUncased
|
23 |
+
from utils import categorical_accuracy, label_decoder, label_encoder
|
24 |
+
|
25 |
+
matplotlib.rcParams['interactive'] == True
|
26 |
+
|
27 |
+
SEED = 42
|
28 |
+
random.seed(SEED)
|
29 |
+
np.random.seed(SEED)
|
30 |
+
torch.manual_seed(SEED)
|
31 |
+
torch.cuda.manual_seed(SEED)
|
32 |
+
torch.backends.cudnn.deterministic = True
|
33 |
+
|
34 |
+
writer = SummaryWriter()
|
35 |
+
logger.add("experiment.log")
|
36 |
+
|
37 |
+
flags.DEFINE_boolean('features', True, "")
|
38 |
+
flags.DEFINE_string('test_file', None, "")
|
39 |
+
flags.DEFINE_string('model_path', None, "")
|
40 |
+
|
41 |
+
FLAGS = flags.FLAGS
|
42 |
+
|
43 |
+
|
44 |
+
def main(_):
|
45 |
+
test_file = config.DATASET_LOCATION + "eval.prep.test.csv"
|
46 |
+
model_path = config.MODEL_PATH
|
47 |
+
if FLAGS.test_file:
|
48 |
+
test_file = FLAGS.test_file
|
49 |
+
if FLAGS.model_path:
|
50 |
+
model_path = FLAGS.model_path
|
51 |
+
df_test = pd.read_csv(test_file).fillna("none")
|
52 |
+
|
53 |
+
# Commenting as there are no labels
|
54 |
+
if FLAGS.features:
|
55 |
+
df_test.label = df_test.label.apply(label_encoder)
|
56 |
+
|
57 |
+
logger.info(f"Bert Model: {config.BERT_PATH}")
|
58 |
+
logger.info(
|
59 |
+
f"Current date and time :{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ")
|
60 |
+
logger.info(f"Test file: {test_file}")
|
61 |
+
logger.info(f"Test size : {len(df_test):.4f}")
|
62 |
+
|
63 |
+
test_dataset = dataset.BERTDataset(
|
64 |
+
review=df_test.text.values,
|
65 |
+
target=df_test.label.values
|
66 |
+
)
|
67 |
+
|
68 |
+
test_data_loader = torch.utils.data.DataLoader(
|
69 |
+
test_dataset,
|
70 |
+
batch_size=config.VALID_BATCH_SIZE,
|
71 |
+
num_workers=3
|
72 |
+
)
|
73 |
+
|
74 |
+
device = config.device
|
75 |
+
|
76 |
+
model = BERTBaseUncased()
|
77 |
+
model.load_state_dict(torch.load(
|
78 |
+
model_path, map_location=torch.device(device)))
|
79 |
+
model.to(device)
|
80 |
+
|
81 |
+
outputs, extracted_features = engine.predict_fn(
|
82 |
+
test_data_loader, model, device, extract_features=FLAGS.features)
|
83 |
+
df_test["predicted"] = outputs
|
84 |
+
# save file
|
85 |
+
df_test.to_csv(model_path.split(
|
86 |
+
"/")[-2]+'.csv', header=None, index=False)
|
87 |
+
|
88 |
+
if FLAGS.features:
|
89 |
+
pca = PCA(n_components=50, random_state=7)
|
90 |
+
X1 = pca.fit_transform(extracted_features)
|
91 |
+
tsne = TSNE(n_components=2, perplexity=10, random_state=6,
|
92 |
+
learning_rate=1000, n_iter=1500)
|
93 |
+
X1 = tsne.fit_transform(X1)
|
94 |
+
# if row == 0: print("Shape after t-SNE: ", X1.shape)
|
95 |
+
|
96 |
+
X = pd.DataFrame(np.concatenate([X1], axis=1),
|
97 |
+
columns=["x1", "y1"])
|
98 |
+
X = X.astype({"x1": float, "y1": float})
|
99 |
+
|
100 |
+
# Plot for layer -1
|
101 |
+
plt.figure(figsize=(20, 15))
|
102 |
+
p1 = sns.scatterplot(x=X["x1"], y=X["y1"], palette="coolwarm")
|
103 |
+
# p1.set_title("development-"+str(row+1)+", layer -1")
|
104 |
+
x_texts = []
|
105 |
+
for output, value in zip(outputs, df_test.label.values):
|
106 |
+
if output == value:
|
107 |
+
x_texts.append("@"+label_decoder(output)
|
108 |
+
[0] + label_decoder(output))
|
109 |
+
else:
|
110 |
+
x_texts.append(label_decoder(value) +
|
111 |
+
"-" + label_decoder(output))
|
112 |
+
|
113 |
+
X["texts"] = x_texts
|
114 |
+
# X["texts"] = ["@G" + label_decoder(output) if output == value else "@R-" + label_decoder(value) + "-" + label_decoder(output)
|
115 |
+
# for output, value in zip(outputs, df_test.label.values)]
|
116 |
+
|
117 |
+
# df_test.label.astype(str)
|
118 |
+
#([str(output)+"-" + str(value)] for output, value in zip(outputs, df_test.label.values))
|
119 |
+
# Label each datapoint with the word it corresponds to
|
120 |
+
for line in X.index:
|
121 |
+
text = X.loc[line, "texts"]+"-"+str(line)
|
122 |
+
if "@U" in text:
|
123 |
+
p1.text(X.loc[line, "x1"]+0.2, X.loc[line, "y1"], text[2:], horizontalalignment='left',
|
124 |
+
size='medium', color='blue', weight='semibold')
|
125 |
+
elif "@P" in text:
|
126 |
+
p1.text(X.loc[line, "x1"]+0.2, X.loc[line, "y1"], text[2:], horizontalalignment='left',
|
127 |
+
size='medium', color='green', weight='semibold')
|
128 |
+
elif "@N" in text:
|
129 |
+
p1.text(X.loc[line, "x1"]+0.2, X.loc[line, "y1"], text[2:], horizontalalignment='left',
|
130 |
+
size='medium', color='red', weight='semibold')
|
131 |
+
else:
|
132 |
+
p1.text(X.loc[line, "x1"]+0.2, X.loc[line, "y1"], text, horizontalalignment='left',
|
133 |
+
size='medium', color='black', weight='semibold')
|
134 |
+
plt.show()
|
135 |
+
plt.savefig(model_path.split(
|
136 |
+
"/")[-2]+'-figure.svg', format="svg")
|
137 |
+
# loocv = model_selection.LeaveOneOut()
|
138 |
+
# model = KNeighborsClassifier(n_neighbors=8)
|
139 |
+
# results = model_selection.cross_val_score(model, X, Y, cv=loocv)
|
140 |
+
# for i, j in outputs, extracted_features:
|
141 |
+
# utils.write_embeddings_to_file(extracted_features, outputs)
|
142 |
+
|
143 |
+
|
144 |
+
if __name__ == "__main__":
|
145 |
+
app.run(main)
|
utils.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import config
|
3 |
+
|
4 |
+
|
5 |
+
def categorical_accuracy(preds, y):
|
6 |
+
"""
|
7 |
+
Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
|
8 |
+
"""
|
9 |
+
max_preds = preds.argmax(
|
10 |
+
dim=1, keepdim=True) # get the index of the max probability
|
11 |
+
correct = max_preds.squeeze(1).eq(y)
|
12 |
+
return correct.sum() / torch.FloatTensor([y.shape[0]])
|
13 |
+
|
14 |
+
def label_encoder(x):
|
15 |
+
label_vec = {"0": 0, "1": 1, "-1": 2}
|
16 |
+
return label_vec[x.replace("__label__", "")]
|
17 |
+
|
18 |
+
def label_decoder(x):
|
19 |
+
label_vec = { 0:"U", 1:"P", 2:"N"}
|
20 |
+
return label_vec[x]
|
21 |
+
|
22 |
+
def label_full_decoder(x):
|
23 |
+
label_vec = { 0:"Neutral", 1:"Positive", 2:"Negative"}
|
24 |
+
return label_vec[x]
|
25 |
+
|