thak123 commited on
Commit
cdb159e
·
1 Parent(s): e710478

Upload 8 files

Browse files
Files changed (8) hide show
  1. config.py +55 -0
  2. dataset.py +40 -0
  3. engine.py +116 -0
  4. main.py +123 -0
  5. metrics.py +76 -0
  6. model.py +36 -0
  7. predict.py +145 -0
  8. utils.py +25 -0
config.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import transformers
2
+ import os
3
+ import torch
4
+
5
+ MAX_LEN = 150 #256
6
+ TRAIN_BATCH_SIZE = 8
7
+ VALID_BATCH_SIZE = 4
8
+ EPOCHS = 5
9
+
10
+ # Folder to contain all the datasets
11
+ DATASET_LOCATION = "" #
12
+ MODEL_PATH = "/mnt/data/group3/gaurish/SentimentAnalyserLVTwitter/bert-sentiment/src/trained_models/mbert-7epoch-lower/model.bin"
13
+
14
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
15
+
16
+ # MBERT Raw Version
17
+ # BERT_PATH = "bert-base-multilingual-cased"
18
+
19
+ # 2 EPOCH Version
20
+ # BERT_PATH = "bert-twitter-fine-tunning/LatvianTwittermBERT-v1"
21
+
22
+ # 7 EPOCH Version
23
+ BERT_PATH = "FFZG-cleopatra/bert-emoji-latvian-twitter"
24
+
25
+ # 7 EPOCH Version + emoticons
26
+ # BERT_PATH = "bert-twitter-language-pretraining/models/LatvianTwittermBERT-v2/checkpoint-106000"
27
+
28
+ # TODO check if lower casing is required
29
+ # BertTokenizer
30
+ TOKENIZER = transformers.BertTokenizer.from_pretrained(
31
+ BERT_PATH,
32
+ do_lower_case=True
33
+ )
34
+
35
+ #####################################################################################################################################
36
+ # Electra
37
+ # Step 1: Model path
38
+ # BERT_PATH = "lmtuners/experiments/disc_lm_small/electra-small/discriminator/final"
39
+ # #"lmtuners/experiments/disc_lm_small/albert-small/final"
40
+
41
+ # # Step 2: Vocab and Lowercase setting
42
+ # TOKENIZER = transformers.BertTokenizer.from_pretrained(
43
+ # "lmtuners/experiments/disc_lm_small/lvtwitterbwpt-vocab-lower_accent.txt",
44
+ # # "lmtuners/experiments/disc_lm_small/bert-base-multilingual-cased-vocab.txt",
45
+ # do_lower_case=True
46
+ # )
47
+
48
+ # ALBERT_CONFIG = transformers.AlbertConfig(
49
+ # vocab_size=len(TOKENIZER), #.get_vocab_size(),
50
+ # hidden_size=256,
51
+ # embedding_size=128,
52
+ # num_hidden_layers=12,
53
+ # num_attention_heads=4,
54
+ # intermediate_size=1024,
55
+ # max_position_embeddings=128)
dataset.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import config
2
+ import torch
3
+
4
+
5
+ class BERTDataset:
6
+ def __init__(self, review, target):
7
+ self.review = review
8
+ self.target = target
9
+ self.tokenizer = config.TOKENIZER
10
+ self.max_len = config.MAX_LEN
11
+
12
+ def __len__(self):
13
+ return len(self.review)
14
+
15
+ def __getitem__(self, item):
16
+ review = str(self.review[item])
17
+ review = " ".join(review.split())
18
+
19
+ inputs = self.tokenizer.encode_plus(
20
+ review,
21
+ None,
22
+ add_special_tokens=True,
23
+ max_length=self.max_len
24
+ )
25
+
26
+ ids = inputs["input_ids"]
27
+ mask = inputs["attention_mask"]
28
+ token_type_ids = inputs["token_type_ids"]
29
+
30
+ padding_length = self.max_len - len(ids)
31
+ ids = ids + ([0] * padding_length)
32
+ mask = mask + ([0] * padding_length)
33
+ token_type_ids = token_type_ids + ([0] * padding_length)
34
+
35
+ return {
36
+ 'ids': torch.tensor(ids, dtype=torch.long),
37
+ 'mask': torch.tensor(mask, dtype=torch.long),
38
+ 'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
39
+ 'targets': torch.tensor(self.target[item], dtype=torch.float)
40
+ }
engine.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from tqdm import tqdm
4
+ from utils import categorical_accuracy
5
+
6
+
7
+ def loss_fn(outputs, targets):
8
+ return nn.CrossEntropyLoss()(outputs, targets)
9
+
10
+
11
+ def train_fn(data_loader, model, optimizer, device, scheduler):
12
+ model.train()
13
+ train_loss, train_acc = 0.0, 0.0
14
+
15
+ for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
16
+ ids = d["ids"]
17
+ token_type_ids = d["token_type_ids"]
18
+ mask = d["mask"]
19
+ targets = d["targets"]
20
+
21
+ ids = ids.to(device, dtype=torch.long)
22
+ token_type_ids = token_type_ids.to(device, dtype=torch.long)
23
+ mask = mask.to(device, dtype=torch.long)
24
+ targets = targets.to(device, dtype=torch.long)
25
+
26
+ optimizer.zero_grad()
27
+ outputs = model(
28
+ ids=ids,
29
+ mask=mask,
30
+ token_type_ids=token_type_ids
31
+ )
32
+
33
+ loss = loss_fn(outputs, targets)
34
+ loss.backward()
35
+
36
+ optimizer.step()
37
+ scheduler.step()
38
+ train_loss += loss.item()
39
+ pred_labels = torch.argmax(outputs, dim=1)
40
+ # (pred_labels == targets).sum().item()
41
+ train_acc += categorical_accuracy(outputs, targets).item()
42
+
43
+ train_loss /= len(data_loader)
44
+ train_acc /= len(data_loader)
45
+ return train_loss, train_acc
46
+
47
+
48
+ def eval_fn(data_loader, model, device):
49
+ model.eval()
50
+ eval_loss, eval_acc = 0.0, 0.0
51
+ fin_targets = []
52
+ fin_outputs = []
53
+ with torch.no_grad():
54
+ for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
55
+ ids = d["ids"]
56
+ token_type_ids = d["token_type_ids"]
57
+ mask = d["mask"]
58
+ targets = d["targets"]
59
+
60
+ ids = ids.to(device, dtype=torch.long)
61
+ token_type_ids = token_type_ids.to(device, dtype=torch.long)
62
+ mask = mask.to(device, dtype=torch.long)
63
+ targets = targets.to(device, dtype=torch.long)
64
+
65
+ outputs = model(
66
+ ids=ids,
67
+ mask=mask,
68
+ token_type_ids=token_type_ids
69
+ )
70
+ loss = loss_fn(outputs, targets)
71
+ eval_loss += loss.item()
72
+ pred_labels = torch.argmax(outputs, axis=1)
73
+ # (pred_labels == targets).sum().item()
74
+ eval_acc += categorical_accuracy(outputs, targets).item()
75
+ fin_targets.extend(targets.cpu().detach().numpy().tolist())
76
+ fin_outputs.extend(torch.argmax(
77
+ outputs, dim=1).cpu().detach().numpy().tolist())
78
+ eval_loss /= len(data_loader)
79
+ eval_acc /= len(data_loader)
80
+ return fin_outputs, fin_targets, eval_loss, eval_acc
81
+
82
+
83
+
84
+ def predict_fn(data_loader, model, device, extract_features=False):
85
+ model.eval()
86
+
87
+ fin_outputs = []
88
+ extracted_features =[]
89
+ with torch.no_grad():
90
+ for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
91
+ ids = d["ids"]
92
+ token_type_ids = d["token_type_ids"]
93
+ mask = d["mask"]
94
+ # targets = d["targets"]
95
+
96
+ ids = ids.to(device, dtype=torch.long)
97
+ token_type_ids = token_type_ids.to(device, dtype=torch.long)
98
+ mask = mask.to(device, dtype=torch.long)
99
+
100
+ outputs = model(
101
+ ids=ids,
102
+ mask=mask,
103
+ token_type_ids=token_type_ids
104
+ )
105
+ if extract_features:
106
+ extracted_features.extend( model.extract_features(
107
+ ids=ids,
108
+ mask=mask,
109
+ token_type_ids=token_type_ids
110
+ ).cpu().detach().numpy().tolist())
111
+
112
+ fin_outputs.extend(torch.argmax(
113
+ outputs, dim=1).cpu().detach().numpy().tolist())
114
+
115
+ return fin_outputs, extracted_features
116
+
main.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import flask
2
+ import torch
3
+ from flask import Flask, render_template, request
4
+ from utils import label_full_decoder
5
+ import sys
6
+ import config
7
+ import dataset
8
+ import engine
9
+ from model import BERTBaseUncased
10
+ from tokenizer import tokenizer
11
+ from werkzeug.serving import run_simple
12
+ # from werkzeug.wsgi import DispatcherMiddleware
13
+
14
+
15
+ T = tokenizer.TweetTokenizer(
16
+ preserve_handles=True, preserve_hashes=True, preserve_case=False, preserve_url=False)
17
+
18
+ app = Flask(__name__,
19
+ static_folder='app_resources/static',
20
+ static_url_path='/sentimentanalyzer',
21
+ instance_relative_config=True,
22
+ template_folder='app_resources/templates/public')
23
+
24
+
25
+ MODEL = None
26
+ DEVICE = config.device
27
+
28
+
29
+ def preprocess(text):
30
+ tokens = T.tokenize(text)
31
+ print(tokens, file=sys.stderr)
32
+ ptokens = []
33
+ for index, token in enumerate(tokens):
34
+ if "@" in token:
35
+ if index > 0:
36
+ # check if previous token was mention
37
+ if "@" in tokens[index-1]:
38
+ pass
39
+ else:
40
+ ptokens.append("mention_0")
41
+ else:
42
+ ptokens.append("mention_0")
43
+ else:
44
+ ptokens.append(token)
45
+
46
+ print(ptokens, file=sys.stderr)
47
+ return " ".join(ptokens)
48
+
49
+
50
+ def sentence_prediction(sentence):
51
+ sentence = preprocess(sentence)
52
+ model_path = config.MODEL_PATH
53
+
54
+ test_dataset = dataset.BERTDataset(
55
+ review=[sentence],
56
+ target=[0]
57
+ )
58
+
59
+ test_data_loader = torch.utils.data.DataLoader(
60
+ test_dataset,
61
+ batch_size=config.VALID_BATCH_SIZE,
62
+ num_workers=3
63
+ )
64
+
65
+ device = config.device
66
+
67
+ model = BERTBaseUncased()
68
+ model.load_state_dict(torch.load(
69
+ model_path, map_location=torch.device(device)))
70
+ model.to(device)
71
+
72
+ outputs, [] = engine.predict_fn(test_data_loader, model, device)
73
+ print(outputs)
74
+ return outputs[0]
75
+
76
+
77
+ @app.route("/sentimentanalyzer/predict", methods=['POST'])
78
+ def predict():
79
+ print(request.form, file=sys.stderr)
80
+ # print([(x) for x in request.get_json()],file=sys.stderr)
81
+ # sentence = request.get_json().get("sentence","")
82
+ sentence = request.form['sentence']
83
+ if sentence:
84
+ print(sentence, file=sys.stderr)
85
+ prediction = sentence_prediction(sentence)
86
+ response = {}
87
+ response["response"] = {
88
+ 'sentence': sentence,
89
+ 'prediction': label_full_decoder(prediction),
90
+ }
91
+ return flask.jsonify(response)
92
+ else:
93
+ return flask.jsonify({"error": "empty text"})
94
+
95
+
96
+ @app.route("/sentimentanalyzer/")
97
+ def index():
98
+ return render_template("index.html")
99
+
100
+
101
+ @app.route("/sentimentanalyzer/demo")
102
+ def demo():
103
+ return render_template("demo.html")
104
+
105
+
106
+ @app.route("/sentimentanalyzer/models")
107
+ def models():
108
+ return render_template("models.html")
109
+
110
+
111
+ @app.route("/sentimentanalyzer/about")
112
+ def about():
113
+ return render_template("about.html")
114
+
115
+
116
+ if __name__ == "__main__":
117
+ MODEL = BERTBaseUncased()
118
+ MODEL.load_state_dict(torch.load(
119
+ config.MODEL_PATH, map_location=torch.device(DEVICE)))
120
+ MODEL.eval()
121
+
122
+ app.run("127.0.0.1", port=1095, debug=True)
123
+ # host="http://cleopatra.ijs.si/sentimentanalyzer"
metrics.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+
4
+ import sys
5
+
6
+ file_path = sys.argv[1]
7
+
8
+ metric = {}
9
+ stastics = {}
10
+
11
+ with open(file_path) as input_file:
12
+ current_epoch=None
13
+ for line in input_file:
14
+ line= line.strip()
15
+
16
+ if line.find("Bert Model") >-1:
17
+ stastics["Bert Model"] =line
18
+ if line.find("Current date and time") >-1:
19
+ stastics["Current date and time"] =line
20
+ if line.find("Train file") >-1:
21
+ stastics["Train file"] =line
22
+ if line.find("Valid file") >-1:
23
+ stastics["Valid file"] =line
24
+ if line.find("Test file") >-1:
25
+ stastics["Test file"] =line
26
+ if line.find("Train size") >-1:
27
+ stastics["Train size"] =line
28
+ if line.find("Valid size") >-1:
29
+ stastics["Valid size"] =line
30
+ if line.find("Test size") >-1:
31
+ stastics["Test size"] =line
32
+ tokens = line.split()
33
+ for token in tokens:
34
+ if token.find("epoch")==0:
35
+ metric[token]=[]
36
+ current_epoch=token
37
+ continue
38
+ if token.find("train_loss")>-1:
39
+ metric[current_epoch].append(token)
40
+ if token.find("val_loss")>-1:
41
+ metric[current_epoch].append(token)
42
+ if token.find("test_loss")>-1:
43
+ metric[current_epoch].append(token)
44
+ if token.find("train_acc")>-1:
45
+ metric[current_epoch].append(token)
46
+ if token.find("val_acc")>-1:
47
+ metric[current_epoch].append(token)
48
+ if token.find("test_acc")>-1:
49
+ metric[current_epoch].append(token)
50
+ results =[]
51
+ for item in metric.items():
52
+ result=[]
53
+ result.append(item[0].replace('epoch=',""))
54
+ for fig in item[1]:
55
+ result.append(fig.split("=")[-1].replace(",",""))
56
+ results.append(result)
57
+
58
+ for item in stastics.items():
59
+ print(item[0],item[1].split()[-1])
60
+
61
+ #lets convert that to numpy array as np.array
62
+ num = np.array(results)
63
+
64
+ #now construct a beautiful table
65
+ df = pd.DataFrame(num, columns=["EPOCH","Trn loss","Val Acc" ,"Tst loss","Trn Acc","Val loss","Tst Acc"]) #
66
+ dash = 62
67
+ print("-"*dash)
68
+ print("| ".join(df.columns), "|")
69
+ for index,row in df.iterrows():
70
+ print("-"*dash)
71
+ print("|",row["EPOCH"]," |", row["Trn loss"]," |", row["Val loss"]," |",row["Tst loss"], " |", row["Trn Acc"]," |",row["Val Acc"]," |",row["Tst Acc"]," |")
72
+
73
+ print("-"*dash)
74
+
75
+
76
+ #
model.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import config
2
+ import transformers
3
+ import torch.nn as nn
4
+
5
+
6
+ class BERTBaseUncased(nn.Module):
7
+ def __init__(self):
8
+ super(BERTBaseUncased, self).__init__()
9
+ self.bert = transformers.BertModel.from_pretrained(config.BERT_PATH)
10
+
11
+ self.bert_drop = nn.Dropout(0.3)
12
+
13
+ self.out = nn.Linear(768, 3)
14
+ # self.out = nn.Linear(256, 3)
15
+
16
+ nn.init.xavier_uniform_(self.out.weight)
17
+
18
+ def forward(self, ids, mask, token_type_ids):
19
+ _, o2 = self.bert(
20
+ ids,
21
+ attention_mask=mask,
22
+ token_type_ids=token_type_ids
23
+ )
24
+ bo = self.bert_drop(o2)
25
+ # bo = self.tanh(self.fc(bo)) # to be commented if original
26
+ output = self.out(bo)
27
+ return output
28
+
29
+ def extract_features(self, ids, mask, token_type_ids):
30
+ _, o2 = self.bert(
31
+ ids,
32
+ attention_mask=mask,
33
+ token_type_ids=token_type_ids
34
+ )
35
+ bo = self.bert_drop(o2)
36
+ return bo
predict.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import random
3
+
4
+ import matplotlib
5
+ import matplotlib.pyplot as plt
6
+ import numpy as np
7
+ import pandas as pd
8
+ import seaborn as sns
9
+ import torch
10
+ import torch.nn as nn
11
+ from absl import app, flags, logging
12
+ from loguru import logger
13
+ from scipy import stats
14
+ from sklearn import metrics, model_selection
15
+ from sklearn.decomposition import PCA
16
+ from sklearn.manifold import TSNE
17
+ from torch.utils.tensorboard import SummaryWriter
18
+
19
+ import config
20
+ import dataset
21
+ import engine
22
+ from model import BERTBaseUncased
23
+ from utils import categorical_accuracy, label_decoder, label_encoder
24
+
25
+ matplotlib.rcParams['interactive'] == True
26
+
27
+ SEED = 42
28
+ random.seed(SEED)
29
+ np.random.seed(SEED)
30
+ torch.manual_seed(SEED)
31
+ torch.cuda.manual_seed(SEED)
32
+ torch.backends.cudnn.deterministic = True
33
+
34
+ writer = SummaryWriter()
35
+ logger.add("experiment.log")
36
+
37
+ flags.DEFINE_boolean('features', True, "")
38
+ flags.DEFINE_string('test_file', None, "")
39
+ flags.DEFINE_string('model_path', None, "")
40
+
41
+ FLAGS = flags.FLAGS
42
+
43
+
44
+ def main(_):
45
+ test_file = config.DATASET_LOCATION + "eval.prep.test.csv"
46
+ model_path = config.MODEL_PATH
47
+ if FLAGS.test_file:
48
+ test_file = FLAGS.test_file
49
+ if FLAGS.model_path:
50
+ model_path = FLAGS.model_path
51
+ df_test = pd.read_csv(test_file).fillna("none")
52
+
53
+ # Commenting as there are no labels
54
+ if FLAGS.features:
55
+ df_test.label = df_test.label.apply(label_encoder)
56
+
57
+ logger.info(f"Bert Model: {config.BERT_PATH}")
58
+ logger.info(
59
+ f"Current date and time :{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ")
60
+ logger.info(f"Test file: {test_file}")
61
+ logger.info(f"Test size : {len(df_test):.4f}")
62
+
63
+ test_dataset = dataset.BERTDataset(
64
+ review=df_test.text.values,
65
+ target=df_test.label.values
66
+ )
67
+
68
+ test_data_loader = torch.utils.data.DataLoader(
69
+ test_dataset,
70
+ batch_size=config.VALID_BATCH_SIZE,
71
+ num_workers=3
72
+ )
73
+
74
+ device = config.device
75
+
76
+ model = BERTBaseUncased()
77
+ model.load_state_dict(torch.load(
78
+ model_path, map_location=torch.device(device)))
79
+ model.to(device)
80
+
81
+ outputs, extracted_features = engine.predict_fn(
82
+ test_data_loader, model, device, extract_features=FLAGS.features)
83
+ df_test["predicted"] = outputs
84
+ # save file
85
+ df_test.to_csv(model_path.split(
86
+ "/")[-2]+'.csv', header=None, index=False)
87
+
88
+ if FLAGS.features:
89
+ pca = PCA(n_components=50, random_state=7)
90
+ X1 = pca.fit_transform(extracted_features)
91
+ tsne = TSNE(n_components=2, perplexity=10, random_state=6,
92
+ learning_rate=1000, n_iter=1500)
93
+ X1 = tsne.fit_transform(X1)
94
+ # if row == 0: print("Shape after t-SNE: ", X1.shape)
95
+
96
+ X = pd.DataFrame(np.concatenate([X1], axis=1),
97
+ columns=["x1", "y1"])
98
+ X = X.astype({"x1": float, "y1": float})
99
+
100
+ # Plot for layer -1
101
+ plt.figure(figsize=(20, 15))
102
+ p1 = sns.scatterplot(x=X["x1"], y=X["y1"], palette="coolwarm")
103
+ # p1.set_title("development-"+str(row+1)+", layer -1")
104
+ x_texts = []
105
+ for output, value in zip(outputs, df_test.label.values):
106
+ if output == value:
107
+ x_texts.append("@"+label_decoder(output)
108
+ [0] + label_decoder(output))
109
+ else:
110
+ x_texts.append(label_decoder(value) +
111
+ "-" + label_decoder(output))
112
+
113
+ X["texts"] = x_texts
114
+ # X["texts"] = ["@G" + label_decoder(output) if output == value else "@R-" + label_decoder(value) + "-" + label_decoder(output)
115
+ # for output, value in zip(outputs, df_test.label.values)]
116
+
117
+ # df_test.label.astype(str)
118
+ #([str(output)+"-" + str(value)] for output, value in zip(outputs, df_test.label.values))
119
+ # Label each datapoint with the word it corresponds to
120
+ for line in X.index:
121
+ text = X.loc[line, "texts"]+"-"+str(line)
122
+ if "@U" in text:
123
+ p1.text(X.loc[line, "x1"]+0.2, X.loc[line, "y1"], text[2:], horizontalalignment='left',
124
+ size='medium', color='blue', weight='semibold')
125
+ elif "@P" in text:
126
+ p1.text(X.loc[line, "x1"]+0.2, X.loc[line, "y1"], text[2:], horizontalalignment='left',
127
+ size='medium', color='green', weight='semibold')
128
+ elif "@N" in text:
129
+ p1.text(X.loc[line, "x1"]+0.2, X.loc[line, "y1"], text[2:], horizontalalignment='left',
130
+ size='medium', color='red', weight='semibold')
131
+ else:
132
+ p1.text(X.loc[line, "x1"]+0.2, X.loc[line, "y1"], text, horizontalalignment='left',
133
+ size='medium', color='black', weight='semibold')
134
+ plt.show()
135
+ plt.savefig(model_path.split(
136
+ "/")[-2]+'-figure.svg', format="svg")
137
+ # loocv = model_selection.LeaveOneOut()
138
+ # model = KNeighborsClassifier(n_neighbors=8)
139
+ # results = model_selection.cross_val_score(model, X, Y, cv=loocv)
140
+ # for i, j in outputs, extracted_features:
141
+ # utils.write_embeddings_to_file(extracted_features, outputs)
142
+
143
+
144
+ if __name__ == "__main__":
145
+ app.run(main)
utils.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import config
3
+
4
+
5
+ def categorical_accuracy(preds, y):
6
+ """
7
+ Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
8
+ """
9
+ max_preds = preds.argmax(
10
+ dim=1, keepdim=True) # get the index of the max probability
11
+ correct = max_preds.squeeze(1).eq(y)
12
+ return correct.sum() / torch.FloatTensor([y.shape[0]])
13
+
14
+ def label_encoder(x):
15
+ label_vec = {"0": 0, "1": 1, "-1": 2}
16
+ return label_vec[x.replace("__label__", "")]
17
+
18
+ def label_decoder(x):
19
+ label_vec = { 0:"U", 1:"P", 2:"N"}
20
+ return label_vec[x]
21
+
22
+ def label_full_decoder(x):
23
+ label_vec = { 0:"Neutral", 1:"Positive", 2:"Negative"}
24
+ return label_vec[x]
25
+