IvaElen commited on
Commit
497b3ad
1 Parent(s): 191119e

Upload 7 files

Browse files
biLSTM1.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torchutils as tu
4
+
5
+
6
+
7
+ class biLSTM(nn.Module):
8
+ """
9
+ The LSTM model that will be used to perform Sentiment analysis.
10
+ """
11
+
12
+ def __init__(self,
13
+ # объем словаря, с которым мы работаем, размер входа для слоя Embedding
14
+ vocab_size: int,
15
+ # размер выходного эмбеддинга каждый элемент последовательности
16
+ # будет описан вектором такой размерности
17
+ embedding_dim: int,
18
+ # размерность hidden state LSTM слоя
19
+ hidden_dim: int,
20
+ # число слоев в LSTM
21
+ n_layers: int,
22
+ drop_prob=0.5,
23
+ seq_len = 128) -> None:
24
+
25
+ super().__init__()
26
+ self.hidden_dim = hidden_dim
27
+ self.n_layers = n_layers
28
+ self.seq_len = seq_len
29
+ self.embedding = nn.Embedding(vocab_size, embedding_dim)
30
+ self.lstm = nn.LSTM(embedding_dim,
31
+ hidden_dim,
32
+ n_layers,
33
+ dropout=drop_prob,
34
+ batch_first=True,
35
+ bidirectional=True
36
+ )
37
+
38
+ self.do = nn.Dropout()
39
+
40
+ self.fc1 = nn.Linear(2*hidden_dim * self.seq_len, 256)
41
+ self.fc2 = nn.Linear(256, 1)
42
+ self.sigmoid = nn.Sigmoid()
43
+
44
+ def forward(self, x):
45
+ embeds = self.embedding(x)
46
+ lstm_out, _ = self.lstm(embeds)
47
+ out = self.fc2(torch.tanh(self.do(self.fc1(lstm_out.flatten(1)))))
48
+ sig_out = self.sigmoid(out)
49
+
50
+ return sig_out
biLSTM_model_do_05_lr001_best.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2637ba70306b15159cd0b6fa890c2277390edc5ea93cdc63e580a4dd9ed92a6d
3
+ size 19147847
history_do_05_lr001_best.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ {'train_losses': [0.5731778456687927, 0.4595582458972931, 0.3687778123021126, 0.35563799259662626, 0.3177712473154068, 0.29712616628408434, 0.2833286724328995, 0.2641707232952118, 0.25079714640378953], 'valid_losses': [0.5975278757321529, 0.5386148532613729, 0.5021108987812812, 0.5125980156545455, 0.4879682982961337, 0.5328947740296522, 0.4984460395211593, 0.5014419947297145, 0.4964689599015774], 'train_metric': [0.687675, 0.794925, 0.843925, 0.85195, 0.870875, 0.8851, 0.890425, 0.898875, 0.9035], 'valid_metric': [0.7419871794871795, 0.8210136217948718, 0.8392427884615384, 0.8481570512820513, 0.8505608974358975, 0.8615785256410257, 0.860176282051282, 0.858573717948718, 0.8645833333333334]}
logistic_regression_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d021f47f5848ca4112983aaa05ee2f2165b420fa21fa91b52af23563cfd3458b
3
+ size 1669478
lstm_preprocessing.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
+ import numpy as np
4
+ import torch
5
+
6
+ from nltk.corpus import stopwords
7
+ stop_words = set(stopwords.words('english'))
8
+
9
+ def data_preprocessing(text: str) -> str:
10
+ """preprocessing string: lowercase, removing html-tags, punctuation and stopwords
11
+
12
+ Args:
13
+ text (str): input string for preprocessing
14
+
15
+ Returns:
16
+ str: preprocessed string
17
+ """
18
+
19
+ text = text.lower()
20
+ text = re.sub('<.*?>', '', text) # Remove html tags
21
+ text = re.sub(r'@\w+', " ", text) # Remove usernames
22
+ text = re.sub(r'#\w+', " ", text) #Remove hash tags
23
+ text = re.sub(r'\d+', " ", text) #Remove digits
24
+ text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
25
+ text = [word for word in text.split() if word not in stop_words]
26
+ text = ' '.join(text)
27
+ return text
28
+
29
+ def get_words_by_freq(sorted_words: list, n: int = 10) -> list:
30
+ return list(filter(lambda x: x[1] > n, sorted_words))
31
+
32
+ def padding(review_int: list, seq_len: int) -> np.array:
33
+ """Make left-sided padding for input list of tokens
34
+
35
+ Args:
36
+ review_int (list): input list of tokens
37
+ seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
38
+
39
+ Returns:
40
+ np.array: padded sequences
41
+ """
42
+ features = np.zeros((len(review_int), seq_len), dtype = int)
43
+ for i, review in enumerate(review_int):
44
+ if len(review) <= seq_len:
45
+ zeros = list(np.zeros(seq_len - len(review)))
46
+ new = zeros + review
47
+ else:
48
+ new = review[: seq_len]
49
+ features[i, :] = np.array(new)
50
+
51
+ return features
52
+
53
+ def preprocess_single_string(
54
+ input_string: str,
55
+ seq_len: int,
56
+ vocab_to_int: dict,
57
+ ) -> torch.tensor:
58
+ """Function for all preprocessing steps on a single string
59
+
60
+ Args:
61
+ input_string (str): input single string for preprocessing
62
+ seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
63
+ vocab_to_int (dict, optional): word corpus {'word' : int index}. Defaults to vocab_to_int.
64
+
65
+ Returns:
66
+ list: preprocessed string
67
+ """
68
+
69
+ preprocessed_string = data_preprocessing(input_string)
70
+ result_list = []
71
+ for word in preprocessed_string.split():
72
+ try:
73
+ result_list.append(vocab_to_int[word])
74
+ except KeyError as e:
75
+ print(f'{e}: not in dictionary!')
76
+ result_padded = padding([result_list], seq_len)[0]
77
+
78
+ return torch.tensor(result_padded)
tfidf_vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a578ab761a1e2b767ae846945b2ee5c59d2d1b34a14d2fba922ddcc110c2883
3
+ size 6834168
vocab_to_int.json ADDED
The diff for this file is too large to render. See raw diff