Spaces:
Build error
Build error
File size: 6,524 Bytes
ade8773 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from gensim.models import Word2Vec
import numpy as np
import gc
def train_word2vec(documents, embedding_dim):
"""
train word2vector over training documents
Args:
documents (list): list of document
embedding_dim (int): output wordvector size
Returns:
word_vectors(dict): dict containing words and their respective vectors
"""
model = Word2Vec(documents, min_count=1, size=embedding_dim)
word_vectors = model.wv
del model
return word_vectors
def create_embedding_matrix(tokenizer, word_vectors, embedding_dim):
"""
Create embedding matrix containing word indexes and respective vectors from word vectors
Args:
tokenizer (keras.preprocessing.text.Tokenizer): keras tokenizer object containing word indexes
word_vectors (dict): dict containing word and their respective vectors
embedding_dim (int): dimension of word vector
Returns:
"""
nb_words = len(tokenizer.word_index) + 1
word_index = tokenizer.word_index
embedding_matrix = np.zeros((nb_words, embedding_dim))
print("Embedding matrix shape: %s" % str(embedding_matrix.shape))
for word, i in word_index.items():
try:
embedding_vector = word_vectors[word]
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
except KeyError:
print("vector not found for word - %s" % word)
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))
return embedding_matrix
def word_embed_meta_data(documents, embedding_dim):
"""
Load tokenizer object for given vocabs list
Args:
documents (list): list of document
embedding_dim (int): embedding dimension
Returns:
tokenizer (keras.preprocessing.text.Tokenizer): keras tokenizer object
embedding_matrix (dict): dict with word_index and vector mapping
"""
documents = [str(x).lower().split() for x in documents]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(documents)
word_vector = train_word2vec(documents, embedding_dim)
embedding_matrix = create_embedding_matrix(tokenizer, word_vector, embedding_dim)
del word_vector
gc.collect()
return tokenizer, embedding_matrix
def create_train_dev_set(tokenizer, sentences_pair, is_similar, max_sequence_length, validation_split_ratio):
"""
Create training and validation dataset
Args:
tokenizer (keras.preprocessing.text.Tokenizer): keras tokenizer object
sentences_pair (list): list of tuple of sentences pairs
is_similar (list): list containing labels if respective sentences in sentence1 and sentence2
are same or not (1 if same else 0)
max_sequence_length (int): max sequence length of sentences to apply padding
validation_split_ratio (float): contain ratio to split training data into validation data
Returns:
train_data_1 (list): list of input features for training set from sentences1
train_data_2 (list): list of input features for training set from sentences2
labels_train (np.array): array containing similarity score for training data
leaks_train(np.array): array of training leaks features
val_data_1 (list): list of input features for validation set from sentences1
val_data_2 (list): list of input features for validation set from sentences1
labels_val (np.array): array containing similarity score for validation data
leaks_val (np.array): array of validation leaks features
"""
sentences1 = [x[0].lower() for x in sentences_pair]
sentences2 = [x[1].lower() for x in sentences_pair]
train_sequences_1 = tokenizer.texts_to_sequences(sentences1)
train_sequences_2 = tokenizer.texts_to_sequences(sentences2)
leaks = [[len(set(x1)), len(set(x2)), len(set(x1).intersection(x2))]
for x1, x2 in zip(train_sequences_1, train_sequences_2)]
train_padded_data_1 = pad_sequences(train_sequences_1, maxlen=max_sequence_length)
train_padded_data_2 = pad_sequences(train_sequences_2, maxlen=max_sequence_length)
train_labels = np.array(is_similar)
leaks = np.array(leaks)
shuffle_indices = np.random.permutation(np.arange(len(train_labels)))
train_data_1_shuffled = train_padded_data_1[shuffle_indices]
train_data_2_shuffled = train_padded_data_2[shuffle_indices]
train_labels_shuffled = train_labels[shuffle_indices]
leaks_shuffled = leaks[shuffle_indices]
dev_idx = max(1, int(len(train_labels_shuffled) * validation_split_ratio))
del train_padded_data_1
del train_padded_data_2
gc.collect()
train_data_1, val_data_1 = train_data_1_shuffled[:-dev_idx], train_data_1_shuffled[-dev_idx:]
train_data_2, val_data_2 = train_data_2_shuffled[:-dev_idx], train_data_2_shuffled[-dev_idx:]
labels_train, labels_val = train_labels_shuffled[:-dev_idx], train_labels_shuffled[-dev_idx:]
leaks_train, leaks_val = leaks_shuffled[:-dev_idx], leaks_shuffled[-dev_idx:]
return train_data_1, train_data_2, labels_train, leaks_train, val_data_1, val_data_2, labels_val, leaks_val
def create_test_data(tokenizer, test_sentences_pair, max_sequence_length):
"""
Create training and validation dataset
Args:
tokenizer (keras.preprocessing.text.Tokenizer): keras tokenizer object
test_sentences_pair (list): list of tuple of sentences pairs
max_sequence_length (int): max sequence length of sentences to apply padding
Returns:
test_data_1 (list): list of input features for training set from sentences1
test_data_2 (list): list of input features for training set from sentences2
"""
test_sentences1 = [str(x[0]).lower() for x in test_sentences_pair]
test_sentences2 = [x[1].lower() for x in test_sentences_pair]
test_sequences_1 = tokenizer.texts_to_sequences(test_sentences1)
test_sequences_2 = tokenizer.texts_to_sequences(test_sentences2)
leaks_test = [[len(set(x1)), len(set(x2)), len(set(x1).intersection(x2))]
for x1, x2 in zip(test_sequences_1, test_sequences_2)]
leaks_test = np.array(leaks_test)
test_data_1 = pad_sequences(test_sequences_1, maxlen=max_sequence_length)
test_data_2 = pad_sequences(test_sequences_2, maxlen=max_sequence_length)
return test_data_1, test_data_2, leaks_test
|