File size: 5,267 Bytes

f5586d9

"""
This file manages the data preparation
"""
import numpy as np


def get_word2vec_matrix(total_words, index2word, word2vec, vector_size):
    """
    This function creates a matrix where the rows are the words and the columns represents the embedding vector.
    We will use this matrix in the embedding layer
    :param total_words: Number of words in our word2vec dictionary.
    :param index2word: dictionary maps between index and word
    :param word2vec: dictionary maps between a word and a vector
    :param vector_size: the size of the embedding vector size
    :return: embedding layer
    """
    word2vec_matrix = np.zeros((total_words, vector_size))
    for index_word, word in index2word.items():
        if word not in word2vec:
            print(f'Can not find the word "{word}" in the word2vec dictionary')
            continue
        else:
            vec = word2vec[word]
            word2vec_matrix[index_word] = vec
    return word2vec_matrix


def _create_sequences(encoded_lyrics_list, total_words, seq_length):
    """
    This function creates sequences from the lyrics
    :param encoded_lyrics_list: A list representing all the songs in the dataset (615 songs). Each cell contains a list
    of ints, where each int corresponds to the lyrics in that song. "I'm a barbie girl" --> [23, 52, 189, 792] etc.
    :param total_words: Number of words in our word2vec dictionary.
    :param seq_length: Number of words predating the word to be predicted.
    :return: (1) A numpy array containing all the sequences seen, concatenated.
             (2) A 2d numpy array where each row represents a word and the columns are the possible words in the
             vocabulary. There is a '1' in the corresponding word (e.g, word number '20,392' in the dataset is word
              number '39' in the vocab.
    """
    input_sequences = []
    next_words = []
    for song_sequence in encoded_lyrics_list:  # iterate over songs
        for i in range(seq_length, len(song_sequence)):  # iterate from minimal sequence length (number of words) to
            start_index = i - seq_length  # number of words in the song
            end_index = i
            # Slice the list into the desired sequence length
            sequence = song_sequence[start_index:end_index]
            input_sequences.append(sequence)
            next_word = song_sequence[end_index]
            next_words.append(next_word)
    input_sequences = np.array(input_sequences)
    one_hot_encoding_next_words = convert_to_one_hot_encoding(input_sequences, next_words, total_words)
    return input_sequences, one_hot_encoding_next_words


def convert_to_one_hot_encoding(input_sequences, next_words, total_words):
    """
    This function converts input to one hot encoding
    """
    one_hot_encoding_next_words = np.zeros((len(input_sequences), total_words), dtype=np.int8)
    for word_index, word in enumerate(next_words):
        one_hot_encoding_next_words[word_index, word] = 1
    return one_hot_encoding_next_words


def create_sets(train_encoded_lyrics_list, test_encoded_lyrics_list, total_words, seq_length, validation_set_size,
                seed):
    """
    This function splits training set to smaller training set and new validation set
    :param train_encoded_lyrics_list: list of sequences in the training set
    :param test_encoded_lyrics_list: list of sequences in the testing set
    :param total_words: total words in the lyrics
    :param seq_length: length of the sequence
    :param validation_set_size: percentage of the validation set
    :param seed: random state for the split
    :return: training/testing/validation set values and labels
    """
    x_train, y_train = _create_sequences(encoded_lyrics_list=train_encoded_lyrics_list,
                                         total_words=total_words, seq_length=seq_length)

    x_train, x_val = create_validation_set(data_to_split=x_train,
                                           val_data_percentage=validation_set_size,
                                           seed=seed)
    y_train, y_val = create_validation_set(data_to_split=y_train,
                                           val_data_percentage=validation_set_size,
                                           seed=seed)

    x_test, y_test = _create_sequences(encoded_lyrics_list=test_encoded_lyrics_list,
                                       total_words=total_words, seq_length=seq_length)

    return {'train': (x_train, y_train), 'validation': (x_val, y_val), 'test': (x_test, y_test)}


def create_validation_set(data_to_split, val_data_percentage, seed):
    """
    This function splits to training and validation set
    :param data_to_split: matrix where the rows are the sequences and the columns are the word indices
    :param val_data_percentage: percentage of the validation set
    :param seed: random state for the split
    :return: training and validation set
    """
    np.random.seed(seed=seed)
    np.random.shuffle(data_to_split)

    validation_ending_index = int(len(data_to_split) * val_data_percentage)
    validation_set = data_to_split[:validation_ending_index]
    data_to_split = data_to_split[validation_ending_index:]

    return data_to_split, validation_set


print('Loaded Successfully')