rnn-embeddings-songs / project /models /rnn.py

Igor Santana

rnn model sent from github to huggingface

9c58361 about 2 years ago

6.57 kB

	from os.path import exists
	from keras.utils import to_categorical
	from keras.models import Model
	from keras.layers import Embedding, LSTM, Dense, CuDNNGRU, LSTM, Input, Bidirectional, Dropout, Concatenate, Bidirectional
	from keras.models import Sequential, load_model
	from keras.callbacks import EarlyStopping, ModelCheckpoint
	from keras.preprocessing.sequence import TimeseriesGenerator
	import concurrent.futures as fut
	import os
	import gc
	import keras
	import pickle
	import time
	import numpy as np
	import pickle as pk
	import pandas as pd
	import tensorflow as tf
	import matplotlib.pyplot as plt
	from math import floor


	tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
	os.environ['TF_CPP_MIN_LOG_LEVEL'] = '4'

	def get_window(playlist, ix, window):
	el = playlist[ix]

	# This is the perfect case:
	if (ix - window >= 0) and (ix + window + 1) < len(playlist):
	window = playlist[ix - window:ix] + playlist[ix + 1:(ix + 1) + window]
	return window

	# Not running into the perfect case, will turn into the damage reduction clause:
	b4 = []
	after = []
	# If the problem is in the before clause, prepend the song until it mets the window size.
	if (ix - window < 0):
	b4 = (abs(ix - window) * ['0']) + playlist[0:ix]
	else:
	b4 = playlist[ix - window:ix]
	# If the problem is in the after clause, append the song until it mets the window size.

	if (ix + window + 1) > len(playlist):
	num = (ix + window + 1) - len(playlist)
	after = playlist[ix + 1:ix + window + 1] + (num * ['0'])
	else:
	after = playlist[ix + 1:(ix + 1) + window]

	return b4 + after


	def window_seqs(sequence, w_size):
	ix = 0
	max_ix = (len(sequence) - 1) - w_size
	x = []
	y = []
	while ix < max_ix:
	x.append(sequence[ix:ix+w_size])
	y.append([sequence[ix+w_size]])
	ix+=1
	return x, y

	def rnn(df, DS, MODEL, W_SIZE, EPOCHS, BATCH_SIZE, EMBEDDING_DIM, NUM_UNITS, BIDIRECTIONAL):
	pwd = 'dataset/{}/'.format(DS)
	WINDOW = W_SIZE * 2

	vocab = sorted(set(df.song.unique().tolist()))
	vocab_size = len(vocab) +1
	song2ix = {u:i for i, u in enumerate(vocab, 1)}
	pickle.dump(song2ix, open('{}_song2ix.pickle'.format(DS), 'wb'), pickle.HIGHEST_PROTOCOL)


	if not exists(pwd + 'song_context_{}.txt'.format(W_SIZE)):
	df['song'] = df.song.apply(lambda song: song2ix[song])
	u_playlists = df[['user', 'song']].groupby('user').agg(tuple)['song'].values
	u_playlists = [list(p) for p in u_playlists]
	s_playlists = df[['session', 'song']].groupby('session').agg(tuple)['song'].values
	s_playlists = [list(p) for p in s_playlists]

	nou_playlists = len(u_playlists)
	nos_playlists = len(s_playlists)

	user_windows = dict()
	session_windows = dict()


	for song in vocab:
	user_windows[song2ix[song]] = []
	session_windows[song2ix[song]] = []

	k4 = 1
	for pl in u_playlists:
	print('[{}/{}] [USER] Playlist'.format(k4, nou_playlists), flush=False, end='\r')
	k4+=1
	ixes = range(0, len(pl))
	s_windows = [(pl[ix], get_window(pl, ix, W_SIZE)) for ix in ixes]
	for song, window in s_windows:
	user_windows[song].append(window)
	print()
	k4 = 1
	for pl in s_playlists:
	print('[{}/{}] [SESSION] Playlist'.format(k4, nos_playlists), flush=False, end='\r')
	k4+=1
	ixes = range(0, len(pl))
	s_windows = [(pl[ix], get_window(pl, ix, W_SIZE)) for ix in ixes]
	for song, window in s_windows:
	session_windows[song].append(window)
	print()

	f = open(pwd + 'song_context_{}.txt'.format(W_SIZE), 'w')
	for song in vocab:
	u_occurrences = user_windows[song2ix[song]]
	s_occurrences = session_windows[song2ix[song]]
	for u_o, s_o in zip(u_occurrences, s_occurrences):
	print('{}\t{}\t{}'.format(','.join([str(i) for i in u_o]), ','.join([str(i) for i in s_o]), str(song2ix[song])), file=f)
	f.close()

	f = open(pwd + 'song_context_{}.txt'.format(W_SIZE), mode='r')

	data = []
	for line in f:
	line = line.replace('\n', '')
	input_user, input_session, target = line.split('\t')
	line = [np.array([int(x) for x in input_user.split(',')]), np.array([int(x) for x in input_session.split(',')]), int(target)]
	data.append(line)

	data = np.vstack(data)

	np.random.shuffle(data)

	def batch(data, bs):
	while True:
	for ix in range(0, len(data), bs):
	u_input = data[ix:ix+bs,0]
	s_input = data[ix:ix+bs,1]
	target = data[ix:ix+bs,2]
	yield [np.vstack(u_input), np.vstack(s_input)], to_categorical(target, num_classes=vocab_size)


	train, test = data[int(len(data) .20):], data[:int(len(data) .20)]

	input_session = Input(batch_shape=(None, WINDOW))
	embedding_session = Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, name='Session_Embeddings', mask_zero=True)(input_session)
	drop_session = Dropout(0.2)(embedding_session)
	rec_session = LSTM(NUM_UNITS, name='Session_LSTM')(drop_session)
	drop_session = Dropout(0.2)(rec_session)

	input_user = Input(batch_shape=(None, WINDOW))
	embedding_user = Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, name='User_Embeddings', mask_zero=True)(input_user)
	drop_user = Dropout(0.2)(embedding_user)
	rec_user = LSTM(NUM_UNITS, name='User_LSTM')(drop_user)
	drop_user = Dropout(0.2)(rec_user)
	combination = Concatenate()([drop_session, drop_user])
	dense = Dense(vocab_size, activation='softmax', name='Densa')(combination)
	model = Model(inputs=[input_session, input_user], outputs=dense)
	checkpoint = ModelCheckpoint('{}_model_checkpoint.h5'.format(DS), monitor='loss', verbose=0, save_best_only=False, period=1)
	es = EarlyStopping(monitor='val_acc', mode='max', verbose=1, patience=5)

	model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
	model.summary()

	if exists('{}_model_checkpoint.h5'.format(DS)):
	model = load_model('{}_model_checkpoint.h5'.format(DS))

	model.fit_generator(generator=batch(train, BATCH_SIZE), steps_per_epoch=len(train) // BATCH_SIZE, epochs=EPOCHS,
	validation_data=batch(test, BATCH_SIZE), validation_steps=len(test) // BATCH_SIZE, callbacks=[es, checkpoint])

	session_embeddings = model.get_layer('Session_Embeddings').get_weights()[0]
	user_embeddings = model.get_layer('User_Embeddings').get_weights()[0]

	u_emb = {}
	s_emb = {}

	for song in vocab:
	u_emb[song] = user_embeddings[song2ix[song]]
	s_emb[song] = session_embeddings[song2ix[song]]

	del model
	gc.collect()
	return u_emb, s_emb