Spaces:
Sleeping
Sleeping
File size: 9,894 Bytes
52160a9 3b47f92 52160a9 3b47f92 52160a9 3b47f92 52160a9 3b47f92 52160a9 3b47f92 52160a9 3b47f92 52160a9 3b47f92 52160a9 3b47f92 52160a9 3b47f92 52160a9 3b47f92 52160a9 3b47f92 52160a9 3b47f92 52160a9 3b47f92 52160a9 3b47f92 52160a9 3b47f92 52160a9 3b47f92 52160a9 3b47f92 52160a9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 |
import numpy as np #to help us use numerical functions
import pandas as pd #to help us use functions for dealing with dataframe
import os #provides functions for creating and removing a directory
import random
from collections import defaultdict #data colector
import surprise
from surprise.reader import Reader
from surprise import Dataset
from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate
from surprise import SVD
from surprise import NMF
from sklearn.feature_extraction.text import TfidfVectorizer #for TF-IDF
from sklearn.metrics.pairwise import linear_kernel
W_belongs_to_collection = 0.16
W_genres = 0.10
W_original_language = 0.01
W_title = 0.11
W_overview = 0.08
W_production_countries = 0.01
W_production_companies = 0.02
W_tagline = 0.10
W_keywords = 0.10
W_Director = 0.03
W_Writer = 0.02
W_Cast = 0.02
W_Top_Cast = 0.03
W_budget_categorized = 0.01
W_length = 0.02
W_average_vote_categorized = 0.08
W_count_vote_categorized = 0.07
W_era = 0.03
##################################################################
tfidf = TfidfVectorizer(stop_words='english') #defining tfidf model which removes additional words such as 'the', 'or', 'in'
movies_filename = pd.read_csv('movies_metadata.csv', low_memory = False)
ratings_filename = pd.read_csv('ratings_small.csv', low_memory = False)
df_popular_popularity = pd.read_csv('df_popular_popularity.csv', low_memory = False)
df_popular_WR_Q = pd.read_csv('df_popular_WR_Q.csv', low_memory = False)
df_cbf_Q = pd.read_csv('df_cbf_Q.csv', low_memory = False)
df_cbf_Q['belongs_to_collection'] = df_cbf_Q['belongs_to_collection'].fillna("")
df_cbf_Q['overview'] = df_cbf_Q['overview'].fillna("")
df_cbf_Q['spoken_languages'] = df_cbf_Q['spoken_languages'].fillna("")
df_cbf_Q['tagline'] = df_cbf_Q['tagline'].fillna("")
df_cbf_Q['Director'] = df_cbf_Q['Director'].fillna("")
df_cbf_Q['Writer'] = df_cbf_Q['Writer'].fillna("")
df_cbf1 = df_cbf_Q
df_cbf2 = df_cbf_Q
ratings = ratings_filename
movie_md = movies_filename
######################################################################
# movie dataframe with votes more than 100
movie_md = movie_md[movie_md['vote_count']>100]
# removing user with below 10 votes
ratings = ratings.groupby("userId").filter(lambda x: x['userId'].count() >= 10)
# IDs of movies with count more than 100
movie_ids = [int(x) for x in movie_md['id'].values]
# Select ratings of movies with more than 100 counts
ratings = ratings[ratings['movieId'].isin(movie_ids)]
#holding only 1 millions of ratings
### in case of not using ratings_small
#ratings = ratings[:1000000]
# Reset Index
ratings.reset_index(inplace=True, drop=True)
#############################################################################################
df_cbf_tfidf_belongs_to_collection = tfidf.fit_transform(df_cbf1['belongs_to_collection'])
cosine_sim_belongs_to_collection = linear_kernel(df_cbf_tfidf_belongs_to_collection, df_cbf_tfidf_belongs_to_collection)
df_cbf_tfidf_genres = tfidf.fit_transform(df_cbf1['genres'])
cosine_sim_genres = linear_kernel(df_cbf_tfidf_genres, df_cbf_tfidf_genres)
df_cbf_tfidf_original_language = tfidf.fit_transform(df_cbf1['original_language'])
cosine_sim_original_language = linear_kernel(df_cbf_tfidf_original_language, df_cbf_tfidf_original_language)
df_cbf_tfidf_title = tfidf.fit_transform(df_cbf1['title'])
cosine_sim_title = linear_kernel(df_cbf_tfidf_title, df_cbf_tfidf_title)
df_cbf_tfidf_overview = tfidf.fit_transform(df_cbf1['overview'])
cosine_sim_overview = linear_kernel(df_cbf_tfidf_overview, df_cbf_tfidf_overview)
df_cbf_tfidf_pruduction_countries = tfidf.fit_transform(df_cbf1['production_countries'])
cosine_sim_pruduction_countries = linear_kernel(df_cbf_tfidf_pruduction_countries, df_cbf_tfidf_pruduction_countries)
df_cbf_tfidf_pruduction_companies = tfidf.fit_transform(df_cbf1['production_companies'])
cosine_sim_pruduction_companies = linear_kernel(df_cbf_tfidf_pruduction_companies, df_cbf_tfidf_pruduction_companies)
df_cbf_tfidf_tagline = tfidf.fit_transform(df_cbf1['tagline'])
cosine_sim_tagline = linear_kernel(df_cbf_tfidf_tagline, df_cbf_tfidf_tagline)
df_cbf_tfidf_keywords = tfidf.fit_transform(df_cbf1['keywords'])
cosine_sim_keywords = linear_kernel(df_cbf_tfidf_keywords, df_cbf_tfidf_keywords)
df_cbf_tfidf_Director = tfidf.fit_transform(df_cbf1['Director'])
cosine_sim_Director = linear_kernel(df_cbf_tfidf_Director, df_cbf_tfidf_Director)
df_cbf_tfidf_Writer = tfidf.fit_transform(df_cbf1['Writer'])
cosine_sim_Writer = linear_kernel(df_cbf_tfidf_Writer, df_cbf_tfidf_Writer)
df_cbf_tfidf_Cast = tfidf.fit_transform(df_cbf1['Cast'])
cosine_sim_Cast = linear_kernel(df_cbf_tfidf_Cast, df_cbf_tfidf_Cast)
df_cbf_tfidf_Top_Cast = tfidf.fit_transform(df_cbf1['Top Cast'])
cosine_sim_Top_Cast = linear_kernel(df_cbf_tfidf_Top_Cast, df_cbf_tfidf_Top_Cast)
df_cbf_tfidf_budget_categorized = tfidf.fit_transform(df_cbf1['budget_categorized'])
cosine_sim_budget_categorized = linear_kernel(df_cbf_tfidf_budget_categorized, df_cbf_tfidf_budget_categorized)
df_cbf_tfidf_Length = tfidf.fit_transform(df_cbf1['Length'])
cosine_sim_Length = linear_kernel(df_cbf_tfidf_Length, df_cbf_tfidf_Length)
df_cbf_tfidf_average_vote_categorized = tfidf.fit_transform(df_cbf1['average_vote_categorized'])
cosine_sim_average_vote_categorized = linear_kernel(df_cbf_tfidf_average_vote_categorized, df_cbf_tfidf_average_vote_categorized)
df_cbf_tfidf_count_vote_categorized = tfidf.fit_transform(df_cbf1['count_vote_categorized'])
cosine_sim_count_vote_categorized = linear_kernel(df_cbf_tfidf_count_vote_categorized, df_cbf_tfidf_count_vote_categorized)
df_cbf_tfidf_era = tfidf.fit_transform(df_cbf1['era'])
cosine_sim_era = linear_kernel(df_cbf_tfidf_era, df_cbf_tfidf_era)
####################################################################################################################################
cosin_sim_final = np.multiply(cosine_sim_belongs_to_collection, W_belongs_to_collection) + np.multiply(cosine_sim_genres, W_genres) + np.multiply(cosine_sim_original_language, W_original_language) + np.multiply(cosine_sim_title, W_title) + np.multiply(cosine_sim_overview, W_overview) + np.multiply(cosine_sim_pruduction_countries, W_production_countries) + np.multiply(cosine_sim_pruduction_companies, W_production_companies) + np.multiply(cosine_sim_tagline, W_tagline) + np.multiply(cosine_sim_keywords, W_keywords) + np.multiply(cosine_sim_Director, W_Director) + np.multiply(cosine_sim_Writer, W_Writer) + np.multiply(cosine_sim_Cast, W_Cast) + np.multiply(cosine_sim_Top_Cast, W_Top_Cast) + np.multiply(cosine_sim_budget_categorized, W_budget_categorized) + np.multiply(cosine_sim_Length, W_length) + np.multiply(cosine_sim_average_vote_categorized, W_average_vote_categorized) + np.multiply(cosine_sim_count_vote_categorized, W_count_vote_categorized) + np.multiply(cosine_sim_era, W_era)
df_cbf2_indices = pd.Series(df_cbf2.index, index=df_cbf2['title'])
#################################################################
#recommend based on popularity
def final_recommender_hot_picks_now(Watched_list):
recommended_list = []
for i in range(10):
recommended_list.append(df_popular_popularity.loc[i, 'title'])
return recommended_list
#recommend based on weighted ratings
def final_recommender_hot_picks_of_all_time(Watched_list):
recommended_list = []
for i in range(10):
recommended_list.append(df_popular_WR_Q.loc[i, 'title'])
return recommended_list
#recommend based on content based
def final_recommender_for_you(Watched_list):
recommended_list = []
if len(Watched_list) < 3:
for i in range(10):
recommended_list.append(df_popular_WR_Q.loc[i, 'title'])
else:
Watched_movies_list = Watched_list[-3:]
recently_watched = Watched_movies_list[-3:]
for i in range(len(recently_watched)):
y = df_cbf2_indices[recently_watched[i]]
z = list(enumerate(cosin_sim_final[y]))
z = sorted(z, key=lambda x: x[1], reverse=True)
z = z[1:16]
k = [i[0] for i in z]
for j in k:
recommended_list.append(df_cbf2.loc[j, 'title'])
for i in range(len(Watched_movies_list)):
recommended_list.append(Watched_movies_list[i])
recommended_list = list(set(recommended_list))
for i in Watched_list:
recommended_list.remove(i)
random.shuffle(recommended_list)
recommended_list = recommended_list[:15]
return recommended_list
def recommender_svd(watch_list):
df1 = ratings
for i in range(len(watch_list)):
df1 = df1.append({'userId' : int(ratings.loc[26123,'userId'])+1, 'movieId' : int(movie_md.loc[movie_md['title'] == watch_list[i], 'id']), 'rating' : 5, 'timestamp' : 0},
ignore_index = True)
# Initialize a surprise reader object
reader = Reader(line_format='user item rating', sep=',', rating_scale=(0,5), skip_lines=1)
# Load the data
data = Dataset.load_from_df(ratings[['userId','movieId','rating']], reader=reader)
# Build trainset object(perform this only when you are using whole dataset to train)
trainset = data.build_full_trainset()
# Initialize model
svd = SVD()
# cross-validate
svd.fit(trainset)
recommendations = []
user_movie_interactions_matrix = df1.pivot(index='userId', columns='movieId', values='rating')
non_interacted_movies = user_movie_interactions_matrix.loc[int(ratings.loc[26123,'userId'])+1][user_movie_interactions_matrix.loc[int(ratings.loc[26123,'userId'])+1].isnull()].index.tolist()
for item_id in non_interacted_movies:
est = svd.predict(int(ratings.loc[26123,'userId'])+1, item_id).est
movie_name = movie_md[movie_md['id']==str(item_id)]['title'].values[0]
recommendations.append((movie_name, est))
recommendations.sort(key=lambda x: x[1], reverse=True)
recommendations = [x[0] for x in recommendations]
return recommendations[:15]
####### #######################################################
|