Spaces:

amirakhlaghiqqq
/

Movie_Recommender

Sleeping

App Files Files Community

amirakhlaghiqqq commited on Jul 7, 2023

Commit

52160a9

•

1 Parent(s): 68fec35

Update recomender.py

Browse files

Files changed (1) hide show

recomender.py +91 -11

recomender.py CHANGED Viewed

@@ -1,9 +1,21 @@
-import numpy as np
-import pandas as pd
 import random
 from sklearn.feature_extraction.text import TfidfVectorizer #for TF-IDF
 from sklearn.metrics.pairwise import linear_kernel
 W_belongs_to_collection = 0.16
 W_genres = 0.10
 W_original_language = 0.01
@@ -22,13 +34,15 @@ W_length = 0.02
 W_average_vote_categorized = 0.08
 W_count_vote_categorized = 0.07
 W_era = 0.03
 tfidf = TfidfVectorizer(stop_words='english') #defining tfidf model which removes additional words such as 'the', 'or', 'in'
 df_popular_popularity = pd.read_csv('df_popular_popularity.csv', low_memory = False)
 df_popular_WR_Q = pd.read_csv('df_popular_WR_Q.csv', low_memory = False)
 df_cbf_Q = pd.read_csv('df_cbf_Q.csv', low_memory = False)
 df_cbf_Q['belongs_to_collection'] = df_cbf_Q['belongs_to_collection'].fillna("")
 df_cbf_Q['overview'] = df_cbf_Q['overview'].fillna("")
 df_cbf_Q['spoken_languages'] = df_cbf_Q['spoken_languages'].fillna("")
@@ -36,10 +50,33 @@ df_cbf_Q['tagline'] = df_cbf_Q['tagline'].fillna("")
 df_cbf_Q['Director'] = df_cbf_Q['Director'].fillna("")
 df_cbf_Q['Writer'] = df_cbf_Q['Writer'].fillna("")
 df_cbf1 = df_cbf_Q
 df_cbf2 = df_cbf_Q
 df_cbf_tfidf_belongs_to_collection = tfidf.fit_transform(df_cbf1['belongs_to_collection'])
 cosine_sim_belongs_to_collection = linear_kernel(df_cbf_tfidf_belongs_to_collection, df_cbf_tfidf_belongs_to_collection)
@@ -77,34 +114,39 @@ df_cbf_tfidf_count_vote_categorized = tfidf.fit_transform(df_cbf1['count_vote_ca
 cosine_sim_count_vote_categorized = linear_kernel(df_cbf_tfidf_count_vote_categorized, df_cbf_tfidf_count_vote_categorized)
 df_cbf_tfidf_era = tfidf.fit_transform(df_cbf1['era'])
 cosine_sim_era = linear_kernel(df_cbf_tfidf_era, df_cbf_tfidf_era)
 cosin_sim_final = np.multiply(cosine_sim_belongs_to_collection, W_belongs_to_collection) + np.multiply(cosine_sim_genres, W_genres) + np.multiply(cosine_sim_original_language, W_original_language) + np.multiply(cosine_sim_title, W_title) + np.multiply(cosine_sim_overview, W_overview) + np.multiply(cosine_sim_pruduction_countries, W_production_countries) + np.multiply(cosine_sim_pruduction_companies, W_production_companies) + np.multiply(cosine_sim_tagline, W_tagline) + np.multiply(cosine_sim_keywords, W_keywords) + np.multiply(cosine_sim_Director, W_Director) + np.multiply(cosine_sim_Writer, W_Writer) + np.multiply(cosine_sim_Cast, W_Cast) + np.multiply(cosine_sim_Top_Cast, W_Top_Cast) + np.multiply(cosine_sim_budget_categorized, W_budget_categorized) + np.multiply(cosine_sim_Length, W_length) + np.multiply(cosine_sim_average_vote_categorized, W_average_vote_categorized) + np.multiply(cosine_sim_count_vote_categorized, W_count_vote_categorized) + np.multiply(cosine_sim_era, W_era)
-df_cbf2_indices = pd.Series(df_cbf2.index, index=df_cbf2['title'])
-def final_recommender_hot_picks_now(Watched_movies_list):
   recommended_list = []
   for i in range(10):
     recommended_list.append(df_popular_popularity.loc[i, 'title'])
   return recommended_list
 #recommend based on weighted ratings
-def final_recommender_hot_picks_of_all_time(Watched_movies_list):
   recommended_list = []
   for i in range(10):
     recommended_list.append(df_popular_WR_Q.loc[i, 'title'])
   return recommended_list
 #recommend based on content based
-def final_recommender_for_you(Watched_movies_list):
   recommended_list = []
-  if len(Watched_movies_list) < 3:
     for i in range(10):
       recommended_list.append(df_popular_WR_Q.loc[i, 'title'])
   else:
     recently_watched = Watched_movies_list[-3:]
     for i in range(len(recently_watched)):
       y = df_cbf2_indices[recently_watched[i]]
@@ -114,8 +156,46 @@ def final_recommender_for_you(Watched_movies_list):
       k = [i[0] for i in z]
       for j in k:
         recommended_list.append(df_cbf2.loc[j, 'title'])
   recommended_list = list(set(recommended_list))
   random.shuffle(recommended_list)
   recommended_list = recommended_list[:15]
   return recommended_list

+import numpy as np #to help us use numerical functions
+import pandas as pd #to help us use functions for dealing with dataframe
+import os #provides functions for creating and removing a directory
 import random
+import matplotlib.pyplot as plt
+from collections import defaultdict #data colector
+import surprise
+from surprise.reader import Reader
+from surprise import Dataset
+from surprise.model_selection import GridSearchCV
+from surprise.model_selection import cross_validate
+from surprise import SVD
+from surprise import NMF
 from sklearn.feature_extraction.text import TfidfVectorizer #for TF-IDF
 from sklearn.metrics.pairwise import linear_kernel
 W_belongs_to_collection = 0.16
 W_genres = 0.10
 W_original_language = 0.01
 W_average_vote_categorized = 0.08
 W_count_vote_categorized = 0.07
 W_era = 0.03
+##################################################################
 tfidf = TfidfVectorizer(stop_words='english') #defining tfidf model which removes additional words such as 'the', 'or', 'in'
+movies_filename = pd.read_csv('movies_metadata.csv', low_memory = False)
+ratings_filename = pd.read_csv('ratings_small.csv', low_memory = False)
 df_popular_popularity = pd.read_csv('df_popular_popularity.csv', low_memory = False)
 df_popular_WR_Q = pd.read_csv('df_popular_WR_Q.csv', low_memory = False)
 df_cbf_Q = pd.read_csv('df_cbf_Q.csv', low_memory = False)
 df_cbf_Q['belongs_to_collection'] = df_cbf_Q['belongs_to_collection'].fillna("")
 df_cbf_Q['overview'] = df_cbf_Q['overview'].fillna("")
 df_cbf_Q['spoken_languages'] = df_cbf_Q['spoken_languages'].fillna("")
 df_cbf_Q['Director'] = df_cbf_Q['Director'].fillna("")
 df_cbf_Q['Writer'] = df_cbf_Q['Writer'].fillna("")
 df_cbf1 = df_cbf_Q
 df_cbf2 = df_cbf_Q
+ratings = ratings_filename
+movie_md = movies_filename
+######################################################################
+# movie dataframe with votes more than 100
+movie_md = movie_md[movie_md['vote_count']>100]
+# removing user with below 10 votes
+ratings = ratings.groupby("userId").filter(lambda x: x['userId'].count() >= 10)
+# IDs of movies with count more than 100
+movie_ids = [int(x) for x in movie_md['id'].values]
+# Select ratings of movies with more than 100 counts
+ratings = ratings[ratings['movieId'].isin(movie_ids)]
+#holding only 1 millions of ratings
+### in case of not using ratings_small
+#ratings = ratings[:1000000]
+# Reset Index
+ratings.reset_index(inplace=True, drop=True)
+#############################################################################################
 df_cbf_tfidf_belongs_to_collection = tfidf.fit_transform(df_cbf1['belongs_to_collection'])
 cosine_sim_belongs_to_collection = linear_kernel(df_cbf_tfidf_belongs_to_collection, df_cbf_tfidf_belongs_to_collection)
 cosine_sim_count_vote_categorized = linear_kernel(df_cbf_tfidf_count_vote_categorized, df_cbf_tfidf_count_vote_categorized)
 df_cbf_tfidf_era = tfidf.fit_transform(df_cbf1['era'])
 cosine_sim_era = linear_kernel(df_cbf_tfidf_era, df_cbf_tfidf_era)
+####################################################################################################################################
 cosin_sim_final = np.multiply(cosine_sim_belongs_to_collection, W_belongs_to_collection) + np.multiply(cosine_sim_genres, W_genres) + np.multiply(cosine_sim_original_language, W_original_language) + np.multiply(cosine_sim_title, W_title) + np.multiply(cosine_sim_overview, W_overview) + np.multiply(cosine_sim_pruduction_countries, W_production_countries) + np.multiply(cosine_sim_pruduction_companies, W_production_companies) + np.multiply(cosine_sim_tagline, W_tagline) + np.multiply(cosine_sim_keywords, W_keywords) + np.multiply(cosine_sim_Director, W_Director) + np.multiply(cosine_sim_Writer, W_Writer) + np.multiply(cosine_sim_Cast, W_Cast) + np.multiply(cosine_sim_Top_Cast, W_Top_Cast) + np.multiply(cosine_sim_budget_categorized, W_budget_categorized) + np.multiply(cosine_sim_Length, W_length) + np.multiply(cosine_sim_average_vote_categorized, W_average_vote_categorized) + np.multiply(cosine_sim_count_vote_categorized, W_count_vote_categorized) + np.multiply(cosine_sim_era, W_era)
+df_cbf2_indices = pd.Series(df_cbf2.index, index=df_cbf2['title'])
+#################################################################
+#recommend based on popularity
+def final_recommender_hot_picks_now(Watched_list):
   recommended_list = []
   for i in range(10):
     recommended_list.append(df_popular_popularity.loc[i, 'title'])
   return recommended_list
 #recommend based on weighted ratings
+def final_recommender_hot_picks_of_all_time(Watched_list):
   recommended_list = []
   for i in range(10):
     recommended_list.append(df_popular_WR_Q.loc[i, 'title'])
   return recommended_list
 #recommend based on content based
+def final_recommender_for_you(Watched_list):
   recommended_list = []
+  if len(Watched_list) < 3:
     for i in range(10):
       recommended_list.append(df_popular_WR_Q.loc[i, 'title'])
   else:
+    Watched_movies_list = Watched_list[-3:]
     recently_watched = Watched_movies_list[-3:]
     for i in range(len(recently_watched)):
       y = df_cbf2_indices[recently_watched[i]]
       k = [i[0] for i in z]
       for j in k:
         recommended_list.append(df_cbf2.loc[j, 'title'])
+  for i in range(len(Watched_movies_list)):
+    recommended_list.append(Watched_movies_list[i])
   recommended_list = list(set(recommended_list))
+  for i in Watched_list:
+    recommended_list.remove(i)
   random.shuffle(recommended_list)
   recommended_list = recommended_list[:15]
   return recommended_list
+def recommender_svd(watch_list):
+  df1 = ratings
+  for i in range(len(watch_list)):
+    df1 = df1.append({'userId' : int(ratings.loc[26123,'userId'])+1, 'movieId' : int(movie_md.loc[movie_md['title'] == watch_list[i], 'id']), 'rating' : 5, 'timestamp' : 0},
+                ignore_index = True)
+  # Initialize a surprise reader object
+  reader = Reader(line_format='user item rating', sep=',', rating_scale=(0,5), skip_lines=1)
+  # Load the data
+  data = Dataset.load_from_df(ratings[['userId','movieId','rating']], reader=reader)
+  # Build trainset object(perform this only when you are using whole dataset to train)
+  trainset = data.build_full_trainset()
+  # Initialize model
+  svd = SVD()
+  # cross-validate
+  svd.fit(trainset)
+  recommendations = []
+  user_movie_interactions_matrix = df1.pivot(index='userId', columns='movieId', values='rating')
+  non_interacted_movies = user_movie_interactions_matrix.loc[int(ratings.loc[26123,'userId'])+1][user_movie_interactions_matrix.loc[int(ratings.loc[26123,'userId'])+1].isnull()].index.tolist()
+  for item_id in non_interacted_movies:
+    est = svd.predict(int(ratings.loc[26123,'userId'])+1, item_id).est
+    movie_name = movie_md[movie_md['id']==str(item_id)]['title'].values[0]
+    recommendations.append((movie_name, est))
+  recommendations.sort(key=lambda x: x[1], reverse=True)
+  recommendations = [x[0] for x in recommendations]
+  return recommendations[:15]
+####### #######################################################