amirakhlaghiqqq commited on
Commit
52160a9
1 Parent(s): 68fec35

Update recomender.py

Browse files
Files changed (1) hide show
  1. recomender.py +91 -11
recomender.py CHANGED
@@ -1,9 +1,21 @@
1
- import numpy as np
2
- import pandas as pd
 
3
  import random
4
-
 
 
 
 
 
 
 
 
5
  from sklearn.feature_extraction.text import TfidfVectorizer #for TF-IDF
6
  from sklearn.metrics.pairwise import linear_kernel
 
 
 
7
  W_belongs_to_collection = 0.16
8
  W_genres = 0.10
9
  W_original_language = 0.01
@@ -22,13 +34,15 @@ W_length = 0.02
22
  W_average_vote_categorized = 0.08
23
  W_count_vote_categorized = 0.07
24
  W_era = 0.03
 
25
  tfidf = TfidfVectorizer(stop_words='english') #defining tfidf model which removes additional words such as 'the', 'or', 'in'
26
 
 
 
27
  df_popular_popularity = pd.read_csv('df_popular_popularity.csv', low_memory = False)
28
  df_popular_WR_Q = pd.read_csv('df_popular_WR_Q.csv', low_memory = False)
29
  df_cbf_Q = pd.read_csv('df_cbf_Q.csv', low_memory = False)
30
 
31
-
32
  df_cbf_Q['belongs_to_collection'] = df_cbf_Q['belongs_to_collection'].fillna("")
33
  df_cbf_Q['overview'] = df_cbf_Q['overview'].fillna("")
34
  df_cbf_Q['spoken_languages'] = df_cbf_Q['spoken_languages'].fillna("")
@@ -36,10 +50,33 @@ df_cbf_Q['tagline'] = df_cbf_Q['tagline'].fillna("")
36
  df_cbf_Q['Director'] = df_cbf_Q['Director'].fillna("")
37
  df_cbf_Q['Writer'] = df_cbf_Q['Writer'].fillna("")
38
 
39
-
40
  df_cbf1 = df_cbf_Q
41
  df_cbf2 = df_cbf_Q
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  df_cbf_tfidf_belongs_to_collection = tfidf.fit_transform(df_cbf1['belongs_to_collection'])
45
  cosine_sim_belongs_to_collection = linear_kernel(df_cbf_tfidf_belongs_to_collection, df_cbf_tfidf_belongs_to_collection)
@@ -77,34 +114,39 @@ df_cbf_tfidf_count_vote_categorized = tfidf.fit_transform(df_cbf1['count_vote_ca
77
  cosine_sim_count_vote_categorized = linear_kernel(df_cbf_tfidf_count_vote_categorized, df_cbf_tfidf_count_vote_categorized)
78
  df_cbf_tfidf_era = tfidf.fit_transform(df_cbf1['era'])
79
  cosine_sim_era = linear_kernel(df_cbf_tfidf_era, df_cbf_tfidf_era)
80
-
81
 
82
 
83
  cosin_sim_final = np.multiply(cosine_sim_belongs_to_collection, W_belongs_to_collection) + np.multiply(cosine_sim_genres, W_genres) + np.multiply(cosine_sim_original_language, W_original_language) + np.multiply(cosine_sim_title, W_title) + np.multiply(cosine_sim_overview, W_overview) + np.multiply(cosine_sim_pruduction_countries, W_production_countries) + np.multiply(cosine_sim_pruduction_companies, W_production_companies) + np.multiply(cosine_sim_tagline, W_tagline) + np.multiply(cosine_sim_keywords, W_keywords) + np.multiply(cosine_sim_Director, W_Director) + np.multiply(cosine_sim_Writer, W_Writer) + np.multiply(cosine_sim_Cast, W_Cast) + np.multiply(cosine_sim_Top_Cast, W_Top_Cast) + np.multiply(cosine_sim_budget_categorized, W_budget_categorized) + np.multiply(cosine_sim_Length, W_length) + np.multiply(cosine_sim_average_vote_categorized, W_average_vote_categorized) + np.multiply(cosine_sim_count_vote_categorized, W_count_vote_categorized) + np.multiply(cosine_sim_era, W_era)
84
- df_cbf2_indices = pd.Series(df_cbf2.index, index=df_cbf2['title'])
85
 
86
 
 
 
87
 
88
- def final_recommender_hot_picks_now(Watched_movies_list):
 
89
  recommended_list = []
90
  for i in range(10):
91
  recommended_list.append(df_popular_popularity.loc[i, 'title'])
92
  return recommended_list
93
 
 
94
  #recommend based on weighted ratings
95
- def final_recommender_hot_picks_of_all_time(Watched_movies_list):
96
  recommended_list = []
97
  for i in range(10):
98
  recommended_list.append(df_popular_WR_Q.loc[i, 'title'])
99
  return recommended_list
100
 
 
101
  #recommend based on content based
102
- def final_recommender_for_you(Watched_movies_list):
103
  recommended_list = []
104
- if len(Watched_movies_list) < 3:
105
  for i in range(10):
106
  recommended_list.append(df_popular_WR_Q.loc[i, 'title'])
107
  else:
 
108
  recently_watched = Watched_movies_list[-3:]
109
  for i in range(len(recently_watched)):
110
  y = df_cbf2_indices[recently_watched[i]]
@@ -114,8 +156,46 @@ def final_recommender_for_you(Watched_movies_list):
114
  k = [i[0] for i in z]
115
  for j in k:
116
  recommended_list.append(df_cbf2.loc[j, 'title'])
 
 
117
  recommended_list = list(set(recommended_list))
 
 
118
  random.shuffle(recommended_list)
119
  recommended_list = recommended_list[:15]
120
  return recommended_list
121
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np #to help us use numerical functions
2
+ import pandas as pd #to help us use functions for dealing with dataframe
3
+ import os #provides functions for creating and removing a directory
4
  import random
5
+ import matplotlib.pyplot as plt
6
+ from collections import defaultdict #data colector
7
+ import surprise
8
+ from surprise.reader import Reader
9
+ from surprise import Dataset
10
+ from surprise.model_selection import GridSearchCV
11
+ from surprise.model_selection import cross_validate
12
+ from surprise import SVD
13
+ from surprise import NMF
14
  from sklearn.feature_extraction.text import TfidfVectorizer #for TF-IDF
15
  from sklearn.metrics.pairwise import linear_kernel
16
+
17
+
18
+
19
  W_belongs_to_collection = 0.16
20
  W_genres = 0.10
21
  W_original_language = 0.01
 
34
  W_average_vote_categorized = 0.08
35
  W_count_vote_categorized = 0.07
36
  W_era = 0.03
37
+ ##################################################################
38
  tfidf = TfidfVectorizer(stop_words='english') #defining tfidf model which removes additional words such as 'the', 'or', 'in'
39
 
40
+ movies_filename = pd.read_csv('movies_metadata.csv', low_memory = False)
41
+ ratings_filename = pd.read_csv('ratings_small.csv', low_memory = False)
42
  df_popular_popularity = pd.read_csv('df_popular_popularity.csv', low_memory = False)
43
  df_popular_WR_Q = pd.read_csv('df_popular_WR_Q.csv', low_memory = False)
44
  df_cbf_Q = pd.read_csv('df_cbf_Q.csv', low_memory = False)
45
 
 
46
  df_cbf_Q['belongs_to_collection'] = df_cbf_Q['belongs_to_collection'].fillna("")
47
  df_cbf_Q['overview'] = df_cbf_Q['overview'].fillna("")
48
  df_cbf_Q['spoken_languages'] = df_cbf_Q['spoken_languages'].fillna("")
 
50
  df_cbf_Q['Director'] = df_cbf_Q['Director'].fillna("")
51
  df_cbf_Q['Writer'] = df_cbf_Q['Writer'].fillna("")
52
 
 
53
  df_cbf1 = df_cbf_Q
54
  df_cbf2 = df_cbf_Q
55
 
56
+ ratings = ratings_filename
57
+ movie_md = movies_filename
58
+ ######################################################################
59
+
60
+ # movie dataframe with votes more than 100
61
+ movie_md = movie_md[movie_md['vote_count']>100]
62
+
63
+ # removing user with below 10 votes
64
+ ratings = ratings.groupby("userId").filter(lambda x: x['userId'].count() >= 10)
65
+
66
+ # IDs of movies with count more than 100
67
+ movie_ids = [int(x) for x in movie_md['id'].values]
68
+
69
+ # Select ratings of movies with more than 100 counts
70
+ ratings = ratings[ratings['movieId'].isin(movie_ids)]
71
+
72
+ #holding only 1 millions of ratings
73
+ ### in case of not using ratings_small
74
+ #ratings = ratings[:1000000]
75
+
76
+ # Reset Index
77
+ ratings.reset_index(inplace=True, drop=True)
78
+
79
+ #############################################################################################
80
 
81
  df_cbf_tfidf_belongs_to_collection = tfidf.fit_transform(df_cbf1['belongs_to_collection'])
82
  cosine_sim_belongs_to_collection = linear_kernel(df_cbf_tfidf_belongs_to_collection, df_cbf_tfidf_belongs_to_collection)
 
114
  cosine_sim_count_vote_categorized = linear_kernel(df_cbf_tfidf_count_vote_categorized, df_cbf_tfidf_count_vote_categorized)
115
  df_cbf_tfidf_era = tfidf.fit_transform(df_cbf1['era'])
116
  cosine_sim_era = linear_kernel(df_cbf_tfidf_era, df_cbf_tfidf_era)
117
+ ####################################################################################################################################
118
 
119
 
120
  cosin_sim_final = np.multiply(cosine_sim_belongs_to_collection, W_belongs_to_collection) + np.multiply(cosine_sim_genres, W_genres) + np.multiply(cosine_sim_original_language, W_original_language) + np.multiply(cosine_sim_title, W_title) + np.multiply(cosine_sim_overview, W_overview) + np.multiply(cosine_sim_pruduction_countries, W_production_countries) + np.multiply(cosine_sim_pruduction_companies, W_production_companies) + np.multiply(cosine_sim_tagline, W_tagline) + np.multiply(cosine_sim_keywords, W_keywords) + np.multiply(cosine_sim_Director, W_Director) + np.multiply(cosine_sim_Writer, W_Writer) + np.multiply(cosine_sim_Cast, W_Cast) + np.multiply(cosine_sim_Top_Cast, W_Top_Cast) + np.multiply(cosine_sim_budget_categorized, W_budget_categorized) + np.multiply(cosine_sim_Length, W_length) + np.multiply(cosine_sim_average_vote_categorized, W_average_vote_categorized) + np.multiply(cosine_sim_count_vote_categorized, W_count_vote_categorized) + np.multiply(cosine_sim_era, W_era)
 
121
 
122
 
123
+ df_cbf2_indices = pd.Series(df_cbf2.index, index=df_cbf2['title'])
124
+ #################################################################
125
 
126
+ #recommend based on popularity
127
+ def final_recommender_hot_picks_now(Watched_list):
128
  recommended_list = []
129
  for i in range(10):
130
  recommended_list.append(df_popular_popularity.loc[i, 'title'])
131
  return recommended_list
132
 
133
+
134
  #recommend based on weighted ratings
135
+ def final_recommender_hot_picks_of_all_time(Watched_list):
136
  recommended_list = []
137
  for i in range(10):
138
  recommended_list.append(df_popular_WR_Q.loc[i, 'title'])
139
  return recommended_list
140
 
141
+
142
  #recommend based on content based
143
+ def final_recommender_for_you(Watched_list):
144
  recommended_list = []
145
+ if len(Watched_list) < 3:
146
  for i in range(10):
147
  recommended_list.append(df_popular_WR_Q.loc[i, 'title'])
148
  else:
149
+ Watched_movies_list = Watched_list[-3:]
150
  recently_watched = Watched_movies_list[-3:]
151
  for i in range(len(recently_watched)):
152
  y = df_cbf2_indices[recently_watched[i]]
 
156
  k = [i[0] for i in z]
157
  for j in k:
158
  recommended_list.append(df_cbf2.loc[j, 'title'])
159
+ for i in range(len(Watched_movies_list)):
160
+ recommended_list.append(Watched_movies_list[i])
161
  recommended_list = list(set(recommended_list))
162
+ for i in Watched_list:
163
+ recommended_list.remove(i)
164
  random.shuffle(recommended_list)
165
  recommended_list = recommended_list[:15]
166
  return recommended_list
167
 
168
+
169
+
170
+
171
+ def recommender_svd(watch_list):
172
+ df1 = ratings
173
+ for i in range(len(watch_list)):
174
+ df1 = df1.append({'userId' : int(ratings.loc[26123,'userId'])+1, 'movieId' : int(movie_md.loc[movie_md['title'] == watch_list[i], 'id']), 'rating' : 5, 'timestamp' : 0},
175
+ ignore_index = True)
176
+
177
+ # Initialize a surprise reader object
178
+ reader = Reader(line_format='user item rating', sep=',', rating_scale=(0,5), skip_lines=1)
179
+ # Load the data
180
+ data = Dataset.load_from_df(ratings[['userId','movieId','rating']], reader=reader)
181
+ # Build trainset object(perform this only when you are using whole dataset to train)
182
+ trainset = data.build_full_trainset()
183
+ # Initialize model
184
+ svd = SVD()
185
+ # cross-validate
186
+ svd.fit(trainset)
187
+
188
+ recommendations = []
189
+ user_movie_interactions_matrix = df1.pivot(index='userId', columns='movieId', values='rating')
190
+
191
+ non_interacted_movies = user_movie_interactions_matrix.loc[int(ratings.loc[26123,'userId'])+1][user_movie_interactions_matrix.loc[int(ratings.loc[26123,'userId'])+1].isnull()].index.tolist()
192
+
193
+ for item_id in non_interacted_movies:
194
+ est = svd.predict(int(ratings.loc[26123,'userId'])+1, item_id).est
195
+ movie_name = movie_md[movie_md['id']==str(item_id)]['title'].values[0]
196
+ recommendations.append((movie_name, est))
197
+
198
+ recommendations.sort(key=lambda x: x[1], reverse=True)
199
+ recommendations = [x[0] for x in recommendations]
200
+ return recommendations[:15]
201
+ ####### #######################################################