Spaces:
Sleeping
Sleeping
amirakhlaghiqqq
commited on
Commit
•
52160a9
1
Parent(s):
68fec35
Update recomender.py
Browse files- recomender.py +91 -11
recomender.py
CHANGED
@@ -1,9 +1,21 @@
|
|
1 |
-
import numpy as np
|
2 |
-
import pandas as pd
|
|
|
3 |
import random
|
4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
from sklearn.feature_extraction.text import TfidfVectorizer #for TF-IDF
|
6 |
from sklearn.metrics.pairwise import linear_kernel
|
|
|
|
|
|
|
7 |
W_belongs_to_collection = 0.16
|
8 |
W_genres = 0.10
|
9 |
W_original_language = 0.01
|
@@ -22,13 +34,15 @@ W_length = 0.02
|
|
22 |
W_average_vote_categorized = 0.08
|
23 |
W_count_vote_categorized = 0.07
|
24 |
W_era = 0.03
|
|
|
25 |
tfidf = TfidfVectorizer(stop_words='english') #defining tfidf model which removes additional words such as 'the', 'or', 'in'
|
26 |
|
|
|
|
|
27 |
df_popular_popularity = pd.read_csv('df_popular_popularity.csv', low_memory = False)
|
28 |
df_popular_WR_Q = pd.read_csv('df_popular_WR_Q.csv', low_memory = False)
|
29 |
df_cbf_Q = pd.read_csv('df_cbf_Q.csv', low_memory = False)
|
30 |
|
31 |
-
|
32 |
df_cbf_Q['belongs_to_collection'] = df_cbf_Q['belongs_to_collection'].fillna("")
|
33 |
df_cbf_Q['overview'] = df_cbf_Q['overview'].fillna("")
|
34 |
df_cbf_Q['spoken_languages'] = df_cbf_Q['spoken_languages'].fillna("")
|
@@ -36,10 +50,33 @@ df_cbf_Q['tagline'] = df_cbf_Q['tagline'].fillna("")
|
|
36 |
df_cbf_Q['Director'] = df_cbf_Q['Director'].fillna("")
|
37 |
df_cbf_Q['Writer'] = df_cbf_Q['Writer'].fillna("")
|
38 |
|
39 |
-
|
40 |
df_cbf1 = df_cbf_Q
|
41 |
df_cbf2 = df_cbf_Q
|
42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
df_cbf_tfidf_belongs_to_collection = tfidf.fit_transform(df_cbf1['belongs_to_collection'])
|
45 |
cosine_sim_belongs_to_collection = linear_kernel(df_cbf_tfidf_belongs_to_collection, df_cbf_tfidf_belongs_to_collection)
|
@@ -77,34 +114,39 @@ df_cbf_tfidf_count_vote_categorized = tfidf.fit_transform(df_cbf1['count_vote_ca
|
|
77 |
cosine_sim_count_vote_categorized = linear_kernel(df_cbf_tfidf_count_vote_categorized, df_cbf_tfidf_count_vote_categorized)
|
78 |
df_cbf_tfidf_era = tfidf.fit_transform(df_cbf1['era'])
|
79 |
cosine_sim_era = linear_kernel(df_cbf_tfidf_era, df_cbf_tfidf_era)
|
80 |
-
|
81 |
|
82 |
|
83 |
cosin_sim_final = np.multiply(cosine_sim_belongs_to_collection, W_belongs_to_collection) + np.multiply(cosine_sim_genres, W_genres) + np.multiply(cosine_sim_original_language, W_original_language) + np.multiply(cosine_sim_title, W_title) + np.multiply(cosine_sim_overview, W_overview) + np.multiply(cosine_sim_pruduction_countries, W_production_countries) + np.multiply(cosine_sim_pruduction_companies, W_production_companies) + np.multiply(cosine_sim_tagline, W_tagline) + np.multiply(cosine_sim_keywords, W_keywords) + np.multiply(cosine_sim_Director, W_Director) + np.multiply(cosine_sim_Writer, W_Writer) + np.multiply(cosine_sim_Cast, W_Cast) + np.multiply(cosine_sim_Top_Cast, W_Top_Cast) + np.multiply(cosine_sim_budget_categorized, W_budget_categorized) + np.multiply(cosine_sim_Length, W_length) + np.multiply(cosine_sim_average_vote_categorized, W_average_vote_categorized) + np.multiply(cosine_sim_count_vote_categorized, W_count_vote_categorized) + np.multiply(cosine_sim_era, W_era)
|
84 |
-
df_cbf2_indices = pd.Series(df_cbf2.index, index=df_cbf2['title'])
|
85 |
|
86 |
|
|
|
|
|
87 |
|
88 |
-
|
|
|
89 |
recommended_list = []
|
90 |
for i in range(10):
|
91 |
recommended_list.append(df_popular_popularity.loc[i, 'title'])
|
92 |
return recommended_list
|
93 |
|
|
|
94 |
#recommend based on weighted ratings
|
95 |
-
def final_recommender_hot_picks_of_all_time(
|
96 |
recommended_list = []
|
97 |
for i in range(10):
|
98 |
recommended_list.append(df_popular_WR_Q.loc[i, 'title'])
|
99 |
return recommended_list
|
100 |
|
|
|
101 |
#recommend based on content based
|
102 |
-
def final_recommender_for_you(
|
103 |
recommended_list = []
|
104 |
-
if len(
|
105 |
for i in range(10):
|
106 |
recommended_list.append(df_popular_WR_Q.loc[i, 'title'])
|
107 |
else:
|
|
|
108 |
recently_watched = Watched_movies_list[-3:]
|
109 |
for i in range(len(recently_watched)):
|
110 |
y = df_cbf2_indices[recently_watched[i]]
|
@@ -114,8 +156,46 @@ def final_recommender_for_you(Watched_movies_list):
|
|
114 |
k = [i[0] for i in z]
|
115 |
for j in k:
|
116 |
recommended_list.append(df_cbf2.loc[j, 'title'])
|
|
|
|
|
117 |
recommended_list = list(set(recommended_list))
|
|
|
|
|
118 |
random.shuffle(recommended_list)
|
119 |
recommended_list = recommended_list[:15]
|
120 |
return recommended_list
|
121 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np #to help us use numerical functions
|
2 |
+
import pandas as pd #to help us use functions for dealing with dataframe
|
3 |
+
import os #provides functions for creating and removing a directory
|
4 |
import random
|
5 |
+
import matplotlib.pyplot as plt
|
6 |
+
from collections import defaultdict #data colector
|
7 |
+
import surprise
|
8 |
+
from surprise.reader import Reader
|
9 |
+
from surprise import Dataset
|
10 |
+
from surprise.model_selection import GridSearchCV
|
11 |
+
from surprise.model_selection import cross_validate
|
12 |
+
from surprise import SVD
|
13 |
+
from surprise import NMF
|
14 |
from sklearn.feature_extraction.text import TfidfVectorizer #for TF-IDF
|
15 |
from sklearn.metrics.pairwise import linear_kernel
|
16 |
+
|
17 |
+
|
18 |
+
|
19 |
W_belongs_to_collection = 0.16
|
20 |
W_genres = 0.10
|
21 |
W_original_language = 0.01
|
|
|
34 |
W_average_vote_categorized = 0.08
|
35 |
W_count_vote_categorized = 0.07
|
36 |
W_era = 0.03
|
37 |
+
##################################################################
|
38 |
tfidf = TfidfVectorizer(stop_words='english') #defining tfidf model which removes additional words such as 'the', 'or', 'in'
|
39 |
|
40 |
+
movies_filename = pd.read_csv('movies_metadata.csv', low_memory = False)
|
41 |
+
ratings_filename = pd.read_csv('ratings_small.csv', low_memory = False)
|
42 |
df_popular_popularity = pd.read_csv('df_popular_popularity.csv', low_memory = False)
|
43 |
df_popular_WR_Q = pd.read_csv('df_popular_WR_Q.csv', low_memory = False)
|
44 |
df_cbf_Q = pd.read_csv('df_cbf_Q.csv', low_memory = False)
|
45 |
|
|
|
46 |
df_cbf_Q['belongs_to_collection'] = df_cbf_Q['belongs_to_collection'].fillna("")
|
47 |
df_cbf_Q['overview'] = df_cbf_Q['overview'].fillna("")
|
48 |
df_cbf_Q['spoken_languages'] = df_cbf_Q['spoken_languages'].fillna("")
|
|
|
50 |
df_cbf_Q['Director'] = df_cbf_Q['Director'].fillna("")
|
51 |
df_cbf_Q['Writer'] = df_cbf_Q['Writer'].fillna("")
|
52 |
|
|
|
53 |
df_cbf1 = df_cbf_Q
|
54 |
df_cbf2 = df_cbf_Q
|
55 |
|
56 |
+
ratings = ratings_filename
|
57 |
+
movie_md = movies_filename
|
58 |
+
######################################################################
|
59 |
+
|
60 |
+
# movie dataframe with votes more than 100
|
61 |
+
movie_md = movie_md[movie_md['vote_count']>100]
|
62 |
+
|
63 |
+
# removing user with below 10 votes
|
64 |
+
ratings = ratings.groupby("userId").filter(lambda x: x['userId'].count() >= 10)
|
65 |
+
|
66 |
+
# IDs of movies with count more than 100
|
67 |
+
movie_ids = [int(x) for x in movie_md['id'].values]
|
68 |
+
|
69 |
+
# Select ratings of movies with more than 100 counts
|
70 |
+
ratings = ratings[ratings['movieId'].isin(movie_ids)]
|
71 |
+
|
72 |
+
#holding only 1 millions of ratings
|
73 |
+
### in case of not using ratings_small
|
74 |
+
#ratings = ratings[:1000000]
|
75 |
+
|
76 |
+
# Reset Index
|
77 |
+
ratings.reset_index(inplace=True, drop=True)
|
78 |
+
|
79 |
+
#############################################################################################
|
80 |
|
81 |
df_cbf_tfidf_belongs_to_collection = tfidf.fit_transform(df_cbf1['belongs_to_collection'])
|
82 |
cosine_sim_belongs_to_collection = linear_kernel(df_cbf_tfidf_belongs_to_collection, df_cbf_tfidf_belongs_to_collection)
|
|
|
114 |
cosine_sim_count_vote_categorized = linear_kernel(df_cbf_tfidf_count_vote_categorized, df_cbf_tfidf_count_vote_categorized)
|
115 |
df_cbf_tfidf_era = tfidf.fit_transform(df_cbf1['era'])
|
116 |
cosine_sim_era = linear_kernel(df_cbf_tfidf_era, df_cbf_tfidf_era)
|
117 |
+
####################################################################################################################################
|
118 |
|
119 |
|
120 |
cosin_sim_final = np.multiply(cosine_sim_belongs_to_collection, W_belongs_to_collection) + np.multiply(cosine_sim_genres, W_genres) + np.multiply(cosine_sim_original_language, W_original_language) + np.multiply(cosine_sim_title, W_title) + np.multiply(cosine_sim_overview, W_overview) + np.multiply(cosine_sim_pruduction_countries, W_production_countries) + np.multiply(cosine_sim_pruduction_companies, W_production_companies) + np.multiply(cosine_sim_tagline, W_tagline) + np.multiply(cosine_sim_keywords, W_keywords) + np.multiply(cosine_sim_Director, W_Director) + np.multiply(cosine_sim_Writer, W_Writer) + np.multiply(cosine_sim_Cast, W_Cast) + np.multiply(cosine_sim_Top_Cast, W_Top_Cast) + np.multiply(cosine_sim_budget_categorized, W_budget_categorized) + np.multiply(cosine_sim_Length, W_length) + np.multiply(cosine_sim_average_vote_categorized, W_average_vote_categorized) + np.multiply(cosine_sim_count_vote_categorized, W_count_vote_categorized) + np.multiply(cosine_sim_era, W_era)
|
|
|
121 |
|
122 |
|
123 |
+
df_cbf2_indices = pd.Series(df_cbf2.index, index=df_cbf2['title'])
|
124 |
+
#################################################################
|
125 |
|
126 |
+
#recommend based on popularity
|
127 |
+
def final_recommender_hot_picks_now(Watched_list):
|
128 |
recommended_list = []
|
129 |
for i in range(10):
|
130 |
recommended_list.append(df_popular_popularity.loc[i, 'title'])
|
131 |
return recommended_list
|
132 |
|
133 |
+
|
134 |
#recommend based on weighted ratings
|
135 |
+
def final_recommender_hot_picks_of_all_time(Watched_list):
|
136 |
recommended_list = []
|
137 |
for i in range(10):
|
138 |
recommended_list.append(df_popular_WR_Q.loc[i, 'title'])
|
139 |
return recommended_list
|
140 |
|
141 |
+
|
142 |
#recommend based on content based
|
143 |
+
def final_recommender_for_you(Watched_list):
|
144 |
recommended_list = []
|
145 |
+
if len(Watched_list) < 3:
|
146 |
for i in range(10):
|
147 |
recommended_list.append(df_popular_WR_Q.loc[i, 'title'])
|
148 |
else:
|
149 |
+
Watched_movies_list = Watched_list[-3:]
|
150 |
recently_watched = Watched_movies_list[-3:]
|
151 |
for i in range(len(recently_watched)):
|
152 |
y = df_cbf2_indices[recently_watched[i]]
|
|
|
156 |
k = [i[0] for i in z]
|
157 |
for j in k:
|
158 |
recommended_list.append(df_cbf2.loc[j, 'title'])
|
159 |
+
for i in range(len(Watched_movies_list)):
|
160 |
+
recommended_list.append(Watched_movies_list[i])
|
161 |
recommended_list = list(set(recommended_list))
|
162 |
+
for i in Watched_list:
|
163 |
+
recommended_list.remove(i)
|
164 |
random.shuffle(recommended_list)
|
165 |
recommended_list = recommended_list[:15]
|
166 |
return recommended_list
|
167 |
|
168 |
+
|
169 |
+
|
170 |
+
|
171 |
+
def recommender_svd(watch_list):
|
172 |
+
df1 = ratings
|
173 |
+
for i in range(len(watch_list)):
|
174 |
+
df1 = df1.append({'userId' : int(ratings.loc[26123,'userId'])+1, 'movieId' : int(movie_md.loc[movie_md['title'] == watch_list[i], 'id']), 'rating' : 5, 'timestamp' : 0},
|
175 |
+
ignore_index = True)
|
176 |
+
|
177 |
+
# Initialize a surprise reader object
|
178 |
+
reader = Reader(line_format='user item rating', sep=',', rating_scale=(0,5), skip_lines=1)
|
179 |
+
# Load the data
|
180 |
+
data = Dataset.load_from_df(ratings[['userId','movieId','rating']], reader=reader)
|
181 |
+
# Build trainset object(perform this only when you are using whole dataset to train)
|
182 |
+
trainset = data.build_full_trainset()
|
183 |
+
# Initialize model
|
184 |
+
svd = SVD()
|
185 |
+
# cross-validate
|
186 |
+
svd.fit(trainset)
|
187 |
+
|
188 |
+
recommendations = []
|
189 |
+
user_movie_interactions_matrix = df1.pivot(index='userId', columns='movieId', values='rating')
|
190 |
+
|
191 |
+
non_interacted_movies = user_movie_interactions_matrix.loc[int(ratings.loc[26123,'userId'])+1][user_movie_interactions_matrix.loc[int(ratings.loc[26123,'userId'])+1].isnull()].index.tolist()
|
192 |
+
|
193 |
+
for item_id in non_interacted_movies:
|
194 |
+
est = svd.predict(int(ratings.loc[26123,'userId'])+1, item_id).est
|
195 |
+
movie_name = movie_md[movie_md['id']==str(item_id)]['title'].values[0]
|
196 |
+
recommendations.append((movie_name, est))
|
197 |
+
|
198 |
+
recommendations.sort(key=lambda x: x[1], reverse=True)
|
199 |
+
recommendations = [x[0] for x in recommendations]
|
200 |
+
return recommendations[:15]
|
201 |
+
####### #######################################################
|