File size: 6,561 Bytes
3b47f92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import numpy as np 
import pandas as pd 
import random

from sklearn.feature_extraction.text import TfidfVectorizer #for TF-IDF
from sklearn.metrics.pairwise import linear_kernel
W_belongs_to_collection = 0.16
W_genres = 0.10
W_original_language = 0.01
W_title = 0.11
W_overview = 0.08
W_production_countries = 0.01
W_production_companies = 0.02
W_tagline = 0.10
W_keywords = 0.10
W_Director = 0.03
W_Writer = 0.02
W_Cast = 0.02
W_Top_Cast = 0.03
W_budget_categorized = 0.01
W_length = 0.02
W_average_vote_categorized = 0.08
W_count_vote_categorized = 0.07
W_era = 0.03
tfidf = TfidfVectorizer(stop_words='english') #defining tfidf model which removes additional words such as 'the', 'or', 'in'

df_popular_popularity = pd.read_csv('df_popular_popularity.csv', low_memory = False)
df_popular_WR_Q = pd.read_csv('df_popular_WR_Q.csv', low_memory = False)
df_cbf_Q = pd.read_csv('df_cbf_Q.csv', low_memory = False)


df_cbf_Q['belongs_to_collection'] = df_cbf_Q['belongs_to_collection'].fillna("")
df_cbf_Q['overview'] = df_cbf_Q['overview'].fillna("")
df_cbf_Q['spoken_languages'] = df_cbf_Q['spoken_languages'].fillna("")
df_cbf_Q['tagline'] = df_cbf_Q['tagline'].fillna("")
df_cbf_Q['Director'] = df_cbf_Q['Director'].fillna("")
df_cbf_Q['Writer'] = df_cbf_Q['Writer'].fillna("")


df_cbf1 = df_cbf_Q
df_cbf2 = df_cbf_Q


df_cbf_tfidf_belongs_to_collection = tfidf.fit_transform(df_cbf1['belongs_to_collection'])
cosine_sim_belongs_to_collection = linear_kernel(df_cbf_tfidf_belongs_to_collection, df_cbf_tfidf_belongs_to_collection)
df_cbf_tfidf_genres = tfidf.fit_transform(df_cbf1['genres'])
cosine_sim_genres = linear_kernel(df_cbf_tfidf_genres, df_cbf_tfidf_genres)
df_cbf_tfidf_original_language = tfidf.fit_transform(df_cbf1['original_language'])
cosine_sim_original_language = linear_kernel(df_cbf_tfidf_original_language, df_cbf_tfidf_original_language)
df_cbf_tfidf_title = tfidf.fit_transform(df_cbf1['title'])
cosine_sim_title = linear_kernel(df_cbf_tfidf_title, df_cbf_tfidf_title)
df_cbf_tfidf_overview = tfidf.fit_transform(df_cbf1['overview'])
cosine_sim_overview = linear_kernel(df_cbf_tfidf_overview, df_cbf_tfidf_overview)
df_cbf_tfidf_pruduction_countries = tfidf.fit_transform(df_cbf1['production_countries'])
cosine_sim_pruduction_countries = linear_kernel(df_cbf_tfidf_pruduction_countries, df_cbf_tfidf_pruduction_countries)
df_cbf_tfidf_pruduction_companies = tfidf.fit_transform(df_cbf1['production_companies'])
cosine_sim_pruduction_companies = linear_kernel(df_cbf_tfidf_pruduction_companies, df_cbf_tfidf_pruduction_companies)
df_cbf_tfidf_tagline = tfidf.fit_transform(df_cbf1['tagline'])
cosine_sim_tagline = linear_kernel(df_cbf_tfidf_tagline, df_cbf_tfidf_tagline)
df_cbf_tfidf_keywords = tfidf.fit_transform(df_cbf1['keywords'])
cosine_sim_keywords = linear_kernel(df_cbf_tfidf_keywords, df_cbf_tfidf_keywords)
df_cbf_tfidf_Director = tfidf.fit_transform(df_cbf1['Director'])
cosine_sim_Director = linear_kernel(df_cbf_tfidf_Director, df_cbf_tfidf_Director)
df_cbf_tfidf_Writer = tfidf.fit_transform(df_cbf1['Writer'])
cosine_sim_Writer = linear_kernel(df_cbf_tfidf_Writer, df_cbf_tfidf_Writer)
df_cbf_tfidf_Cast = tfidf.fit_transform(df_cbf1['Cast'])
cosine_sim_Cast = linear_kernel(df_cbf_tfidf_Cast, df_cbf_tfidf_Cast)
df_cbf_tfidf_Top_Cast = tfidf.fit_transform(df_cbf1['Top Cast'])
cosine_sim_Top_Cast = linear_kernel(df_cbf_tfidf_Top_Cast, df_cbf_tfidf_Top_Cast)
df_cbf_tfidf_budget_categorized = tfidf.fit_transform(df_cbf1['budget_categorized'])
cosine_sim_budget_categorized = linear_kernel(df_cbf_tfidf_budget_categorized, df_cbf_tfidf_budget_categorized)
df_cbf_tfidf_Length = tfidf.fit_transform(df_cbf1['Length'])
cosine_sim_Length = linear_kernel(df_cbf_tfidf_Length, df_cbf_tfidf_Length)
df_cbf_tfidf_average_vote_categorized = tfidf.fit_transform(df_cbf1['average_vote_categorized'])
cosine_sim_average_vote_categorized = linear_kernel(df_cbf_tfidf_average_vote_categorized, df_cbf_tfidf_average_vote_categorized)
df_cbf_tfidf_count_vote_categorized = tfidf.fit_transform(df_cbf1['count_vote_categorized'])
cosine_sim_count_vote_categorized = linear_kernel(df_cbf_tfidf_count_vote_categorized, df_cbf_tfidf_count_vote_categorized)
df_cbf_tfidf_era = tfidf.fit_transform(df_cbf1['era'])
cosine_sim_era = linear_kernel(df_cbf_tfidf_era, df_cbf_tfidf_era)



cosin_sim_final = np.multiply(cosine_sim_belongs_to_collection, W_belongs_to_collection) + np.multiply(cosine_sim_genres, W_genres) + np.multiply(cosine_sim_original_language, W_original_language) + np.multiply(cosine_sim_title, W_title) + np.multiply(cosine_sim_overview, W_overview) + np.multiply(cosine_sim_pruduction_countries, W_production_countries) + np.multiply(cosine_sim_pruduction_companies, W_production_companies) + np.multiply(cosine_sim_tagline, W_tagline) + np.multiply(cosine_sim_keywords, W_keywords) + np.multiply(cosine_sim_Director, W_Director) + np.multiply(cosine_sim_Writer, W_Writer) + np.multiply(cosine_sim_Cast, W_Cast) + np.multiply(cosine_sim_Top_Cast, W_Top_Cast) + np.multiply(cosine_sim_budget_categorized, W_budget_categorized) + np.multiply(cosine_sim_Length, W_length) + np.multiply(cosine_sim_average_vote_categorized, W_average_vote_categorized) + np.multiply(cosine_sim_count_vote_categorized, W_count_vote_categorized) + np.multiply(cosine_sim_era, W_era)
df_cbf2_indices = pd.Series(df_cbf2.index, index=df_cbf2['title'])



def final_recommender_hot_picks_now(Watched_movies_list):
  recommended_list = []
  for i in range(10):
    recommended_list.append(df_popular_popularity.loc[i, 'title'])
  return recommended_list

#recommend based on weighted ratings
def final_recommender_hot_picks_of_all_time(Watched_movies_list):
  recommended_list = []
  for i in range(10):
    recommended_list.append(df_popular_WR_Q.loc[i, 'title'])
  return recommended_list

#recommend based on content based
def final_recommender_for_you(Watched_movies_list):
  recommended_list = []
  if len(Watched_movies_list) < 3:
    for i in range(10):
      recommended_list.append(df_popular_WR_Q.loc[i, 'title'])
  else:
    recently_watched = Watched_movies_list[-3:]
    for i in range(len(recently_watched)):
      y = df_cbf2_indices[recently_watched[i]]
      z = list(enumerate(cosin_sim_final[y]))
      z = sorted(z, key=lambda x: x[1], reverse=True)
      z = z[1:16]
      k = [i[0] for i in z]
      for j in k:
        recommended_list.append(df_cbf2.loc[j, 'title'])
  recommended_list = list(set(recommended_list))
  random.shuffle(recommended_list)
  recommended_list = recommended_list[:15]
  return recommended_list