Yew Chong commited on
Commit
15cf29b
·
1 Parent(s): 7748238
README.md CHANGED
@@ -16,7 +16,21 @@ Hello, and welcome to our books recommendation project for BT5153!
16
 
17
  # Project Directory
18
  ## Front-end UI
19
- **To Add Frontend Here**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  ## Source Code
22
  Codes are stored under `./Books` as `.ipynb` files, and named according to the order they should be run.
 
16
 
17
  # Project Directory
18
  ## Front-end UI
19
+ ### Book Recommendation Ensemble Model Interface
20
+
21
+ This interface generates recommendations, but only for a list of randomly sampled test users from our dataset.
22
+
23
+ This interface was created on Python version 3.11.4, with requirements listed in `requirements.txt`.
24
+ There may be some requirements missed, please install as needed.
25
+
26
+ All sub-models and the final ensemble classifier model were trained in advance. They are included inside the Data
27
+ folder.
28
+
29
+ All data used for live recommendation is in the Data folder. Since the Data folder is too large to be submitted,
30
+ we will submit a representative subset of the data.
31
+
32
+ Start the interface with `python -m flask run`.
33
+
34
 
35
  ## Source Code
36
  Codes are stored under `./Books` as `.ipynb` files, and named according to the order they should be run.
main.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from flask import Flask, render_template, request
3
+ from model import Model
4
+ import plotly.graph_objects as go
5
+
6
+ model = Model()
7
+ app = Flask(__name__)
8
+ logging.basicConfig(level=logging.DEBUG)
9
+ PRED_CACHE = dict()
10
+
11
+
12
+ @app.route('/')
13
+ def index():
14
+ return render_template('index.html')
15
+
16
+
17
+ @app.route('/test_users')
18
+ def test_users():
19
+ model.run_predictions_on_full_test()
20
+ model.prepare_user_details()
21
+
22
+ # Options for the dropdown menu
23
+ user_details = model.user_details['top_books'].to_dict()
24
+ return render_template('test_users.html', user_details=user_details)
25
+
26
+
27
+ @app.route('/test_users/<chosen_user>')
28
+ def process(chosen_user):
29
+ # Get book recommendations
30
+ if chosen_user in PRED_CACHE:
31
+ preds_df = PRED_CACHE[chosen_user]
32
+ else:
33
+ preds_df = model.get_user_predictions(chosen_user)
34
+ PRED_CACHE[chosen_user] = preds_df
35
+
36
+ if preds_df is None:
37
+ return "No predictions hit!"
38
+
39
+ # Get Pandas series of recommended books
40
+ recommended_books = preds_df.set_index('book_id')[['title_without_series', 'target', 'final_score']]
41
+ recommended_books['is_recommended'] = recommended_books['final_score'] >= 0.45
42
+
43
+ # Use Bootstrap's List to make a list of recommended books and a button for each book, routing to '/explain/book_id'
44
+ # Render the page with recommended books
45
+ return render_template(
46
+ 'recommended_books.html',
47
+ chosen_user=chosen_user,
48
+ recommended_books=recommended_books
49
+ )
50
+
51
+
52
+ @app.route('/test_users/<chosen_user>/<int:chosen_book>')
53
+ def explain(chosen_user, chosen_book):
54
+ # Get book recommendations
55
+ # This should be a cache hit since we're coming from `process`, but we include the else path just in case
56
+ if chosen_user in PRED_CACHE:
57
+ preds_df = PRED_CACHE[chosen_user]
58
+ else:
59
+ preds_df = model.get_user_predictions(chosen_user)
60
+ PRED_CACHE[chosen_user] = preds_df
61
+
62
+ # Get Pandas series of recommended books
63
+ recommended_books = preds_df.set_index('book_id')[['title_without_series', 'target', 'final_score']]
64
+ recommended_books['is_recommended'] = recommended_books['final_score'] >= 0.45
65
+
66
+ # book_details = model.all_books[model.all_books['book_id'] == book_id]
67
+ logging.info(f"Generating explanation for user:{chosen_user}, book:{chosen_book}")
68
+
69
+ book_df = preds_df.set_index('book_id').loc[chosen_book]
70
+ waterfall_cols = [
71
+ 'intercept',
72
+ 'clus_score',
73
+ 'gen_score',
74
+ 'desc_score',
75
+ 'rev_score',
76
+ 'user_score',
77
+ 'tit_score',
78
+ 'final_score'
79
+ ]
80
+ waterfall_display_cols = [
81
+ 'Intercept',
82
+ 'Book Clustering Similarity',
83
+ 'Genre Similarity',
84
+ 'Description Topic Similarity',
85
+ 'Review Vector Similarity',
86
+ 'User Clustering Similarity',
87
+ 'Title Vector Similarity',
88
+ 'Sum of Sub-Model Scores'
89
+ ]
90
+ waterfall_data = book_df[waterfall_cols].tolist()
91
+ fig = go.Figure(
92
+ go.Waterfall(
93
+ name='Recommendation explanation',
94
+ orientation='h',
95
+ measure=['relative', 'relative', 'relative', 'relative', 'relative', 'relative', 'relative', 'total'],
96
+ y=waterfall_display_cols,
97
+ x=waterfall_data
98
+ )
99
+ )
100
+ fig_html = fig.to_html(full_html=False)
101
+
102
+ top_model_idx = waterfall_cols.index(book_df[waterfall_cols[:-1]].astype(float).idxmax())
103
+ top_model = waterfall_display_cols[top_model_idx]
104
+ explanation_str = f"The highest contributing model was {top_model}. "
105
+ if book_df['final_score'] >= 0.45:
106
+ reasons = [
107
+ '-', # intercept
108
+ 'it is similar to books you enjoyed in terms of book statistics like popularity and page count.',
109
+ 'it is similar to books you enjoyed in terms of overlapping genres.',
110
+ 'it is similar to books you enjoyed in terms of description similarity.',
111
+ 'it is similar to books you enjoyed in terms of review similarity.',
112
+ 'other users similar to you in taste enjoyed this book.',
113
+ 'it is similar to books you enjoyed in terms of title similarity.',
114
+ ]
115
+ explanation_str += "This means that this book was recommended since "
116
+ explanation_str += reasons[top_model_idx]
117
+ else:
118
+ explanation_str += "However, the confidence score is below the threshold of 0.45, so it is not recommended."
119
+
120
+ score_sum = f"{sum(waterfall_data[:-1]):.5f}"
121
+ final_score = f"{book_df['final_score']:.5f}"
122
+ return render_template(
123
+ 'recommended_books.html',
124
+ chosen_user=chosen_user,
125
+ recommended_books=recommended_books,
126
+ render_explanation='true',
127
+ fig=fig_html,
128
+ score_sum=score_sum,
129
+ final_score=final_score,
130
+ explanation_str=explanation_str
131
+ )
132
+
133
+
134
+ if __name__ == '__main__':
135
+ app.run(debug=True)
model.py ADDED
@@ -0,0 +1,309 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ from tqdm import tqdm
4
+ import pickle
5
+ import os
6
+ from collections import defaultdict
7
+ import random
8
+ import warnings
9
+ import logging
10
+
11
+ from langchain_community.embeddings import HuggingFaceBgeEmbeddings
12
+ from langchain_community.vectorstores import FAISS
13
+
14
+ warnings.filterwarnings("ignore")
15
+ random.seed(5153)
16
+ logging.basicConfig(level=logging.DEBUG)
17
+
18
+
19
+ class Model:
20
+ def __init__(self):
21
+ self.cache_path = "Data/cache.pkl"
22
+ self.is_loaded = False
23
+ self.dataset = None
24
+ self.predictions = None
25
+ self.user_details = None
26
+ self.temp_store = None
27
+ self.pipeline = None
28
+ self.chosen_books_per_user = None
29
+ self.all_books = pd.read_csv("Data/books.csv")
30
+ logging.info("Initialized model")
31
+
32
+ def run_predictions_on_full_test(self):
33
+ if self.is_loaded:
34
+ logging.info("Model is already loaded")
35
+ return
36
+ if self.does_cache_exist():
37
+ logging.info("Retrieving cached full-test predictions")
38
+ self.retrieve_cache()
39
+ logging.info("Completed full-test")
40
+ return
41
+ logging.info("Generating full-test predictions")
42
+ reviews_df = pd.read_csv("Data/final_dataset/reviews_test.csv")
43
+ good_reviews = reviews_df[reviews_df['rating'] > 3]
44
+ good_user_books_dict = good_reviews.groupby('user_id')['book_id'].unique().apply(list).to_dict()
45
+
46
+ # to further minimize compute time, we only use 20 (randomly sampled) users
47
+ num_random_users = 20
48
+ randomly_sampled_users = random.sample(list(good_user_books_dict.keys()), num_random_users)
49
+ sampled_good_user_books_dict = {user_id: good_user_books_dict[user_id] for user_id in randomly_sampled_users}
50
+
51
+ # to minimize compute time, we take only 150 random (good) books per user
52
+ # prepare it in the form of user_id -> list[book_id]
53
+ num_rand_books_per_user = 150
54
+ chosen_books_per_user = {
55
+ user_id: random.sample(books, min(len(books), num_rand_books_per_user))
56
+ for user_id, books in sampled_good_user_books_dict.items()
57
+ }
58
+
59
+ # save this for reference
60
+ self.chosen_books_per_user = chosen_books_per_user
61
+
62
+ # run predictions on all of the above users
63
+ self.prepare_predictions(chosen_books_per_user)
64
+ logging.info("Caching full-test predictions")
65
+ self.cache_results()
66
+ logging.info("Completed full-test")
67
+
68
+ def run_prediction_on_adhoc_user(self, chosen_book_ids):
69
+ self.prepare_predictions(
70
+ {'current_user': chosen_book_ids}
71
+ )
72
+
73
+ def prepare_predictions(self, target_users_and_books: dict[str, list[str]]):
74
+ """
75
+ Given a dictionary of user_id to list[book_id], where the list of book IDs are the books favored by
76
+ the associated user, this function returns the recommended books for each user provided in the dictionary
77
+
78
+ :param target_users_and_books: Dictionary of user ID to favored books (as book IDs)
79
+ :return: Dataframe of user IDs and associated recommended books, plus individual model scores
80
+ """
81
+ target_user_list = list(target_users_and_books.keys())
82
+
83
+ file_dict = {}
84
+ for filename in ['reviews_test', 'users_test', 'reviews_sub']:
85
+ file_dict[filename] = pd.read_csv(f'Data/final_dataset/{filename}.csv')
86
+
87
+ file_dict['users'] = file_dict['users_test']
88
+ file_dict['reviews'] = file_dict['reviews_test']
89
+
90
+ file_dict['good_reviews'] = file_dict['reviews'][file_dict['reviews']['rating'] > 3]
91
+ file_dict['books'] = pd.read_csv('Data/books.csv')
92
+
93
+ #################################################################################
94
+ # GENRE MODEL; DESCRIPTION MODEL; TITLE MODEL; BOOK STATS CLUSTER MODEL
95
+ #################################################################################
96
+
97
+ clusterbooks = pd.DataFrame(
98
+ np.load('Data/Recommended Storage/cluster_books.npy', allow_pickle=True),
99
+ columns=['target_book', 'recco_book_id', 'similarity_score']).astype(float) # wasn't saved as float
100
+ genrebooks = pd.DataFrame(
101
+ np.load('Data/Recommended Storage/genres_books.npy', allow_pickle=True),
102
+ columns=['target_book', 'recco_book_id', 'similarity_score'])
103
+ descbooks = pd.DataFrame(
104
+ np.load('Data/Recommended Storage/description_books.npy', allow_pickle=True),
105
+ columns=['target_book', 'recco_book_id', 'similarity_score'])
106
+ revbooks = pd.DataFrame(
107
+ np.load('Data/Recommended Storage/reviews_books_new.npy', allow_pickle=True),
108
+ columns=['target_book', 'recco_book_id', 'similarity_score'])
109
+
110
+ def optimized_converter(simbooks, user_id_list, name, prog_bar_description):
111
+ user_ratings_list = pd.DataFrame(columns=['user_id', 'recco_book_id', 'similarity_score'])
112
+ for curr_user_id in tqdm(user_id_list, desc=prog_bar_description):
113
+ curr_user_books = pd.Series(target_users_and_books[curr_user_id])
114
+ relevant_simbooks = simbooks[simbooks['target_book'].isin(curr_user_books)]
115
+ summed_scores = relevant_simbooks.groupby('recco_book_id')['similarity_score'].sum().reset_index()
116
+ summed_scores['user_id'] = curr_user_id
117
+ if not curr_user_books.empty:
118
+ summed_scores = summed_scores[~summed_scores['recco_book_id'].isin(curr_user_books)]
119
+ # TODO: Think about how to adjust this for small number of books
120
+ summed_scores['similarity_score'] /= len(curr_user_books)
121
+ top_30 = summed_scores.nlargest(30, 'similarity_score')
122
+ user_ratings_list = pd.concat([user_ratings_list, top_30], ignore_index=True)
123
+ return user_ratings_list.rename(columns={'recco_book_id': 'book_id', 'similarity_score': name})
124
+
125
+ genre_users = optimized_converter(genrebooks, target_user_list, 'gen_score', "Generating recs (genre)")
126
+ cluster_users = optimized_converter(clusterbooks, target_user_list, 'clus_score',
127
+ "Generating recs (book stats cluster)")
128
+ description_users = optimized_converter(descbooks, target_user_list, 'desc_score',
129
+ "Generating recs (description)")
130
+ reviews_users = optimized_converter(revbooks, target_user_list, 'rev_score', "Generating recs (reviews)")
131
+
132
+ #################################################################################
133
+ # USER SIMILARITY CLUSTERING MODEL
134
+ #################################################################################
135
+
136
+ def jaccard_similarity_pandas(target_user, reviews_sub, n):
137
+ target_user_books = target_users_and_books[target_user]
138
+ relevant_reviews = reviews_sub[reviews_sub['book_id'].isin(target_user_books)]
139
+ intersections = relevant_reviews.groupby('user_id').size()
140
+ # all_books = pd.concat(
141
+ # [df[df['user_id'] == target_user]['book_id'], reviews_sub['book_id']]).drop_duplicates()
142
+ user_book_counts = reviews_sub.groupby('user_id')['book_id'].nunique()
143
+ unions = len(target_user_books) + user_book_counts - intersections
144
+ jaccard_index = intersections / unions
145
+ top_n_users = jaccard_index.nlargest(n)
146
+ return top_n_users.reset_index().values.tolist()
147
+
148
+ def recommend_books(target_user_id, reviews_sub, num_books):
149
+ # df = reviews_sub[(reviews_sub['rating'].isin([4, 5]))]
150
+ top_n_similar_users = jaccard_similarity_pandas(target_user_id, reviews_sub, n=20)
151
+ target_user_books = target_users_and_books[target_user_id]
152
+ similar_users_reviews = reviews_sub[reviews_sub['user_id'].isin([user[0] for user in top_n_similar_users])]
153
+
154
+ recommended_books = defaultdict(float)
155
+ for curr_user_id, similarity_score in top_n_similar_users:
156
+ user_reviews = similar_users_reviews[similar_users_reviews['user_id'] == curr_user_id]
157
+ for _, row in user_reviews.iterrows():
158
+ if row['book_id'] not in target_user_books:
159
+ recommended_books[row['book_id']] += similarity_score
160
+
161
+ # Return top recommended books sorted by score
162
+ sorted_recommended_books = sorted(recommended_books.items(), key=lambda x: x[1], reverse=True)
163
+ return [(target_user_id, book_id, book_score) for book_id, book_score in
164
+ sorted_recommended_books[:num_books]]
165
+
166
+ all_recommendations = []
167
+
168
+ for each_user_id in tqdm(target_user_list, desc="Generating recs (users)"):
169
+ recommendations = recommend_books(each_user_id, file_dict['reviews_sub'], 30)
170
+ all_recommendations.extend(recommendations)
171
+ user_users = pd.DataFrame(all_recommendations, columns=['user_id', 'book_id', 'user_score'])
172
+ user_users.head()
173
+
174
+ #################################################################################
175
+ # TITLE SIMILARITY MODEL
176
+ #################################################################################
177
+
178
+ store = FAISS.load_local(
179
+ "Data/faiss_store",
180
+ HuggingFaceBgeEmbeddings(
181
+ model_kwargs={"device": "cpu"},
182
+ encode_kwargs={"normalize_embeddings": True}
183
+ ),
184
+ allow_dangerous_deserialization=True
185
+ )
186
+
187
+ title_output = []
188
+ for user_id, books in tqdm(target_users_and_books.items(), desc="Generating recs (title)"):
189
+ user_book_id = target_users_and_books[user_id]
190
+ user_books = file_dict['books'][(file_dict['books']['book_id'].isin(user_book_id))]
191
+ titles = '\n'.join(user_books['title_without_series']) # Using titles without series for queries
192
+ results = store.similarity_search_with_score(titles, k=80)
193
+ for result, score in results:
194
+ if result.metadata.get('book_id') not in user_books:
195
+ title_output.append([user_id, result.metadata.get('book_id'), 1 - score])
196
+
197
+ # Save formatted
198
+ title_users = pd.DataFrame(title_output, columns=['user_id', 'book_id', 'tit_score'])
199
+
200
+ #################################################################################
201
+ # COMBINING MODEL OUTPUTS
202
+ #################################################################################
203
+
204
+ self.temp_store = {
205
+ 'cluster': cluster_users,
206
+ 'genre': genre_users,
207
+ 'desc': description_users,
208
+ 'reviews': reviews_users,
209
+ 'users': user_users,
210
+ 'title': title_users,
211
+ }
212
+
213
+ combined_df = pd.merge(cluster_users, genre_users, on=['user_id', 'book_id'], how='outer')
214
+ combined_df = pd.merge(combined_df, description_users, on=['user_id', 'book_id'], how='outer')
215
+ combined_df = pd.merge(combined_df, reviews_users, on=['user_id', 'book_id'], how='outer')
216
+ combined_df = pd.merge(combined_df, user_users, on=['user_id', 'book_id'], how='outer')
217
+ combined_df = pd.merge(combined_df, title_users, on=['user_id', 'book_id'], how='outer')
218
+
219
+ combined_df.fillna(0, inplace=True)
220
+ combined_df['book_id'] = combined_df['book_id'].astype(int)
221
+ combined_df['tit_score'] = combined_df['tit_score'].astype(float)
222
+
223
+ reviews_df = file_dict['reviews'][file_dict['reviews']['rating'].isin([1, 2, 3, 4, 5])]
224
+ reviews_filtered = reviews_df[['user_id', 'book_id', 'rating']]
225
+ combined_df = combined_df.merge(reviews_filtered, on=['user_id', 'book_id'], how='left')
226
+ combined_df.rename(columns={'rating': 'target'}, inplace=True)
227
+ combined_df['binary'] = np.where(combined_df['target'] >= 4, 1, 0)
228
+
229
+ # remove books which are not recommended at all
230
+ combined_df = combined_df[
231
+ (combined_df[['clus_score', 'gen_score', 'desc_score', 'rev_score', 'user_score', 'tit_score']] != 0).any(
232
+ axis=1)]
233
+
234
+ with open("Data/final_model.pkl", 'rb') as file:
235
+ self.pipeline = pickle.load(file)
236
+
237
+ X_test = combined_df.drop(columns=['user_id', 'book_id', 'target', 'binary'])
238
+ predictions_df = combined_df[
239
+ ['user_id', 'book_id', 'clus_score', 'gen_score', 'desc_score', 'rev_score', 'user_score',
240
+ 'tit_score', 'target', 'binary']].copy()
241
+ predictions_df['final_score'] = self.pipeline.predict_proba(X_test).T[1]
242
+ predictions_df['would_recommend'] = predictions_df['final_score'] >= 0.45 # peak f2 score at this threshold
243
+ predictions_df = predictions_df.sort_values(['user_id', 'final_score'], ascending=[True, False])
244
+
245
+ self.dataset = combined_df
246
+ self.predictions = predictions_df
247
+
248
+ def prepare_user_details(self):
249
+ users_list = self.dataset['user_id'].unique()
250
+
251
+ users_df = pd.read_csv("Data/final_dataset/users_test.csv")
252
+ books_df = pd.read_csv("Data/final_dataset/books_test.csv")
253
+
254
+ # filter to keep only relevant users
255
+ users_df = users_df[users_df['user_id'].isin(users_list)]
256
+ # merge to get book and review data
257
+ full_df = users_df.merge(books_df, on="user_id")
258
+
259
+ user_details = pd.DataFrame()
260
+ top_books_per_user = full_df.groupby("user_id").apply(
261
+ lambda x: x.sort_values('rating').nlargest(n=5, columns='rating')['title_without_series'].tolist())
262
+ user_details['top_books'] = top_books_per_user
263
+
264
+ self.user_details = user_details
265
+
266
+ def get_user_predictions(self, chosen_user):
267
+ logging.info(f"Generating predictions for user: {chosen_user}")
268
+ user_predictions = self.predictions[self.predictions['user_id'] == chosen_user]
269
+ user_predictions = user_predictions.dropna(subset=['target'])
270
+ if len(user_predictions) == 0:
271
+ logging.info(f"No predictions hit! Exiting early")
272
+ return None
273
+
274
+ # transform model scores using the pipeline (scaler + logistic regression coefficients)
275
+ # specifically, apply scaler then apply linear layer of logistic regression
276
+ model_score_cols = [c for c in user_predictions.columns if c.endswith('_score') and c != 'final_score']
277
+ scaled_model_scores = self.pipeline['scaler'].transform(user_predictions[model_score_cols])
278
+ multed_model_scores = scaled_model_scores * self.pipeline['classifier'].coef_[0]
279
+ final_model_scores = pd.DataFrame(multed_model_scores, columns=model_score_cols)
280
+ final_model_scores['intercept'] = self.pipeline['classifier'].intercept_[0]
281
+
282
+ columns = ['book_id', 'target', 'final_score', 'would_recommend']
283
+ predictions_and_score = pd.concat(
284
+ [user_predictions[columns].reset_index(drop=True), final_model_scores],
285
+ axis=1
286
+ )
287
+ return predictions_and_score.merge(self.all_books[['book_id', 'title_without_series']], on='book_id')
288
+
289
+ def cache_results(self):
290
+ with open(self.cache_path, 'wb+') as f:
291
+ to_pickle = dict()
292
+ to_pickle['dataset'] = self.dataset
293
+ to_pickle['predictions'] = self.predictions
294
+ to_pickle['temp_store'] = self.temp_store
295
+ to_pickle['pipeline'] = self.pipeline
296
+ to_pickle['chosen_books'] = self.chosen_books_per_user
297
+ # to_pickle['user_details'] = self.user_details
298
+ pickle.dump(to_pickle, f)
299
+ self.is_loaded = True
300
+
301
+ def does_cache_exist(self):
302
+ return os.path.exists(self.cache_path)
303
+
304
+ def retrieve_cache(self):
305
+ with open(self.cache_path, 'rb') as f:
306
+ unpickled = pickle.load(f)
307
+ for key, val in unpickled.items():
308
+ exec(f"self.{key} = val")
309
+ self.is_loaded = True
templates/index.html ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <title>Book Recommender</title>
5
+ <!-- Bootstrap CSS -->
6
+ <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-QWTKZyjpPEjISv5WaRU9OFeRpok6YctnYmDr5pNlyT2bRjXh0JMhjY6hW+ALEwIH" crossorigin="anonymous">
7
+ </head>
8
+ <body>
9
+ <div class="container">
10
+ <h1>Book Recommendation System</h1>
11
+ <p>Welcome! You can access the options below:</p>
12
+ <a href="/test_users" class="btn btn-primary">Go to Test User Set</a>
13
+ </div>
14
+ <!-- Bootstrap JS (Optional) -->
15
+ <script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.bundle.min.js" integrity="sha384-YvpcrYf0tY3lHB60NNkmXc5s9fDVZLESaAA55NDzOxhy9GkcIdslK1eN7N6jIeHz" crossorigin="anonymous"></script>
16
+ </body>
17
+ </html>
templates/recommended_books.html ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <title>Book Recommender</title>
5
+ <!-- The loading of KaTeX is deferred to speed up page rendering -->
6
+ <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/dist/katex.min.css" integrity="sha384-wcIxkf4k558AjM3Yz3BBFQUbk/zgIYC2R0QpeeYb+TwlBVMrlgLqwRjRtGZiK7ww" crossorigin="anonymous">
7
+ <script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/katex.min.js" integrity="sha384-hIoBPJpTUs74ddyc4bFZSM1TVlQDA60VBbJS0oA934VSz82sBx1X7kSx2ATBDIyd" crossorigin="anonymous"></script>
8
+ <!-- Bootstrap CSS -->
9
+ <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-QWTKZyjpPEjISv5WaRU9OFeRpok6YctnYmDr5pNlyT2bRjXh0JMhjY6hW+ALEwIH" crossorigin="anonymous">
10
+ </head>
11
+ <body>
12
+ <div class="container">
13
+ <h1>User books breakdown</h1>
14
+ <h2>What is on this page?</h2>
15
+ <p>
16
+ The books below are books that have been recommended by at least one sub-model in our ensemble model for
17
+ the current user.
18
+ They are sorted by our ensemble model's confidence score. If our model is confident that the user will
19
+ enjoy the book, then the book is sorted towards the top.
20
+ <br/><br/>
21
+ These books have also already been read by this user. We can compare the user's actual rating to the
22
+ confidence score of our ensemble model to see how accurate our ensemble model's predictions are.
23
+ <br/><br/>
24
+ Click on the Explain button in blue to look at a breakdown of each sub-model's scores and how they
25
+ contributed to the ensemble model's final confidence score, and for a brief explanation of why this book
26
+ was recommended to the user.
27
+ <br/><br/>
28
+ <a href="/test_users">Click here to return to the list of users.</a>
29
+ </p>
30
+ <h2>Current user ID: {{ chosen_user }}</h2>
31
+ <table class="table table-striped" style="margin-top: 3em">
32
+ <thead class="thead-dark">
33
+ <tr>
34
+ <th>Title</th>
35
+ <th>User's actual rating</th>
36
+ <th>Prediction confidence score</th>
37
+ <th>Recommended?</th>
38
+ <th>Show explanation</th>
39
+ </tr>
40
+ </thead>
41
+ <tbody>
42
+ {% for book_id, book_data in recommended_books.iterrows() %}
43
+ <tr>
44
+ <td>{{ book_data['title_without_series'] }}</td>
45
+ <td>{{ book_data['target'] }}</td>
46
+ <td>{{ book_data['final_score'] }}</td>
47
+ <td>{{ book_data['is_recommended'] }}</td>
48
+ <td><a href="/test_users/{{ chosen_user }}/{{ book_id }}" class="btn btn-primary">Explain</a></td>
49
+ </tr>
50
+ {% endfor %}
51
+ </tbody>
52
+ </table>
53
+ {{ fig|safe }}
54
+ <div style="display: flex; flex-direction: column; justify-content: space-around; align-items: center">
55
+ <div id="ScoreSum" style="font-size: 1.5em"></div>
56
+ <div id="Formula" style="font-size: 1.5em"></div>
57
+ <p style="text-align: center; width: 50%; margin-top: 1em">
58
+ {{ explanation_str }}
59
+ </p>
60
+ </div>
61
+ </div>
62
+ <script>
63
+ scoreSumDiv = document.getElementById("ScoreSum");
64
+ formulaDiv = document.getElementById("Formula");
65
+ // disgusting string-concatenating-Jinja-templating monster
66
+ if ({{ render_explanation }}) {
67
+ const score_sum = {{ score_sum }};
68
+ katex.render("\\text{Sum of sub-model scores}=" + score_sum, scoreSumDiv);
69
+
70
+ const start = "\\text{Confidence Score} = \\frac{1}{1+e^{-(";
71
+ const end = ")}} = ";
72
+ const threshold = "0.45";
73
+ const final_score = {{ final_score }};
74
+ let conclusion;
75
+ if (parseFloat(final_score) >= parseFloat(threshold)) {
76
+ conclusion = "\\ge " + threshold + " \\text{ (Recommended)}";
77
+ } else {
78
+ conclusion = "\\lt " + threshold + " \\text{ (Not recommended)}";
79
+ }
80
+ const render_str = start + score_sum + end + final_score + conclusion;
81
+ katex.render(render_str, formulaDiv);
82
+ }
83
+ </script>
84
+ <!-- Bootstrap JS (Optional) -->
85
+ <script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.bundle.min.js" integrity="sha384-YvpcrYf0tY3lHB60NNkmXc5s9fDVZLESaAA55NDzOxhy9GkcIdslK1eN7N6jIeHz" crossorigin="anonymous"></script>
86
+
87
+ </body>
88
+ </html>
templates/test_users.html ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <title>Book Recommender</title>
5
+ <!-- Bootstrap CSS -->
6
+ <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-QWTKZyjpPEjISv5WaRU9OFeRpok6YctnYmDr5pNlyT2bRjXh0JMhjY6hW+ALEwIH" crossorigin="anonymous">
7
+ </head>
8
+ <body>
9
+ <div class="container">
10
+ <h1>Select a User</h1>
11
+ <p>
12
+ We generated recommendations for each of the 20 test users below. In each card, you can see that user's
13
+ favourite titles (top 5, sorted by user's rating) to give an idea of the user's taste profile. Click
14
+ on the button to see what we have recommended for this user.
15
+ </p>
16
+ <div class="row">
17
+ {% for user_id, top_books in user_details.items() %}
18
+ <div class="col-md-4 mb-4">
19
+ <div class="card">
20
+ <div class="card-body">
21
+ <h5 class="card-title">User</h5>
22
+ {% for title in top_books %}
23
+ <h6 class="card-text">{{ title }}</h6>
24
+ {% endfor %}
25
+ <a href="/test_users/{{ user_id }}" class="btn btn-primary">Select</a>
26
+ </div>
27
+ </div>
28
+ </div>
29
+ {% endfor %}
30
+ </div>
31
+ </div>
32
+ <!-- Bootstrap JS (Optional) -->
33
+ <script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.bundle.min.js" integrity="sha384-YvpcrYf0tY3lHB60NNkmXc5s9fDVZLESaAA55NDzOxhy9GkcIdslK1eN7N6jIeHz" crossorigin="anonymous"></script>
34
+ </body>
35
+ </html>