Spaces:
Running
Running
Yew Chong
commited on
Commit
·
15cf29b
1
Parent(s):
7748238
frontend
Browse files- README.md +15 -1
- main.py +135 -0
- model.py +309 -0
- templates/index.html +17 -0
- templates/recommended_books.html +88 -0
- templates/test_users.html +35 -0
README.md
CHANGED
@@ -16,7 +16,21 @@ Hello, and welcome to our books recommendation project for BT5153!
|
|
16 |
|
17 |
# Project Directory
|
18 |
## Front-end UI
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
## Source Code
|
22 |
Codes are stored under `./Books` as `.ipynb` files, and named according to the order they should be run.
|
|
|
16 |
|
17 |
# Project Directory
|
18 |
## Front-end UI
|
19 |
+
### Book Recommendation Ensemble Model Interface
|
20 |
+
|
21 |
+
This interface generates recommendations, but only for a list of randomly sampled test users from our dataset.
|
22 |
+
|
23 |
+
This interface was created on Python version 3.11.4, with requirements listed in `requirements.txt`.
|
24 |
+
There may be some requirements missed, please install as needed.
|
25 |
+
|
26 |
+
All sub-models and the final ensemble classifier model were trained in advance. They are included inside the Data
|
27 |
+
folder.
|
28 |
+
|
29 |
+
All data used for live recommendation is in the Data folder. Since the Data folder is too large to be submitted,
|
30 |
+
we will submit a representative subset of the data.
|
31 |
+
|
32 |
+
Start the interface with `python -m flask run`.
|
33 |
+
|
34 |
|
35 |
## Source Code
|
36 |
Codes are stored under `./Books` as `.ipynb` files, and named according to the order they should be run.
|
main.py
ADDED
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
from flask import Flask, render_template, request
|
3 |
+
from model import Model
|
4 |
+
import plotly.graph_objects as go
|
5 |
+
|
6 |
+
model = Model()
|
7 |
+
app = Flask(__name__)
|
8 |
+
logging.basicConfig(level=logging.DEBUG)
|
9 |
+
PRED_CACHE = dict()
|
10 |
+
|
11 |
+
|
12 |
+
@app.route('/')
|
13 |
+
def index():
|
14 |
+
return render_template('index.html')
|
15 |
+
|
16 |
+
|
17 |
+
@app.route('/test_users')
|
18 |
+
def test_users():
|
19 |
+
model.run_predictions_on_full_test()
|
20 |
+
model.prepare_user_details()
|
21 |
+
|
22 |
+
# Options for the dropdown menu
|
23 |
+
user_details = model.user_details['top_books'].to_dict()
|
24 |
+
return render_template('test_users.html', user_details=user_details)
|
25 |
+
|
26 |
+
|
27 |
+
@app.route('/test_users/<chosen_user>')
|
28 |
+
def process(chosen_user):
|
29 |
+
# Get book recommendations
|
30 |
+
if chosen_user in PRED_CACHE:
|
31 |
+
preds_df = PRED_CACHE[chosen_user]
|
32 |
+
else:
|
33 |
+
preds_df = model.get_user_predictions(chosen_user)
|
34 |
+
PRED_CACHE[chosen_user] = preds_df
|
35 |
+
|
36 |
+
if preds_df is None:
|
37 |
+
return "No predictions hit!"
|
38 |
+
|
39 |
+
# Get Pandas series of recommended books
|
40 |
+
recommended_books = preds_df.set_index('book_id')[['title_without_series', 'target', 'final_score']]
|
41 |
+
recommended_books['is_recommended'] = recommended_books['final_score'] >= 0.45
|
42 |
+
|
43 |
+
# Use Bootstrap's List to make a list of recommended books and a button for each book, routing to '/explain/book_id'
|
44 |
+
# Render the page with recommended books
|
45 |
+
return render_template(
|
46 |
+
'recommended_books.html',
|
47 |
+
chosen_user=chosen_user,
|
48 |
+
recommended_books=recommended_books
|
49 |
+
)
|
50 |
+
|
51 |
+
|
52 |
+
@app.route('/test_users/<chosen_user>/<int:chosen_book>')
|
53 |
+
def explain(chosen_user, chosen_book):
|
54 |
+
# Get book recommendations
|
55 |
+
# This should be a cache hit since we're coming from `process`, but we include the else path just in case
|
56 |
+
if chosen_user in PRED_CACHE:
|
57 |
+
preds_df = PRED_CACHE[chosen_user]
|
58 |
+
else:
|
59 |
+
preds_df = model.get_user_predictions(chosen_user)
|
60 |
+
PRED_CACHE[chosen_user] = preds_df
|
61 |
+
|
62 |
+
# Get Pandas series of recommended books
|
63 |
+
recommended_books = preds_df.set_index('book_id')[['title_without_series', 'target', 'final_score']]
|
64 |
+
recommended_books['is_recommended'] = recommended_books['final_score'] >= 0.45
|
65 |
+
|
66 |
+
# book_details = model.all_books[model.all_books['book_id'] == book_id]
|
67 |
+
logging.info(f"Generating explanation for user:{chosen_user}, book:{chosen_book}")
|
68 |
+
|
69 |
+
book_df = preds_df.set_index('book_id').loc[chosen_book]
|
70 |
+
waterfall_cols = [
|
71 |
+
'intercept',
|
72 |
+
'clus_score',
|
73 |
+
'gen_score',
|
74 |
+
'desc_score',
|
75 |
+
'rev_score',
|
76 |
+
'user_score',
|
77 |
+
'tit_score',
|
78 |
+
'final_score'
|
79 |
+
]
|
80 |
+
waterfall_display_cols = [
|
81 |
+
'Intercept',
|
82 |
+
'Book Clustering Similarity',
|
83 |
+
'Genre Similarity',
|
84 |
+
'Description Topic Similarity',
|
85 |
+
'Review Vector Similarity',
|
86 |
+
'User Clustering Similarity',
|
87 |
+
'Title Vector Similarity',
|
88 |
+
'Sum of Sub-Model Scores'
|
89 |
+
]
|
90 |
+
waterfall_data = book_df[waterfall_cols].tolist()
|
91 |
+
fig = go.Figure(
|
92 |
+
go.Waterfall(
|
93 |
+
name='Recommendation explanation',
|
94 |
+
orientation='h',
|
95 |
+
measure=['relative', 'relative', 'relative', 'relative', 'relative', 'relative', 'relative', 'total'],
|
96 |
+
y=waterfall_display_cols,
|
97 |
+
x=waterfall_data
|
98 |
+
)
|
99 |
+
)
|
100 |
+
fig_html = fig.to_html(full_html=False)
|
101 |
+
|
102 |
+
top_model_idx = waterfall_cols.index(book_df[waterfall_cols[:-1]].astype(float).idxmax())
|
103 |
+
top_model = waterfall_display_cols[top_model_idx]
|
104 |
+
explanation_str = f"The highest contributing model was {top_model}. "
|
105 |
+
if book_df['final_score'] >= 0.45:
|
106 |
+
reasons = [
|
107 |
+
'-', # intercept
|
108 |
+
'it is similar to books you enjoyed in terms of book statistics like popularity and page count.',
|
109 |
+
'it is similar to books you enjoyed in terms of overlapping genres.',
|
110 |
+
'it is similar to books you enjoyed in terms of description similarity.',
|
111 |
+
'it is similar to books you enjoyed in terms of review similarity.',
|
112 |
+
'other users similar to you in taste enjoyed this book.',
|
113 |
+
'it is similar to books you enjoyed in terms of title similarity.',
|
114 |
+
]
|
115 |
+
explanation_str += "This means that this book was recommended since "
|
116 |
+
explanation_str += reasons[top_model_idx]
|
117 |
+
else:
|
118 |
+
explanation_str += "However, the confidence score is below the threshold of 0.45, so it is not recommended."
|
119 |
+
|
120 |
+
score_sum = f"{sum(waterfall_data[:-1]):.5f}"
|
121 |
+
final_score = f"{book_df['final_score']:.5f}"
|
122 |
+
return render_template(
|
123 |
+
'recommended_books.html',
|
124 |
+
chosen_user=chosen_user,
|
125 |
+
recommended_books=recommended_books,
|
126 |
+
render_explanation='true',
|
127 |
+
fig=fig_html,
|
128 |
+
score_sum=score_sum,
|
129 |
+
final_score=final_score,
|
130 |
+
explanation_str=explanation_str
|
131 |
+
)
|
132 |
+
|
133 |
+
|
134 |
+
if __name__ == '__main__':
|
135 |
+
app.run(debug=True)
|
model.py
ADDED
@@ -0,0 +1,309 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
from tqdm import tqdm
|
4 |
+
import pickle
|
5 |
+
import os
|
6 |
+
from collections import defaultdict
|
7 |
+
import random
|
8 |
+
import warnings
|
9 |
+
import logging
|
10 |
+
|
11 |
+
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
|
12 |
+
from langchain_community.vectorstores import FAISS
|
13 |
+
|
14 |
+
warnings.filterwarnings("ignore")
|
15 |
+
random.seed(5153)
|
16 |
+
logging.basicConfig(level=logging.DEBUG)
|
17 |
+
|
18 |
+
|
19 |
+
class Model:
|
20 |
+
def __init__(self):
|
21 |
+
self.cache_path = "Data/cache.pkl"
|
22 |
+
self.is_loaded = False
|
23 |
+
self.dataset = None
|
24 |
+
self.predictions = None
|
25 |
+
self.user_details = None
|
26 |
+
self.temp_store = None
|
27 |
+
self.pipeline = None
|
28 |
+
self.chosen_books_per_user = None
|
29 |
+
self.all_books = pd.read_csv("Data/books.csv")
|
30 |
+
logging.info("Initialized model")
|
31 |
+
|
32 |
+
def run_predictions_on_full_test(self):
|
33 |
+
if self.is_loaded:
|
34 |
+
logging.info("Model is already loaded")
|
35 |
+
return
|
36 |
+
if self.does_cache_exist():
|
37 |
+
logging.info("Retrieving cached full-test predictions")
|
38 |
+
self.retrieve_cache()
|
39 |
+
logging.info("Completed full-test")
|
40 |
+
return
|
41 |
+
logging.info("Generating full-test predictions")
|
42 |
+
reviews_df = pd.read_csv("Data/final_dataset/reviews_test.csv")
|
43 |
+
good_reviews = reviews_df[reviews_df['rating'] > 3]
|
44 |
+
good_user_books_dict = good_reviews.groupby('user_id')['book_id'].unique().apply(list).to_dict()
|
45 |
+
|
46 |
+
# to further minimize compute time, we only use 20 (randomly sampled) users
|
47 |
+
num_random_users = 20
|
48 |
+
randomly_sampled_users = random.sample(list(good_user_books_dict.keys()), num_random_users)
|
49 |
+
sampled_good_user_books_dict = {user_id: good_user_books_dict[user_id] for user_id in randomly_sampled_users}
|
50 |
+
|
51 |
+
# to minimize compute time, we take only 150 random (good) books per user
|
52 |
+
# prepare it in the form of user_id -> list[book_id]
|
53 |
+
num_rand_books_per_user = 150
|
54 |
+
chosen_books_per_user = {
|
55 |
+
user_id: random.sample(books, min(len(books), num_rand_books_per_user))
|
56 |
+
for user_id, books in sampled_good_user_books_dict.items()
|
57 |
+
}
|
58 |
+
|
59 |
+
# save this for reference
|
60 |
+
self.chosen_books_per_user = chosen_books_per_user
|
61 |
+
|
62 |
+
# run predictions on all of the above users
|
63 |
+
self.prepare_predictions(chosen_books_per_user)
|
64 |
+
logging.info("Caching full-test predictions")
|
65 |
+
self.cache_results()
|
66 |
+
logging.info("Completed full-test")
|
67 |
+
|
68 |
+
def run_prediction_on_adhoc_user(self, chosen_book_ids):
|
69 |
+
self.prepare_predictions(
|
70 |
+
{'current_user': chosen_book_ids}
|
71 |
+
)
|
72 |
+
|
73 |
+
def prepare_predictions(self, target_users_and_books: dict[str, list[str]]):
|
74 |
+
"""
|
75 |
+
Given a dictionary of user_id to list[book_id], where the list of book IDs are the books favored by
|
76 |
+
the associated user, this function returns the recommended books for each user provided in the dictionary
|
77 |
+
|
78 |
+
:param target_users_and_books: Dictionary of user ID to favored books (as book IDs)
|
79 |
+
:return: Dataframe of user IDs and associated recommended books, plus individual model scores
|
80 |
+
"""
|
81 |
+
target_user_list = list(target_users_and_books.keys())
|
82 |
+
|
83 |
+
file_dict = {}
|
84 |
+
for filename in ['reviews_test', 'users_test', 'reviews_sub']:
|
85 |
+
file_dict[filename] = pd.read_csv(f'Data/final_dataset/{filename}.csv')
|
86 |
+
|
87 |
+
file_dict['users'] = file_dict['users_test']
|
88 |
+
file_dict['reviews'] = file_dict['reviews_test']
|
89 |
+
|
90 |
+
file_dict['good_reviews'] = file_dict['reviews'][file_dict['reviews']['rating'] > 3]
|
91 |
+
file_dict['books'] = pd.read_csv('Data/books.csv')
|
92 |
+
|
93 |
+
#################################################################################
|
94 |
+
# GENRE MODEL; DESCRIPTION MODEL; TITLE MODEL; BOOK STATS CLUSTER MODEL
|
95 |
+
#################################################################################
|
96 |
+
|
97 |
+
clusterbooks = pd.DataFrame(
|
98 |
+
np.load('Data/Recommended Storage/cluster_books.npy', allow_pickle=True),
|
99 |
+
columns=['target_book', 'recco_book_id', 'similarity_score']).astype(float) # wasn't saved as float
|
100 |
+
genrebooks = pd.DataFrame(
|
101 |
+
np.load('Data/Recommended Storage/genres_books.npy', allow_pickle=True),
|
102 |
+
columns=['target_book', 'recco_book_id', 'similarity_score'])
|
103 |
+
descbooks = pd.DataFrame(
|
104 |
+
np.load('Data/Recommended Storage/description_books.npy', allow_pickle=True),
|
105 |
+
columns=['target_book', 'recco_book_id', 'similarity_score'])
|
106 |
+
revbooks = pd.DataFrame(
|
107 |
+
np.load('Data/Recommended Storage/reviews_books_new.npy', allow_pickle=True),
|
108 |
+
columns=['target_book', 'recco_book_id', 'similarity_score'])
|
109 |
+
|
110 |
+
def optimized_converter(simbooks, user_id_list, name, prog_bar_description):
|
111 |
+
user_ratings_list = pd.DataFrame(columns=['user_id', 'recco_book_id', 'similarity_score'])
|
112 |
+
for curr_user_id in tqdm(user_id_list, desc=prog_bar_description):
|
113 |
+
curr_user_books = pd.Series(target_users_and_books[curr_user_id])
|
114 |
+
relevant_simbooks = simbooks[simbooks['target_book'].isin(curr_user_books)]
|
115 |
+
summed_scores = relevant_simbooks.groupby('recco_book_id')['similarity_score'].sum().reset_index()
|
116 |
+
summed_scores['user_id'] = curr_user_id
|
117 |
+
if not curr_user_books.empty:
|
118 |
+
summed_scores = summed_scores[~summed_scores['recco_book_id'].isin(curr_user_books)]
|
119 |
+
# TODO: Think about how to adjust this for small number of books
|
120 |
+
summed_scores['similarity_score'] /= len(curr_user_books)
|
121 |
+
top_30 = summed_scores.nlargest(30, 'similarity_score')
|
122 |
+
user_ratings_list = pd.concat([user_ratings_list, top_30], ignore_index=True)
|
123 |
+
return user_ratings_list.rename(columns={'recco_book_id': 'book_id', 'similarity_score': name})
|
124 |
+
|
125 |
+
genre_users = optimized_converter(genrebooks, target_user_list, 'gen_score', "Generating recs (genre)")
|
126 |
+
cluster_users = optimized_converter(clusterbooks, target_user_list, 'clus_score',
|
127 |
+
"Generating recs (book stats cluster)")
|
128 |
+
description_users = optimized_converter(descbooks, target_user_list, 'desc_score',
|
129 |
+
"Generating recs (description)")
|
130 |
+
reviews_users = optimized_converter(revbooks, target_user_list, 'rev_score', "Generating recs (reviews)")
|
131 |
+
|
132 |
+
#################################################################################
|
133 |
+
# USER SIMILARITY CLUSTERING MODEL
|
134 |
+
#################################################################################
|
135 |
+
|
136 |
+
def jaccard_similarity_pandas(target_user, reviews_sub, n):
|
137 |
+
target_user_books = target_users_and_books[target_user]
|
138 |
+
relevant_reviews = reviews_sub[reviews_sub['book_id'].isin(target_user_books)]
|
139 |
+
intersections = relevant_reviews.groupby('user_id').size()
|
140 |
+
# all_books = pd.concat(
|
141 |
+
# [df[df['user_id'] == target_user]['book_id'], reviews_sub['book_id']]).drop_duplicates()
|
142 |
+
user_book_counts = reviews_sub.groupby('user_id')['book_id'].nunique()
|
143 |
+
unions = len(target_user_books) + user_book_counts - intersections
|
144 |
+
jaccard_index = intersections / unions
|
145 |
+
top_n_users = jaccard_index.nlargest(n)
|
146 |
+
return top_n_users.reset_index().values.tolist()
|
147 |
+
|
148 |
+
def recommend_books(target_user_id, reviews_sub, num_books):
|
149 |
+
# df = reviews_sub[(reviews_sub['rating'].isin([4, 5]))]
|
150 |
+
top_n_similar_users = jaccard_similarity_pandas(target_user_id, reviews_sub, n=20)
|
151 |
+
target_user_books = target_users_and_books[target_user_id]
|
152 |
+
similar_users_reviews = reviews_sub[reviews_sub['user_id'].isin([user[0] for user in top_n_similar_users])]
|
153 |
+
|
154 |
+
recommended_books = defaultdict(float)
|
155 |
+
for curr_user_id, similarity_score in top_n_similar_users:
|
156 |
+
user_reviews = similar_users_reviews[similar_users_reviews['user_id'] == curr_user_id]
|
157 |
+
for _, row in user_reviews.iterrows():
|
158 |
+
if row['book_id'] not in target_user_books:
|
159 |
+
recommended_books[row['book_id']] += similarity_score
|
160 |
+
|
161 |
+
# Return top recommended books sorted by score
|
162 |
+
sorted_recommended_books = sorted(recommended_books.items(), key=lambda x: x[1], reverse=True)
|
163 |
+
return [(target_user_id, book_id, book_score) for book_id, book_score in
|
164 |
+
sorted_recommended_books[:num_books]]
|
165 |
+
|
166 |
+
all_recommendations = []
|
167 |
+
|
168 |
+
for each_user_id in tqdm(target_user_list, desc="Generating recs (users)"):
|
169 |
+
recommendations = recommend_books(each_user_id, file_dict['reviews_sub'], 30)
|
170 |
+
all_recommendations.extend(recommendations)
|
171 |
+
user_users = pd.DataFrame(all_recommendations, columns=['user_id', 'book_id', 'user_score'])
|
172 |
+
user_users.head()
|
173 |
+
|
174 |
+
#################################################################################
|
175 |
+
# TITLE SIMILARITY MODEL
|
176 |
+
#################################################################################
|
177 |
+
|
178 |
+
store = FAISS.load_local(
|
179 |
+
"Data/faiss_store",
|
180 |
+
HuggingFaceBgeEmbeddings(
|
181 |
+
model_kwargs={"device": "cpu"},
|
182 |
+
encode_kwargs={"normalize_embeddings": True}
|
183 |
+
),
|
184 |
+
allow_dangerous_deserialization=True
|
185 |
+
)
|
186 |
+
|
187 |
+
title_output = []
|
188 |
+
for user_id, books in tqdm(target_users_and_books.items(), desc="Generating recs (title)"):
|
189 |
+
user_book_id = target_users_and_books[user_id]
|
190 |
+
user_books = file_dict['books'][(file_dict['books']['book_id'].isin(user_book_id))]
|
191 |
+
titles = '\n'.join(user_books['title_without_series']) # Using titles without series for queries
|
192 |
+
results = store.similarity_search_with_score(titles, k=80)
|
193 |
+
for result, score in results:
|
194 |
+
if result.metadata.get('book_id') not in user_books:
|
195 |
+
title_output.append([user_id, result.metadata.get('book_id'), 1 - score])
|
196 |
+
|
197 |
+
# Save formatted
|
198 |
+
title_users = pd.DataFrame(title_output, columns=['user_id', 'book_id', 'tit_score'])
|
199 |
+
|
200 |
+
#################################################################################
|
201 |
+
# COMBINING MODEL OUTPUTS
|
202 |
+
#################################################################################
|
203 |
+
|
204 |
+
self.temp_store = {
|
205 |
+
'cluster': cluster_users,
|
206 |
+
'genre': genre_users,
|
207 |
+
'desc': description_users,
|
208 |
+
'reviews': reviews_users,
|
209 |
+
'users': user_users,
|
210 |
+
'title': title_users,
|
211 |
+
}
|
212 |
+
|
213 |
+
combined_df = pd.merge(cluster_users, genre_users, on=['user_id', 'book_id'], how='outer')
|
214 |
+
combined_df = pd.merge(combined_df, description_users, on=['user_id', 'book_id'], how='outer')
|
215 |
+
combined_df = pd.merge(combined_df, reviews_users, on=['user_id', 'book_id'], how='outer')
|
216 |
+
combined_df = pd.merge(combined_df, user_users, on=['user_id', 'book_id'], how='outer')
|
217 |
+
combined_df = pd.merge(combined_df, title_users, on=['user_id', 'book_id'], how='outer')
|
218 |
+
|
219 |
+
combined_df.fillna(0, inplace=True)
|
220 |
+
combined_df['book_id'] = combined_df['book_id'].astype(int)
|
221 |
+
combined_df['tit_score'] = combined_df['tit_score'].astype(float)
|
222 |
+
|
223 |
+
reviews_df = file_dict['reviews'][file_dict['reviews']['rating'].isin([1, 2, 3, 4, 5])]
|
224 |
+
reviews_filtered = reviews_df[['user_id', 'book_id', 'rating']]
|
225 |
+
combined_df = combined_df.merge(reviews_filtered, on=['user_id', 'book_id'], how='left')
|
226 |
+
combined_df.rename(columns={'rating': 'target'}, inplace=True)
|
227 |
+
combined_df['binary'] = np.where(combined_df['target'] >= 4, 1, 0)
|
228 |
+
|
229 |
+
# remove books which are not recommended at all
|
230 |
+
combined_df = combined_df[
|
231 |
+
(combined_df[['clus_score', 'gen_score', 'desc_score', 'rev_score', 'user_score', 'tit_score']] != 0).any(
|
232 |
+
axis=1)]
|
233 |
+
|
234 |
+
with open("Data/final_model.pkl", 'rb') as file:
|
235 |
+
self.pipeline = pickle.load(file)
|
236 |
+
|
237 |
+
X_test = combined_df.drop(columns=['user_id', 'book_id', 'target', 'binary'])
|
238 |
+
predictions_df = combined_df[
|
239 |
+
['user_id', 'book_id', 'clus_score', 'gen_score', 'desc_score', 'rev_score', 'user_score',
|
240 |
+
'tit_score', 'target', 'binary']].copy()
|
241 |
+
predictions_df['final_score'] = self.pipeline.predict_proba(X_test).T[1]
|
242 |
+
predictions_df['would_recommend'] = predictions_df['final_score'] >= 0.45 # peak f2 score at this threshold
|
243 |
+
predictions_df = predictions_df.sort_values(['user_id', 'final_score'], ascending=[True, False])
|
244 |
+
|
245 |
+
self.dataset = combined_df
|
246 |
+
self.predictions = predictions_df
|
247 |
+
|
248 |
+
def prepare_user_details(self):
|
249 |
+
users_list = self.dataset['user_id'].unique()
|
250 |
+
|
251 |
+
users_df = pd.read_csv("Data/final_dataset/users_test.csv")
|
252 |
+
books_df = pd.read_csv("Data/final_dataset/books_test.csv")
|
253 |
+
|
254 |
+
# filter to keep only relevant users
|
255 |
+
users_df = users_df[users_df['user_id'].isin(users_list)]
|
256 |
+
# merge to get book and review data
|
257 |
+
full_df = users_df.merge(books_df, on="user_id")
|
258 |
+
|
259 |
+
user_details = pd.DataFrame()
|
260 |
+
top_books_per_user = full_df.groupby("user_id").apply(
|
261 |
+
lambda x: x.sort_values('rating').nlargest(n=5, columns='rating')['title_without_series'].tolist())
|
262 |
+
user_details['top_books'] = top_books_per_user
|
263 |
+
|
264 |
+
self.user_details = user_details
|
265 |
+
|
266 |
+
def get_user_predictions(self, chosen_user):
|
267 |
+
logging.info(f"Generating predictions for user: {chosen_user}")
|
268 |
+
user_predictions = self.predictions[self.predictions['user_id'] == chosen_user]
|
269 |
+
user_predictions = user_predictions.dropna(subset=['target'])
|
270 |
+
if len(user_predictions) == 0:
|
271 |
+
logging.info(f"No predictions hit! Exiting early")
|
272 |
+
return None
|
273 |
+
|
274 |
+
# transform model scores using the pipeline (scaler + logistic regression coefficients)
|
275 |
+
# specifically, apply scaler then apply linear layer of logistic regression
|
276 |
+
model_score_cols = [c for c in user_predictions.columns if c.endswith('_score') and c != 'final_score']
|
277 |
+
scaled_model_scores = self.pipeline['scaler'].transform(user_predictions[model_score_cols])
|
278 |
+
multed_model_scores = scaled_model_scores * self.pipeline['classifier'].coef_[0]
|
279 |
+
final_model_scores = pd.DataFrame(multed_model_scores, columns=model_score_cols)
|
280 |
+
final_model_scores['intercept'] = self.pipeline['classifier'].intercept_[0]
|
281 |
+
|
282 |
+
columns = ['book_id', 'target', 'final_score', 'would_recommend']
|
283 |
+
predictions_and_score = pd.concat(
|
284 |
+
[user_predictions[columns].reset_index(drop=True), final_model_scores],
|
285 |
+
axis=1
|
286 |
+
)
|
287 |
+
return predictions_and_score.merge(self.all_books[['book_id', 'title_without_series']], on='book_id')
|
288 |
+
|
289 |
+
def cache_results(self):
|
290 |
+
with open(self.cache_path, 'wb+') as f:
|
291 |
+
to_pickle = dict()
|
292 |
+
to_pickle['dataset'] = self.dataset
|
293 |
+
to_pickle['predictions'] = self.predictions
|
294 |
+
to_pickle['temp_store'] = self.temp_store
|
295 |
+
to_pickle['pipeline'] = self.pipeline
|
296 |
+
to_pickle['chosen_books'] = self.chosen_books_per_user
|
297 |
+
# to_pickle['user_details'] = self.user_details
|
298 |
+
pickle.dump(to_pickle, f)
|
299 |
+
self.is_loaded = True
|
300 |
+
|
301 |
+
def does_cache_exist(self):
|
302 |
+
return os.path.exists(self.cache_path)
|
303 |
+
|
304 |
+
def retrieve_cache(self):
|
305 |
+
with open(self.cache_path, 'rb') as f:
|
306 |
+
unpickled = pickle.load(f)
|
307 |
+
for key, val in unpickled.items():
|
308 |
+
exec(f"self.{key} = val")
|
309 |
+
self.is_loaded = True
|
templates/index.html
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html>
|
3 |
+
<head>
|
4 |
+
<title>Book Recommender</title>
|
5 |
+
<!-- Bootstrap CSS -->
|
6 |
+
<link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-QWTKZyjpPEjISv5WaRU9OFeRpok6YctnYmDr5pNlyT2bRjXh0JMhjY6hW+ALEwIH" crossorigin="anonymous">
|
7 |
+
</head>
|
8 |
+
<body>
|
9 |
+
<div class="container">
|
10 |
+
<h1>Book Recommendation System</h1>
|
11 |
+
<p>Welcome! You can access the options below:</p>
|
12 |
+
<a href="/test_users" class="btn btn-primary">Go to Test User Set</a>
|
13 |
+
</div>
|
14 |
+
<!-- Bootstrap JS (Optional) -->
|
15 |
+
<script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.bundle.min.js" integrity="sha384-YvpcrYf0tY3lHB60NNkmXc5s9fDVZLESaAA55NDzOxhy9GkcIdslK1eN7N6jIeHz" crossorigin="anonymous"></script>
|
16 |
+
</body>
|
17 |
+
</html>
|
templates/recommended_books.html
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html>
|
3 |
+
<head>
|
4 |
+
<title>Book Recommender</title>
|
5 |
+
<!-- The loading of KaTeX is deferred to speed up page rendering -->
|
6 |
+
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/dist/katex.min.css" integrity="sha384-wcIxkf4k558AjM3Yz3BBFQUbk/zgIYC2R0QpeeYb+TwlBVMrlgLqwRjRtGZiK7ww" crossorigin="anonymous">
|
7 |
+
<script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/katex.min.js" integrity="sha384-hIoBPJpTUs74ddyc4bFZSM1TVlQDA60VBbJS0oA934VSz82sBx1X7kSx2ATBDIyd" crossorigin="anonymous"></script>
|
8 |
+
<!-- Bootstrap CSS -->
|
9 |
+
<link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-QWTKZyjpPEjISv5WaRU9OFeRpok6YctnYmDr5pNlyT2bRjXh0JMhjY6hW+ALEwIH" crossorigin="anonymous">
|
10 |
+
</head>
|
11 |
+
<body>
|
12 |
+
<div class="container">
|
13 |
+
<h1>User books breakdown</h1>
|
14 |
+
<h2>What is on this page?</h2>
|
15 |
+
<p>
|
16 |
+
The books below are books that have been recommended by at least one sub-model in our ensemble model for
|
17 |
+
the current user.
|
18 |
+
They are sorted by our ensemble model's confidence score. If our model is confident that the user will
|
19 |
+
enjoy the book, then the book is sorted towards the top.
|
20 |
+
<br/><br/>
|
21 |
+
These books have also already been read by this user. We can compare the user's actual rating to the
|
22 |
+
confidence score of our ensemble model to see how accurate our ensemble model's predictions are.
|
23 |
+
<br/><br/>
|
24 |
+
Click on the Explain button in blue to look at a breakdown of each sub-model's scores and how they
|
25 |
+
contributed to the ensemble model's final confidence score, and for a brief explanation of why this book
|
26 |
+
was recommended to the user.
|
27 |
+
<br/><br/>
|
28 |
+
<a href="/test_users">Click here to return to the list of users.</a>
|
29 |
+
</p>
|
30 |
+
<h2>Current user ID: {{ chosen_user }}</h2>
|
31 |
+
<table class="table table-striped" style="margin-top: 3em">
|
32 |
+
<thead class="thead-dark">
|
33 |
+
<tr>
|
34 |
+
<th>Title</th>
|
35 |
+
<th>User's actual rating</th>
|
36 |
+
<th>Prediction confidence score</th>
|
37 |
+
<th>Recommended?</th>
|
38 |
+
<th>Show explanation</th>
|
39 |
+
</tr>
|
40 |
+
</thead>
|
41 |
+
<tbody>
|
42 |
+
{% for book_id, book_data in recommended_books.iterrows() %}
|
43 |
+
<tr>
|
44 |
+
<td>{{ book_data['title_without_series'] }}</td>
|
45 |
+
<td>{{ book_data['target'] }}</td>
|
46 |
+
<td>{{ book_data['final_score'] }}</td>
|
47 |
+
<td>{{ book_data['is_recommended'] }}</td>
|
48 |
+
<td><a href="/test_users/{{ chosen_user }}/{{ book_id }}" class="btn btn-primary">Explain</a></td>
|
49 |
+
</tr>
|
50 |
+
{% endfor %}
|
51 |
+
</tbody>
|
52 |
+
</table>
|
53 |
+
{{ fig|safe }}
|
54 |
+
<div style="display: flex; flex-direction: column; justify-content: space-around; align-items: center">
|
55 |
+
<div id="ScoreSum" style="font-size: 1.5em"></div>
|
56 |
+
<div id="Formula" style="font-size: 1.5em"></div>
|
57 |
+
<p style="text-align: center; width: 50%; margin-top: 1em">
|
58 |
+
{{ explanation_str }}
|
59 |
+
</p>
|
60 |
+
</div>
|
61 |
+
</div>
|
62 |
+
<script>
|
63 |
+
scoreSumDiv = document.getElementById("ScoreSum");
|
64 |
+
formulaDiv = document.getElementById("Formula");
|
65 |
+
// disgusting string-concatenating-Jinja-templating monster
|
66 |
+
if ({{ render_explanation }}) {
|
67 |
+
const score_sum = {{ score_sum }};
|
68 |
+
katex.render("\\text{Sum of sub-model scores}=" + score_sum, scoreSumDiv);
|
69 |
+
|
70 |
+
const start = "\\text{Confidence Score} = \\frac{1}{1+e^{-(";
|
71 |
+
const end = ")}} = ";
|
72 |
+
const threshold = "0.45";
|
73 |
+
const final_score = {{ final_score }};
|
74 |
+
let conclusion;
|
75 |
+
if (parseFloat(final_score) >= parseFloat(threshold)) {
|
76 |
+
conclusion = "\\ge " + threshold + " \\text{ (Recommended)}";
|
77 |
+
} else {
|
78 |
+
conclusion = "\\lt " + threshold + " \\text{ (Not recommended)}";
|
79 |
+
}
|
80 |
+
const render_str = start + score_sum + end + final_score + conclusion;
|
81 |
+
katex.render(render_str, formulaDiv);
|
82 |
+
}
|
83 |
+
</script>
|
84 |
+
<!-- Bootstrap JS (Optional) -->
|
85 |
+
<script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.bundle.min.js" integrity="sha384-YvpcrYf0tY3lHB60NNkmXc5s9fDVZLESaAA55NDzOxhy9GkcIdslK1eN7N6jIeHz" crossorigin="anonymous"></script>
|
86 |
+
|
87 |
+
</body>
|
88 |
+
</html>
|
templates/test_users.html
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html>
|
3 |
+
<head>
|
4 |
+
<title>Book Recommender</title>
|
5 |
+
<!-- Bootstrap CSS -->
|
6 |
+
<link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-QWTKZyjpPEjISv5WaRU9OFeRpok6YctnYmDr5pNlyT2bRjXh0JMhjY6hW+ALEwIH" crossorigin="anonymous">
|
7 |
+
</head>
|
8 |
+
<body>
|
9 |
+
<div class="container">
|
10 |
+
<h1>Select a User</h1>
|
11 |
+
<p>
|
12 |
+
We generated recommendations for each of the 20 test users below. In each card, you can see that user's
|
13 |
+
favourite titles (top 5, sorted by user's rating) to give an idea of the user's taste profile. Click
|
14 |
+
on the button to see what we have recommended for this user.
|
15 |
+
</p>
|
16 |
+
<div class="row">
|
17 |
+
{% for user_id, top_books in user_details.items() %}
|
18 |
+
<div class="col-md-4 mb-4">
|
19 |
+
<div class="card">
|
20 |
+
<div class="card-body">
|
21 |
+
<h5 class="card-title">User</h5>
|
22 |
+
{% for title in top_books %}
|
23 |
+
<h6 class="card-text">{{ title }}</h6>
|
24 |
+
{% endfor %}
|
25 |
+
<a href="/test_users/{{ user_id }}" class="btn btn-primary">Select</a>
|
26 |
+
</div>
|
27 |
+
</div>
|
28 |
+
</div>
|
29 |
+
{% endfor %}
|
30 |
+
</div>
|
31 |
+
</div>
|
32 |
+
<!-- Bootstrap JS (Optional) -->
|
33 |
+
<script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.bundle.min.js" integrity="sha384-YvpcrYf0tY3lHB60NNkmXc5s9fDVZLESaAA55NDzOxhy9GkcIdslK1eN7N6jIeHz" crossorigin="anonymous"></script>
|
34 |
+
</body>
|
35 |
+
</html>
|