import pandas as pd | |
import json | |
import streamlit as st | |
MAX_MIN_BOOK_COUNT_FOR_SIMILAR_USER = 10 # the maximum amount of books required for a user to be considered a similar user | |
DEFAULT_BOOK_COVER_URL = ",1000_QL80_.jpg" | |
def get_dataframes(): | |
booksDf = pd.read_csv("./goodreads/books.csv", | |
usecols=["book_id", | |
"original_publication_year", | |
# "average_rating", | |
"isbn", | |
"authors", | |
"title", | |
"average_rating"]) | |
booksDf['book_id'] = range(1, len(booksDf) + 1) | |
baseRatingsDf = pd.read_csv("./goodreads/ratings.csv") | |
bookMetadataJSON = json.load(open("./goodreads/book_metadata.json")) | |
return booksDf, baseRatingsDf, bookMetadataJSON | |
books, baseRatings, bookMetadata = get_dataframes() | |
targetUserId = baseRatings['user_id'].max() + 1 | |
def get_book_metadata_by_id(book_id): | |
book_id = str(book_id) | |
data = { | |
"description" : "n/a", | |
"thumbnail" : DEFAULT_BOOK_COVER_URL | |
} | |
if book_id in bookMetadata: | |
data = bookMetadata[book_id] | |
return data | |
def get_book_ids_by_title(book_titles): | |
return books[books["title"].isin(book_titles)]["book_id"].values | |
def get_all_book_titles(): | |
return books["title"].values | |
def get_book_data_by_id(book_id): | |
return books[books["book_id"] == book_id].to_dict(orient="records")[0] | |
def get_book_title(book_id): | |
return books[books["book_id"] == book_id]["title"].values[0] | |
def get_book_titles(book_ids): | |
return books[books["book_id"].isin(book_ids)]["title"].values | |
def find_closest_read_title(bookByRatingData, upvotedBookIds, targetBookId): | |
user_df = bookByRatingData.groupby(["user_id","book_id"])["rating"].mean().unstack() | |
# drop all columns except the upvotedBookIds and the targetBookId | |
book_read_df = user_df[upvotedBookIds + [targetBookId]] | |
# replace NaNs with 0 | |
book_read_df = book_read_df.fillna(0) | |
# find the correlation between the targetBookId and the upvotedBookIds | |
corr_df = book_read_df.corr().unstack() | |
# find the closest book to the targetBookId that is NOT the targetBookId | |
closestBookId = corr_df[targetBookId][corr_df[targetBookId] < 1].idxmax() | |
# get the title of the closest book | |
closestReadBookData = get_book_data_by_id(closestBookId) | |
return closestReadBookData | |
def update_user_ratings(upvotedBookIds, downvotedBookIds): | |
appendBookIds = [] | |
appendBookRatings = [] | |
appendUserIds = [] | |
for bookId in upvotedBookIds: | |
appendBookIds.append(bookId) | |
appendBookRatings.append(RATING_FOR_UPVOTE) | |
appendUserIds.append(targetUserId) | |
for bookId in downvotedBookIds: | |
appendBookIds.append(bookId) | |
appendBookRatings.append(RATING_FOR_DOWNVOTE) | |
appendUserIds.append(targetUserId) | |
newUserData = { | |
'book_id': appendBookIds, | |
'user_id': appendUserIds, | |
'rating': appendBookRatings | |
} | |
newRows = pd.DataFrame(newUserData) | |
ratings = pd.concat([baseRatings, newRows], ignore_index=True) | |
book_by_rating_df = pd.merge(books, ratings, on="book_id", how="inner") | |
user_df = book_by_rating_df.groupby(["user_id","title"])["rating"].mean().unstack() | |
targetUserDf = user_df[user_df.index == targetUserId] | |
targetBooksRead = targetUserDf.dropna(axis=1).columns.tolist() | |
# for all the books the user has read | |
book_read_df = user_df[targetBooksRead] | |
# get counts of every *other* user that has read these | |
userBookCount = book_read_df.notnull().sum(axis=1) | |
print('\n\n\n\n\n\n\n\n') | |
print('--' * 10) | |
print(userBookCount) | |
# from there get users who've read at least X percent of the main user | |
minBookCount = int(book_read_df.shape[1] * SIMILAR_USER_MIN_PERCENT_SHARED_BOOKS) | |
minBookCount = min(minBookCount, MAX_MIN_BOOK_COUNT_FOR_SIMILAR_USER) | |
print(f'Min book count for Similar User: {minBookCount}') | |
usersSameBooks = userBookCount[userBookCount > minBookCount].index | |
# print(f'Users who have read at least 30% of the main user: {len(usersSameBooks)}') | |
# filter the main user's read books df to only include rows from users who've read X percent | |
filted_df = book_read_df[book_read_df.index.isin(usersSameBooks)] | |
# convert all NaNs to 0 | |
filted_df = filted_df.fillna(0) | |
corr_df = filted_df.T.corr().unstack() | |
top_readers = None | |
minTopReaderCorrelation = SIMILAR_USER_MIN_CORRELATION | |
while top_readers is None: | |
top_readers = pd.DataFrame(corr_df[targetUserId][corr_df[targetUserId] > minTopReaderCorrelation], columns=["corr"]) | |
# if top_readers only has 1 row, then we need to lower the correlation threshold | |
if len(top_readers) <= 1: | |
minTopReaderCorrelation -= 0.05 | |
top_readers = None | |
print(top_readers) | |
if (targetUserId in top_readers.index): | |
top_readers = top_readers.drop(targetUserId) | |
# get the ratings for the top readers | |
top_readers_ratings = pd.merge(top_readers, book_by_rating_df[["user_id", "book_id", "rating"]], how='inner', on="user_id") | |
# weight their ratings by how correlated they are with the user | |
top_readers_ratings['weighted_rating'] = top_readers_ratings['corr'] * top_readers_ratings['rating'] | |
# # make a pivot table from the books and their new weighted rating | |
recommendation_df = top_readers_ratings.pivot_table(values="weighted_rating", index="book_id", aggfunc="mean") | |
# set all books in upvotedBookIds/downvotedBookIds to 0 in the recommendation_df if they are in there | |
recommendation_df.loc[recommendation_df.index.isin(upvotedBookIds)] = 0 | |
recommendation_df.loc[recommendation_df.index.isin(downvotedBookIds)] = 0 | |
# sort the books by their weighted rating | |
books_recommend = recommendation_df[recommendation_df["weighted_rating"] > 1].sort_values(by="weighted_rating", ascending=False).head(20) | |
# get the recommended books (and sort by average_rating) | |
recommendedBooks = books[books["book_id"].isin(books_recommend.index)] | |
# sort recommended books by the weighted_rating in books_recommend | |
recommendedBooks = recommendedBooks.merge(books_recommend, on="book_id").sort_values(by="weighted_rating", ascending=False) | |
# drop book_id column | |
recommendedBooks = recommendedBooks.drop(columns=["average_rating"]) | |
# get each row in the recommendedBooks dataframe as a dictionary | |
recommendedBooksRowsAsDicts = recommendedBooks.to_dict(orient="records") | |
return { | |
"upvotedBookIds": upvotedBookIds, | |
"downvotedBookIds": downvotedBookIds, | |
"numSimilarUsers" : len(usersSameBooks), | |
"recommendedBooksData": recommendedBooksRowsAsDicts, | |
"bookByRatingData": book_by_rating_df, | |
# sort by correlation | |
"topCorrelatedReadersData" : top_readers.sort_values(by="corr", ascending=False) | |
} |