book-rater / bookdb.py
conlan's picture
Update files from private repo
7957649
import pandas as pd
import json
import streamlit as st
SIMILAR_USER_MIN_PERCENT_SHARED_BOOKS = 0.30
SIMILAR_USER_MIN_CORRELATION = 0.25
MAX_MIN_BOOK_COUNT_FOR_SIMILAR_USER = 10 # the maximum amount of books required for a user to be considered a similar user
DEFAULT_BOOK_COVER_URL = "https://m.media-amazon.com/images/I/81QPHl7zgbL._AC_UF1000,1000_QL80_.jpg"
@st.cache
def get_dataframes():
booksDf = pd.read_csv("./goodreads/books.csv",
usecols=["book_id",
"original_publication_year",
# "average_rating",
"isbn",
"authors",
"title",
"average_rating"])
booksDf['book_id'] = range(1, len(booksDf) + 1)
baseRatingsDf = pd.read_csv("./goodreads/ratings.csv")
bookMetadataJSON = json.load(open("./goodreads/book_metadata.json"))
return booksDf, baseRatingsDf, bookMetadataJSON
books, baseRatings, bookMetadata = get_dataframes()
targetUserId = baseRatings['user_id'].max() + 1
def get_book_metadata_by_id(book_id):
book_id = str(book_id)
data = {
"description" : "n/a",
"thumbnail" : DEFAULT_BOOK_COVER_URL
}
if book_id in bookMetadata:
data = bookMetadata[book_id]
return data
def get_book_ids_by_title(book_titles):
return books[books["title"].isin(book_titles)]["book_id"].values
def get_all_book_titles():
return books["title"].values
def get_book_data_by_id(book_id):
return books[books["book_id"] == book_id].to_dict(orient="records")[0]
def get_book_title(book_id):
return books[books["book_id"] == book_id]["title"].values[0]
def get_book_titles(book_ids):
return books[books["book_id"].isin(book_ids)]["title"].values
def find_closest_read_title(bookByRatingData, upvotedBookIds, targetBookId):
user_df = bookByRatingData.groupby(["user_id","book_id"])["rating"].mean().unstack()
# drop all columns except the upvotedBookIds and the targetBookId
book_read_df = user_df[upvotedBookIds + [targetBookId]]
# replace NaNs with 0
book_read_df = book_read_df.fillna(0)
# find the correlation between the targetBookId and the upvotedBookIds
corr_df = book_read_df.corr().unstack()
# find the closest book to the targetBookId that is NOT the targetBookId
closestBookId = corr_df[targetBookId][corr_df[targetBookId] < 1].idxmax()
# get the title of the closest book
closestReadBookData = get_book_data_by_id(closestBookId)
return closestReadBookData
def update_user_ratings(upvotedBookIds, downvotedBookIds):
RATING_FOR_UPVOTE = 5
RATING_FOR_DOWNVOTE = 1
appendBookIds = []
appendBookRatings = []
appendUserIds = []
for bookId in upvotedBookIds:
appendBookIds.append(bookId)
appendBookRatings.append(RATING_FOR_UPVOTE)
appendUserIds.append(targetUserId)
for bookId in downvotedBookIds:
appendBookIds.append(bookId)
appendBookRatings.append(RATING_FOR_DOWNVOTE)
appendUserIds.append(targetUserId)
newUserData = {
'book_id': appendBookIds,
'user_id': appendUserIds,
'rating': appendBookRatings
}
newRows = pd.DataFrame(newUserData)
ratings = pd.concat([baseRatings, newRows], ignore_index=True)
book_by_rating_df = pd.merge(books, ratings, on="book_id", how="inner")
user_df = book_by_rating_df.groupby(["user_id","title"])["rating"].mean().unstack()
targetUserDf = user_df[user_df.index == targetUserId]
targetBooksRead = targetUserDf.dropna(axis=1).columns.tolist()
# for all the books the user has read
book_read_df = user_df[targetBooksRead]
# get counts of every *other* user that has read these
userBookCount = book_read_df.notnull().sum(axis=1)
print('\n\n\n\n\n\n\n\n')
print('--' * 10)
print(userBookCount)
# from there get users who've read at least X percent of the main user
minBookCount = int(book_read_df.shape[1] * SIMILAR_USER_MIN_PERCENT_SHARED_BOOKS)
minBookCount = min(minBookCount, MAX_MIN_BOOK_COUNT_FOR_SIMILAR_USER)
print(f'Min book count for Similar User: {minBookCount}')
usersSameBooks = userBookCount[userBookCount > minBookCount].index
# print(f'Users who have read at least 30% of the main user: {len(usersSameBooks)}')
# filter the main user's read books df to only include rows from users who've read X percent
filted_df = book_read_df[book_read_df.index.isin(usersSameBooks)]
# convert all NaNs to 0
filted_df = filted_df.fillna(0)
corr_df = filted_df.T.corr().unstack()
top_readers = None
minTopReaderCorrelation = SIMILAR_USER_MIN_CORRELATION
while top_readers is None:
top_readers = pd.DataFrame(corr_df[targetUserId][corr_df[targetUserId] > minTopReaderCorrelation], columns=["corr"])
# if top_readers only has 1 row, then we need to lower the correlation threshold
if len(top_readers) <= 1:
minTopReaderCorrelation -= 0.05
top_readers = None
print(top_readers)
if (targetUserId in top_readers.index):
top_readers = top_readers.drop(targetUserId)
# get the ratings for the top readers
top_readers_ratings = pd.merge(top_readers, book_by_rating_df[["user_id", "book_id", "rating"]], how='inner', on="user_id")
# weight their ratings by how correlated they are with the user
top_readers_ratings['weighted_rating'] = top_readers_ratings['corr'] * top_readers_ratings['rating']
# # make a pivot table from the books and their new weighted rating
recommendation_df = top_readers_ratings.pivot_table(values="weighted_rating", index="book_id", aggfunc="mean")
# set all books in upvotedBookIds/downvotedBookIds to 0 in the recommendation_df if they are in there
recommendation_df.loc[recommendation_df.index.isin(upvotedBookIds)] = 0
recommendation_df.loc[recommendation_df.index.isin(downvotedBookIds)] = 0
# sort the books by their weighted rating
books_recommend = recommendation_df[recommendation_df["weighted_rating"] > 1].sort_values(by="weighted_rating", ascending=False).head(20)
# get the recommended books (and sort by average_rating)
recommendedBooks = books[books["book_id"].isin(books_recommend.index)]
# sort recommended books by the weighted_rating in books_recommend
recommendedBooks = recommendedBooks.merge(books_recommend, on="book_id").sort_values(by="weighted_rating", ascending=False)
# drop book_id column
recommendedBooks = recommendedBooks.drop(columns=["average_rating"])
# get each row in the recommendedBooks dataframe as a dictionary
recommendedBooksRowsAsDicts = recommendedBooks.to_dict(orient="records")
return {
"upvotedBookIds": upvotedBookIds,
"downvotedBookIds": downvotedBookIds,
"numSimilarUsers" : len(usersSameBooks),
"recommendedBooksData": recommendedBooksRowsAsDicts,
"bookByRatingData": book_by_rating_df,
# sort by correlation
"topCorrelatedReadersData" : top_readers.sort_values(by="corr", ascending=False)
}