import pandas as pd import json import streamlit as st SIMILAR_USER_MIN_PERCENT_SHARED_BOOKS = 0.30 SIMILAR_USER_MIN_CORRELATION = 0.25 MAX_MIN_BOOK_COUNT_FOR_SIMILAR_USER = 10 # the maximum amount of books required for a user to be considered a similar user DEFAULT_BOOK_COVER_URL = "https://m.media-amazon.com/images/I/81QPHl7zgbL._AC_UF1000,1000_QL80_.jpg" @st.cache def get_dataframes(): booksDf = pd.read_csv("./goodreads/books.csv", usecols=["book_id", "original_publication_year", # "average_rating", "isbn", "authors", "title", "average_rating"]) booksDf['book_id'] = range(1, len(booksDf) + 1) baseRatingsDf = pd.read_csv("./goodreads/ratings.csv") bookMetadataJSON = json.load(open("./goodreads/book_metadata.json")) return booksDf, baseRatingsDf, bookMetadataJSON books, baseRatings, bookMetadata = get_dataframes() targetUserId = baseRatings['user_id'].max() + 1 def get_book_metadata_by_id(book_id): book_id = str(book_id) data = { "description" : "n/a", "thumbnail" : DEFAULT_BOOK_COVER_URL } if book_id in bookMetadata: data = bookMetadata[book_id] return data def get_book_ids_by_title(book_titles): return books[books["title"].isin(book_titles)]["book_id"].values def get_all_book_titles(): return books["title"].values def get_book_data_by_id(book_id): return books[books["book_id"] == book_id].to_dict(orient="records")[0] def get_book_title(book_id): return books[books["book_id"] == book_id]["title"].values[0] def get_book_titles(book_ids): return books[books["book_id"].isin(book_ids)]["title"].values def find_closest_read_title(bookByRatingData, upvotedBookIds, targetBookId): user_df = bookByRatingData.groupby(["user_id","book_id"])["rating"].mean().unstack() # drop all columns except the upvotedBookIds and the targetBookId book_read_df = user_df[upvotedBookIds + [targetBookId]] # replace NaNs with 0 book_read_df = book_read_df.fillna(0) # find the correlation between the targetBookId and the upvotedBookIds corr_df = book_read_df.corr().unstack() # find the closest book to the targetBookId that is NOT the targetBookId closestBookId = corr_df[targetBookId][corr_df[targetBookId] < 1].idxmax() # get the title of the closest book closestReadBookData = get_book_data_by_id(closestBookId) return closestReadBookData def update_user_ratings(upvotedBookIds, downvotedBookIds): RATING_FOR_UPVOTE = 5 RATING_FOR_DOWNVOTE = 1 appendBookIds = [] appendBookRatings = [] appendUserIds = [] for bookId in upvotedBookIds: appendBookIds.append(bookId) appendBookRatings.append(RATING_FOR_UPVOTE) appendUserIds.append(targetUserId) for bookId in downvotedBookIds: appendBookIds.append(bookId) appendBookRatings.append(RATING_FOR_DOWNVOTE) appendUserIds.append(targetUserId) newUserData = { 'book_id': appendBookIds, 'user_id': appendUserIds, 'rating': appendBookRatings } newRows = pd.DataFrame(newUserData) ratings = pd.concat([baseRatings, newRows], ignore_index=True) book_by_rating_df = pd.merge(books, ratings, on="book_id", how="inner") user_df = book_by_rating_df.groupby(["user_id","title"])["rating"].mean().unstack() targetUserDf = user_df[user_df.index == targetUserId] targetBooksRead = targetUserDf.dropna(axis=1).columns.tolist() # for all the books the user has read book_read_df = user_df[targetBooksRead] # get counts of every *other* user that has read these userBookCount = book_read_df.notnull().sum(axis=1) print('\n\n\n\n\n\n\n\n') print('--' * 10) print(userBookCount) # from there get users who've read at least X percent of the main user minBookCount = int(book_read_df.shape[1] * SIMILAR_USER_MIN_PERCENT_SHARED_BOOKS) minBookCount = min(minBookCount, MAX_MIN_BOOK_COUNT_FOR_SIMILAR_USER) print(f'Min book count for Similar User: {minBookCount}') usersSameBooks = userBookCount[userBookCount > minBookCount].index # print(f'Users who have read at least 30% of the main user: {len(usersSameBooks)}') # filter the main user's read books df to only include rows from users who've read X percent filted_df = book_read_df[book_read_df.index.isin(usersSameBooks)] # convert all NaNs to 0 filted_df = filted_df.fillna(0) corr_df = filted_df.T.corr().unstack() top_readers = None minTopReaderCorrelation = SIMILAR_USER_MIN_CORRELATION while top_readers is None: top_readers = pd.DataFrame(corr_df[targetUserId][corr_df[targetUserId] > minTopReaderCorrelation], columns=["corr"]) # if top_readers only has 1 row, then we need to lower the correlation threshold if len(top_readers) <= 1: minTopReaderCorrelation -= 0.05 top_readers = None print(top_readers) if (targetUserId in top_readers.index): top_readers = top_readers.drop(targetUserId) # get the ratings for the top readers top_readers_ratings = pd.merge(top_readers, book_by_rating_df[["user_id", "book_id", "rating"]], how='inner', on="user_id") # weight their ratings by how correlated they are with the user top_readers_ratings['weighted_rating'] = top_readers_ratings['corr'] * top_readers_ratings['rating'] # # make a pivot table from the books and their new weighted rating recommendation_df = top_readers_ratings.pivot_table(values="weighted_rating", index="book_id", aggfunc="mean") # set all books in upvotedBookIds/downvotedBookIds to 0 in the recommendation_df if they are in there recommendation_df.loc[recommendation_df.index.isin(upvotedBookIds)] = 0 recommendation_df.loc[recommendation_df.index.isin(downvotedBookIds)] = 0 # sort the books by their weighted rating books_recommend = recommendation_df[recommendation_df["weighted_rating"] > 1].sort_values(by="weighted_rating", ascending=False).head(20) # get the recommended books (and sort by average_rating) recommendedBooks = books[books["book_id"].isin(books_recommend.index)] # sort recommended books by the weighted_rating in books_recommend recommendedBooks = recommendedBooks.merge(books_recommend, on="book_id").sort_values(by="weighted_rating", ascending=False) # drop book_id column recommendedBooks = recommendedBooks.drop(columns=["average_rating"]) # get each row in the recommendedBooks dataframe as a dictionary recommendedBooksRowsAsDicts = recommendedBooks.to_dict(orient="records") return { "upvotedBookIds": upvotedBookIds, "downvotedBookIds": downvotedBookIds, "numSimilarUsers" : len(usersSameBooks), "recommendedBooksData": recommendedBooksRowsAsDicts, "bookByRatingData": book_by_rating_df, # sort by correlation "topCorrelatedReadersData" : top_readers.sort_values(by="corr", ascending=False) }