Spaces:
Sleeping
Sleeping
File size: 7,222 Bytes
8189b4d 7957649 8189b4d 7957649 8189b4d 7957649 8189b4d 7957649 8189b4d 7957649 8189b4d 7957649 8189b4d 7957649 8189b4d 7957649 8189b4d 7957649 8189b4d 7957649 8189b4d 7957649 8189b4d 7957649 8189b4d 7957649 8189b4d 7957649 8189b4d 7957649 8189b4d 7957649 8189b4d 7957649 8189b4d 7957649 8189b4d 7957649 8189b4d 7957649 8189b4d 7957649 8189b4d 7957649 8189b4d 7957649 8189b4d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 |
import pandas as pd
import json
import streamlit as st
SIMILAR_USER_MIN_PERCENT_SHARED_BOOKS = 0.30
SIMILAR_USER_MIN_CORRELATION = 0.25
MAX_MIN_BOOK_COUNT_FOR_SIMILAR_USER = 10 # the maximum amount of books required for a user to be considered a similar user
DEFAULT_BOOK_COVER_URL = "https://m.media-amazon.com/images/I/81QPHl7zgbL._AC_UF1000,1000_QL80_.jpg"
@st.cache
def get_dataframes():
booksDf = pd.read_csv("./goodreads/books.csv",
usecols=["book_id",
"original_publication_year",
# "average_rating",
"isbn",
"authors",
"title",
"average_rating"])
booksDf['book_id'] = range(1, len(booksDf) + 1)
baseRatingsDf = pd.read_csv("./goodreads/ratings.csv")
bookMetadataJSON = json.load(open("./goodreads/book_metadata.json"))
return booksDf, baseRatingsDf, bookMetadataJSON
books, baseRatings, bookMetadata = get_dataframes()
targetUserId = baseRatings['user_id'].max() + 1
def get_book_metadata_by_id(book_id):
book_id = str(book_id)
data = {
"description" : "n/a",
"thumbnail" : DEFAULT_BOOK_COVER_URL
}
if book_id in bookMetadata:
data = bookMetadata[book_id]
return data
def get_book_ids_by_title(book_titles):
return books[books["title"].isin(book_titles)]["book_id"].values
def get_all_book_titles():
return books["title"].values
def get_book_data_by_id(book_id):
return books[books["book_id"] == book_id].to_dict(orient="records")[0]
def get_book_title(book_id):
return books[books["book_id"] == book_id]["title"].values[0]
def get_book_titles(book_ids):
return books[books["book_id"].isin(book_ids)]["title"].values
def find_closest_read_title(bookByRatingData, upvotedBookIds, targetBookId):
user_df = bookByRatingData.groupby(["user_id","book_id"])["rating"].mean().unstack()
# drop all columns except the upvotedBookIds and the targetBookId
book_read_df = user_df[upvotedBookIds + [targetBookId]]
# replace NaNs with 0
book_read_df = book_read_df.fillna(0)
# find the correlation between the targetBookId and the upvotedBookIds
corr_df = book_read_df.corr().unstack()
# find the closest book to the targetBookId that is NOT the targetBookId
closestBookId = corr_df[targetBookId][corr_df[targetBookId] < 1].idxmax()
# get the title of the closest book
closestReadBookData = get_book_data_by_id(closestBookId)
return closestReadBookData
def update_user_ratings(upvotedBookIds, downvotedBookIds):
RATING_FOR_UPVOTE = 5
RATING_FOR_DOWNVOTE = 1
appendBookIds = []
appendBookRatings = []
appendUserIds = []
for bookId in upvotedBookIds:
appendBookIds.append(bookId)
appendBookRatings.append(RATING_FOR_UPVOTE)
appendUserIds.append(targetUserId)
for bookId in downvotedBookIds:
appendBookIds.append(bookId)
appendBookRatings.append(RATING_FOR_DOWNVOTE)
appendUserIds.append(targetUserId)
newUserData = {
'book_id': appendBookIds,
'user_id': appendUserIds,
'rating': appendBookRatings
}
newRows = pd.DataFrame(newUserData)
ratings = pd.concat([baseRatings, newRows], ignore_index=True)
book_by_rating_df = pd.merge(books, ratings, on="book_id", how="inner")
user_df = book_by_rating_df.groupby(["user_id","title"])["rating"].mean().unstack()
targetUserDf = user_df[user_df.index == targetUserId]
targetBooksRead = targetUserDf.dropna(axis=1).columns.tolist()
# for all the books the user has read
book_read_df = user_df[targetBooksRead]
# get counts of every *other* user that has read these
userBookCount = book_read_df.notnull().sum(axis=1)
print('\n\n\n\n\n\n\n\n')
print('--' * 10)
print(userBookCount)
# from there get users who've read at least X percent of the main user
minBookCount = int(book_read_df.shape[1] * SIMILAR_USER_MIN_PERCENT_SHARED_BOOKS)
minBookCount = min(minBookCount, MAX_MIN_BOOK_COUNT_FOR_SIMILAR_USER)
print(f'Min book count for Similar User: {minBookCount}')
usersSameBooks = userBookCount[userBookCount > minBookCount].index
# print(f'Users who have read at least 30% of the main user: {len(usersSameBooks)}')
# filter the main user's read books df to only include rows from users who've read X percent
filted_df = book_read_df[book_read_df.index.isin(usersSameBooks)]
# convert all NaNs to 0
filted_df = filted_df.fillna(0)
corr_df = filted_df.T.corr().unstack()
top_readers = None
minTopReaderCorrelation = SIMILAR_USER_MIN_CORRELATION
while top_readers is None:
top_readers = pd.DataFrame(corr_df[targetUserId][corr_df[targetUserId] > minTopReaderCorrelation], columns=["corr"])
# if top_readers only has 1 row, then we need to lower the correlation threshold
if len(top_readers) <= 1:
minTopReaderCorrelation -= 0.05
top_readers = None
print(top_readers)
if (targetUserId in top_readers.index):
top_readers = top_readers.drop(targetUserId)
# get the ratings for the top readers
top_readers_ratings = pd.merge(top_readers, book_by_rating_df[["user_id", "book_id", "rating"]], how='inner', on="user_id")
# weight their ratings by how correlated they are with the user
top_readers_ratings['weighted_rating'] = top_readers_ratings['corr'] * top_readers_ratings['rating']
# # make a pivot table from the books and their new weighted rating
recommendation_df = top_readers_ratings.pivot_table(values="weighted_rating", index="book_id", aggfunc="mean")
# set all books in upvotedBookIds/downvotedBookIds to 0 in the recommendation_df if they are in there
recommendation_df.loc[recommendation_df.index.isin(upvotedBookIds)] = 0
recommendation_df.loc[recommendation_df.index.isin(downvotedBookIds)] = 0
# sort the books by their weighted rating
books_recommend = recommendation_df[recommendation_df["weighted_rating"] > 1].sort_values(by="weighted_rating", ascending=False).head(20)
# get the recommended books (and sort by average_rating)
recommendedBooks = books[books["book_id"].isin(books_recommend.index)]
# sort recommended books by the weighted_rating in books_recommend
recommendedBooks = recommendedBooks.merge(books_recommend, on="book_id").sort_values(by="weighted_rating", ascending=False)
# drop book_id column
recommendedBooks = recommendedBooks.drop(columns=["average_rating"])
# get each row in the recommendedBooks dataframe as a dictionary
recommendedBooksRowsAsDicts = recommendedBooks.to_dict(orient="records")
return {
"upvotedBookIds": upvotedBookIds,
"downvotedBookIds": downvotedBookIds,
"numSimilarUsers" : len(usersSameBooks),
"recommendedBooksData": recommendedBooksRowsAsDicts,
"bookByRatingData": book_by_rating_df,
# sort by correlation
"topCorrelatedReadersData" : top_readers.sort_values(by="corr", ascending=False)
} |