Spaces:

conlan
/

book-rater

Sleeping

File size: 7,222 Bytes

import pandas as pd
import json
import streamlit as st

SIMILAR_USER_MIN_PERCENT_SHARED_BOOKS = 0.30
SIMILAR_USER_MIN_CORRELATION = 0.25

MAX_MIN_BOOK_COUNT_FOR_SIMILAR_USER = 10 # the maximum amount of books required for a user to be considered a similar user

DEFAULT_BOOK_COVER_URL = "https://m.media-amazon.com/images/I/81QPHl7zgbL._AC_UF1000,1000_QL80_.jpg"

@st.cache
def get_dataframes():
    booksDf = pd.read_csv("./goodreads/books.csv", 
                 usecols=["book_id",
                            "original_publication_year",
                        #   "average_rating",
                            "isbn",
                            "authors",
                            "title",
                            "average_rating"])
    booksDf['book_id'] = range(1, len(booksDf) + 1)

    baseRatingsDf = pd.read_csv("./goodreads/ratings.csv")    
    
    bookMetadataJSON = json.load(open("./goodreads/book_metadata.json"))

    return booksDf, baseRatingsDf, bookMetadataJSON

books, baseRatings, bookMetadata = get_dataframes()

targetUserId = baseRatings['user_id'].max() + 1

def get_book_metadata_by_id(book_id):
    book_id = str(book_id)
    
    data = {
        "description" : "n/a",
        "thumbnail" : DEFAULT_BOOK_COVER_URL
    }

    if book_id in bookMetadata:
        data = bookMetadata[book_id]

    return data

def get_book_ids_by_title(book_titles):
    return books[books["title"].isin(book_titles)]["book_id"].values

def get_all_book_titles():
    return books["title"].values

def get_book_data_by_id(book_id):
    return books[books["book_id"] == book_id].to_dict(orient="records")[0]

def get_book_title(book_id):
    return books[books["book_id"] == book_id]["title"].values[0]

def get_book_titles(book_ids):
    return books[books["book_id"].isin(book_ids)]["title"].values

def find_closest_read_title(bookByRatingData, upvotedBookIds, targetBookId):
    user_df = bookByRatingData.groupby(["user_id","book_id"])["rating"].mean().unstack()

    # drop all columns except the upvotedBookIds and the targetBookId
    book_read_df = user_df[upvotedBookIds + [targetBookId]]    

    # replace NaNs with 0
    book_read_df = book_read_df.fillna(0)

    # find the correlation between the targetBookId and the upvotedBookIds
    corr_df = book_read_df.corr().unstack()

    # find the closest book to the targetBookId that is NOT the targetBookId
    closestBookId = corr_df[targetBookId][corr_df[targetBookId] < 1].idxmax()

    # get the title of the closest book
    closestReadBookData = get_book_data_by_id(closestBookId)

    return closestReadBookData

def update_user_ratings(upvotedBookIds, downvotedBookIds):
    RATING_FOR_UPVOTE = 5
    RATING_FOR_DOWNVOTE = 1

    appendBookIds = []
    appendBookRatings = []
    appendUserIds = []

    for bookId in upvotedBookIds:
        appendBookIds.append(bookId)
        appendBookRatings.append(RATING_FOR_UPVOTE)
        appendUserIds.append(targetUserId)

    for bookId in downvotedBookIds:
        appendBookIds.append(bookId)
        appendBookRatings.append(RATING_FOR_DOWNVOTE)
        appendUserIds.append(targetUserId)

    newUserData = {
        'book_id': appendBookIds,
        'user_id': appendUserIds,
        'rating': appendBookRatings
    }

    newRows = pd.DataFrame(newUserData)

    ratings = pd.concat([baseRatings, newRows], ignore_index=True)

    book_by_rating_df = pd.merge(books, ratings, on="book_id", how="inner")    

    user_df = book_by_rating_df.groupby(["user_id","title"])["rating"].mean().unstack()
    
    targetUserDf = user_df[user_df.index == targetUserId]

    targetBooksRead = targetUserDf.dropna(axis=1).columns.tolist()

    # for all the books the user has read
    book_read_df = user_df[targetBooksRead]

    # get counts of every *other* user that has read these
    userBookCount = book_read_df.notnull().sum(axis=1)

    print('\n\n\n\n\n\n\n\n')
    print('--' * 10)
    print(userBookCount)

    # from there get users who've read at least X percent of the main user
    minBookCount = int(book_read_df.shape[1] * SIMILAR_USER_MIN_PERCENT_SHARED_BOOKS)

    minBookCount = min(minBookCount, MAX_MIN_BOOK_COUNT_FOR_SIMILAR_USER)

    print(f'Min book count for Similar User: {minBookCount}')

    usersSameBooks = userBookCount[userBookCount > minBookCount].index

    # print(f'Users who have read at least 30% of the main user: {len(usersSameBooks)}')

    # filter the main user's read books df to only include rows from users who've read X percent
    filted_df = book_read_df[book_read_df.index.isin(usersSameBooks)]

    # convert all NaNs to 0
    filted_df = filted_df.fillna(0)

    corr_df = filted_df.T.corr().unstack()

    top_readers = None

    minTopReaderCorrelation = SIMILAR_USER_MIN_CORRELATION
    
    while top_readers is None:
        top_readers = pd.DataFrame(corr_df[targetUserId][corr_df[targetUserId] > minTopReaderCorrelation], columns=["corr"])

        # if top_readers only has 1 row, then we need to lower the correlation threshold
        if len(top_readers) <= 1:
            minTopReaderCorrelation -= 0.05
            top_readers = None
    
    print(top_readers)

    if (targetUserId in top_readers.index):
        top_readers = top_readers.drop(targetUserId)    

    # get the ratings for the top readers
    top_readers_ratings = pd.merge(top_readers, book_by_rating_df[["user_id", "book_id", "rating"]], how='inner', on="user_id")

    # weight their ratings by how correlated they are with the user
    top_readers_ratings['weighted_rating'] = top_readers_ratings['corr'] * top_readers_ratings['rating']

    # # make a pivot table from the books and their new weighted rating
    recommendation_df = top_readers_ratings.pivot_table(values="weighted_rating", index="book_id", aggfunc="mean")

    # set all books in upvotedBookIds/downvotedBookIds to 0 in the recommendation_df if they are in there
    recommendation_df.loc[recommendation_df.index.isin(upvotedBookIds)] = 0
    recommendation_df.loc[recommendation_df.index.isin(downvotedBookIds)] = 0

    # sort the books by their weighted rating
    books_recommend = recommendation_df[recommendation_df["weighted_rating"] > 1].sort_values(by="weighted_rating", ascending=False).head(20)

    # get the recommended books (and sort by average_rating)
    recommendedBooks = books[books["book_id"].isin(books_recommend.index)]

    # sort recommended books by the weighted_rating in books_recommend
    recommendedBooks = recommendedBooks.merge(books_recommend, on="book_id").sort_values(by="weighted_rating", ascending=False)

    # drop book_id column
    recommendedBooks = recommendedBooks.drop(columns=["average_rating"])

    # get each row in the recommendedBooks dataframe as a dictionary
    recommendedBooksRowsAsDicts = recommendedBooks.to_dict(orient="records")

    return {
        "upvotedBookIds": upvotedBookIds,
        "downvotedBookIds": downvotedBookIds,
        "numSimilarUsers" : len(usersSameBooks),
        "recommendedBooksData": recommendedBooksRowsAsDicts,
        "bookByRatingData": book_by_rating_df,
        
        # sort by correlation
        "topCorrelatedReadersData" : top_readers.sort_values(by="corr", ascending=False)
    }