Spaces:

conlan
/

book-rater

Sleeping

App Files Files Community

book-rater / bookdb.py

conlan

Update files from private repo

7957649 11 months ago

raw

history blame contribute delete

7.22 kB

	import pandas as pd
	import json
	import streamlit as st

	SIMILAR_USER_MIN_PERCENT_SHARED_BOOKS = 0.30
	SIMILAR_USER_MIN_CORRELATION = 0.25

	MAX_MIN_BOOK_COUNT_FOR_SIMILAR_USER = 10 # the maximum amount of books required for a user to be considered a similar user

	DEFAULT_BOOK_COVER_URL = "https://m.media-amazon.com/images/I/81QPHl7zgbL._AC_UF1000,1000_QL80_.jpg"

	@st.cache
	def get_dataframes():
	booksDf = pd.read_csv("./goodreads/books.csv",
	usecols=["book_id",
	"original_publication_year",
	# "average_rating",
	"isbn",
	"authors",
	"title",
	"average_rating"])
	booksDf['book_id'] = range(1, len(booksDf) + 1)

	baseRatingsDf = pd.read_csv("./goodreads/ratings.csv")

	bookMetadataJSON = json.load(open("./goodreads/book_metadata.json"))

	return booksDf, baseRatingsDf, bookMetadataJSON

	books, baseRatings, bookMetadata = get_dataframes()

	targetUserId = baseRatings['user_id'].max() + 1

	def get_book_metadata_by_id(book_id):
	book_id = str(book_id)

	data = {
	"description" : "n/a",
	"thumbnail" : DEFAULT_BOOK_COVER_URL
	}

	if book_id in bookMetadata:
	data = bookMetadata[book_id]

	return data

	def get_book_ids_by_title(book_titles):
	return books[books["title"].isin(book_titles)]["book_id"].values

	def get_all_book_titles():
	return books["title"].values

	def get_book_data_by_id(book_id):
	return books[books["book_id"] == book_id].to_dict(orient="records")[0]

	def get_book_title(book_id):
	return books[books["book_id"] == book_id]["title"].values[0]

	def get_book_titles(book_ids):
	return books[books["book_id"].isin(book_ids)]["title"].values

	def find_closest_read_title(bookByRatingData, upvotedBookIds, targetBookId):
	user_df = bookByRatingData.groupby(["user_id","book_id"])["rating"].mean().unstack()

	# drop all columns except the upvotedBookIds and the targetBookId
	book_read_df = user_df[upvotedBookIds + [targetBookId]]

	# replace NaNs with 0
	book_read_df = book_read_df.fillna(0)

	# find the correlation between the targetBookId and the upvotedBookIds
	corr_df = book_read_df.corr().unstack()

	# find the closest book to the targetBookId that is NOT the targetBookId
	closestBookId = corr_df[targetBookId][corr_df[targetBookId] < 1].idxmax()

	# get the title of the closest book
	closestReadBookData = get_book_data_by_id(closestBookId)

	return closestReadBookData

	def update_user_ratings(upvotedBookIds, downvotedBookIds):
	RATING_FOR_UPVOTE = 5
	RATING_FOR_DOWNVOTE = 1

	appendBookIds = []
	appendBookRatings = []
	appendUserIds = []

	for bookId in upvotedBookIds:
	appendBookIds.append(bookId)
	appendBookRatings.append(RATING_FOR_UPVOTE)
	appendUserIds.append(targetUserId)

	for bookId in downvotedBookIds:
	appendBookIds.append(bookId)
	appendBookRatings.append(RATING_FOR_DOWNVOTE)
	appendUserIds.append(targetUserId)

	newUserData = {
	'book_id': appendBookIds,
	'user_id': appendUserIds,
	'rating': appendBookRatings
	}

	newRows = pd.DataFrame(newUserData)

	ratings = pd.concat([baseRatings, newRows], ignore_index=True)

	book_by_rating_df = pd.merge(books, ratings, on="book_id", how="inner")

	user_df = book_by_rating_df.groupby(["user_id","title"])["rating"].mean().unstack()

	targetUserDf = user_df[user_df.index == targetUserId]

	targetBooksRead = targetUserDf.dropna(axis=1).columns.tolist()

	# for all the books the user has read
	book_read_df = user_df[targetBooksRead]

	# get counts of every other user that has read these
	userBookCount = book_read_df.notnull().sum(axis=1)

	print('\n\n\n\n\n\n\n\n')
	print('--' * 10)
	print(userBookCount)

	# from there get users who've read at least X percent of the main user
	minBookCount = int(book_read_df.shape[1] * SIMILAR_USER_MIN_PERCENT_SHARED_BOOKS)

	minBookCount = min(minBookCount, MAX_MIN_BOOK_COUNT_FOR_SIMILAR_USER)

	print(f'Min book count for Similar User: {minBookCount}')

	usersSameBooks = userBookCount[userBookCount > minBookCount].index

	# print(f'Users who have read at least 30% of the main user: {len(usersSameBooks)}')

	# filter the main user's read books df to only include rows from users who've read X percent
	filted_df = book_read_df[book_read_df.index.isin(usersSameBooks)]

	# convert all NaNs to 0
	filted_df = filted_df.fillna(0)

	corr_df = filted_df.T.corr().unstack()

	top_readers = None

	minTopReaderCorrelation = SIMILAR_USER_MIN_CORRELATION

	while top_readers is None:
	top_readers = pd.DataFrame(corr_df[targetUserId][corr_df[targetUserId] > minTopReaderCorrelation], columns=["corr"])

	# if top_readers only has 1 row, then we need to lower the correlation threshold
	if len(top_readers) <= 1:
	minTopReaderCorrelation -= 0.05
	top_readers = None

	print(top_readers)

	if (targetUserId in top_readers.index):
	top_readers = top_readers.drop(targetUserId)

	# get the ratings for the top readers
	top_readers_ratings = pd.merge(top_readers, book_by_rating_df[["user_id", "book_id", "rating"]], how='inner', on="user_id")

	# weight their ratings by how correlated they are with the user
	top_readers_ratings['weighted_rating'] = top_readers_ratings['corr'] * top_readers_ratings['rating']

	# # make a pivot table from the books and their new weighted rating
	recommendation_df = top_readers_ratings.pivot_table(values="weighted_rating", index="book_id", aggfunc="mean")

	# set all books in upvotedBookIds/downvotedBookIds to 0 in the recommendation_df if they are in there
	recommendation_df.loc[recommendation_df.index.isin(upvotedBookIds)] = 0
	recommendation_df.loc[recommendation_df.index.isin(downvotedBookIds)] = 0

	# sort the books by their weighted rating
	books_recommend = recommendation_df[recommendation_df["weighted_rating"] > 1].sort_values(by="weighted_rating", ascending=False).head(20)

	# get the recommended books (and sort by average_rating)
	recommendedBooks = books[books["book_id"].isin(books_recommend.index)]

	# sort recommended books by the weighted_rating in books_recommend
	recommendedBooks = recommendedBooks.merge(books_recommend, on="book_id").sort_values(by="weighted_rating", ascending=False)

	# drop book_id column
	recommendedBooks = recommendedBooks.drop(columns=["average_rating"])

	# get each row in the recommendedBooks dataframe as a dictionary
	recommendedBooksRowsAsDicts = recommendedBooks.to_dict(orient="records")

	return {
	"upvotedBookIds": upvotedBookIds,
	"downvotedBookIds": downvotedBookIds,
	"numSimilarUsers" : len(usersSameBooks),
	"recommendedBooksData": recommendedBooksRowsAsDicts,
	"bookByRatingData": book_by_rating_df,

	# sort by correlation
	"topCorrelatedReadersData" : top_readers.sort_values(by="corr", ascending=False)
	}