Spaces:

bertugmirasyedi
/

aristotle-api

Sleeping

App Files Files Community

aristotle-api / app.py

bertugmirasyedi

Fix for Bettertransformer error

50dcc63 almost 2 years ago

raw

history blame

16 kB

	from fastapi import FastAPI
	from fastapi.middleware.cors import CORSMiddleware
	import os
	from transformers import (
	AutoModelForSeq2SeqLM,
	AutoTokenizer,
	AutoModelForSequenceClassification,
	)
	from optimum.onnxruntime import ORTModelForSeq2SeqLM, ORTModelForSequenceClassification
	from sentence_transformers import SentenceTransformer

	# Define the FastAPI app
	app = FastAPI(docs_url="/")

	# Add the CORS middleware to the app
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)

	# Define the Google Books API key
	key = os.environ.get("GOOGLE_BOOKS_API_KEY")

	# Define summarization models
	summary_tokenizer_normal = AutoTokenizer.from_pretrained("lidiya/bart-base-samsum")
	summary_model_normal = AutoModelForSeq2SeqLM.from_pretrained("lidiya/bart-base-samsum")
	summary_tokenizer_onnx = AutoTokenizer.from_pretrained("optimum/t5-small")
	summary_model_onnx = ORTModelForSeq2SeqLM.from_pretrained("optimum/t5-small")

	# Define classification models
	classification_tokenizer_normal = AutoTokenizer.from_pretrained(
	"sileod/deberta-v3-base-tasksource-nli"
	)
	classification_model_normal = AutoModelForSequenceClassification.from_pretrained(
	"sileod/deberta-v3-base-tasksource-nli"
	)
	classification_tokenizer_onnx = AutoTokenizer.from_pretrained(
	"optimum/distilbert-base-uncased-mnli"
	)
	classification_model_onnx = ORTModelForSequenceClassification.from_pretrained(
	"optimum/distilbert-base-uncased-mnli"
	)

	# Define similarity model
	similarity_model = SentenceTransformer("all-MiniLM-L6-v2")


	@app.get("/search")
	async def search(
	query: str,
	add_chatgpt_results: bool = False,
	n_results: int = 10,
	):
	"""
	Get the results from the Google Books API, OpenAlex, and optionally OpenAI.
	"""
	import time
	import requests

	start_time = time.time()

	# Initialize the lists to store the results
	titles = []
	authors = []
	publishers = []
	descriptions = []
	images = []

	def gbooks_search(query, n_results=30):
	"""
	Access the Google Books API and return the results.
	"""
	# Set the API endpoint and query parameters
	url = "https://www.googleapis.com/books/v1/volumes"
	params = {
	"q": str(query),
	"printType": "books",
	"maxResults": n_results,
	"key": key,
	}

	# Send a GET request to the API with the specified parameters
	response = requests.get(url, params=params)

	# Parse the response JSON and append the results
	data = response.json()

	# Initialize the lists to store the results
	titles = []
	authors = []
	publishers = []
	descriptions = []
	images = []

	for item in data["items"]:
	volume_info = item["volumeInfo"]
	try:
	titles.append(f"{volume_info['title']}: {volume_info['subtitle']}")
	except KeyError:
	titles.append(volume_info["title"])

	try:
	descriptions.append(volume_info["description"])
	except KeyError:
	descriptions.append("Null")

	try:
	publishers.append(volume_info["publisher"])
	except KeyError:
	publishers.append("Null")

	try:
	authors.append(volume_info["authors"][0])
	except KeyError:
	authors.append("Null")

	try:
	images.append(volume_info["imageLinks"]["thumbnail"])
	except KeyError:
	images.append(
	"https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
	)

	return titles, authors, publishers, descriptions, images

	# Run the gbooks_search function
	(
	titles_placeholder,
	authors_placeholder,
	publishers_placeholder,
	descriptions_placeholder,
	images_placeholder,
	) = gbooks_search(query, n_results=n_results)

	# Append the results to the lists
	[titles.append(title) for title in titles_placeholder]
	[authors.append(author) for author in authors_placeholder]
	[publishers.append(publisher) for publisher in publishers_placeholder]
	[descriptions.append(description) for description in descriptions_placeholder]
	[images.append(image) for image in images_placeholder]

	# Get the time since the start
	first_checkpoint = time.time()
	first_checkpoint_time = int(first_checkpoint - start_time)

	def openalex_search(query, n_results=10):
	"""
	Run a search on OpenAlex and return the results.
	"""
	import pyalex
	from pyalex import Works

	# Add email to the config
	pyalex.config.email = "[email protected]"

	# Define a pager object with the same query
	pager = Works().search(str(query)).paginate(per_page=n_results, n_max=n_results)

	# Generate a list of the results
	openalex_results = list(pager)

	# Initialize the lists to store the results
	titles = []
	authors = []
	publishers = []
	descriptions = []
	images = []

	# Get the titles, descriptions, and publishers and append them to the lists
	try:
	for result in openalex_results[0]:
	try:
	titles.append(result["title"])
	except KeyError:
	titles.append("Null")

	try:
	descriptions.append(result["abstract"])
	except KeyError:
	descriptions.append("Null")

	try:
	publishers.append(result["host_venue"]["publisher"])
	except KeyError:
	publishers.append("Null")

	try:
	authors.append(result["authorships"][0]["author"]["display_name"])
	except KeyError:
	authors.append("Null")

	images.append(
	"https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
	)
	except IndexError:
	titles.append("Null")
	descriptions.append("Null")
	publishers.append("Null")
	authors.append("Null")
	images.append(
	"https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
	)

	return titles, authors, publishers, descriptions, images

	# Run the openalex_search function
	(
	titles_placeholder,
	authors_placeholder,
	publishers_placeholder,
	descriptions_placeholder,
	images_placeholder,
	) = openalex_search(query, n_results=n_results)

	# Append the results to the lists
	[titles.append(title) for title in titles_placeholder]
	[authors.append(author) for author in authors_placeholder]
	[publishers.append(publisher) for publisher in publishers_placeholder]
	[descriptions.append(description) for description in descriptions_placeholder]
	[images.append(image) for image in images_placeholder]

	# Calculate the elapsed time between the first and second checkpoints
	second_checkpoint = time.time()
	second_checkpoint_time = int(second_checkpoint - first_checkpoint)

	def openai_search(query, n_results=10):
	"""
	Create a query to the OpenAI ChatGPT API and return the results.
	"""
	import openai

	# Initialize the lists to store the results
	titles = []
	authors = []
	publishers = []
	descriptions = []
	images = []

	# Set the OpenAI API key
	openai.api_key = os.environ.get("OPENAI_API_KEY")

	# Create ChatGPT query
	chatgpt_response = openai.ChatCompletion.create(
	model="gpt-3.5-turbo",
	messages=[
	{
	"role": "system",
	"content": "You are a librarian. You are helping a patron find a book.",
	},
	{
	"role": "user",
	"content": f"Recommend me {n_results} books about {query}. Your response should be like: 'title: <title>, author: <author>, publisher: <publisher>, summary: <summary>'",
	},
	],
	)

	# Split the response into a list of results
	chatgpt_results = chatgpt_response["choices"][0]["message"]["content"].split(
	"\n"
	)[2::2]

	# Define a function to parse the results
	def parse_result(
	result, ordered_keys=["Title", "Author", "Publisher", "Summary"]
	):
	# Create a dict to store the key-value pairs
	parsed_result = {}

	for key in ordered_keys:
	# Split the result string by the key and append the value to the list
	if key != ordered_keys[-1]:
	parsed_result[key] = result.split(f"{key}: ")[1].split(",")[0]
	else:
	parsed_result[key] = result.split(f"{key}: ")[1]

	return parsed_result

	ordered_keys = ["Title", "Author", "Publisher", "Summary"]

	for result in chatgpt_results:
	try:
	# Parse the result
	parsed_result = parse_result(result, ordered_keys=ordered_keys)

	# Append the parsed result to the lists
	titles.append(parsed_result["Title"])
	authors.append(parsed_result["Author"])
	publishers.append(parsed_result["Publisher"])
	descriptions.append(parsed_result["Summary"])
	images.append(
	"https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
	)

	# In case the OpenAI API hits the limit
	except IndexError:
	break

	return titles, authors, publishers, descriptions, images

	if add_chatgpt_results:
	# Run the openai_search function
	(
	titles_placeholder,
	authors_placeholder,
	publishers_placeholder,
	descriptions_placeholder,
	images_placeholder,
	) = openai_search(query)

	# Append the results to the lists
	[titles.append(title) for title in titles_placeholder]
	[authors.append(author) for author in authors_placeholder]
	[publishers.append(publisher) for publisher in publishers_placeholder]
	[descriptions.append(description) for description in descriptions_placeholder]
	[images.append(image) for image in images_placeholder]

	# Calculate the elapsed time between the second and third checkpoints
	third_checkpoint = time.time()
	third_checkpoint_time = int(third_checkpoint - second_checkpoint)

	results = [
	{
	"id": i,
	"title": title,
	"author": author,
	"publisher": publisher,
	"description": description,
	"image_link": image,
	}
	for (i, [title, author, publisher, description, image]) in enumerate(
	zip(titles, authors, publishers, descriptions, images)
	)
	]

	return results


	@app.post("/classify")
	async def classify(data: list, runtime: str = "normal"):
	"""
	Create classifier pipeline and return the results.
	"""
	titles = [book["title"] for book in data]
	descriptions = [book["description"] for book in data]
	publishers = [book["publisher"] for book in data]

	# Combine title, description, and publisher into a single string
	combined_data = [
	f"The book's title is {title}. It is published by {publisher}. This book is about {description}"
	for title, description, publisher in zip(titles, descriptions, publishers)
	]

	from transformers import (
	AutoTokenizer,
	AutoModelForSequenceClassification,
	pipeline,
	)
	from optimum.onnxruntime import ORTModelForSequenceClassification

	if runtime == "normal":
	# Define the zero-shot classifier
	tokenizer = classification_tokenizer_normal
	model = classification_model_normal
	elif runtime == "onnxruntime":
	tokenizer = classification_tokenizer_onnx
	model = classification_model_onnx

	classifier_pipe = pipeline(
	"zero-shot-classification",
	model=model,
	tokenizer=tokenizer,
	hypothesis_template="This book is {}.",
	batch_size=1,
	device=-1,
	multi_label=False,
	)

	# Define the candidate labels
	level = [
	"Introductory",
	"Advanced",
	]

	audience = ["Academic", "Not Academic", "Manual"]

	classes = [
	{
	"audience": classifier_pipe(doc, audience)["labels"][0],
	"audience_confidence": classifier_pipe(doc, audience)["scores"][0],
	"level": classifier_pipe(doc, level)["labels"][0],
	"level_confidence": classifier_pipe(doc, level)["scores"][0],
	}
	for doc in combined_data
	]

	return classes


	@app.post("/find_similar")
	async def find_similar(data: list, top_k: int = 5):
	"""
	Calculate the similarity between the selected book and the corpus. Return the top_k results.
	"""
	from sentence_transformers import SentenceTransformer
	from sentence_transformers import util

	titles = [book["title"] for book in data]
	descriptions = [book["description"] for book in data]
	publishers = [book["publisher"] for book in data]

	# Combine title, description, and publisher into a single string
	combined_data = [
	f"The book's title is {title}. It is published by {publisher}. This book is about {description}"
	for title, description, publisher in zip(titles, descriptions, publishers)
	]

	sentence_transformer = similarity_model
	book_embeddings = sentence_transformer.encode(combined_data, convert_to_tensor=True)

	# Make sure that the top_k value is not greater than the number of books
	top_k = len(combined_data) if top_k > len(combined_data) else top_k

	similar_books = []

	for i in range(len(combined_data)):
	# Get the embedding for the ith book
	current_embedding = book_embeddings[i]

	# Calculate the similarity between the ith book and the rest of the books
	similarity_sorted = util.semantic_search(
	current_embedding, book_embeddings, top_k=top_k
	)

	# Append the results to the list
	similar_books.append(
	{
	"sorted_by_similarity": similarity_sorted[0][1:],
	}
	)

	return similar_books


	@app.post("/summarize")
	async def summarize(descriptions: list, runtime="normal"):
	"""
	Summarize the descriptions and return the results.
	"""
	from transformers import (
	AutoTokenizer,
	AutoModelForSeq2SeqLM,
	pipeline,
	)
	from optimum.onnxruntime import ORTModelForSeq2SeqLM
	from optimum.bettertransformer import BetterTransformer

	# Define the summarizer model and tokenizer
	if runtime == "normal":
	tokenizer = summary_tokenizer_normal
	normal_model = summary_model_normal
	model = BetterTransformer.transform(normal_model)
	elif runtime == "onnxruntime":
	tokenizer = summary_tokenizer_onnx
	model = summary_model_onnx

	# Create the summarizer pipeline
	summarizer_pipe = pipeline("summarization", model=model, tokenizer=tokenizer)

	# Summarize the descriptions
	summaries = [
	summarizer_pipe(description)
	if (description != "Null" and description != None)
	else [{"summary_text": "No summary text is available."}]
	for description in descriptions
	]

	return summaries