Spaces:

bertugmirasyedi
/

aristotle-api

Sleeping

File size: 15,954 Bytes

from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
import os
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    AutoModelForSequenceClassification,
)
from optimum.onnxruntime import ORTModelForSeq2SeqLM, ORTModelForSequenceClassification
from sentence_transformers import SentenceTransformer

# Define the FastAPI app
app = FastAPI(docs_url="/")

# Add the CORS middleware to the app
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Define the Google Books API key
key = os.environ.get("GOOGLE_BOOKS_API_KEY")

# Define summarization models
summary_tokenizer_normal = AutoTokenizer.from_pretrained("lidiya/bart-base-samsum")
summary_model_normal = AutoModelForSeq2SeqLM.from_pretrained("lidiya/bart-base-samsum")
summary_tokenizer_onnx = AutoTokenizer.from_pretrained("optimum/t5-small")
summary_model_onnx = ORTModelForSeq2SeqLM.from_pretrained("optimum/t5-small")

# Define classification models
classification_tokenizer_normal = AutoTokenizer.from_pretrained(
    "sileod/deberta-v3-base-tasksource-nli"
)
classification_model_normal = AutoModelForSequenceClassification.from_pretrained(
    "sileod/deberta-v3-base-tasksource-nli"
)
classification_tokenizer_onnx = AutoTokenizer.from_pretrained(
    "optimum/distilbert-base-uncased-mnli"
)
classification_model_onnx = ORTModelForSequenceClassification.from_pretrained(
    "optimum/distilbert-base-uncased-mnli"
)

# Define similarity model
similarity_model = SentenceTransformer("all-MiniLM-L6-v2")


@app.get("/search")
async def search(
    query: str,
    add_chatgpt_results: bool = False,
    n_results: int = 10,
):
    """
    Get the results from the Google Books API, OpenAlex, and optionally OpenAI.
    """
    import time
    import requests

    start_time = time.time()

    # Initialize the lists to store the results
    titles = []
    authors = []
    publishers = []
    descriptions = []
    images = []

    def gbooks_search(query, n_results=30):
        """
        Access the Google Books API and return the results.
        """
        # Set the API endpoint and query parameters
        url = "https://www.googleapis.com/books/v1/volumes"
        params = {
            "q": str(query),
            "printType": "books",
            "maxResults": n_results,
            "key": key,
        }

        # Send a GET request to the API with the specified parameters
        response = requests.get(url, params=params)

        # Parse the response JSON and append the results
        data = response.json()

        # Initialize the lists to store the results
        titles = []
        authors = []
        publishers = []
        descriptions = []
        images = []

        for item in data["items"]:
            volume_info = item["volumeInfo"]
            try:
                titles.append(f"{volume_info['title']}: {volume_info['subtitle']}")
            except KeyError:
                titles.append(volume_info["title"])

            try:
                descriptions.append(volume_info["description"])
            except KeyError:
                descriptions.append("Null")

            try:
                publishers.append(volume_info["publisher"])
            except KeyError:
                publishers.append("Null")

            try:
                authors.append(volume_info["authors"][0])
            except KeyError:
                authors.append("Null")

            try:
                images.append(volume_info["imageLinks"]["thumbnail"])
            except KeyError:
                images.append(
                    "https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
                )

        return titles, authors, publishers, descriptions, images

    # Run the gbooks_search function
    (
        titles_placeholder,
        authors_placeholder,
        publishers_placeholder,
        descriptions_placeholder,
        images_placeholder,
    ) = gbooks_search(query, n_results=n_results)

    # Append the results to the lists
    [titles.append(title) for title in titles_placeholder]
    [authors.append(author) for author in authors_placeholder]
    [publishers.append(publisher) for publisher in publishers_placeholder]
    [descriptions.append(description) for description in descriptions_placeholder]
    [images.append(image) for image in images_placeholder]

    # Get the time since the start
    first_checkpoint = time.time()
    first_checkpoint_time = int(first_checkpoint - start_time)

    def openalex_search(query, n_results=10):
        """
        Run a search on OpenAlex and return the results.
        """
        import pyalex
        from pyalex import Works

        # Add email to the config
        pyalex.config.email = "[email protected]"

        # Define a pager object with the same query
        pager = Works().search(str(query)).paginate(per_page=n_results, n_max=n_results)

        # Generate a list of the results
        openalex_results = list(pager)

        # Initialize the lists to store the results
        titles = []
        authors = []
        publishers = []
        descriptions = []
        images = []

        # Get the titles, descriptions, and publishers and append them to the lists
        try:
            for result in openalex_results[0]:
                try:
                    titles.append(result["title"])
                except KeyError:
                    titles.append("Null")

                try:
                    descriptions.append(result["abstract"])
                except KeyError:
                    descriptions.append("Null")

                try:
                    publishers.append(result["host_venue"]["publisher"])
                except KeyError:
                    publishers.append("Null")

                try:
                    authors.append(result["authorships"][0]["author"]["display_name"])
                except KeyError:
                    authors.append("Null")

                images.append(
                    "https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
                )
        except IndexError:
            titles.append("Null")
            descriptions.append("Null")
            publishers.append("Null")
            authors.append("Null")
            images.append(
                "https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
            )

        return titles, authors, publishers, descriptions, images

    # Run the openalex_search function
    (
        titles_placeholder,
        authors_placeholder,
        publishers_placeholder,
        descriptions_placeholder,
        images_placeholder,
    ) = openalex_search(query, n_results=n_results)

    # Append the results to the lists
    [titles.append(title) for title in titles_placeholder]
    [authors.append(author) for author in authors_placeholder]
    [publishers.append(publisher) for publisher in publishers_placeholder]
    [descriptions.append(description) for description in descriptions_placeholder]
    [images.append(image) for image in images_placeholder]

    # Calculate the elapsed time between the first and second checkpoints
    second_checkpoint = time.time()
    second_checkpoint_time = int(second_checkpoint - first_checkpoint)

    def openai_search(query, n_results=10):
        """
        Create a query to the OpenAI ChatGPT API and return the results.
        """
        import openai

        # Initialize the lists to store the results
        titles = []
        authors = []
        publishers = []
        descriptions = []
        images = []

        # Set the OpenAI API key
        openai.api_key = os.environ.get("OPENAI_API_KEY")

        # Create ChatGPT query
        chatgpt_response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {
                    "role": "system",
                    "content": "You are a librarian. You are helping a patron find a book.",
                },
                {
                    "role": "user",
                    "content": f"Recommend me {n_results} books about {query}. Your response should be like: 'title: <title>, author: <author>, publisher: <publisher>, summary: <summary>'",
                },
            ],
        )

        # Split the response into a list of results
        chatgpt_results = chatgpt_response["choices"][0]["message"]["content"].split(
            "\n"
        )[2::2]

        # Define a function to parse the results
        def parse_result(
            result, ordered_keys=["Title", "Author", "Publisher", "Summary"]
        ):
            # Create a dict to store the key-value pairs
            parsed_result = {}

            for key in ordered_keys:
                # Split the result string by the key and append the value to the list
                if key != ordered_keys[-1]:
                    parsed_result[key] = result.split(f"{key}: ")[1].split(",")[0]
                else:
                    parsed_result[key] = result.split(f"{key}: ")[1]

            return parsed_result

        ordered_keys = ["Title", "Author", "Publisher", "Summary"]

        for result in chatgpt_results:
            try:
                # Parse the result
                parsed_result = parse_result(result, ordered_keys=ordered_keys)

                # Append the parsed result to the lists
                titles.append(parsed_result["Title"])
                authors.append(parsed_result["Author"])
                publishers.append(parsed_result["Publisher"])
                descriptions.append(parsed_result["Summary"])
                images.append(
                    "https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
                )

            # In case the OpenAI API hits the limit
            except IndexError:
                break

        return titles, authors, publishers, descriptions, images

    if add_chatgpt_results:
        # Run the openai_search function
        (
            titles_placeholder,
            authors_placeholder,
            publishers_placeholder,
            descriptions_placeholder,
            images_placeholder,
        ) = openai_search(query)

        # Append the results to the lists
        [titles.append(title) for title in titles_placeholder]
        [authors.append(author) for author in authors_placeholder]
        [publishers.append(publisher) for publisher in publishers_placeholder]
        [descriptions.append(description) for description in descriptions_placeholder]
        [images.append(image) for image in images_placeholder]

    # Calculate the elapsed time between the second and third checkpoints
    third_checkpoint = time.time()
    third_checkpoint_time = int(third_checkpoint - second_checkpoint)

    results = [
        {
            "id": i,
            "title": title,
            "author": author,
            "publisher": publisher,
            "description": description,
            "image_link": image,
        }
        for (i, [title, author, publisher, description, image]) in enumerate(
            zip(titles, authors, publishers, descriptions, images)
        )
    ]

    return results


@app.post("/classify")
async def classify(data: list, runtime: str = "normal"):
    """
    Create classifier pipeline and return the results.
    """
    titles = [book["title"] for book in data]
    descriptions = [book["description"] for book in data]
    publishers = [book["publisher"] for book in data]

    # Combine title, description, and publisher into a single string
    combined_data = [
        f"The book's title is {title}. It is published by {publisher}. This book is about {description}"
        for title, description, publisher in zip(titles, descriptions, publishers)
    ]

    from transformers import (
        AutoTokenizer,
        AutoModelForSequenceClassification,
        pipeline,
    )
    from optimum.onnxruntime import ORTModelForSequenceClassification

    if runtime == "normal":
        # Define the zero-shot classifier
        tokenizer = classification_tokenizer_normal
        model = classification_model_normal
    elif runtime == "onnxruntime":
        tokenizer = classification_tokenizer_onnx
        model = classification_model_onnx

    classifier_pipe = pipeline(
        "zero-shot-classification",
        model=model,
        tokenizer=tokenizer,
        hypothesis_template="This book is {}.",
        batch_size=1,
        device=-1,
        multi_label=False,
    )

    # Define the candidate labels
    level = [
        "Introductory",
        "Advanced",
    ]

    audience = ["Academic", "Not Academic", "Manual"]

    classes = [
        {
            "audience": classifier_pipe(doc, audience)["labels"][0],
            "audience_confidence": classifier_pipe(doc, audience)["scores"][0],
            "level": classifier_pipe(doc, level)["labels"][0],
            "level_confidence": classifier_pipe(doc, level)["scores"][0],
        }
        for doc in combined_data
    ]

    return classes


@app.post("/find_similar")
async def find_similar(data: list, top_k: int = 5):
    """
    Calculate the similarity between the selected book and the corpus. Return the top_k results.
    """
    from sentence_transformers import SentenceTransformer
    from sentence_transformers import util

    titles = [book["title"] for book in data]
    descriptions = [book["description"] for book in data]
    publishers = [book["publisher"] for book in data]

    # Combine title, description, and publisher into a single string
    combined_data = [
        f"The book's title is {title}. It is published by {publisher}. This book is about {description}"
        for title, description, publisher in zip(titles, descriptions, publishers)
    ]

    sentence_transformer = similarity_model
    book_embeddings = sentence_transformer.encode(combined_data, convert_to_tensor=True)

    # Make sure that the top_k value is not greater than the number of books
    top_k = len(combined_data) if top_k > len(combined_data) else top_k

    similar_books = []

    for i in range(len(combined_data)):
        # Get the embedding for the ith book
        current_embedding = book_embeddings[i]

        # Calculate the similarity between the ith book and the rest of the books
        similarity_sorted = util.semantic_search(
            current_embedding, book_embeddings, top_k=top_k
        )

        # Append the results to the list
        similar_books.append(
            {
                "sorted_by_similarity": similarity_sorted[0][1:],
            }
        )

    return similar_books


@app.post("/summarize")
async def summarize(descriptions: list, runtime="normal"):
    """
    Summarize the descriptions and return the results.
    """
    from transformers import (
        AutoTokenizer,
        AutoModelForSeq2SeqLM,
        pipeline,
    )
    from optimum.onnxruntime import ORTModelForSeq2SeqLM
    from optimum.bettertransformer import BetterTransformer

    # Define the summarizer model and tokenizer
    if runtime == "normal":
        tokenizer = summary_tokenizer_normal
        normal_model = summary_model_normal
        model = BetterTransformer.transform(normal_model)
    elif runtime == "onnxruntime":
        tokenizer = summary_tokenizer_onnx
        model = summary_model_onnx

    # Create the summarizer pipeline
    summarizer_pipe = pipeline("summarization", model=model, tokenizer=tokenizer)

    # Summarize the descriptions
    summaries = [
        summarizer_pipe(description)
        if (description != "Null" and description != None)
        else [{"summary_text": "No summary text is available."}]
        for description in descriptions
    ]

    return summaries