Spaces:

jjz5463
/

Diary-AI

Paused

File size: 4,564 Bytes

import openai
from google.cloud import vision
from google.oauth2 import service_account
import io
import google.generativeai as genai
from diffusers import AutoPipelineForText2Image
import torch
import os
import spaces

# Utilize the Google Cloud Vision API to recognize text in the
# input input_images (diary input_images), https://cloud.google.com/vision.
def detect_text_in_image(image_path, credentials):

    # Create a Vision API client using the credentials
    client = vision.ImageAnnotatorClient(credentials=credentials)

    # Open the image file
    with io.open(image_path, 'rb') as image_file:
        content = image_file.read()

    # Create an image object for the Vision API
    image = vision.Image(content=content)

    # Use the Vision API to detect text
    response = client.text_detection(image=image)
    texts = response.text_annotations

    # Check for errors in the response
    if response.error.message:
        raise Exception(f'{response.error.message}')

    # Return the detected text or an empty string
    return texts[0].description if texts else ''


# Utilize the PaLM 2 Bison for Text model to conduct NLP tasks such as
# text summarization and condensing on the diary text, https://ai.google.dev/palm_docs/palm.
def summarize_diary_text(text, api_key):
    # Initialize the OpenAI client
    client = openai.Client(api_key=api_key)

    # Use the client to call the chat completion API
    response = client.chat.completions.create(
        model="gpt-4",  # Use GPT-4
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": f"Summarize the following diary entry: {text}"}
        ],
        max_tokens=150,
        temperature=0.7,
        n=1  # Number of completions to generate
    )

    # Extract the summary from the response
    return response.choices[0].message.content


# Utilize the Gemini 1.0 Pro Vision to input an image of the diary writer,
# and output a textual description of the image,
# https://ai.google.dev/gemini-api/docs/models/gemini.
# Mock example assuming an API request to Gemini
def analyze_writer_image(image_path, api_key):
    genai.configure(api_key=api_key)
    model = genai.GenerativeModel("gemini-1.5-flash")
    myfile = genai.upload_file(image_path)
    result = model.generate_content(
        [myfile, "\n\n", "Can you give a very short description of the person in the image?"]
    )
    return result.text


# Now that you have text from the diary and text describing the diary writer,
# you can utilize the SDXL-Turbo stable diffusion model to generate
# input_images https://huggingface.co/stabilityai/sdxl-turbo.
# You can try to output several input_images for a diary entry. Analyze how accurate the results,
# and think about what could be improved.
@spaces.GPU
def generate_comic_book(diary_text, writer_description, num_pages=4):
    pipe = AutoPipelineForText2Image.from_pretrained(
        "stabilityai/sdxl-turbo",
        torch_dtype=torch.float16,
        variant="fp16",
        cache_dir="./SDXL-Turbo"
    )

    # # Check for available device: CUDA, MPS, or CPU
    # if torch.cuda.is_available():
    #     device = "cuda"
    #     print("Using CUDA backend.")
    # elif torch.backends.mps.is_available():
    #     device = "mps"
    #     print("Using MPS backend.")
    # else:
    #     device = "cpu"
    #     print("CUDA and MPS not available. Falling back to CPU.")

    # Move the model to the selected device
    pipe.to('cuda')

    # Create a directory to store the comic book input_images
    os.makedirs("comic_book", exist_ok=True)

    # Split diary text into multiple segments/scenes for comic book pages
    diary_scenes = diary_text.split('.')[:num_pages]  # Split by periods, limiting to `num_pages`

    # Iterate over each scene, generating a page for each one
    for i, scene in enumerate(diary_scenes):
        prompt = (f'Comic Book Style: \n'
                  f'Actor Description: {writer_description} \n'
                  f'Diary Scene: {scene.strip()}\n'
                  f'Generate an cartoon image to represent this diary scene.')

        print(f"Generating comic page {i + 1} with prompt:\n{prompt}\n")

        # Generate the image
        image = pipe(prompt=prompt, num_inference_steps=30, guidance_scale=7.5).images[0]

        # Save the generated image
        image_path = f"comic_book/page_{i + 1}.png"
        image.save(image_path)
        print(f"Page {i + 1} saved as {image_path}")

    print("Comic book generation complete!")