File size: 4,564 Bytes
78263be
 
 
 
 
 
 
7907d0c
adad2af
78263be
 
7907d0c
868e527
78263be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a7f93f9
 
78263be
a7f93f9
78263be
 
 
 
7907d0c
 
78263be
adad2af
7907d0c
78263be
 
48fe872
 
80db327
7907d0c
78263be
bef0550
 
 
 
 
 
 
 
 
 
78263be
 
adad2af
78263be
7907d0c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78263be
7907d0c
78263be
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import openai
from google.cloud import vision
from google.oauth2 import service_account
import io
import google.generativeai as genai
from diffusers import AutoPipelineForText2Image
import torch
import os
import spaces

# Utilize the Google Cloud Vision API to recognize text in the
# input input_images (diary input_images), https://cloud.google.com/vision.
def detect_text_in_image(image_path, credentials):

    # Create a Vision API client using the credentials
    client = vision.ImageAnnotatorClient(credentials=credentials)

    # Open the image file
    with io.open(image_path, 'rb') as image_file:
        content = image_file.read()

    # Create an image object for the Vision API
    image = vision.Image(content=content)

    # Use the Vision API to detect text
    response = client.text_detection(image=image)
    texts = response.text_annotations

    # Check for errors in the response
    if response.error.message:
        raise Exception(f'{response.error.message}')

    # Return the detected text or an empty string
    return texts[0].description if texts else ''


# Utilize the PaLM 2 Bison for Text model to conduct NLP tasks such as
# text summarization and condensing on the diary text, https://ai.google.dev/palm_docs/palm.
def summarize_diary_text(text, api_key):
    # Initialize the OpenAI client
    client = openai.Client(api_key=api_key)

    # Use the client to call the chat completion API
    response = client.chat.completions.create(
        model="gpt-4",  # Use GPT-4
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": f"Summarize the following diary entry: {text}"}
        ],
        max_tokens=150,
        temperature=0.7,
        n=1  # Number of completions to generate
    )

    # Extract the summary from the response
    return response.choices[0].message.content


# Utilize the Gemini 1.0 Pro Vision to input an image of the diary writer,
# and output a textual description of the image,
# https://ai.google.dev/gemini-api/docs/models/gemini.
# Mock example assuming an API request to Gemini
def analyze_writer_image(image_path, api_key):
    genai.configure(api_key=api_key)
    model = genai.GenerativeModel("gemini-1.5-flash")
    myfile = genai.upload_file(image_path)
    result = model.generate_content(
        [myfile, "\n\n", "Can you give a very short description of the person in the image?"]
    )
    return result.text


# Now that you have text from the diary and text describing the diary writer,
# you can utilize the SDXL-Turbo stable diffusion model to generate
# input_images https://huggingface.co/stabilityai/sdxl-turbo.
# You can try to output several input_images for a diary entry. Analyze how accurate the results,
# and think about what could be improved.
@spaces.GPU
def generate_comic_book(diary_text, writer_description, num_pages=4):
    pipe = AutoPipelineForText2Image.from_pretrained(
        "stabilityai/sdxl-turbo",
        torch_dtype=torch.float16,
        variant="fp16",
        cache_dir="./SDXL-Turbo"
    )

    # # Check for available device: CUDA, MPS, or CPU
    # if torch.cuda.is_available():
    #     device = "cuda"
    #     print("Using CUDA backend.")
    # elif torch.backends.mps.is_available():
    #     device = "mps"
    #     print("Using MPS backend.")
    # else:
    #     device = "cpu"
    #     print("CUDA and MPS not available. Falling back to CPU.")

    # Move the model to the selected device
    pipe.to('cuda')

    # Create a directory to store the comic book input_images
    os.makedirs("comic_book", exist_ok=True)

    # Split diary text into multiple segments/scenes for comic book pages
    diary_scenes = diary_text.split('.')[:num_pages]  # Split by periods, limiting to `num_pages`

    # Iterate over each scene, generating a page for each one
    for i, scene in enumerate(diary_scenes):
        prompt = (f'Comic Book Style: \n'
                  f'Actor Description: {writer_description} \n'
                  f'Diary Scene: {scene.strip()}\n'
                  f'Generate an cartoon image to represent this diary scene.')

        print(f"Generating comic page {i + 1} with prompt:\n{prompt}\n")

        # Generate the image
        image = pipe(prompt=prompt, num_inference_steps=30, guidance_scale=7.5).images[0]

        # Save the generated image
        image_path = f"comic_book/page_{i + 1}.png"
        image.save(image_path)
        print(f"Page {i + 1} saved as {image_path}")

    print("Comic book generation complete!")