Update app.py
Browse files
app.py
CHANGED
@@ -15,144 +15,21 @@ CLIP_PATH = "google/siglip-so400m-patch14-384"
|
|
15 |
MODEL_PATH = "Qwen/Qwen2.5-7B-Instruct"
|
16 |
CHECKPOINT_PATH = Path("9em124t2-499968")
|
17 |
CAPTION_TYPE_MAP = {
|
18 |
-
("descriptive", "formal", False, False): [
|
19 |
-
|
20 |
-
|
21 |
-
],
|
22 |
-
("descriptive", "
|
23 |
-
|
24 |
-
|
25 |
-
],
|
26 |
-
("
|
27 |
-
|
28 |
-
|
29 |
-
],
|
30 |
-
("
|
31 |
-
|
32 |
-
|
33 |
-
],
|
34 |
-
("descriptive", "informal", False, True): [
|
35 |
-
"In about {word_count} words, give a laid-back description of this image's vibe and key features.",
|
36 |
-
"Summarize the coolest parts of this artwork in a casual tone, using roughly {word_count} words."
|
37 |
-
],
|
38 |
-
("descriptive", "informal", True, False): [
|
39 |
-
"Write a {length} chill description of this image, highlighting what you find most interesting or unique.",
|
40 |
-
"Give a {length} relaxed explanation of what's going on in this artwork and why it catches your eye."
|
41 |
-
],
|
42 |
-
("training_prompt", "formal", False, False): [
|
43 |
-
"Generate a detailed stable diffusion prompt to recreate this image, including style, composition, and key elements.",
|
44 |
-
"Craft a comprehensive prompt for an AI art generator to produce an image in the same style and mood as this artwork."
|
45 |
-
],
|
46 |
-
("training_prompt", "formal", False, True): [
|
47 |
-
"Within {word_count} words, create a precise stable diffusion prompt capturing the essence of this image.",
|
48 |
-
"Compose a concise AI art prompt of {word_count} words to replicate this artwork's style and content."
|
49 |
-
],
|
50 |
-
("training_prompt", "formal", True, False): [
|
51 |
-
"Write a {length} stable diffusion prompt that thoroughly describes this image's style, subject, and artistic techniques.",
|
52 |
-
"Develop a {length} detailed prompt for AI art generation, breaking down the key visual elements and artistic approach of this image."
|
53 |
-
],
|
54 |
-
("rng-tags", "formal", False, False): [
|
55 |
-
"Generate a comprehensive list of Booru tags describing this image's content, style, and artistic elements.",
|
56 |
-
"Create an extensive set of Booru tags covering all aspects of this artwork, including subject, technique, and mood."
|
57 |
-
],
|
58 |
-
("rng-tags", "formal", False, True): [
|
59 |
-
"Produce a focused list of Booru tags within {word_count} words, capturing the most important aspects of this image.",
|
60 |
-
"Compile a concise set of Booru tags, limited to {word_count} words, that best represent this artwork's key features."
|
61 |
-
],
|
62 |
-
("rng-tags", "formal", True, False): [
|
63 |
-
"Generate a {length} list of Booru tags, providing a thorough categorization of this image's content and style.",
|
64 |
-
"Create a {length} set of Booru tags that extensively describe all visual elements and artistic choices in this artwork."
|
65 |
-
],
|
66 |
-
("artistic_inspiration", "formal", False, False): [
|
67 |
-
"Analyze this image and suggest artistic variations or extensions that could be created based on its style and theme.",
|
68 |
-
"Provide a formal interpretation of this artwork's mood and style, offering ideas for complementary pieces or a series."
|
69 |
-
],
|
70 |
-
("artistic_inspiration", "informal", False, False): [
|
71 |
-
"Brainstorm some cool ideas for new artworks inspired by this image's style or subject matter.",
|
72 |
-
"Riff on this artwork's vibe and come up with some creative spin-offs or related pieces an artist could make."
|
73 |
-
],
|
74 |
-
("technical_breakdown", "formal", False, False): [
|
75 |
-
"Provide a detailed technical analysis of the artistic techniques and materials likely used to create this image.",
|
76 |
-
"Break down the compositional elements and artistic methods employed in this artwork, suitable for an art student's study."
|
77 |
-
],
|
78 |
-
("emotional_response", "informal", False, False): [
|
79 |
-
"Describe the emotions and feelings this artwork evokes, and explain why it might resonate with viewers.",
|
80 |
-
"Share your gut reaction to this image and speculate on what the artist might have been feeling or thinking."
|
81 |
-
],
|
82 |
-
|
83 |
-
("thematic_analysis", "formal", False, False): [
|
84 |
-
"Provide an in-depth analysis of the themes presented in this image, exploring the underlying messages and concepts.",
|
85 |
-
"Analyze the primary and secondary themes of this artwork, discussing their significance and interplay."
|
86 |
-
],
|
87 |
-
("thematic_analysis", "formal", False, True): [
|
88 |
-
"Within {word_count} words, dissect the main themes of this image, highlighting their relevance and impact.",
|
89 |
-
"Craft a concise thematic analysis of this artwork in {word_count} words, focusing on its core messages."
|
90 |
-
],
|
91 |
-
("thematic_analysis", "formal", True, False): [
|
92 |
-
"Write a {length} formal exploration of the themes depicted in this image, examining their depth and meaning.",
|
93 |
-
"Develop a {length} scholarly analysis of the thematic elements in this artwork, discussing their significance."
|
94 |
-
],
|
95 |
-
("stylistic_comparison", "informal", False, False): [
|
96 |
-
"Compare the style of this image to other famous art movements or artists, highlighting similarities and differences.",
|
97 |
-
"Describe how this artwork's style relates to [specific artist/style], and what makes it unique."
|
98 |
-
],
|
99 |
-
("stylistic_comparison", "informal", False, True): [
|
100 |
-
"In about {word_count} words, compare this image's style with other known art styles or artists.",
|
101 |
-
"Summarize the stylistic similarities and differences of this artwork compared to other genres in {word_count} words."
|
102 |
-
],
|
103 |
-
("stylistic_comparison", "informal", True, False): [
|
104 |
-
"Write a {length} casual comparison of this image's style with other art movements or famous artists.",
|
105 |
-
"Give a {length} relaxed description of how this artwork's style aligns or differs from other genres."
|
106 |
-
],
|
107 |
-
("narrative_suggestion", "formal", False, False): [
|
108 |
-
"Create a short narrative inspired by this image, outlining a possible story that reflects its visual elements.",
|
109 |
-
"Develop a brief storyline that complements the themes and mood depicted in this artwork."
|
110 |
-
],
|
111 |
-
("narrative_suggestion", "formal", False, True): [
|
112 |
-
"Within {word_count} words, outline a narrative inspired by this image's visual elements and mood.",
|
113 |
-
"Compose a concise story idea based on the themes and composition of this artwork in {word_count} words."
|
114 |
-
],
|
115 |
-
("narrative_suggestion", "formal", True, False): [
|
116 |
-
"Write a {length} formal narrative inspired by this image, detailing a story that aligns with its visual and thematic elements.",
|
117 |
-
"Develop a {length} scholarly storyline that reflects the mood and composition of this artwork."
|
118 |
-
],
|
119 |
-
("contextual_storytelling", "informal", False, False): [
|
120 |
-
"Tell a cool story that could be happening in the scene of this image, based on its visual cues.",
|
121 |
-
"Imagine a background story for this artwork, explaining what's happening and why."
|
122 |
-
],
|
123 |
-
("contextual_storytelling", "informal", False, True): [
|
124 |
-
"In about {word_count} words, create a backstory for the scene depicted in this image.",
|
125 |
-
"Summarize a possible background narrative for this artwork in {word_count} words."
|
126 |
-
],
|
127 |
-
("contextual_storytelling", "informal", True, False): [
|
128 |
-
"Write a {length} informal story that provides context to the scene portrayed in this image.",
|
129 |
-
"Give a {length} casual backstory explaining the events depicted in this artwork."
|
130 |
-
],
|
131 |
-
|
132 |
-
("style_prompt", "formal", False, False): [
|
133 |
-
"Generate a detailed stable diffusion prompt to recreate this image, including style, composition, and key elements.",
|
134 |
-
"Craft a comprehensive prompt for an AI art generator to produce an image in the same style and mood as this artwork."
|
135 |
-
],
|
136 |
-
("style_prompt", "formal", False, True): [
|
137 |
-
"Within {word_count} words, create a precise stable diffusion prompt capturing the essence of this image.",
|
138 |
-
"Compose a concise AI art prompt of {word_count} words to replicate this artwork's style and content."
|
139 |
-
],
|
140 |
-
("style_prompt", "formal", True, False): [
|
141 |
-
"Write a {length} stable diffusion prompt that thoroughly describes this image's style, subject, and artistic techniques.",
|
142 |
-
"Develop a {length} detailed prompt for AI art generation, breaking down the key visual elements and artistic approach of this image."
|
143 |
-
],
|
144 |
-
("style_prompt", "informal", False, False): [
|
145 |
-
"Imagine this image is in an exhibition of {style} art. Describe what makes it fit in or stand out from other {style} pieces.",
|
146 |
-
"Give a casual rundown of how this artwork vibes with the {style} movement. What's similar? What's different? What's cool about it?"
|
147 |
-
],
|
148 |
-
("style_prompt", "informal", False, True): [
|
149 |
-
"In about {word_count} words, chat about how this image relates to {style} art. What catches your eye as typical or unusual for the style?",
|
150 |
-
"Summarize in roughly {word_count} words how this artwork plays with {style} ideas. What's familiar? What's a twist on the style?"
|
151 |
-
],
|
152 |
-
("style_prompt", "informal", True, False): [
|
153 |
-
"Write a {length} chill analysis of this image as if it's part of a {style} art show. What works? What's surprising? How does it make you feel?",
|
154 |
-
"Give a {length} relaxed breakdown of how this artwork fits (or doesn't) into the {style} scene. What's your take on its use of {style} elements?"
|
155 |
-
],
|
156 |
}
|
157 |
|
158 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
@@ -278,8 +155,8 @@ def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str,
|
|
278 |
except ValueError:
|
279 |
pass
|
280 |
|
281 |
-
# 'rng-tags' and '
|
282 |
-
if caption_type
|
283 |
caption_tone = "formal"
|
284 |
|
285 |
# Build prompt
|
@@ -289,12 +166,9 @@ def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str,
|
|
289 |
|
290 |
prompt_str = CAPTION_TYPE_MAP[prompt_key][0].format(length=length, word_count=length)
|
291 |
|
292 |
-
# Add style prompt details if applicable
|
293 |
if caption_type == "style_prompt":
|
294 |
-
prompt_str += (f"
|
295 |
-
f"{film_stock} film stock, {composition} composition, and {lighting} lighting.
|
296 |
-
f"Format the output as a comma-separated list of descriptors and modifiers, "
|
297 |
-
f"suitable for direct input into a Stable Diffusion interface.")
|
298 |
|
299 |
print(f"Prompt: {prompt_str}")
|
300 |
|
@@ -317,15 +191,7 @@ def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str,
|
|
317 |
# Embed prompt
|
318 |
prompt_embeds = text_model.model.embed_tokens(prompt.to('cuda'))
|
319 |
assert prompt_embeds.shape == (1, prompt.shape[1], text_model.config.hidden_size), f"Prompt shape is {prompt_embeds.shape}, expected {(1, prompt.shape[1], text_model.config.hidden_size)}"
|
320 |
-
|
321 |
-
# Check if bos_token_id exists
|
322 |
-
if tokenizer.bos_token_id is None:
|
323 |
-
print("Warning: bos_token_id is None. Using default value of 1.")
|
324 |
-
bos_token_id = 1
|
325 |
-
else:
|
326 |
-
bos_token_id = tokenizer.bos_token_id
|
327 |
-
|
328 |
-
embedded_bos = text_model.model.embed_tokens(torch.tensor([[bos_token_id]], device=text_model.device, dtype=torch.int64))
|
329 |
eot_embed = image_adapter.get_eot_embedding().unsqueeze(0).to(dtype=text_model.dtype)
|
330 |
|
331 |
# Construct prompts
|
@@ -337,7 +203,7 @@ def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str,
|
|
337 |
], dim=1)
|
338 |
|
339 |
input_ids = torch.cat([
|
340 |
-
torch.tensor([[bos_token_id]], dtype=torch.long),
|
341 |
torch.zeros((1, embedded_images.shape[1]), dtype=torch.long),
|
342 |
prompt,
|
343 |
torch.tensor([[tokenizer.convert_tokens_to_ids("<|eot_id|>")]], dtype=torch.long),
|
@@ -490,17 +356,7 @@ with gr.Blocks(theme="Hev832/Applio", css=css) as demo:
|
|
490 |
run_button = gr.Button("Make My Caption!")
|
491 |
|
492 |
with gr.Column():
|
493 |
-
output_caption = gr.Textbox(label="Your
|
494 |
-
|
495 |
-
gr.Markdown("""
|
496 |
-
## How to Use Your Generated Prompt:
|
497 |
-
1. For "Style Prompt" captions, the output is formatted for direct use in Stable Diffusion.
|
498 |
-
2. Simply copy the entire text from the output box.
|
499 |
-
3. Paste it into your preferred Stable Diffusion interface or any other AI image generation platform.
|
500 |
-
4. Adjust or add to the prompt as desired to fine-tune your image generation.
|
501 |
-
|
502 |
-
Remember, you can always regenerate or modify the prompt to get different results!
|
503 |
-
""")
|
504 |
|
505 |
run_button.click(fn=stream_chat, inputs=[input_image, caption_type, caption_tone, caption_length, lens_type, film_stock, composition, lighting], outputs=[output_caption])
|
506 |
|
|
|
15 |
MODEL_PATH = "Qwen/Qwen2.5-7B-Instruct"
|
16 |
CHECKPOINT_PATH = Path("9em124t2-499968")
|
17 |
CAPTION_TYPE_MAP = {
|
18 |
+
("descriptive", "formal", False, False): ["Write a descriptive caption for this image in a formal tone."],
|
19 |
+
("descriptive", "formal", False, True): ["Write a descriptive caption for this image in a formal tone within {word_count} words."],
|
20 |
+
("descriptive", "formal", True, False): ["Write a {length} descriptive caption for this image in a formal tone."],
|
21 |
+
("descriptive", "informal", False, False): ["Write a descriptive caption for this image in a casual tone."],
|
22 |
+
("descriptive", "informal", False, True): ["Write a descriptive caption for this image in a casual tone within {word_count} words."],
|
23 |
+
("descriptive", "informal", True, False): ["Write a {length} descriptive caption for this image in a casual tone."],
|
24 |
+
("training_prompt", "formal", False, False): ["Write a stable diffusion prompt for this image."],
|
25 |
+
("training_prompt", "formal", False, True): ["Write a stable diffusion prompt for this image within {word_count} words."],
|
26 |
+
("training_prompt", "formal", True, False): ["Write a {length} stable diffusion prompt for this image."],
|
27 |
+
("rng-tags", "formal", False, False): ["Write a list of Booru tags for this image."],
|
28 |
+
("rng-tags", "formal", False, True): ["Write a list of Booru tags for this image within {word_count} words."],
|
29 |
+
("rng-tags", "formal", True, False): ["Write a {length} list of Booru tags for this image."],
|
30 |
+
("style_prompt", "formal", False, False): ["Generate a detailed stable diffusion prompt to recreate this image, including style, composition, and key elements."],
|
31 |
+
("style_prompt", "formal", False, True): ["Within {word_count} words, create a precise stable diffusion prompt capturing the essence of this image."],
|
32 |
+
("style_prompt", "formal", True, False): ["Write a {length} stable diffusion prompt that thoroughly describes this image's style, subject, and artistic techniques."],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
}
|
34 |
|
35 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
|
|
155 |
except ValueError:
|
156 |
pass
|
157 |
|
158 |
+
# 'rng-tags', 'training_prompt', and 'style_prompt' don't have formal/informal tones
|
159 |
+
if caption_type in ["rng-tags", "training_prompt", "style_prompt"]:
|
160 |
caption_tone = "formal"
|
161 |
|
162 |
# Build prompt
|
|
|
166 |
|
167 |
prompt_str = CAPTION_TYPE_MAP[prompt_key][0].format(length=length, word_count=length)
|
168 |
|
|
|
169 |
if caption_type == "style_prompt":
|
170 |
+
prompt_str += (f" Include details about using a {lens_type} lens, "
|
171 |
+
f"{film_stock} film stock, {composition} composition, and {lighting} lighting.")
|
|
|
|
|
172 |
|
173 |
print(f"Prompt: {prompt_str}")
|
174 |
|
|
|
191 |
# Embed prompt
|
192 |
prompt_embeds = text_model.model.embed_tokens(prompt.to('cuda'))
|
193 |
assert prompt_embeds.shape == (1, prompt.shape[1], text_model.config.hidden_size), f"Prompt shape is {prompt_embeds.shape}, expected {(1, prompt.shape[1], text_model.config.hidden_size)}"
|
194 |
+
embedded_bos = text_model.model.embed_tokens(torch.tensor([[tokenizer.bos_token_id]], device=text_model.device, dtype=torch.int64))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
195 |
eot_embed = image_adapter.get_eot_embedding().unsqueeze(0).to(dtype=text_model.dtype)
|
196 |
|
197 |
# Construct prompts
|
|
|
203 |
], dim=1)
|
204 |
|
205 |
input_ids = torch.cat([
|
206 |
+
torch.tensor([[tokenizer.bos_token_id]], dtype=torch.long),
|
207 |
torch.zeros((1, embedded_images.shape[1]), dtype=torch.long),
|
208 |
prompt,
|
209 |
torch.tensor([[tokenizer.convert_tokens_to_ids("<|eot_id|>")]], dtype=torch.long),
|
|
|
356 |
run_button = gr.Button("Make My Caption!")
|
357 |
|
358 |
with gr.Column():
|
359 |
+
output_caption = gr.Textbox(label="Your Amazing Caption Appears Here", lines=10)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
360 |
|
361 |
run_button.click(fn=stream_chat, inputs=[input_image, caption_type, caption_tone, caption_length, lens_type, film_stock, composition, lighting], outputs=[output_caption])
|
362 |
|