Severian commited on
Commit
7c751fb
·
verified ·
1 Parent(s): 250653b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -166
app.py CHANGED
@@ -15,144 +15,21 @@ CLIP_PATH = "google/siglip-so400m-patch14-384"
15
  MODEL_PATH = "Qwen/Qwen2.5-7B-Instruct"
16
  CHECKPOINT_PATH = Path("9em124t2-499968")
17
  CAPTION_TYPE_MAP = {
18
- ("descriptive", "formal", False, False): [
19
- "Write a detailed, formal description of this image, focusing on composition, style, and artistic elements.",
20
- "Provide a comprehensive, academic analysis of this artwork's visual characteristics and techniques."
21
- ],
22
- ("descriptive", "formal", False, True): [
23
- "Craft a formal, concise description of this image within {word_count} words, highlighting key visual elements.",
24
- "Summarize the artwork's main features and style in a formal tone, using no more than {word_count} words."
25
- ],
26
- ("descriptive", "formal", True, False): [
27
- "Compose a {length} formal critique of this image, discussing its artistic merits and visual impact.",
28
- "Create a {length} scholarly description of this artwork, analyzing its composition and aesthetic qualities."
29
- ],
30
- ("descriptive", "informal", False, False): [
31
- "Describe this image as if you're explaining it to a friend, focusing on what stands out to you.",
32
- "Give a casual, conversational rundown of what you see in this artwork and how it makes you feel."
33
- ],
34
- ("descriptive", "informal", False, True): [
35
- "In about {word_count} words, give a laid-back description of this image's vibe and key features.",
36
- "Summarize the coolest parts of this artwork in a casual tone, using roughly {word_count} words."
37
- ],
38
- ("descriptive", "informal", True, False): [
39
- "Write a {length} chill description of this image, highlighting what you find most interesting or unique.",
40
- "Give a {length} relaxed explanation of what's going on in this artwork and why it catches your eye."
41
- ],
42
- ("training_prompt", "formal", False, False): [
43
- "Generate a detailed stable diffusion prompt to recreate this image, including style, composition, and key elements.",
44
- "Craft a comprehensive prompt for an AI art generator to produce an image in the same style and mood as this artwork."
45
- ],
46
- ("training_prompt", "formal", False, True): [
47
- "Within {word_count} words, create a precise stable diffusion prompt capturing the essence of this image.",
48
- "Compose a concise AI art prompt of {word_count} words to replicate this artwork's style and content."
49
- ],
50
- ("training_prompt", "formal", True, False): [
51
- "Write a {length} stable diffusion prompt that thoroughly describes this image's style, subject, and artistic techniques.",
52
- "Develop a {length} detailed prompt for AI art generation, breaking down the key visual elements and artistic approach of this image."
53
- ],
54
- ("rng-tags", "formal", False, False): [
55
- "Generate a comprehensive list of Booru tags describing this image's content, style, and artistic elements.",
56
- "Create an extensive set of Booru tags covering all aspects of this artwork, including subject, technique, and mood."
57
- ],
58
- ("rng-tags", "formal", False, True): [
59
- "Produce a focused list of Booru tags within {word_count} words, capturing the most important aspects of this image.",
60
- "Compile a concise set of Booru tags, limited to {word_count} words, that best represent this artwork's key features."
61
- ],
62
- ("rng-tags", "formal", True, False): [
63
- "Generate a {length} list of Booru tags, providing a thorough categorization of this image's content and style.",
64
- "Create a {length} set of Booru tags that extensively describe all visual elements and artistic choices in this artwork."
65
- ],
66
- ("artistic_inspiration", "formal", False, False): [
67
- "Analyze this image and suggest artistic variations or extensions that could be created based on its style and theme.",
68
- "Provide a formal interpretation of this artwork's mood and style, offering ideas for complementary pieces or a series."
69
- ],
70
- ("artistic_inspiration", "informal", False, False): [
71
- "Brainstorm some cool ideas for new artworks inspired by this image's style or subject matter.",
72
- "Riff on this artwork's vibe and come up with some creative spin-offs or related pieces an artist could make."
73
- ],
74
- ("technical_breakdown", "formal", False, False): [
75
- "Provide a detailed technical analysis of the artistic techniques and materials likely used to create this image.",
76
- "Break down the compositional elements and artistic methods employed in this artwork, suitable for an art student's study."
77
- ],
78
- ("emotional_response", "informal", False, False): [
79
- "Describe the emotions and feelings this artwork evokes, and explain why it might resonate with viewers.",
80
- "Share your gut reaction to this image and speculate on what the artist might have been feeling or thinking."
81
- ],
82
-
83
- ("thematic_analysis", "formal", False, False): [
84
- "Provide an in-depth analysis of the themes presented in this image, exploring the underlying messages and concepts.",
85
- "Analyze the primary and secondary themes of this artwork, discussing their significance and interplay."
86
- ],
87
- ("thematic_analysis", "formal", False, True): [
88
- "Within {word_count} words, dissect the main themes of this image, highlighting their relevance and impact.",
89
- "Craft a concise thematic analysis of this artwork in {word_count} words, focusing on its core messages."
90
- ],
91
- ("thematic_analysis", "formal", True, False): [
92
- "Write a {length} formal exploration of the themes depicted in this image, examining their depth and meaning.",
93
- "Develop a {length} scholarly analysis of the thematic elements in this artwork, discussing their significance."
94
- ],
95
- ("stylistic_comparison", "informal", False, False): [
96
- "Compare the style of this image to other famous art movements or artists, highlighting similarities and differences.",
97
- "Describe how this artwork's style relates to [specific artist/style], and what makes it unique."
98
- ],
99
- ("stylistic_comparison", "informal", False, True): [
100
- "In about {word_count} words, compare this image's style with other known art styles or artists.",
101
- "Summarize the stylistic similarities and differences of this artwork compared to other genres in {word_count} words."
102
- ],
103
- ("stylistic_comparison", "informal", True, False): [
104
- "Write a {length} casual comparison of this image's style with other art movements or famous artists.",
105
- "Give a {length} relaxed description of how this artwork's style aligns or differs from other genres."
106
- ],
107
- ("narrative_suggestion", "formal", False, False): [
108
- "Create a short narrative inspired by this image, outlining a possible story that reflects its visual elements.",
109
- "Develop a brief storyline that complements the themes and mood depicted in this artwork."
110
- ],
111
- ("narrative_suggestion", "formal", False, True): [
112
- "Within {word_count} words, outline a narrative inspired by this image's visual elements and mood.",
113
- "Compose a concise story idea based on the themes and composition of this artwork in {word_count} words."
114
- ],
115
- ("narrative_suggestion", "formal", True, False): [
116
- "Write a {length} formal narrative inspired by this image, detailing a story that aligns with its visual and thematic elements.",
117
- "Develop a {length} scholarly storyline that reflects the mood and composition of this artwork."
118
- ],
119
- ("contextual_storytelling", "informal", False, False): [
120
- "Tell a cool story that could be happening in the scene of this image, based on its visual cues.",
121
- "Imagine a background story for this artwork, explaining what's happening and why."
122
- ],
123
- ("contextual_storytelling", "informal", False, True): [
124
- "In about {word_count} words, create a backstory for the scene depicted in this image.",
125
- "Summarize a possible background narrative for this artwork in {word_count} words."
126
- ],
127
- ("contextual_storytelling", "informal", True, False): [
128
- "Write a {length} informal story that provides context to the scene portrayed in this image.",
129
- "Give a {length} casual backstory explaining the events depicted in this artwork."
130
- ],
131
-
132
- ("style_prompt", "formal", False, False): [
133
- "Generate a detailed stable diffusion prompt to recreate this image, including style, composition, and key elements.",
134
- "Craft a comprehensive prompt for an AI art generator to produce an image in the same style and mood as this artwork."
135
- ],
136
- ("style_prompt", "formal", False, True): [
137
- "Within {word_count} words, create a precise stable diffusion prompt capturing the essence of this image.",
138
- "Compose a concise AI art prompt of {word_count} words to replicate this artwork's style and content."
139
- ],
140
- ("style_prompt", "formal", True, False): [
141
- "Write a {length} stable diffusion prompt that thoroughly describes this image's style, subject, and artistic techniques.",
142
- "Develop a {length} detailed prompt for AI art generation, breaking down the key visual elements and artistic approach of this image."
143
- ],
144
- ("style_prompt", "informal", False, False): [
145
- "Imagine this image is in an exhibition of {style} art. Describe what makes it fit in or stand out from other {style} pieces.",
146
- "Give a casual rundown of how this artwork vibes with the {style} movement. What's similar? What's different? What's cool about it?"
147
- ],
148
- ("style_prompt", "informal", False, True): [
149
- "In about {word_count} words, chat about how this image relates to {style} art. What catches your eye as typical or unusual for the style?",
150
- "Summarize in roughly {word_count} words how this artwork plays with {style} ideas. What's familiar? What's a twist on the style?"
151
- ],
152
- ("style_prompt", "informal", True, False): [
153
- "Write a {length} chill analysis of this image as if it's part of a {style} art show. What works? What's surprising? How does it make you feel?",
154
- "Give a {length} relaxed breakdown of how this artwork fits (or doesn't) into the {style} scene. What's your take on its use of {style} elements?"
155
- ],
156
  }
157
 
158
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
@@ -278,8 +155,8 @@ def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str,
278
  except ValueError:
279
  pass
280
 
281
- # 'rng-tags' and 'training_prompt' don't have formal/informal tones
282
- if caption_type == "rng-tags" or caption_type == "training_prompt":
283
  caption_tone = "formal"
284
 
285
  # Build prompt
@@ -289,12 +166,9 @@ def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str,
289
 
290
  prompt_str = CAPTION_TYPE_MAP[prompt_key][0].format(length=length, word_count=length)
291
 
292
- # Add style prompt details if applicable
293
  if caption_type == "style_prompt":
294
- prompt_str += (f" The prompt should specifically include details about using a {lens_type} lens, "
295
- f"{film_stock} film stock, {composition} composition, and {lighting} lighting. "
296
- f"Format the output as a comma-separated list of descriptors and modifiers, "
297
- f"suitable for direct input into a Stable Diffusion interface.")
298
 
299
  print(f"Prompt: {prompt_str}")
300
 
@@ -317,15 +191,7 @@ def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str,
317
  # Embed prompt
318
  prompt_embeds = text_model.model.embed_tokens(prompt.to('cuda'))
319
  assert prompt_embeds.shape == (1, prompt.shape[1], text_model.config.hidden_size), f"Prompt shape is {prompt_embeds.shape}, expected {(1, prompt.shape[1], text_model.config.hidden_size)}"
320
-
321
- # Check if bos_token_id exists
322
- if tokenizer.bos_token_id is None:
323
- print("Warning: bos_token_id is None. Using default value of 1.")
324
- bos_token_id = 1
325
- else:
326
- bos_token_id = tokenizer.bos_token_id
327
-
328
- embedded_bos = text_model.model.embed_tokens(torch.tensor([[bos_token_id]], device=text_model.device, dtype=torch.int64))
329
  eot_embed = image_adapter.get_eot_embedding().unsqueeze(0).to(dtype=text_model.dtype)
330
 
331
  # Construct prompts
@@ -337,7 +203,7 @@ def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str,
337
  ], dim=1)
338
 
339
  input_ids = torch.cat([
340
- torch.tensor([[bos_token_id]], dtype=torch.long),
341
  torch.zeros((1, embedded_images.shape[1]), dtype=torch.long),
342
  prompt,
343
  torch.tensor([[tokenizer.convert_tokens_to_ids("<|eot_id|>")]], dtype=torch.long),
@@ -490,17 +356,7 @@ with gr.Blocks(theme="Hev832/Applio", css=css) as demo:
490
  run_button = gr.Button("Make My Caption!")
491
 
492
  with gr.Column():
493
- output_caption = gr.Textbox(label="Your Image Generation Prompt (Copy this for Stable Diffusion)", lines=10)
494
-
495
- gr.Markdown("""
496
- ## How to Use Your Generated Prompt:
497
- 1. For "Style Prompt" captions, the output is formatted for direct use in Stable Diffusion.
498
- 2. Simply copy the entire text from the output box.
499
- 3. Paste it into your preferred Stable Diffusion interface or any other AI image generation platform.
500
- 4. Adjust or add to the prompt as desired to fine-tune your image generation.
501
-
502
- Remember, you can always regenerate or modify the prompt to get different results!
503
- """)
504
 
505
  run_button.click(fn=stream_chat, inputs=[input_image, caption_type, caption_tone, caption_length, lens_type, film_stock, composition, lighting], outputs=[output_caption])
506
 
 
15
  MODEL_PATH = "Qwen/Qwen2.5-7B-Instruct"
16
  CHECKPOINT_PATH = Path("9em124t2-499968")
17
  CAPTION_TYPE_MAP = {
18
+ ("descriptive", "formal", False, False): ["Write a descriptive caption for this image in a formal tone."],
19
+ ("descriptive", "formal", False, True): ["Write a descriptive caption for this image in a formal tone within {word_count} words."],
20
+ ("descriptive", "formal", True, False): ["Write a {length} descriptive caption for this image in a formal tone."],
21
+ ("descriptive", "informal", False, False): ["Write a descriptive caption for this image in a casual tone."],
22
+ ("descriptive", "informal", False, True): ["Write a descriptive caption for this image in a casual tone within {word_count} words."],
23
+ ("descriptive", "informal", True, False): ["Write a {length} descriptive caption for this image in a casual tone."],
24
+ ("training_prompt", "formal", False, False): ["Write a stable diffusion prompt for this image."],
25
+ ("training_prompt", "formal", False, True): ["Write a stable diffusion prompt for this image within {word_count} words."],
26
+ ("training_prompt", "formal", True, False): ["Write a {length} stable diffusion prompt for this image."],
27
+ ("rng-tags", "formal", False, False): ["Write a list of Booru tags for this image."],
28
+ ("rng-tags", "formal", False, True): ["Write a list of Booru tags for this image within {word_count} words."],
29
+ ("rng-tags", "formal", True, False): ["Write a {length} list of Booru tags for this image."],
30
+ ("style_prompt", "formal", False, False): ["Generate a detailed stable diffusion prompt to recreate this image, including style, composition, and key elements."],
31
+ ("style_prompt", "formal", False, True): ["Within {word_count} words, create a precise stable diffusion prompt capturing the essence of this image."],
32
+ ("style_prompt", "formal", True, False): ["Write a {length} stable diffusion prompt that thoroughly describes this image's style, subject, and artistic techniques."],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  }
34
 
35
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
 
155
  except ValueError:
156
  pass
157
 
158
+ # 'rng-tags', 'training_prompt', and 'style_prompt' don't have formal/informal tones
159
+ if caption_type in ["rng-tags", "training_prompt", "style_prompt"]:
160
  caption_tone = "formal"
161
 
162
  # Build prompt
 
166
 
167
  prompt_str = CAPTION_TYPE_MAP[prompt_key][0].format(length=length, word_count=length)
168
 
 
169
  if caption_type == "style_prompt":
170
+ prompt_str += (f" Include details about using a {lens_type} lens, "
171
+ f"{film_stock} film stock, {composition} composition, and {lighting} lighting.")
 
 
172
 
173
  print(f"Prompt: {prompt_str}")
174
 
 
191
  # Embed prompt
192
  prompt_embeds = text_model.model.embed_tokens(prompt.to('cuda'))
193
  assert prompt_embeds.shape == (1, prompt.shape[1], text_model.config.hidden_size), f"Prompt shape is {prompt_embeds.shape}, expected {(1, prompt.shape[1], text_model.config.hidden_size)}"
194
+ embedded_bos = text_model.model.embed_tokens(torch.tensor([[tokenizer.bos_token_id]], device=text_model.device, dtype=torch.int64))
 
 
 
 
 
 
 
 
195
  eot_embed = image_adapter.get_eot_embedding().unsqueeze(0).to(dtype=text_model.dtype)
196
 
197
  # Construct prompts
 
203
  ], dim=1)
204
 
205
  input_ids = torch.cat([
206
+ torch.tensor([[tokenizer.bos_token_id]], dtype=torch.long),
207
  torch.zeros((1, embedded_images.shape[1]), dtype=torch.long),
208
  prompt,
209
  torch.tensor([[tokenizer.convert_tokens_to_ids("<|eot_id|>")]], dtype=torch.long),
 
356
  run_button = gr.Button("Make My Caption!")
357
 
358
  with gr.Column():
359
+ output_caption = gr.Textbox(label="Your Amazing Caption Appears Here", lines=10)
 
 
 
 
 
 
 
 
 
 
360
 
361
  run_button.click(fn=stream_chat, inputs=[input_image, caption_type, caption_tone, caption_length, lens_type, film_stock, composition, lighting], outputs=[output_caption])
362