Severian commited on
Commit
936fabc
·
verified ·
1 Parent(s): 368e071

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +358 -307
app.py CHANGED
@@ -12,28 +12,147 @@ import torchvision.transforms.functional as TVF
12
 
13
 
14
  CLIP_PATH = "google/siglip-so400m-patch14-384"
15
- MODEL_PATH = "Qwen/Qwen2.5-7B-Instruct"
16
  CHECKPOINT_PATH = Path("9em124t2-499968")
17
- TITLE = "<h1><center>JoyCaption Alpha One (2024-09-20a)</center></h1>"
18
  CAPTION_TYPE_MAP = {
19
- ("descriptive", "formal", False, False): ["Write a descriptive caption for this image in a formal tone."],
20
- ("descriptive", "formal", False, True): ["Write a descriptive caption for this image in a formal tone within {word_count} words."],
21
- ("descriptive", "formal", True, False): ["Write a {length} descriptive caption for this image in a formal tone."],
22
- ("descriptive", "informal", False, False): ["Write a descriptive caption for this image in a casual tone."],
23
- ("descriptive", "informal", False, True): ["Write a descriptive caption for this image in a casual tone within {word_count} words."],
24
- ("descriptive", "informal", True, False): ["Write a {length} descriptive caption for this image in a casual tone."],
25
-
26
- ("training_prompt", "formal", False, False): ["Write a stable diffusion prompt for this image."],
27
- ("training_prompt", "formal", False, True): ["Write a stable diffusion prompt for this image within {word_count} words."],
28
- ("training_prompt", "formal", True, False): ["Write a {length} stable diffusion prompt for this image."],
29
-
30
- ("rng-tags", "formal", False, False): ["Write a list of Booru tags for this image."],
31
- ("rng-tags", "formal", False, True): ["Write a list of Booru tags for this image within {word_count} words."],
32
- ("rng-tags", "formal", True, False): ["Write a {length} list of Booru tags for this image."],
33
-
34
- ("style_prompt", "formal", False, False): ["Generate a detailed style prompt for this image, including lens type, film stock, composition notes, lighting aspects, and any special photographic techniques."],
35
- ("style_prompt", "formal", False, True): ["Generate a detailed style prompt for this image within {word_count} words, including lens type, film stock, composition notes, lighting aspects, and any special photographic techniques."],
36
- ("style_prompt", "formal", True, False): ["Generate a {length} detailed style prompt for this image, including lens type, film stock, composition notes, lighting aspects, and any special photographic techniques."],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  }
38
 
39
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
@@ -138,105 +257,122 @@ text_model.eval()
138
  # Image Adapter
139
  print("Loading image adapter")
140
  image_adapter = ImageAdapter(clip_model.config.hidden_size, text_model.config.hidden_size, False, False, 38, False)
141
- image_adapter.load_state_dict(torch.load(CHECKPOINT_PATH / "image_adapter.pt", map_location="cpu", weights_only=True))
142
  image_adapter.eval()
143
  image_adapter.to("cuda")
144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
- def preprocess_image(input_image: Image.Image) -> torch.Tensor:
147
- """
148
- Preprocess the input image for the CLIP model.
149
- """
150
  image = input_image.resize((384, 384), Image.LANCZOS)
151
  pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
152
  pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
153
- return pixel_values.to('cuda')
154
 
155
- def generate_caption(text_model, tokenizer, image_features, prompt_str: str, max_new_tokens: int = 300) -> str:
156
- """
157
- Generate a caption based on the image features and prompt.
158
- """
159
  prompt = tokenizer.encode(prompt_str, return_tensors='pt', padding=False, truncation=False, add_special_tokens=False)
 
 
 
 
 
 
 
 
 
160
  prompt_embeds = text_model.model.embed_tokens(prompt.to('cuda'))
161
- embedded_bos = text_model.model.embed_tokens(torch.tensor([[tokenizer.bos_token_id]], device=text_model.device, dtype=torch.int64))
 
 
 
 
 
 
 
 
162
  eot_embed = image_adapter.get_eot_embedding().unsqueeze(0).to(dtype=text_model.dtype)
163
 
 
164
  inputs_embeds = torch.cat([
165
- embedded_bos.expand(image_features.shape[0], -1, -1),
166
- image_features.to(dtype=embedded_bos.dtype),
167
- prompt_embeds.expand(image_features.shape[0], -1, -1),
168
- eot_embed.expand(image_features.shape[0], -1, -1),
169
  ], dim=1)
170
 
171
  input_ids = torch.cat([
172
- torch.tensor([[tokenizer.bos_token_id]], dtype=torch.long),
173
- torch.zeros((1, image_features.shape[1]), dtype=torch.long),
174
  prompt,
175
  torch.tensor([[tokenizer.convert_tokens_to_ids("<|eot_id|>")]], dtype=torch.long),
176
  ], dim=1).to('cuda')
177
  attention_mask = torch.ones_like(input_ids)
178
 
179
- generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=max_new_tokens, do_sample=True, suppress_tokens=None)
180
 
 
181
  generate_ids = generate_ids[:, input_ids.shape[1]:]
182
  if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids("<|eot_id|>"):
183
  generate_ids = generate_ids[:, :-1]
184
 
185
- return tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0].strip()
186
-
187
- @spaces.GPU()
188
- @torch.no_grad()
189
- def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str, caption_length: str | int, lens_type: str = "", film_stock: str = "", composition_style: str = "", lighting_aspect: str = "", special_technique: str = "", color_effect: str = "") -> str:
190
- """
191
- Generate a caption or style prompt based on the input image and parameters.
192
- """
193
- torch.cuda.empty_cache()
194
-
195
- try:
196
- length = None if caption_length == "any" else caption_length
197
- if isinstance(length, str):
198
- length = int(length)
199
- except ValueError:
200
- raise ValueError(f"Invalid caption length: {caption_length}")
201
-
202
- if caption_type in ["rng-tags", "training_prompt", "style_prompt"]:
203
- caption_tone = "formal"
204
-
205
- prompt_key = (caption_type, caption_tone, isinstance(length, str), isinstance(length, int))
206
- if prompt_key not in CAPTION_TYPE_MAP:
207
- raise ValueError(f"Invalid caption type: {prompt_key}")
208
-
209
- prompt_str = CAPTION_TYPE_MAP[prompt_key][0].format(length=length, word_count=length)
210
-
211
- if caption_type == "style_prompt":
212
- prompt_str += f" Lens type: {lens_type} ({lens_types_info[lens_type]}). "
213
- prompt_str += f"Film stock: {film_stocks_info[film_stock]}). "
214
- prompt_str += f"Composition style: {composition_styles_info[composition_style]}). "
215
- prompt_str += f"Lighting aspect: {lighting_aspects_info[lighting_aspect]}). "
216
- prompt_str += f"Special technique: {special_techniques_info[special_technique]}). "
217
- prompt_str += f"Color effect: {color_effects_info[color_effect]})."
218
-
219
- # Debugging: Print the constructed prompt string
220
- print(f"Constructed Prompt: {prompt_str}")
221
-
222
- pixel_values = preprocess_image(input_image)
223
-
224
- with torch.amp.autocast_mode.autocast('cuda', enabled=True):
225
- vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True)
226
- image_features = vision_outputs.hidden_states
227
- embedded_images = image_adapter(image_features)
228
- embedded_images = embedded_images.to('cuda')
229
-
230
- # Load the model from MODEL_PATH
231
- text_model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map="auto", torch_dtype=torch.bfloat16)
232
- text_model.eval()
233
-
234
- # Debugging: Print the prompt string before passing to generate_caption
235
- print(f"Prompt passed to generate_caption: {prompt_str}")
236
 
237
- caption = generate_caption(text_model, tokenizer, embedded_images, prompt_str)
238
-
239
- return caption
240
 
241
  css = """
242
  h1, h2, h3, h4, h5, h6, p, li, ul, ol, a, .centered-image {
@@ -256,110 +392,63 @@ ul, ol {
256
  }
257
  """
258
 
259
- # Add detailed descriptions for each option
260
- lens_types_info = {
261
- "Standard": "A versatile lens with a field of view similar to human vision.",
262
- "Wide-angle": "Captures a wider field of view, great for landscapes and architecture. Applies moderate to strong lens effect with image warp.",
263
- "Telephoto": "Used for distant subjects, gives an 'award-winning' or 'National Geographic' look. Creates interesting effects when prompted.",
264
- "Macro": "For extreme close-up photography, revealing tiny details.",
265
- "Fish-eye": "Ultra-wide-angle lens that creates a strong bubble-like distortion. Generates panoramic photos with the entire image warping into a bubble.",
266
- "Tilt-shift": "Allows adjusting the plane of focus, creating a 'miniature' effect. Known for the 'diorama miniature look'.",
267
- "Zoom lens": "Variable focal length lens. Often zooms in on the subject, perfect for creating a base for inpainting. Interesting effect on landscapes with motion blur.",
268
- "GoPro": "Wide-angle lens with clean digital look. Excludes film grain and most filter effects, resulting in natural colors and regular saturation.",
269
- "Pinhole camera": "Creates a unique, foggy, low-detail, historic photograph look. Used since the 1850s, with peak popularity in the 1930s."
270
- }
271
-
272
- film_stocks_info = {
273
- "Kodak Portra": "Professional color negative film known for its natural skin tones and low contrast.",
274
- "Fujifilm Velvia": "Slide film known for vibrant colors and high saturation, popular among landscape photographers.",
275
- "Ilford Delta": "Black and white film known for its fine grain and high sharpness.",
276
- "Kodak Tri-X": "Classic high-speed black and white film, known for its distinctive grain and wide exposure latitude.",
277
- "Fujifilm Provia": "Color reversal film known for its natural color reproduction and fine grain.",
278
- "Cinestill": "Color photos with fine/low grain and higher than average resolution. Colors are slightly oversaturated or slightly desaturated.",
279
- "Ektachrome": "Color photos with fine/low to moderate grain. Colors on the colder part of spectrum or regular, with normal or slightly higher saturation.",
280
- "Ektar": "Modern Kodak film. Color photos with little to no grain. Results look like regular modern photography with artistic angles.",
281
- "Film Washi": "Mostly black and white photos with fine/low to moderate grain. Occasionally gives colored photos with low saturation. Distinct style with high black contrast and soft camera lens effect.",
282
- "Fomapan": "Black and white photos with fine/low to moderate grain, highly artistic exposure and angles. Adds very soft lens effect without distortion, dark photo vignette.",
283
- "Fujicolor": "Color photos with fine/low to moderate grain. Colors are slightly or notably desaturated, with entire color hue shifted in a very distinct manner.",
284
- "Holga": "Color photos with high grain. Colors are either very oversaturated or slightly desaturated. Distinct contrast of black. Often applies photographic vignette.",
285
- "Instax": "Instant color photos similar to Polaroid but clearer. Near perfect colors, regular saturation, fine/low to medium grain.",
286
- "Lomography": "Color photos with high grain. Colors are either very oversaturated or slightly desaturated. Distinct contrast of black. Often applies photographic vignette.",
287
- "Kodachrome": "Color photos with moderate grain. Colors on either colder part of spectrum or regular, with normal or slightly higher saturation.",
288
- "Rollei": "Mostly black and white photos, sometimes color with fine/low grain. Can be sepia colored or have unusual hues and desaturation. Great for landscapes."
289
- }
290
-
291
- composition_styles_info = {
292
- "Rule of Thirds": "Divides the frame into a 3x3 grid, placing key elements along the lines or at their intersections.",
293
- "Golden Ratio": "Uses a spiral based on the golden ratio to create a balanced and aesthetically pleasing composition.",
294
- "Symmetry": "Creates a mirror-like balance in the image, often used for architectural or nature photography.",
295
- "Leading Lines": "Uses lines within the frame to draw the viewer's eye to the main subject or through the image.",
296
- "Framing": "Uses elements within the scene to create a frame around the main subject.",
297
- "Minimalism": "Simplifies the composition to its essential elements, often with a lot of negative space.",
298
- "Fill the Frame": "The main subject dominates the entire frame, leaving little to no background.",
299
- "Negative Space": "Uses empty space around the subject to create a sense of simplicity or isolation.",
300
- "Centered Composition": "Places the main subject in the center of the frame, creating a sense of stability or importance.",
301
- "Diagonal Lines": "Uses diagonal elements to create a sense of movement or dynamic tension in the image.",
302
- "Triangular Composition": "Arranges elements in the frame to form a triangle, creating a sense of stability and harmony.",
303
- "Radial Balance": "Arranges elements in a circular pattern around a central point, creating a sense of movement or completeness."
304
  }
305
 
306
- lighting_aspects_info = {
307
- "Natural light": "Uses available light from the sun or sky, often creating soft, even illumination.",
308
- "Studio lighting": "Controlled artificial lighting setup, allowing for precise manipulation of light and shadow.",
309
- "Back light": "Light source behind the subject, creating silhouettes or rim lighting effects.",
310
- "Split light": "Strong light source at 90-degree angle, lighting one half of the subject while leaving the other in shadow.",
311
- "Broad light": "Light source at an angle to the subject, producing well-lit photographs with soft to moderate shadows.",
312
- "Dim light": "Weak or distant light source, creating lower than average brightness and often dramatic images.",
313
- "Flash photography": "Uses a brief, intense burst of light. Can be fill flash (even lighting) or harsh flash (strong contrasts).",
314
- "Sunlight": "Direct light from the sun, often creating strong contrasts and warm tones.",
315
- "Moonlight": "Soft, cool light from the moon, often creating a mysterious or romantic atmosphere.",
316
- "Spotlight": "Focused beam of light illuminating a specific area, creating high contrast between light and shadow.",
317
- "High-key lighting": "Bright, even lighting with minimal shadows, creating a light and airy feel.",
318
- "Low-key lighting": "Predominantly dark tones with selective lighting, creating a moody or dramatic atmosphere.",
319
- "Rembrandt lighting": "Classic portrait lighting technique creating a triangle of light on the cheek of the subject."
 
 
 
 
 
 
 
320
  }
321
 
322
- special_techniques_info = {
323
- "Double exposure": "Superimposes two exposures to create a single image, often resulting in a dreamy or surreal effect.",
324
- "Long exposure": "Uses a long shutter speed to capture motion over time, often creating smooth, blurred effects for moving elements.",
325
- "Multiple exposure": "Superimposes multiple exposures, multiplying the subject or its key elements across the image.",
326
- "HDR": "High Dynamic Range imaging, combining multiple exposures to capture a wider range of light and dark tones.",
327
- "Bokeh effect": "Creates a soft, out-of-focus background, often with circular highlights.",
328
- "Silhouette": "Captures the outline of a subject against a brighter background, creating a dramatic contrast.",
329
- "Panning": "Follows a moving subject with the camera, creating a sharp subject with a blurred background.",
330
- "Light painting": "Uses long exposure and moving light sources to 'paint' with light in the image.",
331
- "Infrared photography": "Captures light in the infrared spectrum, often resulting in surreal, otherworldly images.",
332
- "Ultraviolet photography": "Captures light in the ultraviolet spectrum, often revealing hidden patterns or creating a strong violet glow.",
333
- "Kirlian photography": "High-voltage photographic technique that captures corona discharges around objects, creating a glowing effect.",
334
- "Thermography": "Captures infrared radiation to create images based on temperature differences, resulting in false-color heat maps.",
335
- "Astrophotography": "Specialized technique for capturing astronomical objects and celestial events, often resulting in stunning starry backgrounds.",
336
- "Underwater photography": "Captures images beneath the surface of water, often in pools, seas, or aquariums.",
337
- "Aerial photography": "Captures images from an elevated position, such as from drones, helicopters, or planes.",
338
- "Macro photography": "Extreme close-up photography, revealing tiny details not visible to the naked eye."
339
- }
340
-
341
- color_effects_info = {
342
- "Black and white": "Removes all color, leaving only shades of gray.",
343
- "Sepia": "Reddish-brown monochrome effect, often associated with vintage photography.",
344
- "Monochrome": "Uses variations of a single color.",
345
- "Vintage color": "Muted or faded color palette reminiscent of old photographs.",
346
- "Cross-processed": "Deliberate processing of film in the wrong chemicals, creating unusual color shifts.",
347
- "Desaturated": "Reduces the intensity of all colors in the image.",
348
- "Vivid colors": "Increases the saturation and intensity of colors.",
349
- "Pastel colors": "Soft, pale colors with a light and airy feel.",
350
- "High contrast": "Emphasizes the difference between light and dark areas in the image.",
351
- "Low contrast": "Reduces the difference between light and dark areas, creating a softer look.",
352
- "Color splash": "Converts most of the image to black and white while leaving one or more elements in color."
353
- }
354
-
355
- def get_dropdown_choices(info_dict):
356
- return [f"{key}: {value}" for key, value in info_dict.items()]
357
-
358
- # Gradio interface
359
  with gr.Blocks(theme="Hev832/Applio", css=css) as demo:
360
  with gr.Tab("Welcome"):
361
  gr.Markdown(
362
- """
363
  <img src="https://path-to-yamamoto-logo.png" alt="Yamamoto Logo" class="centered-image">
364
 
365
  # 🎨 Yamamoto JoyCaption: AI-Powered Art Inspiration
@@ -367,7 +456,7 @@ with gr.Blocks(theme="Hev832/Applio", css=css) as demo:
367
  ## Accelerate Your Creative Workflow with Intelligent Image Analysis
368
 
369
  This innovative tool empowers Yamamoto's artists to quickly generate descriptive captions,<br>
370
- training prompts, and tags from existing artwork, fueling the creative process for GenAI models.
371
 
372
  ## 🚀 How It Works:
373
  1. **Upload Your Inspiration**: Drop in an image (e.g., a charcoal horse picture) that embodies your desired style.
@@ -376,147 +465,109 @@ with gr.Blocks(theme="Hev832/Applio", css=css) as demo:
376
  4. **Generate and Iterate**: Click 'Caption' to analyze your image and use the results to inspire new creations.
377
  """
378
  )
379
-
380
  with gr.Tab("JoyCaption"):
381
- with gr.Accordion("How to Use JoyCaption", open=False):
382
- gr.Markdown("""
383
- # How to Use JoyCaption
384
-
385
- Hello, artist! Let's make some fun captions for your pictures. Here's how:
386
-
387
- 1. **Pick a Picture**: Find a cool picture you want to talk about and upload it.
388
-
389
- 2. **Choose What You Want**:
390
- - **Caption Type**:
391
- * "Descriptive" tells you what's in the picture
392
- * "Training Prompt" helps computers make similar pictures
393
- * "RNG-Tags" gives you short words about the picture
394
- * "Style Prompt" creates detailed prompts for image generation
395
 
396
- 3. **Pick a Style** (for "Descriptive" and "Style Prompt" only):
397
- - "Formal" sounds like a teacher talking
398
- - "Informal" sounds like a friend chatting
399
 
400
- 4. **Decide How Long**:
401
- - "Any" lets the computer decide
402
- - Or pick a size from "very short" to "very long"
403
- - You can even choose a specific number of words!
404
-
405
- 5. **Advanced Options** (for "Style Prompt" only):
406
- - Choose lens type, film stock, composition, and lighting details
407
-
408
- 6. **Make the Caption**: Click the "Make My Caption!" button and watch the magic happen!
409
-
410
- Remember, have fun and be creative with your captions!
411
-
412
- ## Tips for Great Captions:
413
- - Try different types to see what you like best
414
- - Experiment with formal and informal tones for fun variations
415
- - Adjust the length to get just the right amount of detail
416
- - For "Style Prompt", play with the advanced options for more specific results
417
- - If you don't like a caption, just click "Make My Caption!" again for a new one
418
-
419
- Have a great time captioning your art!
420
- """)
421
 
422
  with gr.Row():
423
- with gr.Column():
424
- input_image = gr.Image(type="pil", label="Input Image")
425
 
426
  caption_type = gr.Dropdown(
427
- choices=["descriptive", "training_prompt", "rng-tags", "style_prompt"],
428
- label="Caption Type",
 
 
 
 
 
 
 
 
 
429
  value="descriptive",
430
  )
431
 
 
 
 
 
 
 
 
 
 
 
 
 
432
  caption_tone = gr.Dropdown(
433
  choices=["formal", "informal"],
434
- label="Caption Tone",
435
  value="formal",
436
  )
437
 
 
 
438
  caption_length = gr.Dropdown(
439
  choices=["any", "very short", "short", "medium-length", "long", "very long"] +
440
  [str(i) for i in range(20, 261, 10)],
441
- label="Caption Length",
442
  value="any",
443
  )
444
 
445
- lens_type = gr.Dropdown(
446
- choices=get_dropdown_choices(lens_types_info),
447
- label="Lens Type",
448
- visible=False,
449
- info="Select a lens type to define the perspective and field of view of the image."
450
- )
451
-
452
- film_stock = gr.Dropdown(
453
- choices=get_dropdown_choices(film_stocks_info),
454
- label="Film Stock",
455
- visible=False,
456
- info="Choose a film stock to determine the color, grain, and overall look of the image."
457
- )
458
-
459
- composition_style = gr.Dropdown(
460
- choices=get_dropdown_choices(composition_styles_info),
461
- label="Composition Style",
462
- visible=False,
463
- info="Select a composition style to guide the arrangement of elements in the image."
464
  )
465
 
466
- lighting_aspect = gr.Dropdown(
467
- choices=get_dropdown_choices(lighting_aspects_info),
468
- label="Lighting Aspect",
469
- visible=False,
470
- info="Choose a lighting style to define the mood and atmosphere of the image."
471
- )
472
-
473
- special_technique = gr.Dropdown(
474
- choices=get_dropdown_choices(special_techniques_info),
475
- label="Special Technique",
476
- visible=False,
477
- info="Select a special photographic technique to add unique effects to the image."
478
- )
479
-
480
- color_effect = gr.Dropdown(
481
- choices=get_dropdown_choices(color_effects_info),
482
- label="Color Effect",
483
- visible=False,
484
- info="Choose a color effect to alter the overall color palette of the image."
485
- )
486
-
487
- gr.Markdown("**Note:** Caption tone doesn't affect `rng-tags`, `training_prompt`, and `style_prompt`.")
488
-
489
- run_button = gr.Button("Make My Caption!")
490
-
491
- with gr.Column():
492
- output_caption = gr.Textbox(label="Generated Caption")
493
-
494
- # Container for advanced options
495
- advanced_options = gr.Column(visible=False)
496
- with advanced_options:
497
- gr.Markdown("### Advanced Options for Style Prompt")
498
- lens_type.render()
499
- film_stock.render()
500
- composition_style.render()
501
- lighting_aspect.render()
502
- special_technique.render()
503
- color_effect.render()
504
-
505
- def update_style_options(caption_type):
506
- return {
507
- lens_type: gr.update(visible=caption_type == "style_prompt"),
508
- film_stock: gr.update(visible=caption_type == "style_prompt"),
509
- composition_style: gr.update(visible=caption_type == "style_prompt"),
510
- lighting_aspect: gr.update(visible=caption_type == "style_prompt"),
511
- special_technique: gr.update(visible=caption_type == "style_prompt"),
512
- color_effect: gr.update(visible=caption_type == "style_prompt"),
513
- advanced_options: gr.update(visible=caption_type == "style_prompt"),
514
- }
515
-
516
- caption_type.change(update_style_options, inputs=[caption_type], outputs=[lens_type, film_stock, composition_style, lighting_aspect, special_technique, color_effect, advanced_options])
517
-
518
- run_button.click(fn=stream_chat, inputs=[input_image, caption_type, caption_tone, caption_length, lens_type, film_stock, composition_style, lighting_aspect, special_technique, color_effect], outputs=[output_caption])
519
 
 
 
 
 
 
520
 
521
  if __name__ == "__main__":
522
  demo.launch()
 
12
 
13
 
14
  CLIP_PATH = "google/siglip-so400m-patch14-384"
15
+ MODEL_PATH = "meta-llama/Meta-Llama-3.1-8B"
16
  CHECKPOINT_PATH = Path("9em124t2-499968")
 
17
  CAPTION_TYPE_MAP = {
18
+ ("descriptive", "formal", False, False): [
19
+ "Write a detailed, formal description of this image, focusing on composition, style, and artistic elements.",
20
+ "Provide a comprehensive, academic analysis of this artwork's visual characteristics and techniques."
21
+ ],
22
+ ("descriptive", "formal", False, True): [
23
+ "Craft a formal, concise description of this image within {word_count} words, highlighting key visual elements.",
24
+ "Summarize the artwork's main features and style in a formal tone, using no more than {word_count} words."
25
+ ],
26
+ ("descriptive", "formal", True, False): [
27
+ "Compose a {length} formal critique of this image, discussing its artistic merits and visual impact.",
28
+ "Create a {length} scholarly description of this artwork, analyzing its composition and aesthetic qualities."
29
+ ],
30
+ ("descriptive", "informal", False, False): [
31
+ "Describe this image as if you're explaining it to a friend, focusing on what stands out to you.",
32
+ "Give a casual, conversational rundown of what you see in this artwork and how it makes you feel."
33
+ ],
34
+ ("descriptive", "informal", False, True): [
35
+ "In about {word_count} words, give a laid-back description of this image's vibe and key features.",
36
+ "Summarize the coolest parts of this artwork in a casual tone, using roughly {word_count} words."
37
+ ],
38
+ ("descriptive", "informal", True, False): [
39
+ "Write a {length} chill description of this image, highlighting what you find most interesting or unique.",
40
+ "Give a {length} relaxed explanation of what's going on in this artwork and why it catches your eye."
41
+ ],
42
+ ("training_prompt", "formal", False, False): [
43
+ "Generate a detailed stable diffusion prompt to recreate this image, including style, composition, and key elements.",
44
+ "Craft a comprehensive prompt for an AI art generator to produce an image in the same style and mood as this artwork."
45
+ ],
46
+ ("training_prompt", "formal", False, True): [
47
+ "Within {word_count} words, create a precise stable diffusion prompt capturing the essence of this image.",
48
+ "Compose a concise AI art prompt of {word_count} words to replicate this artwork's style and content."
49
+ ],
50
+ ("training_prompt", "formal", True, False): [
51
+ "Write a {length} stable diffusion prompt that thoroughly describes this image's style, subject, and artistic techniques.",
52
+ "Develop a {length} detailed prompt for AI art generation, breaking down the key visual elements and artistic approach of this image."
53
+ ],
54
+ ("rng-tags", "formal", False, False): [
55
+ "Generate a comprehensive list of Booru tags describing this image's content, style, and artistic elements.",
56
+ "Create an extensive set of Booru tags covering all aspects of this artwork, including subject, technique, and mood."
57
+ ],
58
+ ("rng-tags", "formal", False, True): [
59
+ "Produce a focused list of Booru tags within {word_count} words, capturing the most important aspects of this image.",
60
+ "Compile a concise set of Booru tags, limited to {word_count} words, that best represent this artwork's key features."
61
+ ],
62
+ ("rng-tags", "formal", True, False): [
63
+ "Generate a {length} list of Booru tags, providing a thorough categorization of this image's content and style.",
64
+ "Create a {length} set of Booru tags that extensively describe all visual elements and artistic choices in this artwork."
65
+ ],
66
+ ("artistic_inspiration", "formal", False, False): [
67
+ "Analyze this image and suggest artistic variations or extensions that could be created based on its style and theme.",
68
+ "Provide a formal interpretation of this artwork's mood and style, offering ideas for complementary pieces or a series."
69
+ ],
70
+ ("artistic_inspiration", "informal", False, False): [
71
+ "Brainstorm some cool ideas for new artworks inspired by this image's style or subject matter.",
72
+ "Riff on this artwork's vibe and come up with some creative spin-offs or related pieces an artist could make."
73
+ ],
74
+ ("technical_breakdown", "formal", False, False): [
75
+ "Provide a detailed technical analysis of the artistic techniques and materials likely used to create this image.",
76
+ "Break down the compositional elements and artistic methods employed in this artwork, suitable for an art student's study."
77
+ ],
78
+ ("emotional_response", "informal", False, False): [
79
+ "Describe the emotions and feelings this artwork evokes, and explain why it might resonate with viewers.",
80
+ "Share your gut reaction to this image and speculate on what the artist might have been feeling or thinking."
81
+ ],
82
+
83
+ ("thematic_analysis", "formal", False, False): [
84
+ "Provide an in-depth analysis of the themes presented in this image, exploring the underlying messages and concepts.",
85
+ "Analyze the primary and secondary themes of this artwork, discussing their significance and interplay."
86
+ ],
87
+ ("thematic_analysis", "formal", False, True): [
88
+ "Within {word_count} words, dissect the main themes of this image, highlighting their relevance and impact.",
89
+ "Craft a concise thematic analysis of this artwork in {word_count} words, focusing on its core messages."
90
+ ],
91
+ ("thematic_analysis", "formal", True, False): [
92
+ "Write a {length} formal exploration of the themes depicted in this image, examining their depth and meaning.",
93
+ "Develop a {length} scholarly analysis of the thematic elements in this artwork, discussing their significance."
94
+ ],
95
+ ("stylistic_comparison", "informal", False, False): [
96
+ "Compare the style of this image to other famous art movements or artists, highlighting similarities and differences.",
97
+ "Describe how this artwork's style relates to [specific artist/style], and what makes it unique."
98
+ ],
99
+ ("stylistic_comparison", "informal", False, True): [
100
+ "In about {word_count} words, compare this image's style with other known art styles or artists.",
101
+ "Summarize the stylistic similarities and differences of this artwork compared to other genres in {word_count} words."
102
+ ],
103
+ ("stylistic_comparison", "informal", True, False): [
104
+ "Write a {length} casual comparison of this image's style with other art movements or famous artists.",
105
+ "Give a {length} relaxed description of how this artwork's style aligns or differs from other genres."
106
+ ],
107
+ ("narrative_suggestion", "formal", False, False): [
108
+ "Create a short narrative inspired by this image, outlining a possible story that reflects its visual elements.",
109
+ "Develop a brief storyline that complements the themes and mood depicted in this artwork."
110
+ ],
111
+ ("narrative_suggestion", "formal", False, True): [
112
+ "Within {word_count} words, outline a narrative inspired by this image's visual elements and mood.",
113
+ "Compose a concise story idea based on the themes and composition of this artwork in {word_count} words."
114
+ ],
115
+ ("narrative_suggestion", "formal", True, False): [
116
+ "Write a {length} formal narrative inspired by this image, detailing a story that aligns with its visual and thematic elements.",
117
+ "Develop a {length} scholarly storyline that reflects the mood and composition of this artwork."
118
+ ],
119
+ ("contextual_storytelling", "informal", False, False): [
120
+ "Tell a cool story that could be happening in the scene of this image, based on its visual cues.",
121
+ "Imagine a background story for this artwork, explaining what's happening and why."
122
+ ],
123
+ ("contextual_storytelling", "informal", False, True): [
124
+ "In about {word_count} words, create a backstory for the scene depicted in this image.",
125
+ "Summarize a possible background narrative for this artwork in {word_count} words."
126
+ ],
127
+ ("contextual_storytelling", "informal", True, False): [
128
+ "Write a {length} informal story that provides context to the scene portrayed in this image.",
129
+ "Give a {length} casual backstory explaining the events depicted in this artwork."
130
+ ],
131
+
132
+ ("style_prompt", "formal", False, False): [
133
+ "Analyze this image through the lens of {style} art. Describe how it aligns with or diverges from {style_characteristics}.",
134
+ "Examine this artwork in the context of the {style} movement, focusing on {style_focus} and how these elements are represented or reinterpreted in the image."
135
+ ],
136
+ ("style_prompt", "formal", False, True): [
137
+ "Within {word_count} words, compare this image to the {style} style, highlighting elements that reflect or contrast with its key characteristics.",
138
+ "Compose a concise {word_count}-word analysis of how this artwork relates to the {style} movement, noting its adherence to or departure from typical {style} elements."
139
+ ],
140
+ ("style_prompt", "formal", True, False): [
141
+ "Write a {length} critique of this image, exploring its relationship to the {style} movement. Discuss composition, technique, and thematic elements in this context.",
142
+ "Develop a {length} analysis of how this artwork incorporates or challenges the principles of {style}, considering its visual language and artistic approach."
143
+ ],
144
+ ("style_prompt", "informal", False, False): [
145
+ "Imagine this image is in an exhibition of {style} art. Describe what makes it fit in or stand out from other {style} pieces.",
146
+ "Give a casual rundown of how this artwork vibes with the {style} movement. What's similar? What's different? What's cool about it?"
147
+ ],
148
+ ("style_prompt", "informal", False, True): [
149
+ "In about {word_count} words, chat about how this image relates to {style} art. What catches your eye as typical or unusual for the style?",
150
+ "Summarize in roughly {word_count} words how this artwork plays with {style} ideas. What's familiar? What's a twist on the style?"
151
+ ],
152
+ ("style_prompt", "informal", True, False): [
153
+ "Write a {length} chill analysis of this image as if it's part of a {style} art show. What works? What's surprising? How does it make you feel?",
154
+ "Give a {length} relaxed breakdown of how this artwork fits (or doesn't) into the {style} scene. What's your take on its use of {style} elements?"
155
+ ],
156
  }
157
 
158
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
 
257
  # Image Adapter
258
  print("Loading image adapter")
259
  image_adapter = ImageAdapter(clip_model.config.hidden_size, text_model.config.hidden_size, False, False, 38, False)
260
+ image_adapter.load_state_dict(torch.load(CHECKPOINT_PATH / "image_adapter.pt", map_location="cpu"))
261
  image_adapter.eval()
262
  image_adapter.to("cuda")
263
 
264
+ # After loading the tokenizer and model
265
+ print(f"Tokenizer class: {type(tokenizer)}")
266
+ print(f"BOS token: {tokenizer.bos_token}")
267
+ print(f"BOS token ID: {tokenizer.bos_token_id}")
268
+ print(f"EOS token: {tokenizer.eos_token}")
269
+ print(f"EOS token ID: {tokenizer.eos_token_id}")
270
+ print(f"Text model device: {text_model.device}")
271
+
272
+ # Ensure the tokenizer has the necessary special tokens
273
+ if tokenizer.bos_token_id is None or tokenizer.eos_token_id is None:
274
+ print("Warning: BOS or EOS token is missing. Adding default tokens.")
275
+ special_tokens_dict = {}
276
+ if tokenizer.bos_token_id is None:
277
+ special_tokens_dict['bos_token'] = '<|endoftext|>'
278
+ if tokenizer.eos_token_id is None:
279
+ special_tokens_dict['eos_token'] = '<|endoftext|>'
280
+ num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)
281
+ print(f"Added {num_added_tokens} special tokens to the tokenizer.")
282
+
283
+ # Resize token embeddings of the model if new tokens are added
284
+ text_model.resize_token_embeddings(len(tokenizer))
285
+
286
+ @spaces.GPU()
287
+ @torch.no_grad()
288
+ def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str, caption_length: str | int, art_style: str) -> str:
289
+ torch.cuda.empty_cache()
290
+
291
+ # Handle caption_length
292
+ length = None
293
+ if caption_length != "any":
294
+ if isinstance(caption_length, int):
295
+ length = caption_length
296
+ elif isinstance(caption_length, str):
297
+ try:
298
+ length = int(caption_length)
299
+ except ValueError:
300
+ # If it's not a number, treat it as a descriptive length
301
+ length = caption_length
302
+
303
+ # 'rng-tags' and 'training_prompt' don't have formal/informal tones
304
+ if caption_type in ["rng-tags", "training_prompt"]:
305
+ caption_tone = "formal"
306
+
307
+ # Build prompt
308
+ prompt_key = (caption_type, caption_tone, isinstance(length, str), isinstance(length, int))
309
+ if prompt_key not in CAPTION_TYPE_MAP:
310
+ raise ValueError(f"Invalid caption type: {prompt_key}")
311
+
312
+ prompt_str = CAPTION_TYPE_MAP[prompt_key][0].format(
313
+ length=length,
314
+ word_count=length,
315
+ style=art_style,
316
+ style_characteristics=STYLE_CHARACTERISTICS.get(art_style, "its unique elements"),
317
+ style_focus=STYLE_FOCUS.get(art_style, "its distinctive features")
318
+ )
319
+ print(f"Prompt: {prompt_str}")
320
 
321
+ # Preprocess image
 
 
 
322
  image = input_image.resize((384, 384), Image.LANCZOS)
323
  pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
324
  pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
325
+ pixel_values = pixel_values.to('cuda')
326
 
327
+ # Tokenize the prompt
 
 
 
328
  prompt = tokenizer.encode(prompt_str, return_tensors='pt', padding=False, truncation=False, add_special_tokens=False)
329
+
330
+ # Embed image
331
+ with torch.amp.autocast_mode.autocast('cuda', enabled=True):
332
+ vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True)
333
+ image_features = vision_outputs.hidden_states
334
+ embedded_images = image_adapter(image_features)
335
+ embedded_images = embedded_images.to('cuda')
336
+
337
+ # Embed prompt
338
  prompt_embeds = text_model.model.embed_tokens(prompt.to('cuda'))
339
+ assert prompt_embeds.shape == (1, prompt.shape[1], text_model.config.hidden_size), f"Prompt shape is {prompt_embeds.shape}, expected {(1, prompt.shape[1], text_model.config.hidden_size)}"
340
+
341
+ # Check for bos_token_id and provide a fallback
342
+ bos_token_id = tokenizer.bos_token_id
343
+ if bos_token_id is None:
344
+ print("Warning: bos_token_id is None. Using default value of 1.")
345
+ bos_token_id = 1 # Common default, but may need adjustment
346
+
347
+ embedded_bos = text_model.model.embed_tokens(torch.tensor([[bos_token_id]], device=text_model.device, dtype=torch.int64))
348
  eot_embed = image_adapter.get_eot_embedding().unsqueeze(0).to(dtype=text_model.dtype)
349
 
350
+ # Construct prompts
351
  inputs_embeds = torch.cat([
352
+ embedded_bos.expand(embedded_images.shape[0], -1, -1),
353
+ embedded_images.to(dtype=embedded_bos.dtype),
354
+ prompt_embeds.expand(embedded_images.shape[0], -1, -1),
355
+ eot_embed.expand(embedded_images.shape[0], -1, -1),
356
  ], dim=1)
357
 
358
  input_ids = torch.cat([
359
+ torch.tensor([[bos_token_id]], dtype=torch.long),
360
+ torch.zeros((1, embedded_images.shape[1]), dtype=torch.long),
361
  prompt,
362
  torch.tensor([[tokenizer.convert_tokens_to_ids("<|eot_id|>")]], dtype=torch.long),
363
  ], dim=1).to('cuda')
364
  attention_mask = torch.ones_like(input_ids)
365
 
366
+ generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=300, do_sample=True, suppress_tokens=None)
367
 
368
+ # Trim off the prompt
369
  generate_ids = generate_ids[:, input_ids.shape[1]:]
370
  if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids("<|eot_id|>"):
371
  generate_ids = generate_ids[:, :-1]
372
 
373
+ caption = tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
374
 
375
+ return caption.strip()
 
 
376
 
377
  css = """
378
  h1, h2, h3, h4, h5, h6, p, li, ul, ol, a, .centered-image {
 
392
  }
393
  """
394
 
395
+ ART_STYLES = [
396
+ "Impressionism", "Cubism", "Surrealism", "Abstract Expressionism", "Pop Art",
397
+ "Minimalism", "Baroque", "Renaissance", "Art Nouveau", "Gothic",
398
+ "Romanticism", "Realism", "Expressionism", "Fauvism", "Art Deco",
399
+ "Futurism", "Dadaism", "Pointillism", "Rococo", "Neoclassicism"
400
+ ]
401
+
402
+ STYLE_CHARACTERISTICS = {
403
+ "Impressionism": "loose brushstrokes, emphasis on light and color, everyday subjects",
404
+ "Cubism": "geometric shapes, multiple perspectives, fragmented forms",
405
+ "Surrealism": "dreamlike imagery, unexpected juxtapositions, subconscious exploration",
406
+ "Abstract Expressionism": "expressive brushwork, emotional content, abstract forms",
407
+ "Pop Art": "bright colors, popular culture references, satire",
408
+ "Minimalism": "simple forms, limited color palette, emphasis on space",
409
+ "Baroque": "dramatic lighting, elaborate detail, grandeur",
410
+ "Renaissance": "realistic depictions, perspective, religious themes",
411
+ "Art Nouveau": "stylized forms, organic shapes, decorative elements",
412
+ "Gothic": "dark themes, dramatic lighting, architectural elements",
413
+ "Romanticism": "emotional content, nature scenes, idealized figures",
414
+ "Realism": "detailed depictions, realistic textures, everyday subjects",
415
+ "Expressionism": "emotional content, distorted forms, abstract elements",
416
+ "Fauvism": "bold colors, abstract forms, emotional content",
417
+ "Art Deco": "geometric shapes, streamlined forms, modern aesthetics",
418
+ "Futurism": "dynamic forms, speed, technology",
419
+ "Dadaism": "anti-art, absurdity, subversion of traditional art",
420
+ "Pointillism": "small dots of color, impressionistic style, emphasis on light",
421
+ "Rococo": "ornate style, lighthearted themes, decorative elements",
422
+ "Neoclassicism": "classical style, balance, symmetry"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
423
  }
424
 
425
+ STYLE_FOCUS = {
426
+ "Impressionism": "capturing fleeting moments and atmospheric effects",
427
+ "Cubism": "deconstructing and reassembling forms from multiple viewpoints",
428
+ "Surrealism": "creating a sense of the uncanny and exploring the subconscious mind",
429
+ "Abstract Expressionism": "expressing emotional content through abstract forms",
430
+ "Pop Art": "commenting on popular culture and satirizing consumerism",
431
+ "Minimalism": "exploring the relationship between form and space",
432
+ "Baroque": "creating dramatic and grandiose compositions",
433
+ "Renaissance": "depicting realistic scenes and exploring perspective",
434
+ "Art Nouveau": "incorporating organic and decorative elements",
435
+ "Gothic": "exploring dark themes and dramatic lighting",
436
+ "Romanticism": "depicting emotional scenes and idealized figures",
437
+ "Realism": "capturing detailed and realistic textures",
438
+ "Expressionism": "expressing emotional content through distorted forms",
439
+ "Fauvism": "emphasizing bold colors and emotional content",
440
+ "Art Deco": "incorporating geometric shapes and modern aesthetics",
441
+ "Futurism": "depicting speed, technology, and dynamism",
442
+ "Dadaism": "subverting traditional art and exploring absurdity",
443
+ "Pointillism": "capturing light and color through small dots",
444
+ "Rococo": "creating lighthearted and decorative compositions",
445
+ "Neoclassicism": "achieving balance and symmetry in classical style"
446
  }
447
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
448
  with gr.Blocks(theme="Hev832/Applio", css=css) as demo:
449
  with gr.Tab("Welcome"):
450
  gr.Markdown(
451
+ """
452
  <img src="https://path-to-yamamoto-logo.png" alt="Yamamoto Logo" class="centered-image">
453
 
454
  # 🎨 Yamamoto JoyCaption: AI-Powered Art Inspiration
 
456
  ## Accelerate Your Creative Workflow with Intelligent Image Analysis
457
 
458
  This innovative tool empowers Yamamoto's artists to quickly generate descriptive captions,<br>
459
+ training prompts, or tags from existing artwork, fueling the creative process for GenAI models.
460
 
461
  ## 🚀 How It Works:
462
  1. **Upload Your Inspiration**: Drop in an image (e.g., a charcoal horse picture) that embodies your desired style.
 
465
  4. **Generate and Iterate**: Click 'Caption' to analyze your image and use the results to inspire new creations.
466
  """
467
  )
468
+
469
  with gr.Tab("JoyCaption"):
470
+ gr.Markdown("""
471
+ # JoyCaption: AI-Powered Image Analysis Tool
 
 
 
 
 
 
 
 
 
 
 
 
472
 
473
+ This tool helps you generate various types of text based on an uploaded image. Here's how to use it:
 
 
474
 
475
+ 1. Upload an image
476
+ 2. Choose your desired output type
477
+ 3. Adjust settings as needed
478
+ 4. Click 'Generate Caption' to get your result
479
+ """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
480
 
481
  with gr.Row():
482
+ with gr.Column(scale=1):
483
+ input_image = gr.Image(type="pil", label="Upload Your Image")
484
 
485
  caption_type = gr.Dropdown(
486
+ choices=[
487
+ "descriptive",
488
+ "training_prompt",
489
+ "rng-tags",
490
+ "thematic_analysis",
491
+ "stylistic_comparison",
492
+ "narrative_suggestion",
493
+ "contextual_storytelling",
494
+ "style_prompt"
495
+ ],
496
+ label="Output Type",
497
  value="descriptive",
498
  )
499
 
500
+ gr.Markdown("""
501
+ ### Output Types Explained:
502
+ - **Descriptive**: A general description of the image
503
+ - **Training Prompt**: A prompt for AI image generation
504
+ - **RNG-Tags**: Tags for categorizing the image
505
+ - **Thematic Analysis**: Exploration of themes in the image
506
+ - **Stylistic Comparison**: Compares the image to art styles
507
+ - **Narrative Suggestion**: A story idea based on the image
508
+ - **Contextual Storytelling**: A background story for the image
509
+ - **Style Prompt**: Analyzes the image in context of a specific art style
510
+ """)
511
+
512
  caption_tone = gr.Dropdown(
513
  choices=["formal", "informal"],
514
+ label="Tone",
515
  value="formal",
516
  )
517
 
518
+ gr.Markdown("Choose between a formal (professional) or informal (casual) tone for the output.")
519
+
520
  caption_length = gr.Dropdown(
521
  choices=["any", "very short", "short", "medium-length", "long", "very long"] +
522
  [str(i) for i in range(20, 261, 10)],
523
+ label="Length",
524
  value="any",
525
  )
526
 
527
+ gr.Markdown("""
528
+ Select the desired length of the output:
529
+ - 'any': No specific length
530
+ - Descriptive options: very short to very long
531
+ - Numeric options: Specify exact word count (20 to 260 words)
532
+ """)
533
+
534
+ art_style = gr.Dropdown(
535
+ choices=ART_STYLES,
536
+ label="Art Style (for Style Prompt)",
537
+ value="Impressionism",
538
+ visible=False
 
 
 
 
 
 
 
539
  )
540
 
541
+ gr.Markdown("Select an art style to analyze the image in that context. Only applicable for 'Style Prompt' output type.")
542
+
543
+ with gr.Column(scale=1):
544
+ output_caption = gr.Textbox(label="Generated Output", lines=10)
545
+ generate_button = gr.Button("Generate Caption")
546
+
547
+ gr.Markdown("""
548
+ ### Additional Notes:
549
+ - The 'Tone' setting doesn't affect 'RNG-Tags' and 'Training Prompt' outputs.
550
+ - 'Art Style' is only used when 'Style Prompt' is selected as the output type.
551
+ - The AI model analyzes the image and generates text based on your selections.
552
+ """)
553
+
554
+ def update_visibility(caption_type):
555
+ return {
556
+ art_style: gr.update(visible=(caption_type == "style_prompt")),
557
+ caption_tone: gr.update(visible=(caption_type not in ["rng-tags", "training_prompt"]))
558
+ }
559
+
560
+ caption_type.change(
561
+ fn=update_visibility,
562
+ inputs=[caption_type],
563
+ outputs=[art_style, caption_tone]
564
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
565
 
566
+ generate_button.click(
567
+ fn=stream_chat,
568
+ inputs=[input_image, caption_type, caption_tone, caption_length, art_style],
569
+ outputs=[output_caption]
570
+ )
571
 
572
  if __name__ == "__main__":
573
  demo.launch()