Severian commited on
Commit
98792a8
·
verified ·
1 Parent(s): 936fabc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +307 -358
app.py CHANGED
@@ -12,147 +12,28 @@ import torchvision.transforms.functional as TVF
12
 
13
 
14
  CLIP_PATH = "google/siglip-so400m-patch14-384"
15
- MODEL_PATH = "meta-llama/Meta-Llama-3.1-8B"
16
  CHECKPOINT_PATH = Path("9em124t2-499968")
 
17
  CAPTION_TYPE_MAP = {
18
- ("descriptive", "formal", False, False): [
19
- "Write a detailed, formal description of this image, focusing on composition, style, and artistic elements.",
20
- "Provide a comprehensive, academic analysis of this artwork's visual characteristics and techniques."
21
- ],
22
- ("descriptive", "formal", False, True): [
23
- "Craft a formal, concise description of this image within {word_count} words, highlighting key visual elements.",
24
- "Summarize the artwork's main features and style in a formal tone, using no more than {word_count} words."
25
- ],
26
- ("descriptive", "formal", True, False): [
27
- "Compose a {length} formal critique of this image, discussing its artistic merits and visual impact.",
28
- "Create a {length} scholarly description of this artwork, analyzing its composition and aesthetic qualities."
29
- ],
30
- ("descriptive", "informal", False, False): [
31
- "Describe this image as if you're explaining it to a friend, focusing on what stands out to you.",
32
- "Give a casual, conversational rundown of what you see in this artwork and how it makes you feel."
33
- ],
34
- ("descriptive", "informal", False, True): [
35
- "In about {word_count} words, give a laid-back description of this image's vibe and key features.",
36
- "Summarize the coolest parts of this artwork in a casual tone, using roughly {word_count} words."
37
- ],
38
- ("descriptive", "informal", True, False): [
39
- "Write a {length} chill description of this image, highlighting what you find most interesting or unique.",
40
- "Give a {length} relaxed explanation of what's going on in this artwork and why it catches your eye."
41
- ],
42
- ("training_prompt", "formal", False, False): [
43
- "Generate a detailed stable diffusion prompt to recreate this image, including style, composition, and key elements.",
44
- "Craft a comprehensive prompt for an AI art generator to produce an image in the same style and mood as this artwork."
45
- ],
46
- ("training_prompt", "formal", False, True): [
47
- "Within {word_count} words, create a precise stable diffusion prompt capturing the essence of this image.",
48
- "Compose a concise AI art prompt of {word_count} words to replicate this artwork's style and content."
49
- ],
50
- ("training_prompt", "formal", True, False): [
51
- "Write a {length} stable diffusion prompt that thoroughly describes this image's style, subject, and artistic techniques.",
52
- "Develop a {length} detailed prompt for AI art generation, breaking down the key visual elements and artistic approach of this image."
53
- ],
54
- ("rng-tags", "formal", False, False): [
55
- "Generate a comprehensive list of Booru tags describing this image's content, style, and artistic elements.",
56
- "Create an extensive set of Booru tags covering all aspects of this artwork, including subject, technique, and mood."
57
- ],
58
- ("rng-tags", "formal", False, True): [
59
- "Produce a focused list of Booru tags within {word_count} words, capturing the most important aspects of this image.",
60
- "Compile a concise set of Booru tags, limited to {word_count} words, that best represent this artwork's key features."
61
- ],
62
- ("rng-tags", "formal", True, False): [
63
- "Generate a {length} list of Booru tags, providing a thorough categorization of this image's content and style.",
64
- "Create a {length} set of Booru tags that extensively describe all visual elements and artistic choices in this artwork."
65
- ],
66
- ("artistic_inspiration", "formal", False, False): [
67
- "Analyze this image and suggest artistic variations or extensions that could be created based on its style and theme.",
68
- "Provide a formal interpretation of this artwork's mood and style, offering ideas for complementary pieces or a series."
69
- ],
70
- ("artistic_inspiration", "informal", False, False): [
71
- "Brainstorm some cool ideas for new artworks inspired by this image's style or subject matter.",
72
- "Riff on this artwork's vibe and come up with some creative spin-offs or related pieces an artist could make."
73
- ],
74
- ("technical_breakdown", "formal", False, False): [
75
- "Provide a detailed technical analysis of the artistic techniques and materials likely used to create this image.",
76
- "Break down the compositional elements and artistic methods employed in this artwork, suitable for an art student's study."
77
- ],
78
- ("emotional_response", "informal", False, False): [
79
- "Describe the emotions and feelings this artwork evokes, and explain why it might resonate with viewers.",
80
- "Share your gut reaction to this image and speculate on what the artist might have been feeling or thinking."
81
- ],
82
-
83
- ("thematic_analysis", "formal", False, False): [
84
- "Provide an in-depth analysis of the themes presented in this image, exploring the underlying messages and concepts.",
85
- "Analyze the primary and secondary themes of this artwork, discussing their significance and interplay."
86
- ],
87
- ("thematic_analysis", "formal", False, True): [
88
- "Within {word_count} words, dissect the main themes of this image, highlighting their relevance and impact.",
89
- "Craft a concise thematic analysis of this artwork in {word_count} words, focusing on its core messages."
90
- ],
91
- ("thematic_analysis", "formal", True, False): [
92
- "Write a {length} formal exploration of the themes depicted in this image, examining their depth and meaning.",
93
- "Develop a {length} scholarly analysis of the thematic elements in this artwork, discussing their significance."
94
- ],
95
- ("stylistic_comparison", "informal", False, False): [
96
- "Compare the style of this image to other famous art movements or artists, highlighting similarities and differences.",
97
- "Describe how this artwork's style relates to [specific artist/style], and what makes it unique."
98
- ],
99
- ("stylistic_comparison", "informal", False, True): [
100
- "In about {word_count} words, compare this image's style with other known art styles or artists.",
101
- "Summarize the stylistic similarities and differences of this artwork compared to other genres in {word_count} words."
102
- ],
103
- ("stylistic_comparison", "informal", True, False): [
104
- "Write a {length} casual comparison of this image's style with other art movements or famous artists.",
105
- "Give a {length} relaxed description of how this artwork's style aligns or differs from other genres."
106
- ],
107
- ("narrative_suggestion", "formal", False, False): [
108
- "Create a short narrative inspired by this image, outlining a possible story that reflects its visual elements.",
109
- "Develop a brief storyline that complements the themes and mood depicted in this artwork."
110
- ],
111
- ("narrative_suggestion", "formal", False, True): [
112
- "Within {word_count} words, outline a narrative inspired by this image's visual elements and mood.",
113
- "Compose a concise story idea based on the themes and composition of this artwork in {word_count} words."
114
- ],
115
- ("narrative_suggestion", "formal", True, False): [
116
- "Write a {length} formal narrative inspired by this image, detailing a story that aligns with its visual and thematic elements.",
117
- "Develop a {length} scholarly storyline that reflects the mood and composition of this artwork."
118
- ],
119
- ("contextual_storytelling", "informal", False, False): [
120
- "Tell a cool story that could be happening in the scene of this image, based on its visual cues.",
121
- "Imagine a background story for this artwork, explaining what's happening and why."
122
- ],
123
- ("contextual_storytelling", "informal", False, True): [
124
- "In about {word_count} words, create a backstory for the scene depicted in this image.",
125
- "Summarize a possible background narrative for this artwork in {word_count} words."
126
- ],
127
- ("contextual_storytelling", "informal", True, False): [
128
- "Write a {length} informal story that provides context to the scene portrayed in this image.",
129
- "Give a {length} casual backstory explaining the events depicted in this artwork."
130
- ],
131
-
132
- ("style_prompt", "formal", False, False): [
133
- "Analyze this image through the lens of {style} art. Describe how it aligns with or diverges from {style_characteristics}.",
134
- "Examine this artwork in the context of the {style} movement, focusing on {style_focus} and how these elements are represented or reinterpreted in the image."
135
- ],
136
- ("style_prompt", "formal", False, True): [
137
- "Within {word_count} words, compare this image to the {style} style, highlighting elements that reflect or contrast with its key characteristics.",
138
- "Compose a concise {word_count}-word analysis of how this artwork relates to the {style} movement, noting its adherence to or departure from typical {style} elements."
139
- ],
140
- ("style_prompt", "formal", True, False): [
141
- "Write a {length} critique of this image, exploring its relationship to the {style} movement. Discuss composition, technique, and thematic elements in this context.",
142
- "Develop a {length} analysis of how this artwork incorporates or challenges the principles of {style}, considering its visual language and artistic approach."
143
- ],
144
- ("style_prompt", "informal", False, False): [
145
- "Imagine this image is in an exhibition of {style} art. Describe what makes it fit in or stand out from other {style} pieces.",
146
- "Give a casual rundown of how this artwork vibes with the {style} movement. What's similar? What's different? What's cool about it?"
147
- ],
148
- ("style_prompt", "informal", False, True): [
149
- "In about {word_count} words, chat about how this image relates to {style} art. What catches your eye as typical or unusual for the style?",
150
- "Summarize in roughly {word_count} words how this artwork plays with {style} ideas. What's familiar? What's a twist on the style?"
151
- ],
152
- ("style_prompt", "informal", True, False): [
153
- "Write a {length} chill analysis of this image as if it's part of a {style} art show. What works? What's surprising? How does it make you feel?",
154
- "Give a {length} relaxed breakdown of how this artwork fits (or doesn't) into the {style} scene. What's your take on its use of {style} elements?"
155
- ],
156
  }
157
 
158
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
@@ -257,122 +138,105 @@ text_model.eval()
257
  # Image Adapter
258
  print("Loading image adapter")
259
  image_adapter = ImageAdapter(clip_model.config.hidden_size, text_model.config.hidden_size, False, False, 38, False)
260
- image_adapter.load_state_dict(torch.load(CHECKPOINT_PATH / "image_adapter.pt", map_location="cpu"))
261
  image_adapter.eval()
262
  image_adapter.to("cuda")
263
 
264
- # After loading the tokenizer and model
265
- print(f"Tokenizer class: {type(tokenizer)}")
266
- print(f"BOS token: {tokenizer.bos_token}")
267
- print(f"BOS token ID: {tokenizer.bos_token_id}")
268
- print(f"EOS token: {tokenizer.eos_token}")
269
- print(f"EOS token ID: {tokenizer.eos_token_id}")
270
- print(f"Text model device: {text_model.device}")
271
-
272
- # Ensure the tokenizer has the necessary special tokens
273
- if tokenizer.bos_token_id is None or tokenizer.eos_token_id is None:
274
- print("Warning: BOS or EOS token is missing. Adding default tokens.")
275
- special_tokens_dict = {}
276
- if tokenizer.bos_token_id is None:
277
- special_tokens_dict['bos_token'] = '<|endoftext|>'
278
- if tokenizer.eos_token_id is None:
279
- special_tokens_dict['eos_token'] = '<|endoftext|>'
280
- num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)
281
- print(f"Added {num_added_tokens} special tokens to the tokenizer.")
282
-
283
- # Resize token embeddings of the model if new tokens are added
284
- text_model.resize_token_embeddings(len(tokenizer))
285
-
286
- @spaces.GPU()
287
- @torch.no_grad()
288
- def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str, caption_length: str | int, art_style: str) -> str:
289
- torch.cuda.empty_cache()
290
-
291
- # Handle caption_length
292
- length = None
293
- if caption_length != "any":
294
- if isinstance(caption_length, int):
295
- length = caption_length
296
- elif isinstance(caption_length, str):
297
- try:
298
- length = int(caption_length)
299
- except ValueError:
300
- # If it's not a number, treat it as a descriptive length
301
- length = caption_length
302
-
303
- # 'rng-tags' and 'training_prompt' don't have formal/informal tones
304
- if caption_type in ["rng-tags", "training_prompt"]:
305
- caption_tone = "formal"
306
-
307
- # Build prompt
308
- prompt_key = (caption_type, caption_tone, isinstance(length, str), isinstance(length, int))
309
- if prompt_key not in CAPTION_TYPE_MAP:
310
- raise ValueError(f"Invalid caption type: {prompt_key}")
311
-
312
- prompt_str = CAPTION_TYPE_MAP[prompt_key][0].format(
313
- length=length,
314
- word_count=length,
315
- style=art_style,
316
- style_characteristics=STYLE_CHARACTERISTICS.get(art_style, "its unique elements"),
317
- style_focus=STYLE_FOCUS.get(art_style, "its distinctive features")
318
- )
319
- print(f"Prompt: {prompt_str}")
320
 
321
- # Preprocess image
 
 
 
322
  image = input_image.resize((384, 384), Image.LANCZOS)
323
  pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
324
  pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
325
- pixel_values = pixel_values.to('cuda')
326
 
327
- # Tokenize the prompt
 
 
 
328
  prompt = tokenizer.encode(prompt_str, return_tensors='pt', padding=False, truncation=False, add_special_tokens=False)
329
-
330
- # Embed image
331
- with torch.amp.autocast_mode.autocast('cuda', enabled=True):
332
- vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True)
333
- image_features = vision_outputs.hidden_states
334
- embedded_images = image_adapter(image_features)
335
- embedded_images = embedded_images.to('cuda')
336
-
337
- # Embed prompt
338
  prompt_embeds = text_model.model.embed_tokens(prompt.to('cuda'))
339
- assert prompt_embeds.shape == (1, prompt.shape[1], text_model.config.hidden_size), f"Prompt shape is {prompt_embeds.shape}, expected {(1, prompt.shape[1], text_model.config.hidden_size)}"
340
-
341
- # Check for bos_token_id and provide a fallback
342
- bos_token_id = tokenizer.bos_token_id
343
- if bos_token_id is None:
344
- print("Warning: bos_token_id is None. Using default value of 1.")
345
- bos_token_id = 1 # Common default, but may need adjustment
346
-
347
- embedded_bos = text_model.model.embed_tokens(torch.tensor([[bos_token_id]], device=text_model.device, dtype=torch.int64))
348
  eot_embed = image_adapter.get_eot_embedding().unsqueeze(0).to(dtype=text_model.dtype)
349
 
350
- # Construct prompts
351
  inputs_embeds = torch.cat([
352
- embedded_bos.expand(embedded_images.shape[0], -1, -1),
353
- embedded_images.to(dtype=embedded_bos.dtype),
354
- prompt_embeds.expand(embedded_images.shape[0], -1, -1),
355
- eot_embed.expand(embedded_images.shape[0], -1, -1),
356
  ], dim=1)
357
 
358
  input_ids = torch.cat([
359
- torch.tensor([[bos_token_id]], dtype=torch.long),
360
- torch.zeros((1, embedded_images.shape[1]), dtype=torch.long),
361
  prompt,
362
  torch.tensor([[tokenizer.convert_tokens_to_ids("<|eot_id|>")]], dtype=torch.long),
363
  ], dim=1).to('cuda')
364
  attention_mask = torch.ones_like(input_ids)
365
 
366
- generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=300, do_sample=True, suppress_tokens=None)
367
 
368
- # Trim off the prompt
369
  generate_ids = generate_ids[:, input_ids.shape[1]:]
370
  if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids("<|eot_id|>"):
371
  generate_ids = generate_ids[:, :-1]
372
 
373
- caption = tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
374
 
375
- return caption.strip()
 
 
376
 
377
  css = """
378
  h1, h2, h3, h4, h5, h6, p, li, ul, ol, a, .centered-image {
@@ -392,63 +256,110 @@ ul, ol {
392
  }
393
  """
394
 
395
- ART_STYLES = [
396
- "Impressionism", "Cubism", "Surrealism", "Abstract Expressionism", "Pop Art",
397
- "Minimalism", "Baroque", "Renaissance", "Art Nouveau", "Gothic",
398
- "Romanticism", "Realism", "Expressionism", "Fauvism", "Art Deco",
399
- "Futurism", "Dadaism", "Pointillism", "Rococo", "Neoclassicism"
400
- ]
401
-
402
- STYLE_CHARACTERISTICS = {
403
- "Impressionism": "loose brushstrokes, emphasis on light and color, everyday subjects",
404
- "Cubism": "geometric shapes, multiple perspectives, fragmented forms",
405
- "Surrealism": "dreamlike imagery, unexpected juxtapositions, subconscious exploration",
406
- "Abstract Expressionism": "expressive brushwork, emotional content, abstract forms",
407
- "Pop Art": "bright colors, popular culture references, satire",
408
- "Minimalism": "simple forms, limited color palette, emphasis on space",
409
- "Baroque": "dramatic lighting, elaborate detail, grandeur",
410
- "Renaissance": "realistic depictions, perspective, religious themes",
411
- "Art Nouveau": "stylized forms, organic shapes, decorative elements",
412
- "Gothic": "dark themes, dramatic lighting, architectural elements",
413
- "Romanticism": "emotional content, nature scenes, idealized figures",
414
- "Realism": "detailed depictions, realistic textures, everyday subjects",
415
- "Expressionism": "emotional content, distorted forms, abstract elements",
416
- "Fauvism": "bold colors, abstract forms, emotional content",
417
- "Art Deco": "geometric shapes, streamlined forms, modern aesthetics",
418
- "Futurism": "dynamic forms, speed, technology",
419
- "Dadaism": "anti-art, absurdity, subversion of traditional art",
420
- "Pointillism": "small dots of color, impressionistic style, emphasis on light",
421
- "Rococo": "ornate style, lighthearted themes, decorative elements",
422
- "Neoclassicism": "classical style, balance, symmetry"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
423
  }
424
 
425
- STYLE_FOCUS = {
426
- "Impressionism": "capturing fleeting moments and atmospheric effects",
427
- "Cubism": "deconstructing and reassembling forms from multiple viewpoints",
428
- "Surrealism": "creating a sense of the uncanny and exploring the subconscious mind",
429
- "Abstract Expressionism": "expressing emotional content through abstract forms",
430
- "Pop Art": "commenting on popular culture and satirizing consumerism",
431
- "Minimalism": "exploring the relationship between form and space",
432
- "Baroque": "creating dramatic and grandiose compositions",
433
- "Renaissance": "depicting realistic scenes and exploring perspective",
434
- "Art Nouveau": "incorporating organic and decorative elements",
435
- "Gothic": "exploring dark themes and dramatic lighting",
436
- "Romanticism": "depicting emotional scenes and idealized figures",
437
- "Realism": "capturing detailed and realistic textures",
438
- "Expressionism": "expressing emotional content through distorted forms",
439
- "Fauvism": "emphasizing bold colors and emotional content",
440
- "Art Deco": "incorporating geometric shapes and modern aesthetics",
441
- "Futurism": "depicting speed, technology, and dynamism",
442
- "Dadaism": "subverting traditional art and exploring absurdity",
443
- "Pointillism": "capturing light and color through small dots",
444
- "Rococo": "creating lighthearted and decorative compositions",
445
- "Neoclassicism": "achieving balance and symmetry in classical style"
446
  }
447
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
448
  with gr.Blocks(theme="Hev832/Applio", css=css) as demo:
449
  with gr.Tab("Welcome"):
450
  gr.Markdown(
451
- """
452
  <img src="https://path-to-yamamoto-logo.png" alt="Yamamoto Logo" class="centered-image">
453
 
454
  # 🎨 Yamamoto JoyCaption: AI-Powered Art Inspiration
@@ -456,7 +367,7 @@ with gr.Blocks(theme="Hev832/Applio", css=css) as demo:
456
  ## Accelerate Your Creative Workflow with Intelligent Image Analysis
457
 
458
  This innovative tool empowers Yamamoto's artists to quickly generate descriptive captions,<br>
459
- training prompts, or tags from existing artwork, fueling the creative process for GenAI models.
460
 
461
  ## 🚀 How It Works:
462
  1. **Upload Your Inspiration**: Drop in an image (e.g., a charcoal horse picture) that embodies your desired style.
@@ -465,109 +376,147 @@ with gr.Blocks(theme="Hev832/Applio", css=css) as demo:
465
  4. **Generate and Iterate**: Click 'Caption' to analyze your image and use the results to inspire new creations.
466
  """
467
  )
468
-
469
  with gr.Tab("JoyCaption"):
470
- gr.Markdown("""
471
- # JoyCaption: AI-Powered Image Analysis Tool
 
 
 
 
 
 
 
 
 
 
 
 
472
 
473
- This tool helps you generate various types of text based on an uploaded image. Here's how to use it:
 
 
474
 
475
- 1. Upload an image
476
- 2. Choose your desired output type
477
- 3. Adjust settings as needed
478
- 4. Click 'Generate Caption' to get your result
479
- """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
480
 
481
  with gr.Row():
482
- with gr.Column(scale=1):
483
- input_image = gr.Image(type="pil", label="Upload Your Image")
484
 
485
  caption_type = gr.Dropdown(
486
- choices=[
487
- "descriptive",
488
- "training_prompt",
489
- "rng-tags",
490
- "thematic_analysis",
491
- "stylistic_comparison",
492
- "narrative_suggestion",
493
- "contextual_storytelling",
494
- "style_prompt"
495
- ],
496
- label="Output Type",
497
  value="descriptive",
498
  )
499
 
500
- gr.Markdown("""
501
- ### Output Types Explained:
502
- - **Descriptive**: A general description of the image
503
- - **Training Prompt**: A prompt for AI image generation
504
- - **RNG-Tags**: Tags for categorizing the image
505
- - **Thematic Analysis**: Exploration of themes in the image
506
- - **Stylistic Comparison**: Compares the image to art styles
507
- - **Narrative Suggestion**: A story idea based on the image
508
- - **Contextual Storytelling**: A background story for the image
509
- - **Style Prompt**: Analyzes the image in context of a specific art style
510
- """)
511
-
512
  caption_tone = gr.Dropdown(
513
  choices=["formal", "informal"],
514
- label="Tone",
515
  value="formal",
516
  )
517
 
518
- gr.Markdown("Choose between a formal (professional) or informal (casual) tone for the output.")
519
-
520
  caption_length = gr.Dropdown(
521
  choices=["any", "very short", "short", "medium-length", "long", "very long"] +
522
  [str(i) for i in range(20, 261, 10)],
523
- label="Length",
524
  value="any",
525
  )
526
 
527
- gr.Markdown("""
528
- Select the desired length of the output:
529
- - 'any': No specific length
530
- - Descriptive options: very short to very long
531
- - Numeric options: Specify exact word count (20 to 260 words)
532
- """)
533
-
534
- art_style = gr.Dropdown(
535
- choices=ART_STYLES,
536
- label="Art Style (for Style Prompt)",
537
- value="Impressionism",
538
- visible=False
539
  )
540
 
541
- gr.Markdown("Select an art style to analyze the image in that context. Only applicable for 'Style Prompt' output type.")
542
-
543
- with gr.Column(scale=1):
544
- output_caption = gr.Textbox(label="Generated Output", lines=10)
545
- generate_button = gr.Button("Generate Caption")
546
-
547
- gr.Markdown("""
548
- ### Additional Notes:
549
- - The 'Tone' setting doesn't affect 'RNG-Tags' and 'Training Prompt' outputs.
550
- - 'Art Style' is only used when 'Style Prompt' is selected as the output type.
551
- - The AI model analyzes the image and generates text based on your selections.
552
- """)
553
-
554
- def update_visibility(caption_type):
555
- return {
556
- art_style: gr.update(visible=(caption_type == "style_prompt")),
557
- caption_tone: gr.update(visible=(caption_type not in ["rng-tags", "training_prompt"]))
558
- }
559
-
560
- caption_type.change(
561
- fn=update_visibility,
562
- inputs=[caption_type],
563
- outputs=[art_style, caption_tone]
564
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
565
 
566
- generate_button.click(
567
- fn=stream_chat,
568
- inputs=[input_image, caption_type, caption_tone, caption_length, art_style],
569
- outputs=[output_caption]
570
- )
571
 
572
  if __name__ == "__main__":
573
  demo.launch()
 
12
 
13
 
14
  CLIP_PATH = "google/siglip-so400m-patch14-384"
15
+ MODEL_PATH = "Qwen/Qwen2.5-7B-Instruct"
16
  CHECKPOINT_PATH = Path("9em124t2-499968")
17
+ TITLE = "<h1><center>JoyCaption Alpha One (2024-09-20a)</center></h1>"
18
  CAPTION_TYPE_MAP = {
19
+ ("descriptive", "formal", False, False): ["Write a descriptive caption for this image in a formal tone."],
20
+ ("descriptive", "formal", False, True): ["Write a descriptive caption for this image in a formal tone within {word_count} words."],
21
+ ("descriptive", "formal", True, False): ["Write a {length} descriptive caption for this image in a formal tone."],
22
+ ("descriptive", "informal", False, False): ["Write a descriptive caption for this image in a casual tone."],
23
+ ("descriptive", "informal", False, True): ["Write a descriptive caption for this image in a casual tone within {word_count} words."],
24
+ ("descriptive", "informal", True, False): ["Write a {length} descriptive caption for this image in a casual tone."],
25
+
26
+ ("training_prompt", "formal", False, False): ["Write a stable diffusion prompt for this image."],
27
+ ("training_prompt", "formal", False, True): ["Write a stable diffusion prompt for this image within {word_count} words."],
28
+ ("training_prompt", "formal", True, False): ["Write a {length} stable diffusion prompt for this image."],
29
+
30
+ ("rng-tags", "formal", False, False): ["Write a list of Booru tags for this image."],
31
+ ("rng-tags", "formal", False, True): ["Write a list of Booru tags for this image within {word_count} words."],
32
+ ("rng-tags", "formal", True, False): ["Write a {length} list of Booru tags for this image."],
33
+
34
+ ("style_prompt", "formal", False, False): ["Generate a detailed style prompt for this image, including lens type, film stock, composition notes, lighting aspects, and any special photographic techniques."],
35
+ ("style_prompt", "formal", False, True): ["Generate a detailed style prompt for this image within {word_count} words, including lens type, film stock, composition notes, lighting aspects, and any special photographic techniques."],
36
+ ("style_prompt", "formal", True, False): ["Generate a {length} detailed style prompt for this image, including lens type, film stock, composition notes, lighting aspects, and any special photographic techniques."],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  }
38
 
39
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
 
138
  # Image Adapter
139
  print("Loading image adapter")
140
  image_adapter = ImageAdapter(clip_model.config.hidden_size, text_model.config.hidden_size, False, False, 38, False)
141
+ image_adapter.load_state_dict(torch.load(CHECKPOINT_PATH / "image_adapter.pt", map_location="cpu", weights_only=True))
142
  image_adapter.eval()
143
  image_adapter.to("cuda")
144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
+ def preprocess_image(input_image: Image.Image) -> torch.Tensor:
147
+ """
148
+ Preprocess the input image for the CLIP model.
149
+ """
150
  image = input_image.resize((384, 384), Image.LANCZOS)
151
  pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
152
  pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
153
+ return pixel_values.to('cuda')
154
 
155
+ def generate_caption(text_model, tokenizer, image_features, prompt_str: str, max_new_tokens: int = 300) -> str:
156
+ """
157
+ Generate a caption based on the image features and prompt.
158
+ """
159
  prompt = tokenizer.encode(prompt_str, return_tensors='pt', padding=False, truncation=False, add_special_tokens=False)
 
 
 
 
 
 
 
 
 
160
  prompt_embeds = text_model.model.embed_tokens(prompt.to('cuda'))
161
+ embedded_bos = text_model.model.embed_tokens(torch.tensor([[tokenizer.bos_token_id]], device=text_model.device, dtype=torch.int64))
 
 
 
 
 
 
 
 
162
  eot_embed = image_adapter.get_eot_embedding().unsqueeze(0).to(dtype=text_model.dtype)
163
 
 
164
  inputs_embeds = torch.cat([
165
+ embedded_bos.expand(image_features.shape[0], -1, -1),
166
+ image_features.to(dtype=embedded_bos.dtype),
167
+ prompt_embeds.expand(image_features.shape[0], -1, -1),
168
+ eot_embed.expand(image_features.shape[0], -1, -1),
169
  ], dim=1)
170
 
171
  input_ids = torch.cat([
172
+ torch.tensor([[tokenizer.bos_token_id]], dtype=torch.long),
173
+ torch.zeros((1, image_features.shape[1]), dtype=torch.long),
174
  prompt,
175
  torch.tensor([[tokenizer.convert_tokens_to_ids("<|eot_id|>")]], dtype=torch.long),
176
  ], dim=1).to('cuda')
177
  attention_mask = torch.ones_like(input_ids)
178
 
179
+ generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=max_new_tokens, do_sample=True, suppress_tokens=None)
180
 
 
181
  generate_ids = generate_ids[:, input_ids.shape[1]:]
182
  if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids("<|eot_id|>"):
183
  generate_ids = generate_ids[:, :-1]
184
 
185
+ return tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0].strip()
186
+
187
+ @spaces.GPU()
188
+ @torch.no_grad()
189
+ def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str, caption_length: str | int, lens_type: str = "", film_stock: str = "", composition_style: str = "", lighting_aspect: str = "", special_technique: str = "", color_effect: str = "") -> str:
190
+ """
191
+ Generate a caption or style prompt based on the input image and parameters.
192
+ """
193
+ torch.cuda.empty_cache()
194
+
195
+ try:
196
+ length = None if caption_length == "any" else caption_length
197
+ if isinstance(length, str):
198
+ length = int(length)
199
+ except ValueError:
200
+ raise ValueError(f"Invalid caption length: {caption_length}")
201
+
202
+ if caption_type in ["rng-tags", "training_prompt", "style_prompt"]:
203
+ caption_tone = "formal"
204
+
205
+ prompt_key = (caption_type, caption_tone, isinstance(length, str), isinstance(length, int))
206
+ if prompt_key not in CAPTION_TYPE_MAP:
207
+ raise ValueError(f"Invalid caption type: {prompt_key}")
208
+
209
+ prompt_str = CAPTION_TYPE_MAP[prompt_key][0].format(length=length, word_count=length)
210
+
211
+ if caption_type == "style_prompt":
212
+ prompt_str += f" Lens type: {lens_type} ({lens_types_info[lens_type]}). "
213
+ prompt_str += f"Film stock: {film_stocks_info[film_stock]}). "
214
+ prompt_str += f"Composition style: {composition_styles_info[composition_style]}). "
215
+ prompt_str += f"Lighting aspect: {lighting_aspects_info[lighting_aspect]}). "
216
+ prompt_str += f"Special technique: {special_techniques_info[special_technique]}). "
217
+ prompt_str += f"Color effect: {color_effects_info[color_effect]})."
218
+
219
+ # Debugging: Print the constructed prompt string
220
+ print(f"Constructed Prompt: {prompt_str}")
221
+
222
+ pixel_values = preprocess_image(input_image)
223
+
224
+ with torch.amp.autocast_mode.autocast('cuda', enabled=True):
225
+ vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True)
226
+ image_features = vision_outputs.hidden_states
227
+ embedded_images = image_adapter(image_features)
228
+ embedded_images = embedded_images.to('cuda')
229
+
230
+ # Load the model from MODEL_PATH
231
+ text_model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map="auto", torch_dtype=torch.bfloat16)
232
+ text_model.eval()
233
+
234
+ # Debugging: Print the prompt string before passing to generate_caption
235
+ print(f"Prompt passed to generate_caption: {prompt_str}")
236
 
237
+ caption = generate_caption(text_model, tokenizer, embedded_images, prompt_str)
238
+
239
+ return caption
240
 
241
  css = """
242
  h1, h2, h3, h4, h5, h6, p, li, ul, ol, a, .centered-image {
 
256
  }
257
  """
258
 
259
+ # Add detailed descriptions for each option
260
+ lens_types_info = {
261
+ "Standard": "A versatile lens with a field of view similar to human vision.",
262
+ "Wide-angle": "Captures a wider field of view, great for landscapes and architecture. Applies moderate to strong lens effect with image warp.",
263
+ "Telephoto": "Used for distant subjects, gives an 'award-winning' or 'National Geographic' look. Creates interesting effects when prompted.",
264
+ "Macro": "For extreme close-up photography, revealing tiny details.",
265
+ "Fish-eye": "Ultra-wide-angle lens that creates a strong bubble-like distortion. Generates panoramic photos with the entire image warping into a bubble.",
266
+ "Tilt-shift": "Allows adjusting the plane of focus, creating a 'miniature' effect. Known for the 'diorama miniature look'.",
267
+ "Zoom lens": "Variable focal length lens. Often zooms in on the subject, perfect for creating a base for inpainting. Interesting effect on landscapes with motion blur.",
268
+ "GoPro": "Wide-angle lens with clean digital look. Excludes film grain and most filter effects, resulting in natural colors and regular saturation.",
269
+ "Pinhole camera": "Creates a unique, foggy, low-detail, historic photograph look. Used since the 1850s, with peak popularity in the 1930s."
270
+ }
271
+
272
+ film_stocks_info = {
273
+ "Kodak Portra": "Professional color negative film known for its natural skin tones and low contrast.",
274
+ "Fujifilm Velvia": "Slide film known for vibrant colors and high saturation, popular among landscape photographers.",
275
+ "Ilford Delta": "Black and white film known for its fine grain and high sharpness.",
276
+ "Kodak Tri-X": "Classic high-speed black and white film, known for its distinctive grain and wide exposure latitude.",
277
+ "Fujifilm Provia": "Color reversal film known for its natural color reproduction and fine grain.",
278
+ "Cinestill": "Color photos with fine/low grain and higher than average resolution. Colors are slightly oversaturated or slightly desaturated.",
279
+ "Ektachrome": "Color photos with fine/low to moderate grain. Colors on the colder part of spectrum or regular, with normal or slightly higher saturation.",
280
+ "Ektar": "Modern Kodak film. Color photos with little to no grain. Results look like regular modern photography with artistic angles.",
281
+ "Film Washi": "Mostly black and white photos with fine/low to moderate grain. Occasionally gives colored photos with low saturation. Distinct style with high black contrast and soft camera lens effect.",
282
+ "Fomapan": "Black and white photos with fine/low to moderate grain, highly artistic exposure and angles. Adds very soft lens effect without distortion, dark photo vignette.",
283
+ "Fujicolor": "Color photos with fine/low to moderate grain. Colors are slightly or notably desaturated, with entire color hue shifted in a very distinct manner.",
284
+ "Holga": "Color photos with high grain. Colors are either very oversaturated or slightly desaturated. Distinct contrast of black. Often applies photographic vignette.",
285
+ "Instax": "Instant color photos similar to Polaroid but clearer. Near perfect colors, regular saturation, fine/low to medium grain.",
286
+ "Lomography": "Color photos with high grain. Colors are either very oversaturated or slightly desaturated. Distinct contrast of black. Often applies photographic vignette.",
287
+ "Kodachrome": "Color photos with moderate grain. Colors on either colder part of spectrum or regular, with normal or slightly higher saturation.",
288
+ "Rollei": "Mostly black and white photos, sometimes color with fine/low grain. Can be sepia colored or have unusual hues and desaturation. Great for landscapes."
289
+ }
290
+
291
+ composition_styles_info = {
292
+ "Rule of Thirds": "Divides the frame into a 3x3 grid, placing key elements along the lines or at their intersections.",
293
+ "Golden Ratio": "Uses a spiral based on the golden ratio to create a balanced and aesthetically pleasing composition.",
294
+ "Symmetry": "Creates a mirror-like balance in the image, often used for architectural or nature photography.",
295
+ "Leading Lines": "Uses lines within the frame to draw the viewer's eye to the main subject or through the image.",
296
+ "Framing": "Uses elements within the scene to create a frame around the main subject.",
297
+ "Minimalism": "Simplifies the composition to its essential elements, often with a lot of negative space.",
298
+ "Fill the Frame": "The main subject dominates the entire frame, leaving little to no background.",
299
+ "Negative Space": "Uses empty space around the subject to create a sense of simplicity or isolation.",
300
+ "Centered Composition": "Places the main subject in the center of the frame, creating a sense of stability or importance.",
301
+ "Diagonal Lines": "Uses diagonal elements to create a sense of movement or dynamic tension in the image.",
302
+ "Triangular Composition": "Arranges elements in the frame to form a triangle, creating a sense of stability and harmony.",
303
+ "Radial Balance": "Arranges elements in a circular pattern around a central point, creating a sense of movement or completeness."
304
  }
305
 
306
+ lighting_aspects_info = {
307
+ "Natural light": "Uses available light from the sun or sky, often creating soft, even illumination.",
308
+ "Studio lighting": "Controlled artificial lighting setup, allowing for precise manipulation of light and shadow.",
309
+ "Back light": "Light source behind the subject, creating silhouettes or rim lighting effects.",
310
+ "Split light": "Strong light source at 90-degree angle, lighting one half of the subject while leaving the other in shadow.",
311
+ "Broad light": "Light source at an angle to the subject, producing well-lit photographs with soft to moderate shadows.",
312
+ "Dim light": "Weak or distant light source, creating lower than average brightness and often dramatic images.",
313
+ "Flash photography": "Uses a brief, intense burst of light. Can be fill flash (even lighting) or harsh flash (strong contrasts).",
314
+ "Sunlight": "Direct light from the sun, often creating strong contrasts and warm tones.",
315
+ "Moonlight": "Soft, cool light from the moon, often creating a mysterious or romantic atmosphere.",
316
+ "Spotlight": "Focused beam of light illuminating a specific area, creating high contrast between light and shadow.",
317
+ "High-key lighting": "Bright, even lighting with minimal shadows, creating a light and airy feel.",
318
+ "Low-key lighting": "Predominantly dark tones with selective lighting, creating a moody or dramatic atmosphere.",
319
+ "Rembrandt lighting": "Classic portrait lighting technique creating a triangle of light on the cheek of the subject."
 
 
 
 
 
 
 
320
  }
321
 
322
+ special_techniques_info = {
323
+ "Double exposure": "Superimposes two exposures to create a single image, often resulting in a dreamy or surreal effect.",
324
+ "Long exposure": "Uses a long shutter speed to capture motion over time, often creating smooth, blurred effects for moving elements.",
325
+ "Multiple exposure": "Superimposes multiple exposures, multiplying the subject or its key elements across the image.",
326
+ "HDR": "High Dynamic Range imaging, combining multiple exposures to capture a wider range of light and dark tones.",
327
+ "Bokeh effect": "Creates a soft, out-of-focus background, often with circular highlights.",
328
+ "Silhouette": "Captures the outline of a subject against a brighter background, creating a dramatic contrast.",
329
+ "Panning": "Follows a moving subject with the camera, creating a sharp subject with a blurred background.",
330
+ "Light painting": "Uses long exposure and moving light sources to 'paint' with light in the image.",
331
+ "Infrared photography": "Captures light in the infrared spectrum, often resulting in surreal, otherworldly images.",
332
+ "Ultraviolet photography": "Captures light in the ultraviolet spectrum, often revealing hidden patterns or creating a strong violet glow.",
333
+ "Kirlian photography": "High-voltage photographic technique that captures corona discharges around objects, creating a glowing effect.",
334
+ "Thermography": "Captures infrared radiation to create images based on temperature differences, resulting in false-color heat maps.",
335
+ "Astrophotography": "Specialized technique for capturing astronomical objects and celestial events, often resulting in stunning starry backgrounds.",
336
+ "Underwater photography": "Captures images beneath the surface of water, often in pools, seas, or aquariums.",
337
+ "Aerial photography": "Captures images from an elevated position, such as from drones, helicopters, or planes.",
338
+ "Macro photography": "Extreme close-up photography, revealing tiny details not visible to the naked eye."
339
+ }
340
+
341
+ color_effects_info = {
342
+ "Black and white": "Removes all color, leaving only shades of gray.",
343
+ "Sepia": "Reddish-brown monochrome effect, often associated with vintage photography.",
344
+ "Monochrome": "Uses variations of a single color.",
345
+ "Vintage color": "Muted or faded color palette reminiscent of old photographs.",
346
+ "Cross-processed": "Deliberate processing of film in the wrong chemicals, creating unusual color shifts.",
347
+ "Desaturated": "Reduces the intensity of all colors in the image.",
348
+ "Vivid colors": "Increases the saturation and intensity of colors.",
349
+ "Pastel colors": "Soft, pale colors with a light and airy feel.",
350
+ "High contrast": "Emphasizes the difference between light and dark areas in the image.",
351
+ "Low contrast": "Reduces the difference between light and dark areas, creating a softer look.",
352
+ "Color splash": "Converts most of the image to black and white while leaving one or more elements in color."
353
+ }
354
+
355
+ def get_dropdown_choices(info_dict):
356
+ return [f"{key}: {value}" for key, value in info_dict.items()]
357
+
358
+ # Gradio interface
359
  with gr.Blocks(theme="Hev832/Applio", css=css) as demo:
360
  with gr.Tab("Welcome"):
361
  gr.Markdown(
362
+ """
363
  <img src="https://path-to-yamamoto-logo.png" alt="Yamamoto Logo" class="centered-image">
364
 
365
  # 🎨 Yamamoto JoyCaption: AI-Powered Art Inspiration
 
367
  ## Accelerate Your Creative Workflow with Intelligent Image Analysis
368
 
369
  This innovative tool empowers Yamamoto's artists to quickly generate descriptive captions,<br>
370
+ training prompts, and tags from existing artwork, fueling the creative process for GenAI models.
371
 
372
  ## 🚀 How It Works:
373
  1. **Upload Your Inspiration**: Drop in an image (e.g., a charcoal horse picture) that embodies your desired style.
 
376
  4. **Generate and Iterate**: Click 'Caption' to analyze your image and use the results to inspire new creations.
377
  """
378
  )
379
+
380
  with gr.Tab("JoyCaption"):
381
+ with gr.Accordion("How to Use JoyCaption", open=False):
382
+ gr.Markdown("""
383
+ # How to Use JoyCaption
384
+
385
+ Hello, artist! Let's make some fun captions for your pictures. Here's how:
386
+
387
+ 1. **Pick a Picture**: Find a cool picture you want to talk about and upload it.
388
+
389
+ 2. **Choose What You Want**:
390
+ - **Caption Type**:
391
+ * "Descriptive" tells you what's in the picture
392
+ * "Training Prompt" helps computers make similar pictures
393
+ * "RNG-Tags" gives you short words about the picture
394
+ * "Style Prompt" creates detailed prompts for image generation
395
 
396
+ 3. **Pick a Style** (for "Descriptive" and "Style Prompt" only):
397
+ - "Formal" sounds like a teacher talking
398
+ - "Informal" sounds like a friend chatting
399
 
400
+ 4. **Decide How Long**:
401
+ - "Any" lets the computer decide
402
+ - Or pick a size from "very short" to "very long"
403
+ - You can even choose a specific number of words!
404
+
405
+ 5. **Advanced Options** (for "Style Prompt" only):
406
+ - Choose lens type, film stock, composition, and lighting details
407
+
408
+ 6. **Make the Caption**: Click the "Make My Caption!" button and watch the magic happen!
409
+
410
+ Remember, have fun and be creative with your captions!
411
+
412
+ ## Tips for Great Captions:
413
+ - Try different types to see what you like best
414
+ - Experiment with formal and informal tones for fun variations
415
+ - Adjust the length to get just the right amount of detail
416
+ - For "Style Prompt", play with the advanced options for more specific results
417
+ - If you don't like a caption, just click "Make My Caption!" again for a new one
418
+
419
+ Have a great time captioning your art!
420
+ """)
421
 
422
  with gr.Row():
423
+ with gr.Column():
424
+ input_image = gr.Image(type="pil", label="Input Image")
425
 
426
  caption_type = gr.Dropdown(
427
+ choices=["descriptive", "training_prompt", "rng-tags", "style_prompt"],
428
+ label="Caption Type",
 
 
 
 
 
 
 
 
 
429
  value="descriptive",
430
  )
431
 
 
 
 
 
 
 
 
 
 
 
 
 
432
  caption_tone = gr.Dropdown(
433
  choices=["formal", "informal"],
434
+ label="Caption Tone",
435
  value="formal",
436
  )
437
 
 
 
438
  caption_length = gr.Dropdown(
439
  choices=["any", "very short", "short", "medium-length", "long", "very long"] +
440
  [str(i) for i in range(20, 261, 10)],
441
+ label="Caption Length",
442
  value="any",
443
  )
444
 
445
+ lens_type = gr.Dropdown(
446
+ choices=get_dropdown_choices(lens_types_info),
447
+ label="Lens Type",
448
+ visible=False,
449
+ info="Select a lens type to define the perspective and field of view of the image."
 
 
 
 
 
 
 
450
  )
451
 
452
+ film_stock = gr.Dropdown(
453
+ choices=get_dropdown_choices(film_stocks_info),
454
+ label="Film Stock",
455
+ visible=False,
456
+ info="Choose a film stock to determine the color, grain, and overall look of the image."
457
+ )
458
+
459
+ composition_style = gr.Dropdown(
460
+ choices=get_dropdown_choices(composition_styles_info),
461
+ label="Composition Style",
462
+ visible=False,
463
+ info="Select a composition style to guide the arrangement of elements in the image."
464
+ )
465
+
466
+ lighting_aspect = gr.Dropdown(
467
+ choices=get_dropdown_choices(lighting_aspects_info),
468
+ label="Lighting Aspect",
469
+ visible=False,
470
+ info="Choose a lighting style to define the mood and atmosphere of the image."
471
+ )
472
+
473
+ special_technique = gr.Dropdown(
474
+ choices=get_dropdown_choices(special_techniques_info),
475
+ label="Special Technique",
476
+ visible=False,
477
+ info="Select a special photographic technique to add unique effects to the image."
478
+ )
479
+
480
+ color_effect = gr.Dropdown(
481
+ choices=get_dropdown_choices(color_effects_info),
482
+ label="Color Effect",
483
+ visible=False,
484
+ info="Choose a color effect to alter the overall color palette of the image."
485
+ )
486
+
487
+ gr.Markdown("**Note:** Caption tone doesn't affect `rng-tags`, `training_prompt`, and `style_prompt`.")
488
+
489
+ run_button = gr.Button("Make My Caption!")
490
+
491
+ with gr.Column():
492
+ output_caption = gr.Textbox(label="Generated Caption")
493
+
494
+ # Container for advanced options
495
+ advanced_options = gr.Column(visible=False)
496
+ with advanced_options:
497
+ gr.Markdown("### Advanced Options for Style Prompt")
498
+ lens_type.render()
499
+ film_stock.render()
500
+ composition_style.render()
501
+ lighting_aspect.render()
502
+ special_technique.render()
503
+ color_effect.render()
504
+
505
+ def update_style_options(caption_type):
506
+ return {
507
+ lens_type: gr.update(visible=caption_type == "style_prompt"),
508
+ film_stock: gr.update(visible=caption_type == "style_prompt"),
509
+ composition_style: gr.update(visible=caption_type == "style_prompt"),
510
+ lighting_aspect: gr.update(visible=caption_type == "style_prompt"),
511
+ special_technique: gr.update(visible=caption_type == "style_prompt"),
512
+ color_effect: gr.update(visible=caption_type == "style_prompt"),
513
+ advanced_options: gr.update(visible=caption_type == "style_prompt"),
514
+ }
515
+
516
+ caption_type.change(update_style_options, inputs=[caption_type], outputs=[lens_type, film_stock, composition_style, lighting_aspect, special_technique, color_effect, advanced_options])
517
+
518
+ run_button.click(fn=stream_chat, inputs=[input_image, caption_type, caption_tone, caption_length, lens_type, film_stock, composition_style, lighting_aspect, special_technique, color_effect], outputs=[output_caption])
519
 
 
 
 
 
 
520
 
521
  if __name__ == "__main__":
522
  demo.launch()