Test-Caption-Captain

Sleeping

App Files Files Community

Severian commited on Sep 26, 2024

Commit

7c751fb

verified ·

1 Parent(s): 250653b

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -166

app.py CHANGED Viewed

@@ -15,144 +15,21 @@ CLIP_PATH = "google/siglip-so400m-patch14-384"
 MODEL_PATH = "Qwen/Qwen2.5-7B-Instruct"
 CHECKPOINT_PATH = Path("9em124t2-499968")
 CAPTION_TYPE_MAP = {
-    ("descriptive", "formal", False, False): [
-        "Write a detailed, formal description of this image, focusing on composition, style, and artistic elements.",
-        "Provide a comprehensive, academic analysis of this artwork's visual characteristics and techniques."
-    ],
-    ("descriptive", "formal", False, True): [
-        "Craft a formal, concise description of this image within {word_count} words, highlighting key visual elements.",
-        "Summarize the artwork's main features and style in a formal tone, using no more than {word_count} words."
-    ],
-    ("descriptive", "formal", True, False): [
-        "Compose a {length} formal critique of this image, discussing its artistic merits and visual impact.",
-        "Create a {length} scholarly description of this artwork, analyzing its composition and aesthetic qualities."
-    ],
-    ("descriptive", "informal", False, False): [
-        "Describe this image as if you're explaining it to a friend, focusing on what stands out to you.",
-        "Give a casual, conversational rundown of what you see in this artwork and how it makes you feel."
-    ],
-    ("descriptive", "informal", False, True): [
-        "In about {word_count} words, give a laid-back description of this image's vibe and key features.",
-        "Summarize the coolest parts of this artwork in a casual tone, using roughly {word_count} words."
-    ],
-    ("descriptive", "informal", True, False): [
-        "Write a {length} chill description of this image, highlighting what you find most interesting or unique.",
-        "Give a {length} relaxed explanation of what's going on in this artwork and why it catches your eye."
-    ],
-    ("training_prompt", "formal", False, False): [
-        "Generate a detailed stable diffusion prompt to recreate this image, including style, composition, and key elements.",
-        "Craft a comprehensive prompt for an AI art generator to produce an image in the same style and mood as this artwork."
-    ],
-    ("training_prompt", "formal", False, True): [
-        "Within {word_count} words, create a precise stable diffusion prompt capturing the essence of this image.",
-        "Compose a concise AI art prompt of {word_count} words to replicate this artwork's style and content."
-    ],
-    ("training_prompt", "formal", True, False): [
-        "Write a {length} stable diffusion prompt that thoroughly describes this image's style, subject, and artistic techniques.",
-        "Develop a {length} detailed prompt for AI art generation, breaking down the key visual elements and artistic approach of this image."
-    ],
-    ("rng-tags", "formal", False, False): [
-        "Generate a comprehensive list of Booru tags describing this image's content, style, and artistic elements.",
-        "Create an extensive set of Booru tags covering all aspects of this artwork, including subject, technique, and mood."
-    ],
-    ("rng-tags", "formal", False, True): [
-        "Produce a focused list of Booru tags within {word_count} words, capturing the most important aspects of this image.",
-        "Compile a concise set of Booru tags, limited to {word_count} words, that best represent this artwork's key features."
-    ],
-    ("rng-tags", "formal", True, False): [
-        "Generate a {length} list of Booru tags, providing a thorough categorization of this image's content and style.",
-        "Create a {length} set of Booru tags that extensively describe all visual elements and artistic choices in this artwork."
-    ],
-    ("artistic_inspiration", "formal", False, False): [
-        "Analyze this image and suggest artistic variations or extensions that could be created based on its style and theme.",
-        "Provide a formal interpretation of this artwork's mood and style, offering ideas for complementary pieces or a series."
-    ],
-    ("artistic_inspiration", "informal", False, False): [
-        "Brainstorm some cool ideas for new artworks inspired by this image's style or subject matter.",
-        "Riff on this artwork's vibe and come up with some creative spin-offs or related pieces an artist could make."
-    ],
-    ("technical_breakdown", "formal", False, False): [
-        "Provide a detailed technical analysis of the artistic techniques and materials likely used to create this image.",
-        "Break down the compositional elements and artistic methods employed in this artwork, suitable for an art student's study."
-    ],
-    ("emotional_response", "informal", False, False): [
-        "Describe the emotions and feelings this artwork evokes, and explain why it might resonate with viewers.",
-        "Share your gut reaction to this image and speculate on what the artist might have been feeling or thinking."
-    ],
-    ("thematic_analysis", "formal", False, False): [
-        "Provide an in-depth analysis of the themes presented in this image, exploring the underlying messages and concepts.",
-        "Analyze the primary and secondary themes of this artwork, discussing their significance and interplay."
-    ],
-    ("thematic_analysis", "formal", False, True): [
-        "Within {word_count} words, dissect the main themes of this image, highlighting their relevance and impact.",
-        "Craft a concise thematic analysis of this artwork in {word_count} words, focusing on its core messages."
-    ],
-    ("thematic_analysis", "formal", True, False): [
-        "Write a {length} formal exploration of the themes depicted in this image, examining their depth and meaning.",
-        "Develop a {length} scholarly analysis of the thematic elements in this artwork, discussing their significance."
-    ],
-    ("stylistic_comparison", "informal", False, False): [
-        "Compare the style of this image to other famous art movements or artists, highlighting similarities and differences.",
-        "Describe how this artwork's style relates to [specific artist/style], and what makes it unique."
-    ],
-    ("stylistic_comparison", "informal", False, True): [
-        "In about {word_count} words, compare this image's style with other known art styles or artists.",
-        "Summarize the stylistic similarities and differences of this artwork compared to other genres in {word_count} words."
-    ],
-    ("stylistic_comparison", "informal", True, False): [
-        "Write a {length} casual comparison of this image's style with other art movements or famous artists.",
-        "Give a {length} relaxed description of how this artwork's style aligns or differs from other genres."
-    ],
-    ("narrative_suggestion", "formal", False, False): [
-        "Create a short narrative inspired by this image, outlining a possible story that reflects its visual elements.",
-        "Develop a brief storyline that complements the themes and mood depicted in this artwork."
-    ],
-    ("narrative_suggestion", "formal", False, True): [
-        "Within {word_count} words, outline a narrative inspired by this image's visual elements and mood.",
-        "Compose a concise story idea based on the themes and composition of this artwork in {word_count} words."
-    ],
-    ("narrative_suggestion", "formal", True, False): [
-        "Write a {length} formal narrative inspired by this image, detailing a story that aligns with its visual and thematic elements.",
-        "Develop a {length} scholarly storyline that reflects the mood and composition of this artwork."
-    ],
-    ("contextual_storytelling", "informal", False, False): [
-        "Tell a cool story that could be happening in the scene of this image, based on its visual cues.",
-        "Imagine a background story for this artwork, explaining what's happening and why."
-    ],
-    ("contextual_storytelling", "informal", False, True): [
-        "In about {word_count} words, create a backstory for the scene depicted in this image.",
-        "Summarize a possible background narrative for this artwork in {word_count} words."
-    ],
-    ("contextual_storytelling", "informal", True, False): [
-        "Write a {length} informal story that provides context to the scene portrayed in this image.",
-        "Give a {length} casual backstory explaining the events depicted in this artwork."
-    ],
-    ("style_prompt", "formal", False, False): [
-        "Generate a detailed stable diffusion prompt to recreate this image, including style, composition, and key elements.",
-        "Craft a comprehensive prompt for an AI art generator to produce an image in the same style and mood as this artwork."
-    ],
-    ("style_prompt", "formal", False, True): [
-        "Within {word_count} words, create a precise stable diffusion prompt capturing the essence of this image.",
-        "Compose a concise AI art prompt of {word_count} words to replicate this artwork's style and content."
-    ],
-    ("style_prompt", "formal", True, False): [
-        "Write a {length} stable diffusion prompt that thoroughly describes this image's style, subject, and artistic techniques.",
-        "Develop a {length} detailed prompt for AI art generation, breaking down the key visual elements and artistic approach of this image."
-    ],
-    ("style_prompt", "informal", False, False): [
-        "Imagine this image is in an exhibition of {style} art. Describe what makes it fit in or stand out from other {style} pieces.",
-        "Give a casual rundown of how this artwork vibes with the {style} movement. What's similar? What's different? What's cool about it?"
-    ],
-    ("style_prompt", "informal", False, True): [
-        "In about {word_count} words, chat about how this image relates to {style} art. What catches your eye as typical or unusual for the style?",
-        "Summarize in roughly {word_count} words how this artwork plays with {style} ideas. What's familiar? What's a twist on the style?"
-    ],
-    ("style_prompt", "informal", True, False): [
-        "Write a {length} chill analysis of this image as if it's part of a {style} art show. What works? What's surprising? How does it make you feel?",
-        "Give a {length} relaxed breakdown of how this artwork fits (or doesn't) into the {style} scene. What's your take on its use of {style} elements?"
-    ],
 }
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
@@ -278,8 +155,8 @@ def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str,
         except ValueError:
             pass
-    # 'rng-tags' and 'training_prompt' don't have formal/informal tones
-    if caption_type == "rng-tags" or caption_type == "training_prompt":
         caption_tone = "formal"
     # Build prompt
@@ -289,12 +166,9 @@ def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str,
     prompt_str = CAPTION_TYPE_MAP[prompt_key][0].format(length=length, word_count=length)
-    # Add style prompt details if applicable
     if caption_type == "style_prompt":
-        prompt_str += (f" The prompt should specifically include details about using a {lens_type} lens, "
-                       f"{film_stock} film stock, {composition} composition, and {lighting} lighting. "
-                       f"Format the output as a comma-separated list of descriptors and modifiers, "
-                       f"suitable for direct input into a Stable Diffusion interface.")
     print(f"Prompt: {prompt_str}")
@@ -317,15 +191,7 @@ def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str,
     # Embed prompt
     prompt_embeds = text_model.model.embed_tokens(prompt.to('cuda'))
     assert prompt_embeds.shape == (1, prompt.shape[1], text_model.config.hidden_size), f"Prompt shape is {prompt_embeds.shape}, expected {(1, prompt.shape[1], text_model.config.hidden_size)}"
-    # Check if bos_token_id exists
-    if tokenizer.bos_token_id is None:
-        print("Warning: bos_token_id is None. Using default value of 1.")
-        bos_token_id = 1
-    else:
-        bos_token_id = tokenizer.bos_token_id
-    embedded_bos = text_model.model.embed_tokens(torch.tensor([[bos_token_id]], device=text_model.device, dtype=torch.int64))
     eot_embed = image_adapter.get_eot_embedding().unsqueeze(0).to(dtype=text_model.dtype)
     # Construct prompts
@@ -337,7 +203,7 @@ def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str,
     ], dim=1)
     input_ids = torch.cat([
-        torch.tensor([[bos_token_id]], dtype=torch.long),
         torch.zeros((1, embedded_images.shape[1]), dtype=torch.long),
         prompt,
         torch.tensor([[tokenizer.convert_tokens_to_ids("<|eot_id|>")]], dtype=torch.long),
@@ -490,17 +356,7 @@ with gr.Blocks(theme="Hev832/Applio", css=css) as demo:
                 run_button = gr.Button("Make My Caption!")
             with gr.Column():
-                output_caption = gr.Textbox(label="Your Image Generation Prompt (Copy this for Stable Diffusion)", lines=10)
-        gr.Markdown("""
-        ## How to Use Your Generated Prompt:
-        1. For "Style Prompt" captions, the output is formatted for direct use in Stable Diffusion.
-        2. Simply copy the entire text from the output box.
-        3. Paste it into your preferred Stable Diffusion interface or any other AI image generation platform.
-        4. Adjust or add to the prompt as desired to fine-tune your image generation.
-        Remember, you can always regenerate or modify the prompt to get different results!
-        """)
         run_button.click(fn=stream_chat, inputs=[input_image, caption_type, caption_tone, caption_length, lens_type, film_stock, composition, lighting], outputs=[output_caption])

 MODEL_PATH = "Qwen/Qwen2.5-7B-Instruct"
 CHECKPOINT_PATH = Path("9em124t2-499968")
 CAPTION_TYPE_MAP = {
+    ("descriptive", "formal", False, False): ["Write a descriptive caption for this image in a formal tone."],
+    ("descriptive", "formal", False, True): ["Write a descriptive caption for this image in a formal tone within {word_count} words."],
+    ("descriptive", "formal", True, False): ["Write a {length} descriptive caption for this image in a formal tone."],
+    ("descriptive", "informal", False, False): ["Write a descriptive caption for this image in a casual tone."],
+    ("descriptive", "informal", False, True): ["Write a descriptive caption for this image in a casual tone within {word_count} words."],
+    ("descriptive", "informal", True, False): ["Write a {length} descriptive caption for this image in a casual tone."],
+    ("training_prompt", "formal", False, False): ["Write a stable diffusion prompt for this image."],
+    ("training_prompt", "formal", False, True): ["Write a stable diffusion prompt for this image within {word_count} words."],
+    ("training_prompt", "formal", True, False): ["Write a {length} stable diffusion prompt for this image."],
+    ("rng-tags", "formal", False, False): ["Write a list of Booru tags for this image."],
+    ("rng-tags", "formal", False, True): ["Write a list of Booru tags for this image within {word_count} words."],
+    ("rng-tags", "formal", True, False): ["Write a {length} list of Booru tags for this image."],
+    ("style_prompt", "formal", False, False): ["Generate a detailed stable diffusion prompt to recreate this image, including style, composition, and key elements."],
+    ("style_prompt", "formal", False, True): ["Within {word_count} words, create a precise stable diffusion prompt capturing the essence of this image."],
+    ("style_prompt", "formal", True, False): ["Write a {length} stable diffusion prompt that thoroughly describes this image's style, subject, and artistic techniques."],
 }
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
         except ValueError:
             pass
+    # 'rng-tags', 'training_prompt', and 'style_prompt' don't have formal/informal tones
+    if caption_type in ["rng-tags", "training_prompt", "style_prompt"]:
         caption_tone = "formal"
     # Build prompt
     prompt_str = CAPTION_TYPE_MAP[prompt_key][0].format(length=length, word_count=length)
     if caption_type == "style_prompt":
+        prompt_str += (f" Include details about using a {lens_type} lens, "
+                       f"{film_stock} film stock, {composition} composition, and {lighting} lighting.")
     print(f"Prompt: {prompt_str}")
     # Embed prompt
     prompt_embeds = text_model.model.embed_tokens(prompt.to('cuda'))
     assert prompt_embeds.shape == (1, prompt.shape[1], text_model.config.hidden_size), f"Prompt shape is {prompt_embeds.shape}, expected {(1, prompt.shape[1], text_model.config.hidden_size)}"
+    embedded_bos = text_model.model.embed_tokens(torch.tensor([[tokenizer.bos_token_id]], device=text_model.device, dtype=torch.int64))
     eot_embed = image_adapter.get_eot_embedding().unsqueeze(0).to(dtype=text_model.dtype)
     # Construct prompts
     ], dim=1)
     input_ids = torch.cat([
+        torch.tensor([[tokenizer.bos_token_id]], dtype=torch.long),
         torch.zeros((1, embedded_images.shape[1]), dtype=torch.long),
         prompt,
         torch.tensor([[tokenizer.convert_tokens_to_ids("<|eot_id|>")]], dtype=torch.long),
                 run_button = gr.Button("Make My Caption!")
             with gr.Column():
+                output_caption = gr.Textbox(label="Your Amazing Caption Appears Here", lines=10)
         run_button.click(fn=stream_chat, inputs=[input_image, caption_type, caption_tone, caption_length, lens_type, film_stock, composition, lighting], outputs=[output_caption])