Test-Caption-Captain

Sleeping

App Files Files Community

Severian commited on Sep 26, 2024

Commit

98792a8

verified ·

1 Parent(s): 936fabc

Update app.py

Browse files

Files changed (1) hide show

app.py +307 -358

app.py CHANGED Viewed

@@ -12,147 +12,28 @@ import torchvision.transforms.functional as TVF
 CLIP_PATH = "google/siglip-so400m-patch14-384"
-MODEL_PATH = "meta-llama/Meta-Llama-3.1-8B"
 CHECKPOINT_PATH = Path("9em124t2-499968")
 CAPTION_TYPE_MAP = {
-    ("descriptive", "formal", False, False): [
-        "Write a detailed, formal description of this image, focusing on composition, style, and artistic elements.",
-        "Provide a comprehensive, academic analysis of this artwork's visual characteristics and techniques."
-    ],
-    ("descriptive", "formal", False, True): [
-        "Craft a formal, concise description of this image within {word_count} words, highlighting key visual elements.",
-        "Summarize the artwork's main features and style in a formal tone, using no more than {word_count} words."
-    ],
-    ("descriptive", "formal", True, False): [
-        "Compose a {length} formal critique of this image, discussing its artistic merits and visual impact.",
-        "Create a {length} scholarly description of this artwork, analyzing its composition and aesthetic qualities."
-    ],
-    ("descriptive", "informal", False, False): [
-        "Describe this image as if you're explaining it to a friend, focusing on what stands out to you.",
-        "Give a casual, conversational rundown of what you see in this artwork and how it makes you feel."
-    ],
-    ("descriptive", "informal", False, True): [
-        "In about {word_count} words, give a laid-back description of this image's vibe and key features.",
-        "Summarize the coolest parts of this artwork in a casual tone, using roughly {word_count} words."
-    ],
-    ("descriptive", "informal", True, False): [
-        "Write a {length} chill description of this image, highlighting what you find most interesting or unique.",
-        "Give a {length} relaxed explanation of what's going on in this artwork and why it catches your eye."
-    ],
-    ("training_prompt", "formal", False, False): [
-        "Generate a detailed stable diffusion prompt to recreate this image, including style, composition, and key elements.",
-        "Craft a comprehensive prompt for an AI art generator to produce an image in the same style and mood as this artwork."
-    ],
-    ("training_prompt", "formal", False, True): [
-        "Within {word_count} words, create a precise stable diffusion prompt capturing the essence of this image.",
-        "Compose a concise AI art prompt of {word_count} words to replicate this artwork's style and content."
-    ],
-    ("training_prompt", "formal", True, False): [
-        "Write a {length} stable diffusion prompt that thoroughly describes this image's style, subject, and artistic techniques.",
-        "Develop a {length} detailed prompt for AI art generation, breaking down the key visual elements and artistic approach of this image."
-    ],
-    ("rng-tags", "formal", False, False): [
-        "Generate a comprehensive list of Booru tags describing this image's content, style, and artistic elements.",
-        "Create an extensive set of Booru tags covering all aspects of this artwork, including subject, technique, and mood."
-    ],
-    ("rng-tags", "formal", False, True): [
-        "Produce a focused list of Booru tags within {word_count} words, capturing the most important aspects of this image.",
-        "Compile a concise set of Booru tags, limited to {word_count} words, that best represent this artwork's key features."
-    ],
-    ("rng-tags", "formal", True, False): [
-        "Generate a {length} list of Booru tags, providing a thorough categorization of this image's content and style.",
-        "Create a {length} set of Booru tags that extensively describe all visual elements and artistic choices in this artwork."
-    ],
-    ("artistic_inspiration", "formal", False, False): [
-        "Analyze this image and suggest artistic variations or extensions that could be created based on its style and theme.",
-        "Provide a formal interpretation of this artwork's mood and style, offering ideas for complementary pieces or a series."
-    ],
-    ("artistic_inspiration", "informal", False, False): [
-        "Brainstorm some cool ideas for new artworks inspired by this image's style or subject matter.",
-        "Riff on this artwork's vibe and come up with some creative spin-offs or related pieces an artist could make."
-    ],
-    ("technical_breakdown", "formal", False, False): [
-        "Provide a detailed technical analysis of the artistic techniques and materials likely used to create this image.",
-        "Break down the compositional elements and artistic methods employed in this artwork, suitable for an art student's study."
-    ],
-    ("emotional_response", "informal", False, False): [
-        "Describe the emotions and feelings this artwork evokes, and explain why it might resonate with viewers.",
-        "Share your gut reaction to this image and speculate on what the artist might have been feeling or thinking."
-    ],
-    ("thematic_analysis", "formal", False, False): [
-        "Provide an in-depth analysis of the themes presented in this image, exploring the underlying messages and concepts.",
-        "Analyze the primary and secondary themes of this artwork, discussing their significance and interplay."
-    ],
-    ("thematic_analysis", "formal", False, True): [
-        "Within {word_count} words, dissect the main themes of this image, highlighting their relevance and impact.",
-        "Craft a concise thematic analysis of this artwork in {word_count} words, focusing on its core messages."
-    ],
-    ("thematic_analysis", "formal", True, False): [
-        "Write a {length} formal exploration of the themes depicted in this image, examining their depth and meaning.",
-        "Develop a {length} scholarly analysis of the thematic elements in this artwork, discussing their significance."
-    ],
-    ("stylistic_comparison", "informal", False, False): [
-        "Compare the style of this image to other famous art movements or artists, highlighting similarities and differences.",
-        "Describe how this artwork's style relates to [specific artist/style], and what makes it unique."
-    ],
-    ("stylistic_comparison", "informal", False, True): [
-        "In about {word_count} words, compare this image's style with other known art styles or artists.",
-        "Summarize the stylistic similarities and differences of this artwork compared to other genres in {word_count} words."
-    ],
-    ("stylistic_comparison", "informal", True, False): [
-        "Write a {length} casual comparison of this image's style with other art movements or famous artists.",
-        "Give a {length} relaxed description of how this artwork's style aligns or differs from other genres."
-    ],
-    ("narrative_suggestion", "formal", False, False): [
-        "Create a short narrative inspired by this image, outlining a possible story that reflects its visual elements.",
-        "Develop a brief storyline that complements the themes and mood depicted in this artwork."
-    ],
-    ("narrative_suggestion", "formal", False, True): [
-        "Within {word_count} words, outline a narrative inspired by this image's visual elements and mood.",
-        "Compose a concise story idea based on the themes and composition of this artwork in {word_count} words."
-    ],
-    ("narrative_suggestion", "formal", True, False): [
-        "Write a {length} formal narrative inspired by this image, detailing a story that aligns with its visual and thematic elements.",
-        "Develop a {length} scholarly storyline that reflects the mood and composition of this artwork."
-    ],
-    ("contextual_storytelling", "informal", False, False): [
-        "Tell a cool story that could be happening in the scene of this image, based on its visual cues.",
-        "Imagine a background story for this artwork, explaining what's happening and why."
-    ],
-    ("contextual_storytelling", "informal", False, True): [
-        "In about {word_count} words, create a backstory for the scene depicted in this image.",
-        "Summarize a possible background narrative for this artwork in {word_count} words."
-    ],
-    ("contextual_storytelling", "informal", True, False): [
-        "Write a {length} informal story that provides context to the scene portrayed in this image.",
-        "Give a {length} casual backstory explaining the events depicted in this artwork."
-    ],
-    ("style_prompt", "formal", False, False): [
-        "Analyze this image through the lens of {style} art. Describe how it aligns with or diverges from {style_characteristics}.",
-        "Examine this artwork in the context of the {style} movement, focusing on {style_focus} and how these elements are represented or reinterpreted in the image."
-    ],
-    ("style_prompt", "formal", False, True): [
-        "Within {word_count} words, compare this image to the {style} style, highlighting elements that reflect or contrast with its key characteristics.",
-        "Compose a concise {word_count}-word analysis of how this artwork relates to the {style} movement, noting its adherence to or departure from typical {style} elements."
-    ],
-    ("style_prompt", "formal", True, False): [
-        "Write a {length} critique of this image, exploring its relationship to the {style} movement. Discuss composition, technique, and thematic elements in this context.",
-        "Develop a {length} analysis of how this artwork incorporates or challenges the principles of {style}, considering its visual language and artistic approach."
-    ],
-    ("style_prompt", "informal", False, False): [
-        "Imagine this image is in an exhibition of {style} art. Describe what makes it fit in or stand out from other {style} pieces.",
-        "Give a casual rundown of how this artwork vibes with the {style} movement. What's similar? What's different? What's cool about it?"
-    ],
-    ("style_prompt", "informal", False, True): [
-        "In about {word_count} words, chat about how this image relates to {style} art. What catches your eye as typical or unusual for the style?",
-        "Summarize in roughly {word_count} words how this artwork plays with {style} ideas. What's familiar? What's a twist on the style?"
-    ],
-    ("style_prompt", "informal", True, False): [
-        "Write a {length} chill analysis of this image as if it's part of a {style} art show. What works? What's surprising? How does it make you feel?",
-        "Give a {length} relaxed breakdown of how this artwork fits (or doesn't) into the {style} scene. What's your take on its use of {style} elements?"
-    ],
 }
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
@@ -257,122 +138,105 @@ text_model.eval()
 # Image Adapter
 print("Loading image adapter")
 image_adapter = ImageAdapter(clip_model.config.hidden_size, text_model.config.hidden_size, False, False, 38, False)
-image_adapter.load_state_dict(torch.load(CHECKPOINT_PATH / "image_adapter.pt", map_location="cpu"))
 image_adapter.eval()
 image_adapter.to("cuda")
-# After loading the tokenizer and model
-print(f"Tokenizer class: {type(tokenizer)}")
-print(f"BOS token: {tokenizer.bos_token}")
-print(f"BOS token ID: {tokenizer.bos_token_id}")
-print(f"EOS token: {tokenizer.eos_token}")
-print(f"EOS token ID: {tokenizer.eos_token_id}")
-print(f"Text model device: {text_model.device}")
-# Ensure the tokenizer has the necessary special tokens
-if tokenizer.bos_token_id is None or tokenizer.eos_token_id is None:
-    print("Warning: BOS or EOS token is missing. Adding default tokens.")
-    special_tokens_dict = {}
-    if tokenizer.bos_token_id is None:
-        special_tokens_dict['bos_token'] = '<|endoftext|>'
-    if tokenizer.eos_token_id is None:
-        special_tokens_dict['eos_token'] = '<|endoftext|>'
-    num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)
-    print(f"Added {num_added_tokens} special tokens to the tokenizer.")
-    # Resize token embeddings of the model if new tokens are added
-    text_model.resize_token_embeddings(len(tokenizer))
-@spaces.GPU()
-@torch.no_grad()
-def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str, caption_length: str | int, art_style: str) -> str:
-    torch.cuda.empty_cache()
-    # Handle caption_length
-    length = None
-    if caption_length != "any":
-        if isinstance(caption_length, int):
-            length = caption_length
-        elif isinstance(caption_length, str):
-            try:
-                length = int(caption_length)
-            except ValueError:
-                # If it's not a number, treat it as a descriptive length
-                length = caption_length
-    # 'rng-tags' and 'training_prompt' don't have formal/informal tones
-    if caption_type in ["rng-tags", "training_prompt"]:
-        caption_tone = "formal"
-    # Build prompt
-    prompt_key = (caption_type, caption_tone, isinstance(length, str), isinstance(length, int))
-    if prompt_key not in CAPTION_TYPE_MAP:
-        raise ValueError(f"Invalid caption type: {prompt_key}")
-    prompt_str = CAPTION_TYPE_MAP[prompt_key][0].format(
-        length=length,
-        word_count=length,
-        style=art_style,
-        style_characteristics=STYLE_CHARACTERISTICS.get(art_style, "its unique elements"),
-        style_focus=STYLE_FOCUS.get(art_style, "its distinctive features")
-    )
-    print(f"Prompt: {prompt_str}")
-    # Preprocess image
     image = input_image.resize((384, 384), Image.LANCZOS)
     pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
     pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
-    pixel_values = pixel_values.to('cuda')
-    # Tokenize the prompt
     prompt = tokenizer.encode(prompt_str, return_tensors='pt', padding=False, truncation=False, add_special_tokens=False)
-    # Embed image
-    with torch.amp.autocast_mode.autocast('cuda', enabled=True):
-        vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True)
-        image_features = vision_outputs.hidden_states
-        embedded_images = image_adapter(image_features)
-        embedded_images = embedded_images.to('cuda')
-    # Embed prompt
     prompt_embeds = text_model.model.embed_tokens(prompt.to('cuda'))
-    assert prompt_embeds.shape == (1, prompt.shape[1], text_model.config.hidden_size), f"Prompt shape is {prompt_embeds.shape}, expected {(1, prompt.shape[1], text_model.config.hidden_size)}"
-    # Check for bos_token_id and provide a fallback
-    bos_token_id = tokenizer.bos_token_id
-    if bos_token_id is None:
-        print("Warning: bos_token_id is None. Using default value of 1.")
-        bos_token_id = 1  # Common default, but may need adjustment
-    embedded_bos = text_model.model.embed_tokens(torch.tensor([[bos_token_id]], device=text_model.device, dtype=torch.int64))
     eot_embed = image_adapter.get_eot_embedding().unsqueeze(0).to(dtype=text_model.dtype)
-    # Construct prompts
     inputs_embeds = torch.cat([
-        embedded_bos.expand(embedded_images.shape[0], -1, -1),
-        embedded_images.to(dtype=embedded_bos.dtype),
-        prompt_embeds.expand(embedded_images.shape[0], -1, -1),
-        eot_embed.expand(embedded_images.shape[0], -1, -1),
     ], dim=1)
     input_ids = torch.cat([
-        torch.tensor([[bos_token_id]], dtype=torch.long),
-        torch.zeros((1, embedded_images.shape[1]), dtype=torch.long),
         prompt,
         torch.tensor([[tokenizer.convert_tokens_to_ids("<|eot_id|>")]], dtype=torch.long),
     ], dim=1).to('cuda')
     attention_mask = torch.ones_like(input_ids)
-    generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=300, do_sample=True, suppress_tokens=None)
-    # Trim off the prompt
     generate_ids = generate_ids[:, input_ids.shape[1]:]
     if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids("<|eot_id|>"):
         generate_ids = generate_ids[:, :-1]
-    caption = tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0]
-    return caption.strip()
 css = """
 h1, h2, h3, h4, h5, h6, p, li, ul, ol, a, .centered-image {
@@ -392,63 +256,110 @@ ul, ol {
 }
 """
-ART_STYLES = [
-    "Impressionism", "Cubism", "Surrealism", "Abstract Expressionism", "Pop Art",
-    "Minimalism", "Baroque", "Renaissance", "Art Nouveau", "Gothic",
-    "Romanticism", "Realism", "Expressionism", "Fauvism", "Art Deco",
-    "Futurism", "Dadaism", "Pointillism", "Rococo", "Neoclassicism"
-]
-STYLE_CHARACTERISTICS = {
-    "Impressionism": "loose brushstrokes, emphasis on light and color, everyday subjects",
-    "Cubism": "geometric shapes, multiple perspectives, fragmented forms",
-    "Surrealism": "dreamlike imagery, unexpected juxtapositions, subconscious exploration",
-    "Abstract Expressionism": "expressive brushwork, emotional content, abstract forms",
-    "Pop Art": "bright colors, popular culture references, satire",
-    "Minimalism": "simple forms, limited color palette, emphasis on space",
-    "Baroque": "dramatic lighting, elaborate detail, grandeur",
-    "Renaissance": "realistic depictions, perspective, religious themes",
-    "Art Nouveau": "stylized forms, organic shapes, decorative elements",
-    "Gothic": "dark themes, dramatic lighting, architectural elements",
-    "Romanticism": "emotional content, nature scenes, idealized figures",
-    "Realism": "detailed depictions, realistic textures, everyday subjects",
-    "Expressionism": "emotional content, distorted forms, abstract elements",
-    "Fauvism": "bold colors, abstract forms, emotional content",
-    "Art Deco": "geometric shapes, streamlined forms, modern aesthetics",
-    "Futurism": "dynamic forms, speed, technology",
-    "Dadaism": "anti-art, absurdity, subversion of traditional art",
-    "Pointillism": "small dots of color, impressionistic style, emphasis on light",
-    "Rococo": "ornate style, lighthearted themes, decorative elements",
-    "Neoclassicism": "classical style, balance, symmetry"
 }
-STYLE_FOCUS = {
-    "Impressionism": "capturing fleeting moments and atmospheric effects",
-    "Cubism": "deconstructing and reassembling forms from multiple viewpoints",
-    "Surrealism": "creating a sense of the uncanny and exploring the subconscious mind",
-    "Abstract Expressionism": "expressing emotional content through abstract forms",
-    "Pop Art": "commenting on popular culture and satirizing consumerism",
-    "Minimalism": "exploring the relationship between form and space",
-    "Baroque": "creating dramatic and grandiose compositions",
-    "Renaissance": "depicting realistic scenes and exploring perspective",
-    "Art Nouveau": "incorporating organic and decorative elements",
-    "Gothic": "exploring dark themes and dramatic lighting",
-    "Romanticism": "depicting emotional scenes and idealized figures",
-    "Realism": "capturing detailed and realistic textures",
-    "Expressionism": "expressing emotional content through distorted forms",
-    "Fauvism": "emphasizing bold colors and emotional content",
-    "Art Deco": "incorporating geometric shapes and modern aesthetics",
-    "Futurism": "depicting speed, technology, and dynamism",
-    "Dadaism": "subverting traditional art and exploring absurdity",
-    "Pointillism": "capturing light and color through small dots",
-    "Rococo": "creating lighthearted and decorative compositions",
-    "Neoclassicism": "achieving balance and symmetry in classical style"
 }
 with gr.Blocks(theme="Hev832/Applio", css=css) as demo:
     with gr.Tab("Welcome"):
         gr.Markdown(
-			"""
             <img src="https://path-to-yamamoto-logo.png" alt="Yamamoto Logo" class="centered-image">
             # 🎨 Yamamoto JoyCaption: AI-Powered Art Inspiration
@@ -456,7 +367,7 @@ with gr.Blocks(theme="Hev832/Applio", css=css) as demo:
             ## Accelerate Your Creative Workflow with Intelligent Image Analysis
             This innovative tool empowers Yamamoto's artists to quickly generate descriptive captions,<br>
-            training prompts, or tags from existing artwork, fueling the creative process for GenAI models.
             ## 🚀 How It Works:
             1. **Upload Your Inspiration**: Drop in an image (e.g., a charcoal horse picture) that embodies your desired style.
@@ -465,109 +376,147 @@ with gr.Blocks(theme="Hev832/Applio", css=css) as demo:
             4. **Generate and Iterate**: Click 'Caption' to analyze your image and use the results to inspire new creations.
             """
         )
     with gr.Tab("JoyCaption"):
-        gr.Markdown("""
-        # JoyCaption: AI-Powered Image Analysis Tool
-        This tool helps you generate various types of text based on an uploaded image. Here's how to use it:
-        1. Upload an image
-        2. Choose your desired output type
-        3. Adjust settings as needed
-        4. Click 'Generate Caption' to get your result
-        """)
         with gr.Row():
-            with gr.Column(scale=1):
-                input_image = gr.Image(type="pil", label="Upload Your Image")
                 caption_type = gr.Dropdown(
-                    choices=[
-                        "descriptive",
-                        "training_prompt",
-                        "rng-tags",
-                        "thematic_analysis",
-                        "stylistic_comparison",
-                        "narrative_suggestion",
-                        "contextual_storytelling",
-                        "style_prompt"
-                    ],
-                    label="Output Type",
                     value="descriptive",
                 )
-                gr.Markdown("""
-                ### Output Types Explained:
-                - **Descriptive**: A general description of the image
-                - **Training Prompt**: A prompt for AI image generation
-                - **RNG-Tags**: Tags for categorizing the image
-                - **Thematic Analysis**: Exploration of themes in the image
-                - **Stylistic Comparison**: Compares the image to art styles
-                - **Narrative Suggestion**: A story idea based on the image
-                - **Contextual Storytelling**: A background story for the image
-                - **Style Prompt**: Analyzes the image in context of a specific art style
-                """)
                 caption_tone = gr.Dropdown(
                     choices=["formal", "informal"],
-                    label="Tone",
                     value="formal",
                 )
-                gr.Markdown("Choose between a formal (professional) or informal (casual) tone for the output.")
                 caption_length = gr.Dropdown(
                     choices=["any", "very short", "short", "medium-length", "long", "very long"] +
                             [str(i) for i in range(20, 261, 10)],
-                    label="Length",
                     value="any",
                 )
-                gr.Markdown("""
-                Select the desired length of the output:
-                - 'any': No specific length
-                - Descriptive options: very short to very long
-                - Numeric options: Specify exact word count (20 to 260 words)
-                """)
-                art_style = gr.Dropdown(
-                    choices=ART_STYLES,
-                    label="Art Style (for Style Prompt)",
-                    value="Impressionism",
-                    visible=False
                 )
-                gr.Markdown("Select an art style to analyze the image in that context. Only applicable for 'Style Prompt' output type.")
-            with gr.Column(scale=1):
-                output_caption = gr.Textbox(label="Generated Output", lines=10)
-                generate_button = gr.Button("Generate Caption")
-        gr.Markdown("""
-        ### Additional Notes:
-        - The 'Tone' setting doesn't affect 'RNG-Tags' and 'Training Prompt' outputs.
-        - 'Art Style' is only used when 'Style Prompt' is selected as the output type.
-        - The AI model analyzes the image and generates text based on your selections.
-        """)
-        def update_visibility(caption_type):
-            return {
-                art_style: gr.update(visible=(caption_type == "style_prompt")),
-                caption_tone: gr.update(visible=(caption_type not in ["rng-tags", "training_prompt"]))
-            }
-        caption_type.change(
-            fn=update_visibility,
-            inputs=[caption_type],
-            outputs=[art_style, caption_tone]
-        )
-        generate_button.click(
-            fn=stream_chat,
-            inputs=[input_image, caption_type, caption_tone, caption_length, art_style],
-            outputs=[output_caption]
-        )
 if __name__ == "__main__":
     demo.launch()

 CLIP_PATH = "google/siglip-so400m-patch14-384"
+MODEL_PATH = "Qwen/Qwen2.5-7B-Instruct"
 CHECKPOINT_PATH = Path("9em124t2-499968")
+TITLE = "<h1><center>JoyCaption Alpha One (2024-09-20a)</center></h1>"
 CAPTION_TYPE_MAP = {
+	("descriptive", "formal", False, False): ["Write a descriptive caption for this image in a formal tone."],
+	("descriptive", "formal", False, True): ["Write a descriptive caption for this image in a formal tone within {word_count} words."],
+	("descriptive", "formal", True, False): ["Write a {length} descriptive caption for this image in a formal tone."],
+	("descriptive", "informal", False, False): ["Write a descriptive caption for this image in a casual tone."],
+	("descriptive", "informal", False, True): ["Write a descriptive caption for this image in a casual tone within {word_count} words."],
+	("descriptive", "informal", True, False): ["Write a {length} descriptive caption for this image in a casual tone."],
+	("training_prompt", "formal", False, False): ["Write a stable diffusion prompt for this image."],
+	("training_prompt", "formal", False, True): ["Write a stable diffusion prompt for this image within {word_count} words."],
+	("training_prompt", "formal", True, False): ["Write a {length} stable diffusion prompt for this image."],
+	("rng-tags", "formal", False, False): ["Write a list of Booru tags for this image."],
+	("rng-tags", "formal", False, True): ["Write a list of Booru tags for this image within {word_count} words."],
+	("rng-tags", "formal", True, False): ["Write a {length} list of Booru tags for this image."],
+	("style_prompt", "formal", False, False): ["Generate a detailed style prompt for this image, including lens type, film stock, composition notes, lighting aspects, and any special photographic techniques."],
+	("style_prompt", "formal", False, True): ["Generate a detailed style prompt for this image within {word_count} words, including lens type, film stock, composition notes, lighting aspects, and any special photographic techniques."],
+	("style_prompt", "formal", True, False): ["Generate a {length} detailed style prompt for this image, including lens type, film stock, composition notes, lighting aspects, and any special photographic techniques."],
 }
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 # Image Adapter
 print("Loading image adapter")
 image_adapter = ImageAdapter(clip_model.config.hidden_size, text_model.config.hidden_size, False, False, 38, False)
+image_adapter.load_state_dict(torch.load(CHECKPOINT_PATH / "image_adapter.pt", map_location="cpu", weights_only=True))
 image_adapter.eval()
 image_adapter.to("cuda")
+def preprocess_image(input_image: Image.Image) -> torch.Tensor:
+    """
+    Preprocess the input image for the CLIP model.
+    """
     image = input_image.resize((384, 384), Image.LANCZOS)
     pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
     pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
+    return pixel_values.to('cuda')
+def generate_caption(text_model, tokenizer, image_features, prompt_str: str, max_new_tokens: int = 300) -> str:
+    """
+    Generate a caption based on the image features and prompt.
+    """
     prompt = tokenizer.encode(prompt_str, return_tensors='pt', padding=False, truncation=False, add_special_tokens=False)
     prompt_embeds = text_model.model.embed_tokens(prompt.to('cuda'))
+    embedded_bos = text_model.model.embed_tokens(torch.tensor([[tokenizer.bos_token_id]], device=text_model.device, dtype=torch.int64))
     eot_embed = image_adapter.get_eot_embedding().unsqueeze(0).to(dtype=text_model.dtype)
     inputs_embeds = torch.cat([
+        embedded_bos.expand(image_features.shape[0], -1, -1),
+        image_features.to(dtype=embedded_bos.dtype),
+        prompt_embeds.expand(image_features.shape[0], -1, -1),
+        eot_embed.expand(image_features.shape[0], -1, -1),
     ], dim=1)
     input_ids = torch.cat([
+        torch.tensor([[tokenizer.bos_token_id]], dtype=torch.long),
+        torch.zeros((1, image_features.shape[1]), dtype=torch.long),
         prompt,
         torch.tensor([[tokenizer.convert_tokens_to_ids("<|eot_id|>")]], dtype=torch.long),
     ], dim=1).to('cuda')
     attention_mask = torch.ones_like(input_ids)
+    generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=max_new_tokens, do_sample=True, suppress_tokens=None)
     generate_ids = generate_ids[:, input_ids.shape[1]:]
     if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids("<|eot_id|>"):
         generate_ids = generate_ids[:, :-1]
+    return tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0].strip()
+@spaces.GPU()
+@torch.no_grad()
+def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str, caption_length: str | int, lens_type: str = "", film_stock: str = "", composition_style: str = "", lighting_aspect: str = "", special_technique: str = "", color_effect: str = "") -> str:
+    """
+    Generate a caption or style prompt based on the input image and parameters.
+    """
+    torch.cuda.empty_cache()
+    try:
+        length = None if caption_length == "any" else caption_length
+        if isinstance(length, str):
+            length = int(length)
+    except ValueError:
+        raise ValueError(f"Invalid caption length: {caption_length}")
+    if caption_type in ["rng-tags", "training_prompt", "style_prompt"]:
+        caption_tone = "formal"
+    prompt_key = (caption_type, caption_tone, isinstance(length, str), isinstance(length, int))
+    if prompt_key not in CAPTION_TYPE_MAP:
+        raise ValueError(f"Invalid caption type: {prompt_key}")
+    prompt_str = CAPTION_TYPE_MAP[prompt_key][0].format(length=length, word_count=length)
+    if caption_type == "style_prompt":
+        prompt_str += f" Lens type: {lens_type} ({lens_types_info[lens_type]}). "
+        prompt_str += f"Film stock: {film_stocks_info[film_stock]}). "
+        prompt_str += f"Composition style: {composition_styles_info[composition_style]}). "
+        prompt_str += f"Lighting aspect: {lighting_aspects_info[lighting_aspect]}). "
+        prompt_str += f"Special technique: {special_techniques_info[special_technique]}). "
+        prompt_str += f"Color effect: {color_effects_info[color_effect]})."
+    # Debugging: Print the constructed prompt string
+    print(f"Constructed Prompt: {prompt_str}")
+    pixel_values = preprocess_image(input_image)
+    with torch.amp.autocast_mode.autocast('cuda', enabled=True):
+        vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True)
+        image_features = vision_outputs.hidden_states
+        embedded_images = image_adapter(image_features)
+        embedded_images = embedded_images.to('cuda')
+    # Load the model from MODEL_PATH
+    text_model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map="auto", torch_dtype=torch.bfloat16)
+    text_model.eval()
+    # Debugging: Print the prompt string before passing to generate_caption
+    print(f"Prompt passed to generate_caption: {prompt_str}")
+    caption = generate_caption(text_model, tokenizer, embedded_images, prompt_str)
+    return caption
 css = """
 h1, h2, h3, h4, h5, h6, p, li, ul, ol, a, .centered-image {
 }
 """
+# Add detailed descriptions for each option
+lens_types_info = {
+    "Standard": "A versatile lens with a field of view similar to human vision.",
+    "Wide-angle": "Captures a wider field of view, great for landscapes and architecture. Applies moderate to strong lens effect with image warp.",
+    "Telephoto": "Used for distant subjects, gives an 'award-winning' or 'National Geographic' look. Creates interesting effects when prompted.",
+    "Macro": "For extreme close-up photography, revealing tiny details.",
+    "Fish-eye": "Ultra-wide-angle lens that creates a strong bubble-like distortion. Generates panoramic photos with the entire image warping into a bubble.",
+    "Tilt-shift": "Allows adjusting the plane of focus, creating a 'miniature' effect. Known for the 'diorama miniature look'.",
+    "Zoom lens": "Variable focal length lens. Often zooms in on the subject, perfect for creating a base for inpainting. Interesting effect on landscapes with motion blur.",
+    "GoPro": "Wide-angle lens with clean digital look. Excludes film grain and most filter effects, resulting in natural colors and regular saturation.",
+    "Pinhole camera": "Creates a unique, foggy, low-detail, historic photograph look. Used since the 1850s, with peak popularity in the 1930s."
+}
+film_stocks_info = {
+    "Kodak Portra": "Professional color negative film known for its natural skin tones and low contrast.",
+    "Fujifilm Velvia": "Slide film known for vibrant colors and high saturation, popular among landscape photographers.",
+    "Ilford Delta": "Black and white film known for its fine grain and high sharpness.",
+    "Kodak Tri-X": "Classic high-speed black and white film, known for its distinctive grain and wide exposure latitude.",
+    "Fujifilm Provia": "Color reversal film known for its natural color reproduction and fine grain.",
+    "Cinestill": "Color photos with fine/low grain and higher than average resolution. Colors are slightly oversaturated or slightly desaturated.",
+    "Ektachrome": "Color photos with fine/low to moderate grain. Colors on the colder part of spectrum or regular, with normal or slightly higher saturation.",
+    "Ektar": "Modern Kodak film. Color photos with little to no grain. Results look like regular modern photography with artistic angles.",
+    "Film Washi": "Mostly black and white photos with fine/low to moderate grain. Occasionally gives colored photos with low saturation. Distinct style with high black contrast and soft camera lens effect.",
+    "Fomapan": "Black and white photos with fine/low to moderate grain, highly artistic exposure and angles. Adds very soft lens effect without distortion, dark photo vignette.",
+    "Fujicolor": "Color photos with fine/low to moderate grain. Colors are slightly or notably desaturated, with entire color hue shifted in a very distinct manner.",
+    "Holga": "Color photos with high grain. Colors are either very oversaturated or slightly desaturated. Distinct contrast of black. Often applies photographic vignette.",
+    "Instax": "Instant color photos similar to Polaroid but clearer. Near perfect colors, regular saturation, fine/low to medium grain.",
+    "Lomography": "Color photos with high grain. Colors are either very oversaturated or slightly desaturated. Distinct contrast of black. Often applies photographic vignette.",
+    "Kodachrome": "Color photos with moderate grain. Colors on either colder part of spectrum or regular, with normal or slightly higher saturation.",
+    "Rollei": "Mostly black and white photos, sometimes color with fine/low grain. Can be sepia colored or have unusual hues and desaturation. Great for landscapes."
+}
+composition_styles_info = {
+    "Rule of Thirds": "Divides the frame into a 3x3 grid, placing key elements along the lines or at their intersections.",
+    "Golden Ratio": "Uses a spiral based on the golden ratio to create a balanced and aesthetically pleasing composition.",
+    "Symmetry": "Creates a mirror-like balance in the image, often used for architectural or nature photography.",
+    "Leading Lines": "Uses lines within the frame to draw the viewer's eye to the main subject or through the image.",
+    "Framing": "Uses elements within the scene to create a frame around the main subject.",
+    "Minimalism": "Simplifies the composition to its essential elements, often with a lot of negative space.",
+    "Fill the Frame": "The main subject dominates the entire frame, leaving little to no background.",
+    "Negative Space": "Uses empty space around the subject to create a sense of simplicity or isolation.",
+    "Centered Composition": "Places the main subject in the center of the frame, creating a sense of stability or importance.",
+    "Diagonal Lines": "Uses diagonal elements to create a sense of movement or dynamic tension in the image.",
+    "Triangular Composition": "Arranges elements in the frame to form a triangle, creating a sense of stability and harmony.",
+    "Radial Balance": "Arranges elements in a circular pattern around a central point, creating a sense of movement or completeness."
 }
+lighting_aspects_info = {
+    "Natural light": "Uses available light from the sun or sky, often creating soft, even illumination.",
+    "Studio lighting": "Controlled artificial lighting setup, allowing for precise manipulation of light and shadow.",
+    "Back light": "Light source behind the subject, creating silhouettes or rim lighting effects.",
+    "Split light": "Strong light source at 90-degree angle, lighting one half of the subject while leaving the other in shadow.",
+    "Broad light": "Light source at an angle to the subject, producing well-lit photographs with soft to moderate shadows.",
+    "Dim light": "Weak or distant light source, creating lower than average brightness and often dramatic images.",
+    "Flash photography": "Uses a brief, intense burst of light. Can be fill flash (even lighting) or harsh flash (strong contrasts).",
+    "Sunlight": "Direct light from the sun, often creating strong contrasts and warm tones.",
+    "Moonlight": "Soft, cool light from the moon, often creating a mysterious or romantic atmosphere.",
+    "Spotlight": "Focused beam of light illuminating a specific area, creating high contrast between light and shadow.",
+    "High-key lighting": "Bright, even lighting with minimal shadows, creating a light and airy feel.",
+    "Low-key lighting": "Predominantly dark tones with selective lighting, creating a moody or dramatic atmosphere.",
+    "Rembrandt lighting": "Classic portrait lighting technique creating a triangle of light on the cheek of the subject."
 }
+special_techniques_info = {
+    "Double exposure": "Superimposes two exposures to create a single image, often resulting in a dreamy or surreal effect.",
+    "Long exposure": "Uses a long shutter speed to capture motion over time, often creating smooth, blurred effects for moving elements.",
+    "Multiple exposure": "Superimposes multiple exposures, multiplying the subject or its key elements across the image.",
+    "HDR": "High Dynamic Range imaging, combining multiple exposures to capture a wider range of light and dark tones.",
+    "Bokeh effect": "Creates a soft, out-of-focus background, often with circular highlights.",
+    "Silhouette": "Captures the outline of a subject against a brighter background, creating a dramatic contrast.",
+    "Panning": "Follows a moving subject with the camera, creating a sharp subject with a blurred background.",
+    "Light painting": "Uses long exposure and moving light sources to 'paint' with light in the image.",
+    "Infrared photography": "Captures light in the infrared spectrum, often resulting in surreal, otherworldly images.",
+    "Ultraviolet photography": "Captures light in the ultraviolet spectrum, often revealing hidden patterns or creating a strong violet glow.",
+    "Kirlian photography": "High-voltage photographic technique that captures corona discharges around objects, creating a glowing effect.",
+    "Thermography": "Captures infrared radiation to create images based on temperature differences, resulting in false-color heat maps.",
+    "Astrophotography": "Specialized technique for capturing astronomical objects and celestial events, often resulting in stunning starry backgrounds.",
+    "Underwater photography": "Captures images beneath the surface of water, often in pools, seas, or aquariums.",
+    "Aerial photography": "Captures images from an elevated position, such as from drones, helicopters, or planes.",
+    "Macro photography": "Extreme close-up photography, revealing tiny details not visible to the naked eye."
+}
+color_effects_info = {
+    "Black and white": "Removes all color, leaving only shades of gray.",
+    "Sepia": "Reddish-brown monochrome effect, often associated with vintage photography.",
+    "Monochrome": "Uses variations of a single color.",
+    "Vintage color": "Muted or faded color palette reminiscent of old photographs.",
+    "Cross-processed": "Deliberate processing of film in the wrong chemicals, creating unusual color shifts.",
+    "Desaturated": "Reduces the intensity of all colors in the image.",
+    "Vivid colors": "Increases the saturation and intensity of colors.",
+    "Pastel colors": "Soft, pale colors with a light and airy feel.",
+    "High contrast": "Emphasizes the difference between light and dark areas in the image.",
+    "Low contrast": "Reduces the difference between light and dark areas, creating a softer look.",
+    "Color splash": "Converts most of the image to black and white while leaving one or more elements in color."
+}
+def get_dropdown_choices(info_dict):
+    return [f"{key}: {value}" for key, value in info_dict.items()]
+# Gradio interface
 with gr.Blocks(theme="Hev832/Applio", css=css) as demo:
     with gr.Tab("Welcome"):
         gr.Markdown(
+            """
             <img src="https://path-to-yamamoto-logo.png" alt="Yamamoto Logo" class="centered-image">
             # 🎨 Yamamoto JoyCaption: AI-Powered Art Inspiration
             ## Accelerate Your Creative Workflow with Intelligent Image Analysis
             This innovative tool empowers Yamamoto's artists to quickly generate descriptive captions,<br>
+            training prompts, and tags from existing artwork, fueling the creative process for GenAI models.
             ## 🚀 How It Works:
             1. **Upload Your Inspiration**: Drop in an image (e.g., a charcoal horse picture) that embodies your desired style.
             4. **Generate and Iterate**: Click 'Caption' to analyze your image and use the results to inspire new creations.
             """
         )
     with gr.Tab("JoyCaption"):
+        with gr.Accordion("How to Use JoyCaption", open=False):
+            gr.Markdown("""
+            # How to Use JoyCaption
+            Hello, artist! Let's make some fun captions for your pictures. Here's how:
+            1. **Pick a Picture**: Find a cool picture you want to talk about and upload it.
+            2. **Choose What You Want**:
+               - **Caption Type**:
+                 * "Descriptive" tells you what's in the picture
+                 * "Training Prompt" helps computers make similar pictures
+                 * "RNG-Tags" gives you short words about the picture
+                 * "Style Prompt" creates detailed prompts for image generation
+            3. **Pick a Style** (for "Descriptive" and "Style Prompt" only):
+               - "Formal" sounds like a teacher talking
+               - "Informal" sounds like a friend chatting
+            4. **Decide How Long**:
+               - "Any" lets the computer decide
+               - Or pick a size from "very short" to "very long"
+               - You can even choose a specific number of words!
+            5. **Advanced Options** (for "Style Prompt" only):
+               - Choose lens type, film stock, composition, and lighting details
+            6. **Make the Caption**: Click the "Make My Caption!" button and watch the magic happen!
+            Remember, have fun and be creative with your captions!
+            ## Tips for Great Captions:
+            - Try different types to see what you like best
+            - Experiment with formal and informal tones for fun variations
+            - Adjust the length to get just the right amount of detail
+            - For "Style Prompt", play with the advanced options for more specific results
+            - If you don't like a caption, just click "Make My Caption!" again for a new one
+            Have a great time captioning your art!
+            """)
         with gr.Row():
+            with gr.Column():
+                input_image = gr.Image(type="pil", label="Input Image")
                 caption_type = gr.Dropdown(
+                    choices=["descriptive", "training_prompt", "rng-tags", "style_prompt"],
+                    label="Caption Type",
                     value="descriptive",
                 )
                 caption_tone = gr.Dropdown(
                     choices=["formal", "informal"],
+                    label="Caption Tone",
                     value="formal",
                 )
                 caption_length = gr.Dropdown(
                     choices=["any", "very short", "short", "medium-length", "long", "very long"] +
                             [str(i) for i in range(20, 261, 10)],
+                    label="Caption Length",
                     value="any",
                 )
+                lens_type = gr.Dropdown(
+                    choices=get_dropdown_choices(lens_types_info),
+                    label="Lens Type",
+                    visible=False,
+                    info="Select a lens type to define the perspective and field of view of the image."
                 )
+                film_stock = gr.Dropdown(
+                    choices=get_dropdown_choices(film_stocks_info),
+                    label="Film Stock",
+                    visible=False,
+                    info="Choose a film stock to determine the color, grain, and overall look of the image."
+                )
+                composition_style = gr.Dropdown(
+                    choices=get_dropdown_choices(composition_styles_info),
+                    label="Composition Style",
+                    visible=False,
+                    info="Select a composition style to guide the arrangement of elements in the image."
+                )
+                lighting_aspect = gr.Dropdown(
+                    choices=get_dropdown_choices(lighting_aspects_info),
+                    label="Lighting Aspect",
+                    visible=False,
+                    info="Choose a lighting style to define the mood and atmosphere of the image."
+                )
+                special_technique = gr.Dropdown(
+                    choices=get_dropdown_choices(special_techniques_info),
+                    label="Special Technique",
+                    visible=False,
+                    info="Select a special photographic technique to add unique effects to the image."
+                )
+                color_effect = gr.Dropdown(
+                    choices=get_dropdown_choices(color_effects_info),
+                    label="Color Effect",
+                    visible=False,
+                    info="Choose a color effect to alter the overall color palette of the image."
+                )
+                gr.Markdown("**Note:** Caption tone doesn't affect `rng-tags`, `training_prompt`, and `style_prompt`.")
+                run_button = gr.Button("Make My Caption!")
+            with gr.Column():
+                output_caption = gr.Textbox(label="Generated Caption")
+                # Container for advanced options
+                advanced_options = gr.Column(visible=False)
+                with advanced_options:
+                    gr.Markdown("### Advanced Options for Style Prompt")
+                    lens_type.render()
+                    film_stock.render()
+                    composition_style.render()
+                    lighting_aspect.render()
+                    special_technique.render()
+                    color_effect.render()
+    def update_style_options(caption_type):
+        return {
+            lens_type: gr.update(visible=caption_type == "style_prompt"),
+            film_stock: gr.update(visible=caption_type == "style_prompt"),
+            composition_style: gr.update(visible=caption_type == "style_prompt"),
+            lighting_aspect: gr.update(visible=caption_type == "style_prompt"),
+            special_technique: gr.update(visible=caption_type == "style_prompt"),
+            color_effect: gr.update(visible=caption_type == "style_prompt"),
+            advanced_options: gr.update(visible=caption_type == "style_prompt"),
+        }
+    caption_type.change(update_style_options, inputs=[caption_type], outputs=[lens_type, film_stock, composition_style, lighting_aspect, special_technique, color_effect, advanced_options])
+    run_button.click(fn=stream_chat, inputs=[input_image, caption_type, caption_tone, caption_length, lens_type, film_stock, composition_style, lighting_aspect, special_technique, color_effect], outputs=[output_caption])
 if __name__ == "__main__":
     demo.launch()