Test-Caption-Captain

Sleeping

App Files Files Community

Severian commited on Sep 26, 2024

Commit

936fabc

verified ·

1 Parent(s): 368e071

Update app.py

Browse files

Files changed (1) hide show

app.py +358 -307

app.py CHANGED Viewed

@@ -12,28 +12,147 @@ import torchvision.transforms.functional as TVF
 CLIP_PATH = "google/siglip-so400m-patch14-384"
-MODEL_PATH = "Qwen/Qwen2.5-7B-Instruct"
 CHECKPOINT_PATH = Path("9em124t2-499968")
-TITLE = "<h1><center>JoyCaption Alpha One (2024-09-20a)</center></h1>"
 CAPTION_TYPE_MAP = {
-	("descriptive", "formal", False, False): ["Write a descriptive caption for this image in a formal tone."],
-	("descriptive", "formal", False, True): ["Write a descriptive caption for this image in a formal tone within {word_count} words."],
-	("descriptive", "formal", True, False): ["Write a {length} descriptive caption for this image in a formal tone."],
-	("descriptive", "informal", False, False): ["Write a descriptive caption for this image in a casual tone."],
-	("descriptive", "informal", False, True): ["Write a descriptive caption for this image in a casual tone within {word_count} words."],
-	("descriptive", "informal", True, False): ["Write a {length} descriptive caption for this image in a casual tone."],
-	("training_prompt", "formal", False, False): ["Write a stable diffusion prompt for this image."],
-	("training_prompt", "formal", False, True): ["Write a stable diffusion prompt for this image within {word_count} words."],
-	("training_prompt", "formal", True, False): ["Write a {length} stable diffusion prompt for this image."],
-	("rng-tags", "formal", False, False): ["Write a list of Booru tags for this image."],
-	("rng-tags", "formal", False, True): ["Write a list of Booru tags for this image within {word_count} words."],
-	("rng-tags", "formal", True, False): ["Write a {length} list of Booru tags for this image."],
-	("style_prompt", "formal", False, False): ["Generate a detailed style prompt for this image, including lens type, film stock, composition notes, lighting aspects, and any special photographic techniques."],
-	("style_prompt", "formal", False, True): ["Generate a detailed style prompt for this image within {word_count} words, including lens type, film stock, composition notes, lighting aspects, and any special photographic techniques."],
-	("style_prompt", "formal", True, False): ["Generate a {length} detailed style prompt for this image, including lens type, film stock, composition notes, lighting aspects, and any special photographic techniques."],
 }
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
@@ -138,105 +257,122 @@ text_model.eval()
 # Image Adapter
 print("Loading image adapter")
 image_adapter = ImageAdapter(clip_model.config.hidden_size, text_model.config.hidden_size, False, False, 38, False)
-image_adapter.load_state_dict(torch.load(CHECKPOINT_PATH / "image_adapter.pt", map_location="cpu", weights_only=True))
 image_adapter.eval()
 image_adapter.to("cuda")
-def preprocess_image(input_image: Image.Image) -> torch.Tensor:
-    """
-    Preprocess the input image for the CLIP model.
-    """
     image = input_image.resize((384, 384), Image.LANCZOS)
     pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
     pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
-    return pixel_values.to('cuda')
-def generate_caption(text_model, tokenizer, image_features, prompt_str: str, max_new_tokens: int = 300) -> str:
-    """
-    Generate a caption based on the image features and prompt.
-    """
     prompt = tokenizer.encode(prompt_str, return_tensors='pt', padding=False, truncation=False, add_special_tokens=False)
     prompt_embeds = text_model.model.embed_tokens(prompt.to('cuda'))
-    embedded_bos = text_model.model.embed_tokens(torch.tensor([[tokenizer.bos_token_id]], device=text_model.device, dtype=torch.int64))
     eot_embed = image_adapter.get_eot_embedding().unsqueeze(0).to(dtype=text_model.dtype)
     inputs_embeds = torch.cat([
-        embedded_bos.expand(image_features.shape[0], -1, -1),
-        image_features.to(dtype=embedded_bos.dtype),
-        prompt_embeds.expand(image_features.shape[0], -1, -1),
-        eot_embed.expand(image_features.shape[0], -1, -1),
     ], dim=1)
     input_ids = torch.cat([
-        torch.tensor([[tokenizer.bos_token_id]], dtype=torch.long),
-        torch.zeros((1, image_features.shape[1]), dtype=torch.long),
         prompt,
         torch.tensor([[tokenizer.convert_tokens_to_ids("<|eot_id|>")]], dtype=torch.long),
     ], dim=1).to('cuda')
     attention_mask = torch.ones_like(input_ids)
-    generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=max_new_tokens, do_sample=True, suppress_tokens=None)
     generate_ids = generate_ids[:, input_ids.shape[1]:]
     if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids("<|eot_id|>"):
         generate_ids = generate_ids[:, :-1]
-    return tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0].strip()
-@spaces.GPU()
-@torch.no_grad()
-def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str, caption_length: str | int, lens_type: str = "", film_stock: str = "", composition_style: str = "", lighting_aspect: str = "", special_technique: str = "", color_effect: str = "") -> str:
-    """
-    Generate a caption or style prompt based on the input image and parameters.
-    """
-    torch.cuda.empty_cache()
-    try:
-        length = None if caption_length == "any" else caption_length
-        if isinstance(length, str):
-            length = int(length)
-    except ValueError:
-        raise ValueError(f"Invalid caption length: {caption_length}")
-    if caption_type in ["rng-tags", "training_prompt", "style_prompt"]:
-        caption_tone = "formal"
-    prompt_key = (caption_type, caption_tone, isinstance(length, str), isinstance(length, int))
-    if prompt_key not in CAPTION_TYPE_MAP:
-        raise ValueError(f"Invalid caption type: {prompt_key}")
-    prompt_str = CAPTION_TYPE_MAP[prompt_key][0].format(length=length, word_count=length)
-    if caption_type == "style_prompt":
-        prompt_str += f" Lens type: {lens_type} ({lens_types_info[lens_type]}). "
-        prompt_str += f"Film stock: {film_stocks_info[film_stock]}). "
-        prompt_str += f"Composition style: {composition_styles_info[composition_style]}). "
-        prompt_str += f"Lighting aspect: {lighting_aspects_info[lighting_aspect]}). "
-        prompt_str += f"Special technique: {special_techniques_info[special_technique]}). "
-        prompt_str += f"Color effect: {color_effects_info[color_effect]})."
-    # Debugging: Print the constructed prompt string
-    print(f"Constructed Prompt: {prompt_str}")
-    pixel_values = preprocess_image(input_image)
-    with torch.amp.autocast_mode.autocast('cuda', enabled=True):
-        vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True)
-        image_features = vision_outputs.hidden_states
-        embedded_images = image_adapter(image_features)
-        embedded_images = embedded_images.to('cuda')
-    # Load the model from MODEL_PATH
-    text_model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map="auto", torch_dtype=torch.bfloat16)
-    text_model.eval()
-    # Debugging: Print the prompt string before passing to generate_caption
-    print(f"Prompt passed to generate_caption: {prompt_str}")
-    caption = generate_caption(text_model, tokenizer, embedded_images, prompt_str)
-    return caption
 css = """
 h1, h2, h3, h4, h5, h6, p, li, ul, ol, a, .centered-image {
@@ -256,110 +392,63 @@ ul, ol {
 }
 """
-# Add detailed descriptions for each option
-lens_types_info = {
-    "Standard": "A versatile lens with a field of view similar to human vision.",
-    "Wide-angle": "Captures a wider field of view, great for landscapes and architecture. Applies moderate to strong lens effect with image warp.",
-    "Telephoto": "Used for distant subjects, gives an 'award-winning' or 'National Geographic' look. Creates interesting effects when prompted.",
-    "Macro": "For extreme close-up photography, revealing tiny details.",
-    "Fish-eye": "Ultra-wide-angle lens that creates a strong bubble-like distortion. Generates panoramic photos with the entire image warping into a bubble.",
-    "Tilt-shift": "Allows adjusting the plane of focus, creating a 'miniature' effect. Known for the 'diorama miniature look'.",
-    "Zoom lens": "Variable focal length lens. Often zooms in on the subject, perfect for creating a base for inpainting. Interesting effect on landscapes with motion blur.",
-    "GoPro": "Wide-angle lens with clean digital look. Excludes film grain and most filter effects, resulting in natural colors and regular saturation.",
-    "Pinhole camera": "Creates a unique, foggy, low-detail, historic photograph look. Used since the 1850s, with peak popularity in the 1930s."
-}
-film_stocks_info = {
-    "Kodak Portra": "Professional color negative film known for its natural skin tones and low contrast.",
-    "Fujifilm Velvia": "Slide film known for vibrant colors and high saturation, popular among landscape photographers.",
-    "Ilford Delta": "Black and white film known for its fine grain and high sharpness.",
-    "Kodak Tri-X": "Classic high-speed black and white film, known for its distinctive grain and wide exposure latitude.",
-    "Fujifilm Provia": "Color reversal film known for its natural color reproduction and fine grain.",
-    "Cinestill": "Color photos with fine/low grain and higher than average resolution. Colors are slightly oversaturated or slightly desaturated.",
-    "Ektachrome": "Color photos with fine/low to moderate grain. Colors on the colder part of spectrum or regular, with normal or slightly higher saturation.",
-    "Ektar": "Modern Kodak film. Color photos with little to no grain. Results look like regular modern photography with artistic angles.",
-    "Film Washi": "Mostly black and white photos with fine/low to moderate grain. Occasionally gives colored photos with low saturation. Distinct style with high black contrast and soft camera lens effect.",
-    "Fomapan": "Black and white photos with fine/low to moderate grain, highly artistic exposure and angles. Adds very soft lens effect without distortion, dark photo vignette.",
-    "Fujicolor": "Color photos with fine/low to moderate grain. Colors are slightly or notably desaturated, with entire color hue shifted in a very distinct manner.",
-    "Holga": "Color photos with high grain. Colors are either very oversaturated or slightly desaturated. Distinct contrast of black. Often applies photographic vignette.",
-    "Instax": "Instant color photos similar to Polaroid but clearer. Near perfect colors, regular saturation, fine/low to medium grain.",
-    "Lomography": "Color photos with high grain. Colors are either very oversaturated or slightly desaturated. Distinct contrast of black. Often applies photographic vignette.",
-    "Kodachrome": "Color photos with moderate grain. Colors on either colder part of spectrum or regular, with normal or slightly higher saturation.",
-    "Rollei": "Mostly black and white photos, sometimes color with fine/low grain. Can be sepia colored or have unusual hues and desaturation. Great for landscapes."
-}
-composition_styles_info = {
-    "Rule of Thirds": "Divides the frame into a 3x3 grid, placing key elements along the lines or at their intersections.",
-    "Golden Ratio": "Uses a spiral based on the golden ratio to create a balanced and aesthetically pleasing composition.",
-    "Symmetry": "Creates a mirror-like balance in the image, often used for architectural or nature photography.",
-    "Leading Lines": "Uses lines within the frame to draw the viewer's eye to the main subject or through the image.",
-    "Framing": "Uses elements within the scene to create a frame around the main subject.",
-    "Minimalism": "Simplifies the composition to its essential elements, often with a lot of negative space.",
-    "Fill the Frame": "The main subject dominates the entire frame, leaving little to no background.",
-    "Negative Space": "Uses empty space around the subject to create a sense of simplicity or isolation.",
-    "Centered Composition": "Places the main subject in the center of the frame, creating a sense of stability or importance.",
-    "Diagonal Lines": "Uses diagonal elements to create a sense of movement or dynamic tension in the image.",
-    "Triangular Composition": "Arranges elements in the frame to form a triangle, creating a sense of stability and harmony.",
-    "Radial Balance": "Arranges elements in a circular pattern around a central point, creating a sense of movement or completeness."
 }
-lighting_aspects_info = {
-    "Natural light": "Uses available light from the sun or sky, often creating soft, even illumination.",
-    "Studio lighting": "Controlled artificial lighting setup, allowing for precise manipulation of light and shadow.",
-    "Back light": "Light source behind the subject, creating silhouettes or rim lighting effects.",
-    "Split light": "Strong light source at 90-degree angle, lighting one half of the subject while leaving the other in shadow.",
-    "Broad light": "Light source at an angle to the subject, producing well-lit photographs with soft to moderate shadows.",
-    "Dim light": "Weak or distant light source, creating lower than average brightness and often dramatic images.",
-    "Flash photography": "Uses a brief, intense burst of light. Can be fill flash (even lighting) or harsh flash (strong contrasts).",
-    "Sunlight": "Direct light from the sun, often creating strong contrasts and warm tones.",
-    "Moonlight": "Soft, cool light from the moon, often creating a mysterious or romantic atmosphere.",
-    "Spotlight": "Focused beam of light illuminating a specific area, creating high contrast between light and shadow.",
-    "High-key lighting": "Bright, even lighting with minimal shadows, creating a light and airy feel.",
-    "Low-key lighting": "Predominantly dark tones with selective lighting, creating a moody or dramatic atmosphere.",
-    "Rembrandt lighting": "Classic portrait lighting technique creating a triangle of light on the cheek of the subject."
 }
-special_techniques_info = {
-    "Double exposure": "Superimposes two exposures to create a single image, often resulting in a dreamy or surreal effect.",
-    "Long exposure": "Uses a long shutter speed to capture motion over time, often creating smooth, blurred effects for moving elements.",
-    "Multiple exposure": "Superimposes multiple exposures, multiplying the subject or its key elements across the image.",
-    "HDR": "High Dynamic Range imaging, combining multiple exposures to capture a wider range of light and dark tones.",
-    "Bokeh effect": "Creates a soft, out-of-focus background, often with circular highlights.",
-    "Silhouette": "Captures the outline of a subject against a brighter background, creating a dramatic contrast.",
-    "Panning": "Follows a moving subject with the camera, creating a sharp subject with a blurred background.",
-    "Light painting": "Uses long exposure and moving light sources to 'paint' with light in the image.",
-    "Infrared photography": "Captures light in the infrared spectrum, often resulting in surreal, otherworldly images.",
-    "Ultraviolet photography": "Captures light in the ultraviolet spectrum, often revealing hidden patterns or creating a strong violet glow.",
-    "Kirlian photography": "High-voltage photographic technique that captures corona discharges around objects, creating a glowing effect.",
-    "Thermography": "Captures infrared radiation to create images based on temperature differences, resulting in false-color heat maps.",
-    "Astrophotography": "Specialized technique for capturing astronomical objects and celestial events, often resulting in stunning starry backgrounds.",
-    "Underwater photography": "Captures images beneath the surface of water, often in pools, seas, or aquariums.",
-    "Aerial photography": "Captures images from an elevated position, such as from drones, helicopters, or planes.",
-    "Macro photography": "Extreme close-up photography, revealing tiny details not visible to the naked eye."
-}
-color_effects_info = {
-    "Black and white": "Removes all color, leaving only shades of gray.",
-    "Sepia": "Reddish-brown monochrome effect, often associated with vintage photography.",
-    "Monochrome": "Uses variations of a single color.",
-    "Vintage color": "Muted or faded color palette reminiscent of old photographs.",
-    "Cross-processed": "Deliberate processing of film in the wrong chemicals, creating unusual color shifts.",
-    "Desaturated": "Reduces the intensity of all colors in the image.",
-    "Vivid colors": "Increases the saturation and intensity of colors.",
-    "Pastel colors": "Soft, pale colors with a light and airy feel.",
-    "High contrast": "Emphasizes the difference between light and dark areas in the image.",
-    "Low contrast": "Reduces the difference between light and dark areas, creating a softer look.",
-    "Color splash": "Converts most of the image to black and white while leaving one or more elements in color."
-}
-def get_dropdown_choices(info_dict):
-    return [f"{key}: {value}" for key, value in info_dict.items()]
-# Gradio interface
 with gr.Blocks(theme="Hev832/Applio", css=css) as demo:
     with gr.Tab("Welcome"):
         gr.Markdown(
-            """
             <img src="https://path-to-yamamoto-logo.png" alt="Yamamoto Logo" class="centered-image">
             # 🎨 Yamamoto JoyCaption: AI-Powered Art Inspiration
@@ -367,7 +456,7 @@ with gr.Blocks(theme="Hev832/Applio", css=css) as demo:
             ## Accelerate Your Creative Workflow with Intelligent Image Analysis
             This innovative tool empowers Yamamoto's artists to quickly generate descriptive captions,<br>
-            training prompts, and tags from existing artwork, fueling the creative process for GenAI models.
             ## 🚀 How It Works:
             1. **Upload Your Inspiration**: Drop in an image (e.g., a charcoal horse picture) that embodies your desired style.
@@ -376,147 +465,109 @@ with gr.Blocks(theme="Hev832/Applio", css=css) as demo:
             4. **Generate and Iterate**: Click 'Caption' to analyze your image and use the results to inspire new creations.
             """
         )
     with gr.Tab("JoyCaption"):
-        with gr.Accordion("How to Use JoyCaption", open=False):
-            gr.Markdown("""
-            # How to Use JoyCaption
-            Hello, artist! Let's make some fun captions for your pictures. Here's how:
-            1. **Pick a Picture**: Find a cool picture you want to talk about and upload it.
-            2. **Choose What You Want**:
-               - **Caption Type**:
-                 * "Descriptive" tells you what's in the picture
-                 * "Training Prompt" helps computers make similar pictures
-                 * "RNG-Tags" gives you short words about the picture
-                 * "Style Prompt" creates detailed prompts for image generation
-            3. **Pick a Style** (for "Descriptive" and "Style Prompt" only):
-               - "Formal" sounds like a teacher talking
-               - "Informal" sounds like a friend chatting
-            4. **Decide How Long**:
-               - "Any" lets the computer decide
-               - Or pick a size from "very short" to "very long"
-               - You can even choose a specific number of words!
-            5. **Advanced Options** (for "Style Prompt" only):
-               - Choose lens type, film stock, composition, and lighting details
-            6. **Make the Caption**: Click the "Make My Caption!" button and watch the magic happen!
-            Remember, have fun and be creative with your captions!
-            ## Tips for Great Captions:
-            - Try different types to see what you like best
-            - Experiment with formal and informal tones for fun variations
-            - Adjust the length to get just the right amount of detail
-            - For "Style Prompt", play with the advanced options for more specific results
-            - If you don't like a caption, just click "Make My Caption!" again for a new one
-            Have a great time captioning your art!
-            """)
         with gr.Row():
-            with gr.Column():
-                input_image = gr.Image(type="pil", label="Input Image")
                 caption_type = gr.Dropdown(
-                    choices=["descriptive", "training_prompt", "rng-tags", "style_prompt"],
-                    label="Caption Type",
                     value="descriptive",
                 )
                 caption_tone = gr.Dropdown(
                     choices=["formal", "informal"],
-                    label="Caption Tone",
                     value="formal",
                 )
                 caption_length = gr.Dropdown(
                     choices=["any", "very short", "short", "medium-length", "long", "very long"] +
                             [str(i) for i in range(20, 261, 10)],
-                    label="Caption Length",
                     value="any",
                 )
-                lens_type = gr.Dropdown(
-                    choices=get_dropdown_choices(lens_types_info),
-                    label="Lens Type",
-                    visible=False,
-                    info="Select a lens type to define the perspective and field of view of the image."
-                )
-                film_stock = gr.Dropdown(
-                    choices=get_dropdown_choices(film_stocks_info),
-                    label="Film Stock",
-                    visible=False,
-                    info="Choose a film stock to determine the color, grain, and overall look of the image."
-                )
-                composition_style = gr.Dropdown(
-                    choices=get_dropdown_choices(composition_styles_info),
-                    label="Composition Style",
-                    visible=False,
-                    info="Select a composition style to guide the arrangement of elements in the image."
                 )
-                lighting_aspect = gr.Dropdown(
-                    choices=get_dropdown_choices(lighting_aspects_info),
-                    label="Lighting Aspect",
-                    visible=False,
-                    info="Choose a lighting style to define the mood and atmosphere of the image."
-                )
-                special_technique = gr.Dropdown(
-                    choices=get_dropdown_choices(special_techniques_info),
-                    label="Special Technique",
-                    visible=False,
-                    info="Select a special photographic technique to add unique effects to the image."
-                )
-                color_effect = gr.Dropdown(
-                    choices=get_dropdown_choices(color_effects_info),
-                    label="Color Effect",
-                    visible=False,
-                    info="Choose a color effect to alter the overall color palette of the image."
-                )
-                gr.Markdown("**Note:** Caption tone doesn't affect `rng-tags`, `training_prompt`, and `style_prompt`.")
-                run_button = gr.Button("Make My Caption!")
-            with gr.Column():
-                output_caption = gr.Textbox(label="Generated Caption")
-                # Container for advanced options
-                advanced_options = gr.Column(visible=False)
-                with advanced_options:
-                    gr.Markdown("### Advanced Options for Style Prompt")
-                    lens_type.render()
-                    film_stock.render()
-                    composition_style.render()
-                    lighting_aspect.render()
-                    special_technique.render()
-                    color_effect.render()
-    def update_style_options(caption_type):
-        return {
-            lens_type: gr.update(visible=caption_type == "style_prompt"),
-            film_stock: gr.update(visible=caption_type == "style_prompt"),
-            composition_style: gr.update(visible=caption_type == "style_prompt"),
-            lighting_aspect: gr.update(visible=caption_type == "style_prompt"),
-            special_technique: gr.update(visible=caption_type == "style_prompt"),
-            color_effect: gr.update(visible=caption_type == "style_prompt"),
-            advanced_options: gr.update(visible=caption_type == "style_prompt"),
-        }
-    caption_type.change(update_style_options, inputs=[caption_type], outputs=[lens_type, film_stock, composition_style, lighting_aspect, special_technique, color_effect, advanced_options])
-    run_button.click(fn=stream_chat, inputs=[input_image, caption_type, caption_tone, caption_length, lens_type, film_stock, composition_style, lighting_aspect, special_technique, color_effect], outputs=[output_caption])
 if __name__ == "__main__":
     demo.launch()

 CLIP_PATH = "google/siglip-so400m-patch14-384"
+MODEL_PATH = "meta-llama/Meta-Llama-3.1-8B"
 CHECKPOINT_PATH = Path("9em124t2-499968")
 CAPTION_TYPE_MAP = {
+    ("descriptive", "formal", False, False): [
+        "Write a detailed, formal description of this image, focusing on composition, style, and artistic elements.",
+        "Provide a comprehensive, academic analysis of this artwork's visual characteristics and techniques."
+    ],
+    ("descriptive", "formal", False, True): [
+        "Craft a formal, concise description of this image within {word_count} words, highlighting key visual elements.",
+        "Summarize the artwork's main features and style in a formal tone, using no more than {word_count} words."
+    ],
+    ("descriptive", "formal", True, False): [
+        "Compose a {length} formal critique of this image, discussing its artistic merits and visual impact.",
+        "Create a {length} scholarly description of this artwork, analyzing its composition and aesthetic qualities."
+    ],
+    ("descriptive", "informal", False, False): [
+        "Describe this image as if you're explaining it to a friend, focusing on what stands out to you.",
+        "Give a casual, conversational rundown of what you see in this artwork and how it makes you feel."
+    ],
+    ("descriptive", "informal", False, True): [
+        "In about {word_count} words, give a laid-back description of this image's vibe and key features.",
+        "Summarize the coolest parts of this artwork in a casual tone, using roughly {word_count} words."
+    ],
+    ("descriptive", "informal", True, False): [
+        "Write a {length} chill description of this image, highlighting what you find most interesting or unique.",
+        "Give a {length} relaxed explanation of what's going on in this artwork and why it catches your eye."
+    ],
+    ("training_prompt", "formal", False, False): [
+        "Generate a detailed stable diffusion prompt to recreate this image, including style, composition, and key elements.",
+        "Craft a comprehensive prompt for an AI art generator to produce an image in the same style and mood as this artwork."
+    ],
+    ("training_prompt", "formal", False, True): [
+        "Within {word_count} words, create a precise stable diffusion prompt capturing the essence of this image.",
+        "Compose a concise AI art prompt of {word_count} words to replicate this artwork's style and content."
+    ],
+    ("training_prompt", "formal", True, False): [
+        "Write a {length} stable diffusion prompt that thoroughly describes this image's style, subject, and artistic techniques.",
+        "Develop a {length} detailed prompt for AI art generation, breaking down the key visual elements and artistic approach of this image."
+    ],
+    ("rng-tags", "formal", False, False): [
+        "Generate a comprehensive list of Booru tags describing this image's content, style, and artistic elements.",
+        "Create an extensive set of Booru tags covering all aspects of this artwork, including subject, technique, and mood."
+    ],
+    ("rng-tags", "formal", False, True): [
+        "Produce a focused list of Booru tags within {word_count} words, capturing the most important aspects of this image.",
+        "Compile a concise set of Booru tags, limited to {word_count} words, that best represent this artwork's key features."
+    ],
+    ("rng-tags", "formal", True, False): [
+        "Generate a {length} list of Booru tags, providing a thorough categorization of this image's content and style.",
+        "Create a {length} set of Booru tags that extensively describe all visual elements and artistic choices in this artwork."
+    ],
+    ("artistic_inspiration", "formal", False, False): [
+        "Analyze this image and suggest artistic variations or extensions that could be created based on its style and theme.",
+        "Provide a formal interpretation of this artwork's mood and style, offering ideas for complementary pieces or a series."
+    ],
+    ("artistic_inspiration", "informal", False, False): [
+        "Brainstorm some cool ideas for new artworks inspired by this image's style or subject matter.",
+        "Riff on this artwork's vibe and come up with some creative spin-offs or related pieces an artist could make."
+    ],
+    ("technical_breakdown", "formal", False, False): [
+        "Provide a detailed technical analysis of the artistic techniques and materials likely used to create this image.",
+        "Break down the compositional elements and artistic methods employed in this artwork, suitable for an art student's study."
+    ],
+    ("emotional_response", "informal", False, False): [
+        "Describe the emotions and feelings this artwork evokes, and explain why it might resonate with viewers.",
+        "Share your gut reaction to this image and speculate on what the artist might have been feeling or thinking."
+    ],
+    ("thematic_analysis", "formal", False, False): [
+        "Provide an in-depth analysis of the themes presented in this image, exploring the underlying messages and concepts.",
+        "Analyze the primary and secondary themes of this artwork, discussing their significance and interplay."
+    ],
+    ("thematic_analysis", "formal", False, True): [
+        "Within {word_count} words, dissect the main themes of this image, highlighting their relevance and impact.",
+        "Craft a concise thematic analysis of this artwork in {word_count} words, focusing on its core messages."
+    ],
+    ("thematic_analysis", "formal", True, False): [
+        "Write a {length} formal exploration of the themes depicted in this image, examining their depth and meaning.",
+        "Develop a {length} scholarly analysis of the thematic elements in this artwork, discussing their significance."
+    ],
+    ("stylistic_comparison", "informal", False, False): [
+        "Compare the style of this image to other famous art movements or artists, highlighting similarities and differences.",
+        "Describe how this artwork's style relates to [specific artist/style], and what makes it unique."
+    ],
+    ("stylistic_comparison", "informal", False, True): [
+        "In about {word_count} words, compare this image's style with other known art styles or artists.",
+        "Summarize the stylistic similarities and differences of this artwork compared to other genres in {word_count} words."
+    ],
+    ("stylistic_comparison", "informal", True, False): [
+        "Write a {length} casual comparison of this image's style with other art movements or famous artists.",
+        "Give a {length} relaxed description of how this artwork's style aligns or differs from other genres."
+    ],
+    ("narrative_suggestion", "formal", False, False): [
+        "Create a short narrative inspired by this image, outlining a possible story that reflects its visual elements.",
+        "Develop a brief storyline that complements the themes and mood depicted in this artwork."
+    ],
+    ("narrative_suggestion", "formal", False, True): [
+        "Within {word_count} words, outline a narrative inspired by this image's visual elements and mood.",
+        "Compose a concise story idea based on the themes and composition of this artwork in {word_count} words."
+    ],
+    ("narrative_suggestion", "formal", True, False): [
+        "Write a {length} formal narrative inspired by this image, detailing a story that aligns with its visual and thematic elements.",
+        "Develop a {length} scholarly storyline that reflects the mood and composition of this artwork."
+    ],
+    ("contextual_storytelling", "informal", False, False): [
+        "Tell a cool story that could be happening in the scene of this image, based on its visual cues.",
+        "Imagine a background story for this artwork, explaining what's happening and why."
+    ],
+    ("contextual_storytelling", "informal", False, True): [
+        "In about {word_count} words, create a backstory for the scene depicted in this image.",
+        "Summarize a possible background narrative for this artwork in {word_count} words."
+    ],
+    ("contextual_storytelling", "informal", True, False): [
+        "Write a {length} informal story that provides context to the scene portrayed in this image.",
+        "Give a {length} casual backstory explaining the events depicted in this artwork."
+    ],
+    ("style_prompt", "formal", False, False): [
+        "Analyze this image through the lens of {style} art. Describe how it aligns with or diverges from {style_characteristics}.",
+        "Examine this artwork in the context of the {style} movement, focusing on {style_focus} and how these elements are represented or reinterpreted in the image."
+    ],
+    ("style_prompt", "formal", False, True): [
+        "Within {word_count} words, compare this image to the {style} style, highlighting elements that reflect or contrast with its key characteristics.",
+        "Compose a concise {word_count}-word analysis of how this artwork relates to the {style} movement, noting its adherence to or departure from typical {style} elements."
+    ],
+    ("style_prompt", "formal", True, False): [
+        "Write a {length} critique of this image, exploring its relationship to the {style} movement. Discuss composition, technique, and thematic elements in this context.",
+        "Develop a {length} analysis of how this artwork incorporates or challenges the principles of {style}, considering its visual language and artistic approach."
+    ],
+    ("style_prompt", "informal", False, False): [
+        "Imagine this image is in an exhibition of {style} art. Describe what makes it fit in or stand out from other {style} pieces.",
+        "Give a casual rundown of how this artwork vibes with the {style} movement. What's similar? What's different? What's cool about it?"
+    ],
+    ("style_prompt", "informal", False, True): [
+        "In about {word_count} words, chat about how this image relates to {style} art. What catches your eye as typical or unusual for the style?",
+        "Summarize in roughly {word_count} words how this artwork plays with {style} ideas. What's familiar? What's a twist on the style?"
+    ],
+    ("style_prompt", "informal", True, False): [
+        "Write a {length} chill analysis of this image as if it's part of a {style} art show. What works? What's surprising? How does it make you feel?",
+        "Give a {length} relaxed breakdown of how this artwork fits (or doesn't) into the {style} scene. What's your take on its use of {style} elements?"
+    ],
 }
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 # Image Adapter
 print("Loading image adapter")
 image_adapter = ImageAdapter(clip_model.config.hidden_size, text_model.config.hidden_size, False, False, 38, False)
+image_adapter.load_state_dict(torch.load(CHECKPOINT_PATH / "image_adapter.pt", map_location="cpu"))
 image_adapter.eval()
 image_adapter.to("cuda")
+# After loading the tokenizer and model
+print(f"Tokenizer class: {type(tokenizer)}")
+print(f"BOS token: {tokenizer.bos_token}")
+print(f"BOS token ID: {tokenizer.bos_token_id}")
+print(f"EOS token: {tokenizer.eos_token}")
+print(f"EOS token ID: {tokenizer.eos_token_id}")
+print(f"Text model device: {text_model.device}")
+# Ensure the tokenizer has the necessary special tokens
+if tokenizer.bos_token_id is None or tokenizer.eos_token_id is None:
+    print("Warning: BOS or EOS token is missing. Adding default tokens.")
+    special_tokens_dict = {}
+    if tokenizer.bos_token_id is None:
+        special_tokens_dict['bos_token'] = '<|endoftext|>'
+    if tokenizer.eos_token_id is None:
+        special_tokens_dict['eos_token'] = '<|endoftext|>'
+    num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)
+    print(f"Added {num_added_tokens} special tokens to the tokenizer.")
+    # Resize token embeddings of the model if new tokens are added
+    text_model.resize_token_embeddings(len(tokenizer))
+@spaces.GPU()
+@torch.no_grad()
+def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str, caption_length: str | int, art_style: str) -> str:
+    torch.cuda.empty_cache()
+    # Handle caption_length
+    length = None
+    if caption_length != "any":
+        if isinstance(caption_length, int):
+            length = caption_length
+        elif isinstance(caption_length, str):
+            try:
+                length = int(caption_length)
+            except ValueError:
+                # If it's not a number, treat it as a descriptive length
+                length = caption_length
+    # 'rng-tags' and 'training_prompt' don't have formal/informal tones
+    if caption_type in ["rng-tags", "training_prompt"]:
+        caption_tone = "formal"
+    # Build prompt
+    prompt_key = (caption_type, caption_tone, isinstance(length, str), isinstance(length, int))
+    if prompt_key not in CAPTION_TYPE_MAP:
+        raise ValueError(f"Invalid caption type: {prompt_key}")
+    prompt_str = CAPTION_TYPE_MAP[prompt_key][0].format(
+        length=length,
+        word_count=length,
+        style=art_style,
+        style_characteristics=STYLE_CHARACTERISTICS.get(art_style, "its unique elements"),
+        style_focus=STYLE_FOCUS.get(art_style, "its distinctive features")
+    )
+    print(f"Prompt: {prompt_str}")
+    # Preprocess image
     image = input_image.resize((384, 384), Image.LANCZOS)
     pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
     pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
+    pixel_values = pixel_values.to('cuda')
+    # Tokenize the prompt
     prompt = tokenizer.encode(prompt_str, return_tensors='pt', padding=False, truncation=False, add_special_tokens=False)
+    # Embed image
+    with torch.amp.autocast_mode.autocast('cuda', enabled=True):
+        vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True)
+        image_features = vision_outputs.hidden_states
+        embedded_images = image_adapter(image_features)
+        embedded_images = embedded_images.to('cuda')
+    # Embed prompt
     prompt_embeds = text_model.model.embed_tokens(prompt.to('cuda'))
+    assert prompt_embeds.shape == (1, prompt.shape[1], text_model.config.hidden_size), f"Prompt shape is {prompt_embeds.shape}, expected {(1, prompt.shape[1], text_model.config.hidden_size)}"
+    # Check for bos_token_id and provide a fallback
+    bos_token_id = tokenizer.bos_token_id
+    if bos_token_id is None:
+        print("Warning: bos_token_id is None. Using default value of 1.")
+        bos_token_id = 1  # Common default, but may need adjustment
+    embedded_bos = text_model.model.embed_tokens(torch.tensor([[bos_token_id]], device=text_model.device, dtype=torch.int64))
     eot_embed = image_adapter.get_eot_embedding().unsqueeze(0).to(dtype=text_model.dtype)
+    # Construct prompts
     inputs_embeds = torch.cat([
+        embedded_bos.expand(embedded_images.shape[0], -1, -1),
+        embedded_images.to(dtype=embedded_bos.dtype),
+        prompt_embeds.expand(embedded_images.shape[0], -1, -1),
+        eot_embed.expand(embedded_images.shape[0], -1, -1),
     ], dim=1)
     input_ids = torch.cat([
+        torch.tensor([[bos_token_id]], dtype=torch.long),
+        torch.zeros((1, embedded_images.shape[1]), dtype=torch.long),
         prompt,
         torch.tensor([[tokenizer.convert_tokens_to_ids("<|eot_id|>")]], dtype=torch.long),
     ], dim=1).to('cuda')
     attention_mask = torch.ones_like(input_ids)
+    generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=300, do_sample=True, suppress_tokens=None)
+    # Trim off the prompt
     generate_ids = generate_ids[:, input_ids.shape[1]:]
     if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids("<|eot_id|>"):
         generate_ids = generate_ids[:, :-1]
+    caption = tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0]
+    return caption.strip()
 css = """
 h1, h2, h3, h4, h5, h6, p, li, ul, ol, a, .centered-image {
 }
 """
+ART_STYLES = [
+    "Impressionism", "Cubism", "Surrealism", "Abstract Expressionism", "Pop Art",
+    "Minimalism", "Baroque", "Renaissance", "Art Nouveau", "Gothic",
+    "Romanticism", "Realism", "Expressionism", "Fauvism", "Art Deco",
+    "Futurism", "Dadaism", "Pointillism", "Rococo", "Neoclassicism"
+]
+STYLE_CHARACTERISTICS = {
+    "Impressionism": "loose brushstrokes, emphasis on light and color, everyday subjects",
+    "Cubism": "geometric shapes, multiple perspectives, fragmented forms",
+    "Surrealism": "dreamlike imagery, unexpected juxtapositions, subconscious exploration",
+    "Abstract Expressionism": "expressive brushwork, emotional content, abstract forms",
+    "Pop Art": "bright colors, popular culture references, satire",
+    "Minimalism": "simple forms, limited color palette, emphasis on space",
+    "Baroque": "dramatic lighting, elaborate detail, grandeur",
+    "Renaissance": "realistic depictions, perspective, religious themes",
+    "Art Nouveau": "stylized forms, organic shapes, decorative elements",
+    "Gothic": "dark themes, dramatic lighting, architectural elements",
+    "Romanticism": "emotional content, nature scenes, idealized figures",
+    "Realism": "detailed depictions, realistic textures, everyday subjects",
+    "Expressionism": "emotional content, distorted forms, abstract elements",
+    "Fauvism": "bold colors, abstract forms, emotional content",
+    "Art Deco": "geometric shapes, streamlined forms, modern aesthetics",
+    "Futurism": "dynamic forms, speed, technology",
+    "Dadaism": "anti-art, absurdity, subversion of traditional art",
+    "Pointillism": "small dots of color, impressionistic style, emphasis on light",
+    "Rococo": "ornate style, lighthearted themes, decorative elements",
+    "Neoclassicism": "classical style, balance, symmetry"
 }
+STYLE_FOCUS = {
+    "Impressionism": "capturing fleeting moments and atmospheric effects",
+    "Cubism": "deconstructing and reassembling forms from multiple viewpoints",
+    "Surrealism": "creating a sense of the uncanny and exploring the subconscious mind",
+    "Abstract Expressionism": "expressing emotional content through abstract forms",
+    "Pop Art": "commenting on popular culture and satirizing consumerism",
+    "Minimalism": "exploring the relationship between form and space",
+    "Baroque": "creating dramatic and grandiose compositions",
+    "Renaissance": "depicting realistic scenes and exploring perspective",
+    "Art Nouveau": "incorporating organic and decorative elements",
+    "Gothic": "exploring dark themes and dramatic lighting",
+    "Romanticism": "depicting emotional scenes and idealized figures",
+    "Realism": "capturing detailed and realistic textures",
+    "Expressionism": "expressing emotional content through distorted forms",
+    "Fauvism": "emphasizing bold colors and emotional content",
+    "Art Deco": "incorporating geometric shapes and modern aesthetics",
+    "Futurism": "depicting speed, technology, and dynamism",
+    "Dadaism": "subverting traditional art and exploring absurdity",
+    "Pointillism": "capturing light and color through small dots",
+    "Rococo": "creating lighthearted and decorative compositions",
+    "Neoclassicism": "achieving balance and symmetry in classical style"
 }
 with gr.Blocks(theme="Hev832/Applio", css=css) as demo:
     with gr.Tab("Welcome"):
         gr.Markdown(
+			"""
             <img src="https://path-to-yamamoto-logo.png" alt="Yamamoto Logo" class="centered-image">
             # 🎨 Yamamoto JoyCaption: AI-Powered Art Inspiration
             ## Accelerate Your Creative Workflow with Intelligent Image Analysis
             This innovative tool empowers Yamamoto's artists to quickly generate descriptive captions,<br>
+            training prompts, or tags from existing artwork, fueling the creative process for GenAI models.
             ## 🚀 How It Works:
             1. **Upload Your Inspiration**: Drop in an image (e.g., a charcoal horse picture) that embodies your desired style.
             4. **Generate and Iterate**: Click 'Caption' to analyze your image and use the results to inspire new creations.
             """
         )
     with gr.Tab("JoyCaption"):
+        gr.Markdown("""
+        # JoyCaption: AI-Powered Image Analysis Tool
+        This tool helps you generate various types of text based on an uploaded image. Here's how to use it:
+        1. Upload an image
+        2. Choose your desired output type
+        3. Adjust settings as needed
+        4. Click 'Generate Caption' to get your result
+        """)
         with gr.Row():
+            with gr.Column(scale=1):
+                input_image = gr.Image(type="pil", label="Upload Your Image")
                 caption_type = gr.Dropdown(
+                    choices=[
+                        "descriptive",
+                        "training_prompt",
+                        "rng-tags",
+                        "thematic_analysis",
+                        "stylistic_comparison",
+                        "narrative_suggestion",
+                        "contextual_storytelling",
+                        "style_prompt"
+                    ],
+                    label="Output Type",
                     value="descriptive",
                 )
+                gr.Markdown("""
+                ### Output Types Explained:
+                - **Descriptive**: A general description of the image
+                - **Training Prompt**: A prompt for AI image generation
+                - **RNG-Tags**: Tags for categorizing the image
+                - **Thematic Analysis**: Exploration of themes in the image
+                - **Stylistic Comparison**: Compares the image to art styles
+                - **Narrative Suggestion**: A story idea based on the image
+                - **Contextual Storytelling**: A background story for the image
+                - **Style Prompt**: Analyzes the image in context of a specific art style
+                """)
                 caption_tone = gr.Dropdown(
                     choices=["formal", "informal"],
+                    label="Tone",
                     value="formal",
                 )
+                gr.Markdown("Choose between a formal (professional) or informal (casual) tone for the output.")
                 caption_length = gr.Dropdown(
                     choices=["any", "very short", "short", "medium-length", "long", "very long"] +
                             [str(i) for i in range(20, 261, 10)],
+                    label="Length",
                     value="any",
                 )
+                gr.Markdown("""
+                Select the desired length of the output:
+                - 'any': No specific length
+                - Descriptive options: very short to very long
+                - Numeric options: Specify exact word count (20 to 260 words)
+                """)
+                art_style = gr.Dropdown(
+                    choices=ART_STYLES,
+                    label="Art Style (for Style Prompt)",
+                    value="Impressionism",
+                    visible=False
                 )
+                gr.Markdown("Select an art style to analyze the image in that context. Only applicable for 'Style Prompt' output type.")
+            with gr.Column(scale=1):
+                output_caption = gr.Textbox(label="Generated Output", lines=10)
+                generate_button = gr.Button("Generate Caption")
+        gr.Markdown("""
+        ### Additional Notes:
+        - The 'Tone' setting doesn't affect 'RNG-Tags' and 'Training Prompt' outputs.
+        - 'Art Style' is only used when 'Style Prompt' is selected as the output type.
+        - The AI model analyzes the image and generates text based on your selections.
+        """)
+        def update_visibility(caption_type):
+            return {
+                art_style: gr.update(visible=(caption_type == "style_prompt")),
+                caption_tone: gr.update(visible=(caption_type not in ["rng-tags", "training_prompt"]))
+            }
+        caption_type.change(
+            fn=update_visibility,
+            inputs=[caption_type],
+            outputs=[art_style, caption_tone]
+        )
+        generate_button.click(
+            fn=stream_chat,
+            inputs=[input_image, caption_type, caption_tone, caption_length, art_style],
+            outputs=[output_caption]
+        )
 if __name__ == "__main__":
     demo.launch()