Test-Caption-Captain

Sleeping

App Files Files Community

Severian commited on Sep 25, 2024

Commit

bd55f23

verified ·

1 Parent(s): 799b4bb

Update app.py

Browse files

Files changed (1) hide show

app.py +118 -242

app.py CHANGED Viewed

@@ -261,118 +261,84 @@ image_adapter.load_state_dict(torch.load(CHECKPOINT_PATH / "image_adapter.pt", m
 image_adapter.eval()
 image_adapter.to("cuda")
-# After loading the tokenizer and model
-print(f"Tokenizer class: {type(tokenizer)}")
-print(f"BOS token: {tokenizer.bos_token}")
-print(f"BOS token ID: {tokenizer.bos_token_id}")
-print(f"EOS token: {tokenizer.eos_token}")
-print(f"EOS token ID: {tokenizer.eos_token_id}")
-print(f"Text model device: {text_model.device}")
-# Ensure the tokenizer has the necessary special tokens
-if tokenizer.bos_token_id is None or tokenizer.eos_token_id is None:
-    print("Warning: BOS or EOS token is missing. Adding default tokens.")
-    special_tokens_dict = {}
-    if tokenizer.bos_token_id is None:
-        special_tokens_dict['bos_token'] = '<|endoftext|>'
-    if tokenizer.eos_token_id is None:
-        special_tokens_dict['eos_token'] = '<|endoftext|>'
-    num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)
-    print(f"Added {num_added_tokens} special tokens to the tokenizer.")
-    # Resize token embeddings of the model if new tokens are added
-    text_model.resize_token_embeddings(len(tokenizer))
 @spaces.GPU()
 @torch.no_grad()
-def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str, caption_length: str | int, art_style: str) -> str:
-    torch.cuda.empty_cache()
-    # Handle caption_length
-    length = None
-    if caption_length != "any":
-        if isinstance(caption_length, int):
-            length = caption_length
-        elif isinstance(caption_length, str):
-            try:
-                length = int(caption_length)
-            except ValueError:
-                # If it's not a number, treat it as a descriptive length
-                length = caption_length
-    # 'rng-tags' and 'training_prompt' don't have formal/informal tones
-    if caption_type in ["rng-tags", "training_prompt"]:
-        caption_tone = "formal"
-    # Build prompt
-    prompt_key = (caption_type, caption_tone, isinstance(length, str), isinstance(length, int))
-    if prompt_key not in CAPTION_TYPE_MAP:
-        raise ValueError(f"Invalid caption type: {prompt_key}")
-    prompt_str = CAPTION_TYPE_MAP[prompt_key][0].format(
-        length=length,
-        word_count=length,
-        style=art_style,
-        style_characteristics=STYLE_CHARACTERISTICS.get(art_style, "its unique elements"),
-        style_focus=STYLE_FOCUS.get(art_style, "its distinctive features")
-    )
-    print(f"Prompt: {prompt_str}")
-    # Preprocess image
-    image = input_image.resize((384, 384), Image.LANCZOS)
-    pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
-    pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
-    pixel_values = pixel_values.to('cuda')
-    # Tokenize the prompt
-    prompt = tokenizer.encode(prompt_str, return_tensors='pt', padding=False, truncation=False, add_special_tokens=False)
-    # Embed image
-    with torch.amp.autocast_mode.autocast('cuda', enabled=True):
-        vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True)
-        image_features = vision_outputs.hidden_states
-        embedded_images = image_adapter(image_features)
-        embedded_images = embedded_images.to('cuda')
-    # Embed prompt
-    prompt_embeds = text_model.model.embed_tokens(prompt.to('cuda'))
-    assert prompt_embeds.shape == (1, prompt.shape[1], text_model.config.hidden_size), f"Prompt shape is {prompt_embeds.shape}, expected {(1, prompt.shape[1], text_model.config.hidden_size)}"
-    # Check for bos_token_id and provide a fallback
-    bos_token_id = tokenizer.bos_token_id
-    if bos_token_id is None:
-        print("Warning: bos_token_id is None. Using default value of 1.")
-        bos_token_id = 1  # Common default, but may need adjustment
-    embedded_bos = text_model.model.embed_tokens(torch.tensor([[bos_token_id]], device=text_model.device, dtype=torch.int64))
-    eot_embed = image_adapter.get_eot_embedding().unsqueeze(0).to(dtype=text_model.dtype)
-    # Construct prompts
-    inputs_embeds = torch.cat([
-        embedded_bos.expand(embedded_images.shape[0], -1, -1),
-        embedded_images.to(dtype=embedded_bos.dtype),
-        prompt_embeds.expand(embedded_images.shape[0], -1, -1),
-        eot_embed.expand(embedded_images.shape[0], -1, -1),
-    ], dim=1)
-    input_ids = torch.cat([
-        torch.tensor([[bos_token_id]], dtype=torch.long),
-        torch.zeros((1, embedded_images.shape[1]), dtype=torch.long),
-        prompt,
-        torch.tensor([[tokenizer.convert_tokens_to_ids("<|eot_id|>")]], dtype=torch.long),
-    ], dim=1).to('cuda')
-    attention_mask = torch.ones_like(input_ids)
-    generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=300, do_sample=True, suppress_tokens=None)
-    # Trim off the prompt
-    generate_ids = generate_ids[:, input_ids.shape[1]:]
-    if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids("<|eot_id|>"):
-        generate_ids = generate_ids[:, :-1]
-    caption = tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0]
-    return caption.strip()
 css = """
 h1, h2, h3, h4, h5, h6, p, li, ul, ol, a, .centered-image {
@@ -392,71 +358,18 @@ ul, ol {
 }
 """
-ART_STYLES = [
-    "Impressionism", "Cubism", "Surrealism", "Abstract Expressionism", "Pop Art",
-    "Minimalism", "Baroque", "Renaissance", "Art Nouveau", "Gothic",
-    "Romanticism", "Realism", "Expressionism", "Fauvism", "Art Deco",
-    "Futurism", "Dadaism", "Pointillism", "Rococo", "Neoclassicism"
-]
-STYLE_CHARACTERISTICS = {
-    "Impressionism": "loose brushstrokes, emphasis on light and color, everyday subjects",
-    "Cubism": "geometric shapes, multiple perspectives, fragmented forms",
-    "Surrealism": "dreamlike imagery, unexpected juxtapositions, subconscious exploration",
-    "Abstract Expressionism": "expressive brushwork, emotional content, abstract forms",
-    "Pop Art": "bright colors, popular culture references, satire",
-    "Minimalism": "simple forms, limited color palette, emphasis on space",
-    "Baroque": "dramatic lighting, elaborate detail, grandeur",
-    "Renaissance": "realistic depictions, perspective, religious themes",
-    "Art Nouveau": "stylized forms, organic shapes, decorative elements",
-    "Gothic": "dark themes, dramatic lighting, architectural elements",
-    "Romanticism": "emotional content, nature scenes, idealized figures",
-    "Realism": "detailed depictions, realistic textures, everyday subjects",
-    "Expressionism": "emotional content, distorted forms, abstract elements",
-    "Fauvism": "bold colors, abstract forms, emotional content",
-    "Art Deco": "geometric shapes, streamlined forms, modern aesthetics",
-    "Futurism": "dynamic forms, speed, technology",
-    "Dadaism": "anti-art, absurdity, subversion of traditional art",
-    "Pointillism": "small dots of color, impressionistic style, emphasis on light",
-    "Rococo": "ornate style, lighthearted themes, decorative elements",
-    "Neoclassicism": "classical style, balance, symmetry"
-}
-STYLE_FOCUS = {
-    "Impressionism": "capturing fleeting moments and atmospheric effects",
-    "Cubism": "deconstructing and reassembling forms from multiple viewpoints",
-    "Surrealism": "creating a sense of the uncanny and exploring the subconscious mind",
-    "Abstract Expressionism": "expressing emotional content through abstract forms",
-    "Pop Art": "commenting on popular culture and satirizing consumerism",
-    "Minimalism": "exploring the relationship between form and space",
-    "Baroque": "creating dramatic and grandiose compositions",
-    "Renaissance": "depicting realistic scenes and exploring perspective",
-    "Art Nouveau": "incorporating organic and decorative elements",
-    "Gothic": "exploring dark themes and dramatic lighting",
-    "Romanticism": "depicting emotional scenes and idealized figures",
-    "Realism": "capturing detailed and realistic textures",
-    "Expressionism": "expressing emotional content through distorted forms",
-    "Fauvism": "emphasizing bold colors and emotional content",
-    "Art Deco": "incorporating geometric shapes and modern aesthetics",
-    "Futurism": "depicting speed, technology, and dynamism",
-    "Dadaism": "subverting traditional art and exploring absurdity",
-    "Pointillism": "capturing light and color through small dots",
-    "Rococo": "creating lighthearted and decorative compositions",
-    "Neoclassicism": "achieving balance and symmetry in classical style"
-}
 with gr.Blocks(theme="Hev832/Applio", css=css) as demo:
     with gr.Tab("Welcome"):
         gr.Markdown(
 			"""
-            <img src="https://path-to-yamamoto-logo.png" alt="Yamamoto Logo" class="centered-image">
             # 🎨 Yamamoto JoyCaption: AI-Powered Art Inspiration
             ## Accelerate Your Creative Workflow with Intelligent Image Analysis
             This innovative tool empowers Yamamoto's artists to quickly generate descriptive captions,<br>
-            training prompts, or tags from existing artwork, fueling the creative process for GenAI models.
             ## 🚀 How It Works:
             1. **Upload Your Inspiration**: Drop in an image (e.g., a charcoal horse picture) that embodies your desired style.
@@ -468,109 +381,72 @@ with gr.Blocks(theme="Hev832/Applio", css=css) as demo:
     with gr.Tab("JoyCaption"):
         gr.Markdown("""
-        # JoyCaption: AI-Powered Image Analysis Tool
-        This tool helps you generate various types of text based on an uploaded image. Here's how to use it:
-        1. Upload an image
-        2. Choose your desired output type
-        3. Adjust settings as needed
-        4. Click 'Generate Caption' to get your result
-        """)
         with gr.Row():
-            with gr.Column(scale=1):
-                input_image = gr.Image(type="pil", label="Upload Your Image")
                 caption_type = gr.Dropdown(
-                    choices=[
-                        "descriptive",
-                        "training_prompt",
-                        "rng-tags",
-                        "thematic_analysis",
-                        "stylistic_comparison",
-                        "narrative_suggestion",
-                        "contextual_storytelling",
-                        "style_prompt"
-                    ],
-                    label="Output Type",
                     value="descriptive",
                 )
-                gr.Markdown("""
-                ### Output Types Explained:
-                - **Descriptive**: A general description of the image
-                - **Training Prompt**: A prompt for AI image generation
-                - **RNG-Tags**: Tags for categorizing the image
-                - **Thematic Analysis**: Exploration of themes in the image
-                - **Stylistic Comparison**: Compares the image to art styles
-                - **Narrative Suggestion**: A story idea based on the image
-                - **Contextual Storytelling**: A background story for the image
-                - **Style Prompt**: Analyzes the image in context of a specific art style
-                """)
                 caption_tone = gr.Dropdown(
                     choices=["formal", "informal"],
-                    label="Tone",
                     value="formal",
                 )
-                gr.Markdown("Choose between a formal (professional) or informal (casual) tone for the output.")
                 caption_length = gr.Dropdown(
                     choices=["any", "very short", "short", "medium-length", "long", "very long"] +
                             [str(i) for i in range(20, 261, 10)],
-                    label="Length",
                     value="any",
                 )
-                gr.Markdown("""
-                Select the desired length of the output:
-                - 'any': No specific length
-                - Descriptive options: very short to very long
-                - Numeric options: Specify exact word count (20 to 260 words)
-                """)
-                art_style = gr.Dropdown(
-                    choices=ART_STYLES,
-                    label="Art Style (for Style Prompt)",
-                    value="Impressionism",
-                    visible=False
-                )
-                gr.Markdown("Select an art style to analyze the image in that context. Only applicable for 'Style Prompt' output type.")
-            with gr.Column(scale=1):
-                output_caption = gr.Textbox(label="Generated Output", lines=10)
-                generate_button = gr.Button("Generate Caption")
         gr.Markdown("""
-        ### Additional Notes:
-        - The 'Tone' setting doesn't affect 'RNG-Tags' and 'Training Prompt' outputs.
-        - 'Art Style' is only used when 'Style Prompt' is selected as the output type.
-        - The AI model analyzes the image and generates text based on your selections.
         """)
-        run_button = gr.Button("Caption")
-        with gr.Column():
-            output_caption = gr.Textbox(label="Caption")
-        caption_type.change(
-            fn=lambda x: gr.update(visible=(x == "style_prompt")),
-            inputs=[caption_type],
-            outputs=[art_style]
-        )
-        generate_button.click(
-            fn=stream_chat,
-            inputs=[input_image, caption_type, caption_tone, caption_length, art_style],
-            outputs=[output_caption]
-        )
-        run_button.click(fn=stream_chat, inputs=[input_image, caption_type, caption_tone, caption_length, art_style], outputs=[output_caption])
 if __name__ == "__main__":
     demo.launch()

 image_adapter.eval()
 image_adapter.to("cuda")
 @spaces.GPU()
 @torch.no_grad()
+def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str, caption_length: str | int) -> str:
+	torch.cuda.empty_cache()
+	# 'any' means no length specified
+	length = None if caption_length == "any" else caption_length
+	if isinstance(length, str):
+		try:
+			length = int(length)
+		except ValueError:
+			pass
+	# 'rng-tags' and 'training_prompt' don't have formal/informal tones
+	if caption_type == "rng-tags" or caption_type == "training_prompt":
+		caption_tone = "formal"
+	# Build prompt
+	prompt_key = (caption_type, caption_tone, isinstance(length, str), isinstance(length, int))
+	if prompt_key not in CAPTION_TYPE_MAP:
+		raise ValueError(f"Invalid caption type: {prompt_key}")
+	prompt_str = CAPTION_TYPE_MAP[prompt_key][0].format(length=length, word_count=length)
+	print(f"Prompt: {prompt_str}")
+	# Preprocess image
+	#image = clip_processor(images=input_image, return_tensors='pt').pixel_values
+	image = input_image.resize((384, 384), Image.LANCZOS)
+	pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
+	pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
+	pixel_values = pixel_values.to('cuda')
+	# Tokenize the prompt
+	prompt = tokenizer.encode(prompt_str, return_tensors='pt', padding=False, truncation=False, add_special_tokens=False)
+	# Embed image
+	with torch.amp.autocast_mode.autocast('cuda', enabled=True):
+		vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True)
+		image_features = vision_outputs.hidden_states
+		embedded_images = image_adapter(image_features)
+		embedded_images = embedded_images.to('cuda')
+	# Embed prompt
+	prompt_embeds = text_model.model.embed_tokens(prompt.to('cuda'))
+	assert prompt_embeds.shape == (1, prompt.shape[1], text_model.config.hidden_size), f"Prompt shape is {prompt_embeds.shape}, expected {(1, prompt.shape[1], text_model.config.hidden_size)}"
+	embedded_bos = text_model.model.embed_tokens(torch.tensor([[tokenizer.bos_token_id]], device=text_model.device, dtype=torch.int64))
+	eot_embed = image_adapter.get_eot_embedding().unsqueeze(0).to(dtype=text_model.dtype)
+	# Construct prompts
+	inputs_embeds = torch.cat([
+		embedded_bos.expand(embedded_images.shape[0], -1, -1),
+		embedded_images.to(dtype=embedded_bos.dtype),
+		prompt_embeds.expand(embedded_images.shape[0], -1, -1),
+		eot_embed.expand(embedded_images.shape[0], -1, -1),
+	], dim=1)
+	input_ids = torch.cat([
+		torch.tensor([[tokenizer.bos_token_id]], dtype=torch.long),
+		torch.zeros((1, embedded_images.shape[1]), dtype=torch.long),
+		prompt,
+		torch.tensor([[tokenizer.convert_tokens_to_ids("<|eot_id|>")]], dtype=torch.long),
+	], dim=1).to('cuda')
+	attention_mask = torch.ones_like(input_ids)
+	#generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=300, do_sample=False, suppress_tokens=None)
+	#generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=300, do_sample=True, top_k=10, temperature=0.5, suppress_tokens=None)
+	generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=300, do_sample=True, suppress_tokens=None)   # Uses the default which is temp=0.6, top_p=0.9
+	# Trim off the prompt
+	generate_ids = generate_ids[:, input_ids.shape[1]:]
+	if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids("<|eot_id|>"):
+		generate_ids = generate_ids[:, :-1]
+	caption = tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0]
+	return caption.strip()
 css = """
 h1, h2, h3, h4, h5, h6, p, li, ul, ol, a, .centered-image {
 }
 """
 with gr.Blocks(theme="Hev832/Applio", css=css) as demo:
     with gr.Tab("Welcome"):
         gr.Markdown(
 			"""
+            <img src="https://cdn-uploads.huggingface.co/production/uploads/64740cf7485a7c8e1bd51ac9/eO4MsESKd3K99rYiUuled.png" alt="Yamamoto Logo" class="centered-image">
             # 🎨 Yamamoto JoyCaption: AI-Powered Art Inspiration
             ## Accelerate Your Creative Workflow with Intelligent Image Analysis
             This innovative tool empowers Yamamoto's artists to quickly generate descriptive captions,<br>
+            training prompts, and tags from existing artwork, fueling the creative process for GenAI models.
             ## 🚀 How It Works:
             1. **Upload Your Inspiration**: Drop in an image (e.g., a charcoal horse picture) that embodies your desired style.
     with gr.Tab("JoyCaption"):
         gr.Markdown("""
+        # How to Use JoyCaption
+        Hello, artist! Let's make some fun captions for your pictures. Here's how:
+        1. **Pick a Picture**: Find a cool picture you want to talk about and upload it.
+        2. **Choose What You Want**:
+           - **Caption Type**:
+             * "Descriptive" tells you what's in the picture
+             * "Training Prompt" helps computers make similar pictures
+             * "RNG-Tags" gives you short words about the picture
+        3. **Pick a Style** (for "Descriptive" only):
+           - "Formal" sounds like a teacher talking
+           - "Informal" sounds like a friend chatting
+        4. **Decide How Long**:
+           - "Any" lets the computer decide
+           - Or pick a size from "very short" to "very long"
+           - You can even choose a specific number of words!
+        5. **Make the Caption**: Click the "Caption" button and watch the magic happen!
+        Remember, have fun and be creative with your captions!
+        """)
         with gr.Row():
+            with gr.Column():
+                input_image = gr.Image(type="pil", label="Upload Your Picture Here")
                 caption_type = gr.Dropdown(
+                    choices=["descriptive", "training_prompt", "rng-tags"],
+                    label="What Kind of Caption Do You Want?",
                     value="descriptive",
                 )
                 caption_tone = gr.Dropdown(
                     choices=["formal", "informal"],
+                    label="How Should It Sound? (For 'Descriptive' Only)",
                     value="formal",
                 )
                 caption_length = gr.Dropdown(
                     choices=["any", "very short", "short", "medium-length", "long", "very long"] +
                             [str(i) for i in range(20, 261, 10)],
+                    label="How Long Should It Be?",
                     value="any",
                 )
+                gr.Markdown("**Friendly Reminder:** The tone (formal/informal) only works for 'Descriptive' captions.")
+                run_button = gr.Button("Make My Caption!")
+            with gr.Column():
+                output_caption = gr.Textbox(label="Your Amazing Caption Appears Here")
         gr.Markdown("""
+        ## Tips for Great Captions:
+        - Try different types to see what you like best
+        - Experiment with formal and informal tones for fun variations
+        - Adjust the length to get just the right amount of detail
+        - If you don't like a caption, just click "Make My Caption!" again for a new one
+        Have a great time captioning your art!
         """)
 if __name__ == "__main__":
     demo.launch()