Test-Caption-Captain

Sleeping

App Files Files Community

Severian commited on Sep 26, 2024

Commit

d2c00ac

verified ·

1 Parent(s): 7c751fb

Update app.py

Browse files

Files changed (1) hide show

app.py +116 -105

app.py CHANGED Viewed

@@ -12,24 +12,28 @@ import torchvision.transforms.functional as TVF
 CLIP_PATH = "google/siglip-so400m-patch14-384"
-MODEL_PATH = "Qwen/Qwen2.5-7B-Instruct"
 CHECKPOINT_PATH = Path("9em124t2-499968")
 CAPTION_TYPE_MAP = {
-    ("descriptive", "formal", False, False): ["Write a descriptive caption for this image in a formal tone."],
-    ("descriptive", "formal", False, True): ["Write a descriptive caption for this image in a formal tone within {word_count} words."],
-    ("descriptive", "formal", True, False): ["Write a {length} descriptive caption for this image in a formal tone."],
-    ("descriptive", "informal", False, False): ["Write a descriptive caption for this image in a casual tone."],
-    ("descriptive", "informal", False, True): ["Write a descriptive caption for this image in a casual tone within {word_count} words."],
-    ("descriptive", "informal", True, False): ["Write a {length} descriptive caption for this image in a casual tone."],
-    ("training_prompt", "formal", False, False): ["Write a stable diffusion prompt for this image."],
-    ("training_prompt", "formal", False, True): ["Write a stable diffusion prompt for this image within {word_count} words."],
-    ("training_prompt", "formal", True, False): ["Write a {length} stable diffusion prompt for this image."],
-    ("rng-tags", "formal", False, False): ["Write a list of Booru tags for this image."],
-    ("rng-tags", "formal", False, True): ["Write a list of Booru tags for this image within {word_count} words."],
-    ("rng-tags", "formal", True, False): ["Write a {length} list of Booru tags for this image."],
-    ("style_prompt", "formal", False, False): ["Generate a detailed stable diffusion prompt to recreate this image, including style, composition, and key elements."],
-    ("style_prompt", "formal", False, True): ["Within {word_count} words, create a precise stable diffusion prompt capturing the essence of this image."],
-    ("style_prompt", "formal", True, False): ["Write a {length} stable diffusion prompt that thoroughly describes this image's style, subject, and artistic techniques."],
 }
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
@@ -139,91 +143,87 @@ image_adapter.eval()
 image_adapter.to("cuda")
-@spaces.GPU()
-@torch.no_grad()
-def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str, caption_length: str | int,
-                lens_type: str = "standard", film_stock: str = "digital",
-                composition: str = "rule of thirds", lighting: str = "natural") -> str:
-    torch.cuda.empty_cache()
-    # 'any' means no length specified
-    length = None if caption_length == "any" else caption_length
-    if isinstance(length, str):
-        try:
-            length = int(length)
-        except ValueError:
-            pass
-    # 'rng-tags', 'training_prompt', and 'style_prompt' don't have formal/informal tones
-    if caption_type in ["rng-tags", "training_prompt", "style_prompt"]:
-        caption_tone = "formal"
-    # Build prompt
-    prompt_key = (caption_type, caption_tone, isinstance(length, str), isinstance(length, int))
-    if prompt_key not in CAPTION_TYPE_MAP:
-        raise ValueError(f"Invalid caption type: {prompt_key}")
-    prompt_str = CAPTION_TYPE_MAP[prompt_key][0].format(length=length, word_count=length)
-    if caption_type == "style_prompt":
-        prompt_str += (f" Include details about using a {lens_type} lens, "
-                       f"{film_stock} film stock, {composition} composition, and {lighting} lighting.")
-    print(f"Prompt: {prompt_str}")
-    # Preprocess image
     image = input_image.resize((384, 384), Image.LANCZOS)
     pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
     pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
-    pixel_values = pixel_values.to('cuda')
-    # Tokenize the prompt
     prompt = tokenizer.encode(prompt_str, return_tensors='pt', padding=False, truncation=False, add_special_tokens=False)
-    # Embed image
-    with torch.amp.autocast_mode.autocast('cuda', enabled=True):
-        vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True)
-        image_features = vision_outputs.hidden_states
-        embedded_images = image_adapter(image_features)
-        embedded_images = embedded_images.to('cuda')
-    # Embed prompt
     prompt_embeds = text_model.model.embed_tokens(prompt.to('cuda'))
-    assert prompt_embeds.shape == (1, prompt.shape[1], text_model.config.hidden_size), f"Prompt shape is {prompt_embeds.shape}, expected {(1, prompt.shape[1], text_model.config.hidden_size)}"
     embedded_bos = text_model.model.embed_tokens(torch.tensor([[tokenizer.bos_token_id]], device=text_model.device, dtype=torch.int64))
     eot_embed = image_adapter.get_eot_embedding().unsqueeze(0).to(dtype=text_model.dtype)
-    # Construct prompts
     inputs_embeds = torch.cat([
-        embedded_bos.expand(embedded_images.shape[0], -1, -1),
-        embedded_images.to(dtype=embedded_bos.dtype),
-        prompt_embeds.expand(embedded_images.shape[0], -1, -1),
-        eot_embed.expand(embedded_images.shape[0], -1, -1),
     ], dim=1)
     input_ids = torch.cat([
         torch.tensor([[tokenizer.bos_token_id]], dtype=torch.long),
-        torch.zeros((1, embedded_images.shape[1]), dtype=torch.long),
         prompt,
         torch.tensor([[tokenizer.convert_tokens_to_ids("<|eot_id|>")]], dtype=torch.long),
     ], dim=1).to('cuda')
     attention_mask = torch.ones_like(input_ids)
-    generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=300, do_sample=True, suppress_tokens=None)
-    # Trim off the prompt
     generate_ids = generate_ids[:, input_ids.shape[1]:]
     if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids("<|eot_id|>"):
         generate_ids = generate_ids[:, :-1]
-    caption = tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0]
-    # For style_prompt, format the output for easy copying into image generation platforms
     if caption_type == "style_prompt":
-        caption = "Stable Diffusion Prompt: " + caption.replace("\n", ", ")
-    return caption.strip()
 css = """
 h1, h2, h3, h4, h5, h6, p, li, ul, ol, a, .centered-image {
@@ -243,10 +243,11 @@ ul, ol {
 }
 """
 with gr.Blocks(theme="Hev832/Applio", css=css) as demo:
     with gr.Tab("Welcome"):
         gr.Markdown(
-			"""
             <img src="https://path-to-yamamoto-logo.png" alt="Yamamoto Logo" class="centered-image">
             # 🎨 Yamamoto JoyCaption: AI-Powered Art Inspiration
@@ -263,7 +264,7 @@ with gr.Blocks(theme="Hev832/Applio", css=css) as demo:
             4. **Generate and Iterate**: Click 'Caption' to analyze your image and use the results to inspire new creations.
             """
         )
     with gr.Tab("JoyCaption"):
         with gr.Accordion("How to Use JoyCaption", open=False):
             gr.Markdown("""
@@ -308,58 +309,68 @@ with gr.Blocks(theme="Hev832/Applio", css=css) as demo:
         with gr.Row():
             with gr.Column():
-                input_image = gr.Image(type="pil", label="Upload Your Picture Here")
                 caption_type = gr.Dropdown(
                     choices=["descriptive", "training_prompt", "rng-tags", "style_prompt"],
-                    label="What Kind of Caption Do You Want?",
                     value="descriptive",
                 )
                 caption_tone = gr.Dropdown(
                     choices=["formal", "informal"],
-                    label="How Should It Sound? (For 'Descriptive' and 'Style Prompt' Only)",
                     value="formal",
                 )
                 caption_length = gr.Dropdown(
                     choices=["any", "very short", "short", "medium-length", "long", "very long"] +
                             [str(i) for i in range(20, 261, 10)],
-                    label="How Long Should It Be?",
                     value="any",
                 )
-                with gr.Accordion("Advanced Options (for Style Prompt)", open=False):
-                    lens_type = gr.Dropdown(
-                        choices=["wide-angle", "telephoto", "macro", "fisheye", "standard"],
-                        label="Lens Type",
-                        value="standard",
-                    )
-                    film_stock = gr.Dropdown(
-                        choices=["35mm", "medium format", "large format", "digital"],
-                        label="Film Stock",
-                        value="digital",
-                    )
-                    composition = gr.Dropdown(
-                        choices=["rule of thirds", "golden ratio", "symmetrical", "asymmetrical", "centered"],
-                        label="Composition",
-                        value="rule of thirds",
-                    )
-                    lighting = gr.Dropdown(
-                        choices=["natural", "studio", "high-key", "low-key", "dramatic"],
-                        label="Lighting",
-                        value="natural",
-                    )
-                gr.Markdown("**Friendly Reminder:** The tone and advanced options only work for specific caption types.")
                 run_button = gr.Button("Make My Caption!")
             with gr.Column():
-                output_caption = gr.Textbox(label="Your Amazing Caption Appears Here", lines=10)
-        run_button.click(fn=stream_chat, inputs=[input_image, caption_type, caption_tone, caption_length, lens_type, film_stock, composition, lighting], outputs=[output_caption])
 if __name__ == "__main__":
     demo.launch()

 CLIP_PATH = "google/siglip-so400m-patch14-384"
+MODEL_PATH = "meta-llama/Meta-Llama-3.1-8B"
 CHECKPOINT_PATH = Path("9em124t2-499968")
+TITLE = "<h1><center>JoyCaption Alpha One (2024-09-20a)</center></h1>"
 CAPTION_TYPE_MAP = {
+	("descriptive", "formal", False, False): ["Write a descriptive caption for this image in a formal tone."],
+	("descriptive", "formal", False, True): ["Write a descriptive caption for this image in a formal tone within {word_count} words."],
+	("descriptive", "formal", True, False): ["Write a {length} descriptive caption for this image in a formal tone."],
+	("descriptive", "informal", False, False): ["Write a descriptive caption for this image in a casual tone."],
+	("descriptive", "informal", False, True): ["Write a descriptive caption for this image in a casual tone within {word_count} words."],
+	("descriptive", "informal", True, False): ["Write a {length} descriptive caption for this image in a casual tone."],
+	("training_prompt", "formal", False, False): ["Write a stable diffusion prompt for this image."],
+	("training_prompt", "formal", False, True): ["Write a stable diffusion prompt for this image within {word_count} words."],
+	("training_prompt", "formal", True, False): ["Write a {length} stable diffusion prompt for this image."],
+	("rng-tags", "formal", False, False): ["Write a list of Booru tags for this image."],
+	("rng-tags", "formal", False, True): ["Write a list of Booru tags for this image within {word_count} words."],
+	("rng-tags", "formal", True, False): ["Write a {length} list of Booru tags for this image."],
+	("style_prompt", "formal", False, False): ["Generate a detailed style prompt for this image, including lens type, film stock, composition notes, and lighting aspects."],
+	("style_prompt", "formal", False, True): ["Generate a detailed style prompt for this image within {word_count} words, including lens type, film stock, composition notes, and lighting aspects."],
+	("style_prompt", "formal", True, False): ["Generate a {length} detailed style prompt for this image, including lens type, film stock, composition notes, and lighting aspects."],
 }
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 image_adapter.to("cuda")
+def preprocess_image(input_image: Image.Image) -> torch.Tensor:
+    """
+    Preprocess the input image for the CLIP model.
+    """
     image = input_image.resize((384, 384), Image.LANCZOS)
     pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
     pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
+    return pixel_values.to('cuda')
+def generate_caption(text_model, tokenizer, image_features, prompt_str: str, max_new_tokens: int = 300) -> str:
+    """
+    Generate a caption based on the image features and prompt.
+    """
     prompt = tokenizer.encode(prompt_str, return_tensors='pt', padding=False, truncation=False, add_special_tokens=False)
     prompt_embeds = text_model.model.embed_tokens(prompt.to('cuda'))
     embedded_bos = text_model.model.embed_tokens(torch.tensor([[tokenizer.bos_token_id]], device=text_model.device, dtype=torch.int64))
     eot_embed = image_adapter.get_eot_embedding().unsqueeze(0).to(dtype=text_model.dtype)
     inputs_embeds = torch.cat([
+        embedded_bos.expand(image_features.shape[0], -1, -1),
+        image_features.to(dtype=embedded_bos.dtype),
+        prompt_embeds.expand(image_features.shape[0], -1, -1),
+        eot_embed.expand(image_features.shape[0], -1, -1),
     ], dim=1)
     input_ids = torch.cat([
         torch.tensor([[tokenizer.bos_token_id]], dtype=torch.long),
+        torch.zeros((1, image_features.shape[1]), dtype=torch.long),
         prompt,
         torch.tensor([[tokenizer.convert_tokens_to_ids("<|eot_id|>")]], dtype=torch.long),
     ], dim=1).to('cuda')
     attention_mask = torch.ones_like(input_ids)
+    generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=max_new_tokens, do_sample=True, suppress_tokens=None)
     generate_ids = generate_ids[:, input_ids.shape[1]:]
     if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids("<|eot_id|>"):
         generate_ids = generate_ids[:, :-1]
+    return tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0].strip()
+@spaces.GPU()
+@torch.no_grad()
+def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str, caption_length: str | int, lens_type: str = "", film_stock: str = "", composition_style: str = "") -> str:
+    """
+    Generate a caption or style prompt based on the input image and parameters.
+    """
+    torch.cuda.empty_cache()
+    try:
+        length = None if caption_length == "any" else caption_length
+        if isinstance(length, str):
+            length = int(length)
+    except ValueError:
+        raise ValueError(f"Invalid caption length: {caption_length}")
+    if caption_type in ["rng-tags", "training_prompt", "style_prompt"]:
+        caption_tone = "formal"
+    prompt_key = (caption_type, caption_tone, isinstance(length, str), isinstance(length, int))
+    if prompt_key not in CAPTION_TYPE_MAP:
+        raise ValueError(f"Invalid caption type: {prompt_key}")
+    prompt_str = CAPTION_TYPE_MAP[prompt_key][0].format(length=length, word_count=length)
     if caption_type == "style_prompt":
+        prompt_str += f" Lens type: {lens_type}. Film stock: {film_stock}. Composition style: {composition_style}."
+    print(f"Prompt: {prompt_str}")
+    pixel_values = preprocess_image(input_image)
+    with torch.amp.autocast_mode.autocast('cuda', enabled=True):
+        vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True)
+        image_features = vision_outputs.hidden_states
+        embedded_images = image_adapter(image_features)
+        embedded_images = embedded_images.to('cuda')
+    caption = generate_caption(text_model, tokenizer, embedded_images, prompt_str)
+    return caption
 css = """
 h1, h2, h3, h4, h5, h6, p, li, ul, ol, a, .centered-image {
 }
 """
+# Gradio interface
 with gr.Blocks(theme="Hev832/Applio", css=css) as demo:
     with gr.Tab("Welcome"):
         gr.Markdown(
+            """
             <img src="https://path-to-yamamoto-logo.png" alt="Yamamoto Logo" class="centered-image">
             # 🎨 Yamamoto JoyCaption: AI-Powered Art Inspiration
             4. **Generate and Iterate**: Click 'Caption' to analyze your image and use the results to inspire new creations.
             """
         )
     with gr.Tab("JoyCaption"):
         with gr.Accordion("How to Use JoyCaption", open=False):
             gr.Markdown("""
         with gr.Row():
             with gr.Column():
+                input_image = gr.Image(type="pil", label="Input Image")
                 caption_type = gr.Dropdown(
                     choices=["descriptive", "training_prompt", "rng-tags", "style_prompt"],
+                    label="Caption Type",
                     value="descriptive",
                 )
                 caption_tone = gr.Dropdown(
                     choices=["formal", "informal"],
+                    label="Caption Tone",
                     value="formal",
                 )
                 caption_length = gr.Dropdown(
                     choices=["any", "very short", "short", "medium-length", "long", "very long"] +
                             [str(i) for i in range(20, 261, 10)],
+                    label="Caption Length",
                     value="any",
                 )
+                lens_type = gr.Dropdown(
+                    choices=["Wide-angle", "Standard", "Telephoto", "Macro", "Fish-eye"],
+                    label="Lens Type",
+                    visible=False,
+                )
+                film_stock = gr.Dropdown(
+                    choices=["Kodak Portra", "Fujifilm Velvia", "Ilford Delta", "Kodak Tri-X", "Fujifilm Provia"],
+                    label="Film Stock",
+                    visible=False,
+                )
+                composition_style = gr.Dropdown(
+                    choices=["Rule of Thirds", "Golden Ratio", "Symmetry", "Leading Lines", "Framing"],
+                    label="Composition Style",
+                    visible=False,
+                )
+                gr.Markdown("**Note:** Caption tone doesn't affect `rng-tags`, `training_prompt`, and `style_prompt`.")
                 run_button = gr.Button("Make My Caption!")
             with gr.Column():
+                output_caption = gr.Textbox(label="Generated Caption")
+                copy_button = gr.Button("Copy to Clipboard")
+    def update_style_options(caption_type):
+        return {
+            lens_type: gr.update(visible=caption_type == "style_prompt"),
+            film_stock: gr.update(visible=caption_type == "style_prompt"),
+            composition_style: gr.update(visible=caption_type == "style_prompt"),
+        }
+    caption_type.change(update_style_options, inputs=[caption_type], outputs=[lens_type, film_stock, composition_style])
+    run_button.click(fn=stream_chat, inputs=[input_image, caption_type, caption_tone, caption_length, lens_type, film_stock, composition_style], outputs=[output_caption])
+    def copy_to_clipboard():
+        return None
+    copy_button.click(fn=copy_to_clipboard, inputs=[], outputs=[])
 if __name__ == "__main__":
     demo.launch()