|
import spaces |
|
import gradio as gr |
|
from huggingface_hub import InferenceClient |
|
from torch import nn |
|
from transformers import AutoModel, AutoProcessor, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast, AutoModelForCausalLM |
|
from pathlib import Path |
|
import torch |
|
import torch.amp.autocast_mode |
|
from PIL import Image |
|
import os |
|
import torchvision.transforms.functional as TVF |
|
|
|
|
|
CLIP_PATH = "google/siglip-so400m-patch14-384" |
|
MODEL_PATH = "Qwen/Qwen2.5-7B-Instruct" |
|
CHECKPOINT_PATH = Path("9em124t2-499968") |
|
CAPTION_TYPE_MAP = { |
|
("descriptive", "formal", False, False): [ |
|
"Write a detailed, formal description of this image, focusing on composition, style, and artistic elements.", |
|
"Provide a comprehensive, academic analysis of this artwork's visual characteristics and techniques." |
|
], |
|
("descriptive", "formal", False, True): [ |
|
"Craft a formal, concise description of this image within {word_count} words, highlighting key visual elements.", |
|
"Summarize the artwork's main features and style in a formal tone, using no more than {word_count} words." |
|
], |
|
("descriptive", "formal", True, False): [ |
|
"Compose a {length} formal critique of this image, discussing its artistic merits and visual impact.", |
|
"Create a {length} scholarly description of this artwork, analyzing its composition and aesthetic qualities." |
|
], |
|
("descriptive", "informal", False, False): [ |
|
"Describe this image as if you're explaining it to a friend, focusing on what stands out to you.", |
|
"Give a casual, conversational rundown of what you see in this artwork and how it makes you feel." |
|
], |
|
("descriptive", "informal", False, True): [ |
|
"In about {word_count} words, give a laid-back description of this image's vibe and key features.", |
|
"Summarize the coolest parts of this artwork in a casual tone, using roughly {word_count} words." |
|
], |
|
("descriptive", "informal", True, False): [ |
|
"Write a {length} chill description of this image, highlighting what you find most interesting or unique.", |
|
"Give a {length} relaxed explanation of what's going on in this artwork and why it catches your eye." |
|
], |
|
("training_prompt", "formal", False, False): [ |
|
"Generate a detailed stable diffusion prompt to recreate this image, including style, composition, and key elements.", |
|
"Craft a comprehensive prompt for an AI art generator to produce an image in the same style and mood as this artwork." |
|
], |
|
("training_prompt", "formal", False, True): [ |
|
"Within {word_count} words, create a precise stable diffusion prompt capturing the essence of this image.", |
|
"Compose a concise AI art prompt of {word_count} words to replicate this artwork's style and content." |
|
], |
|
("training_prompt", "formal", True, False): [ |
|
"Write a {length} stable diffusion prompt that thoroughly describes this image's style, subject, and artistic techniques.", |
|
"Develop a {length} detailed prompt for AI art generation, breaking down the key visual elements and artistic approach of this image." |
|
], |
|
("rng-tags", "formal", False, False): [ |
|
"Generate a comprehensive list of Booru tags describing this image's content, style, and artistic elements.", |
|
"Create an extensive set of Booru tags covering all aspects of this artwork, including subject, technique, and mood." |
|
], |
|
("rng-tags", "formal", False, True): [ |
|
"Produce a focused list of Booru tags within {word_count} words, capturing the most important aspects of this image.", |
|
"Compile a concise set of Booru tags, limited to {word_count} words, that best represent this artwork's key features." |
|
], |
|
("rng-tags", "formal", True, False): [ |
|
"Generate a {length} list of Booru tags, providing a thorough categorization of this image's content and style.", |
|
"Create a {length} set of Booru tags that extensively describe all visual elements and artistic choices in this artwork." |
|
], |
|
("artistic_inspiration", "formal", False, False): [ |
|
"Analyze this image and suggest artistic variations or extensions that could be created based on its style and theme.", |
|
"Provide a formal interpretation of this artwork's mood and style, offering ideas for complementary pieces or a series." |
|
], |
|
("artistic_inspiration", "informal", False, False): [ |
|
"Brainstorm some cool ideas for new artworks inspired by this image's style or subject matter.", |
|
"Riff on this artwork's vibe and come up with some creative spin-offs or related pieces an artist could make." |
|
], |
|
("technical_breakdown", "formal", False, False): [ |
|
"Provide a detailed technical analysis of the artistic techniques and materials likely used to create this image.", |
|
"Break down the compositional elements and artistic methods employed in this artwork, suitable for an art student's study." |
|
], |
|
("emotional_response", "informal", False, False): [ |
|
"Describe the emotions and feelings this artwork evokes, and explain why it might resonate with viewers.", |
|
"Share your gut reaction to this image and speculate on what the artist might have been feeling or thinking." |
|
], |
|
|
|
("thematic_analysis", "formal", False, False): [ |
|
"Provide an in-depth analysis of the themes presented in this image, exploring the underlying messages and concepts.", |
|
"Analyze the primary and secondary themes of this artwork, discussing their significance and interplay." |
|
], |
|
("thematic_analysis", "formal", False, True): [ |
|
"Within {word_count} words, dissect the main themes of this image, highlighting their relevance and impact.", |
|
"Craft a concise thematic analysis of this artwork in {word_count} words, focusing on its core messages." |
|
], |
|
("thematic_analysis", "formal", True, False): [ |
|
"Write a {length} formal exploration of the themes depicted in this image, examining their depth and meaning.", |
|
"Develop a {length} scholarly analysis of the thematic elements in this artwork, discussing their significance." |
|
], |
|
("stylistic_comparison", "informal", False, False): [ |
|
"Compare the style of this image to other famous art movements or artists, highlighting similarities and differences.", |
|
"Describe how this artwork's style relates to [specific artist/style], and what makes it unique." |
|
], |
|
("stylistic_comparison", "informal", False, True): [ |
|
"In about {word_count} words, compare this image's style with other known art styles or artists.", |
|
"Summarize the stylistic similarities and differences of this artwork compared to other genres in {word_count} words." |
|
], |
|
("stylistic_comparison", "informal", True, False): [ |
|
"Write a {length} casual comparison of this image's style with other art movements or famous artists.", |
|
"Give a {length} relaxed description of how this artwork's style aligns or differs from other genres." |
|
], |
|
("narrative_suggestion", "formal", False, False): [ |
|
"Create a short narrative inspired by this image, outlining a possible story that reflects its visual elements.", |
|
"Develop a brief storyline that complements the themes and mood depicted in this artwork." |
|
], |
|
("narrative_suggestion", "formal", False, True): [ |
|
"Within {word_count} words, outline a narrative inspired by this image's visual elements and mood.", |
|
"Compose a concise story idea based on the themes and composition of this artwork in {word_count} words." |
|
], |
|
("narrative_suggestion", "formal", True, False): [ |
|
"Write a {length} formal narrative inspired by this image, detailing a story that aligns with its visual and thematic elements.", |
|
"Develop a {length} scholarly storyline that reflects the mood and composition of this artwork." |
|
], |
|
("contextual_storytelling", "informal", False, False): [ |
|
"Tell a cool story that could be happening in the scene of this image, based on its visual cues.", |
|
"Imagine a background story for this artwork, explaining what's happening and why." |
|
], |
|
("contextual_storytelling", "informal", False, True): [ |
|
"In about {word_count} words, create a backstory for the scene depicted in this image.", |
|
"Summarize a possible background narrative for this artwork in {word_count} words." |
|
], |
|
("contextual_storytelling", "informal", True, False): [ |
|
"Write a {length} informal story that provides context to the scene portrayed in this image.", |
|
"Give a {length} casual backstory explaining the events depicted in this artwork." |
|
], |
|
|
|
("style_prompt", "formal", False, False): [ |
|
"Analyze this image through the lens of {style} art. Describe how it aligns with or diverges from {style_characteristics}.", |
|
"Examine this artwork in the context of the {style} movement, focusing on {style_focus} and how these elements are represented or reinterpreted in the image." |
|
], |
|
("style_prompt", "formal", False, True): [ |
|
"Within {word_count} words, compare this image to the {style} style, highlighting elements that reflect or contrast with its key characteristics.", |
|
"Compose a concise {word_count}-word analysis of how this artwork relates to the {style} movement, noting its adherence to or departure from typical {style} elements." |
|
], |
|
("style_prompt", "formal", True, False): [ |
|
"Write a {length} critique of this image, exploring its relationship to the {style} movement. Discuss composition, technique, and thematic elements in this context.", |
|
"Develop a {length} analysis of how this artwork incorporates or challenges the principles of {style}, considering its visual language and artistic approach." |
|
], |
|
("style_prompt", "informal", False, False): [ |
|
"Imagine this image is in an exhibition of {style} art. Describe what makes it fit in or stand out from other {style} pieces.", |
|
"Give a casual rundown of how this artwork vibes with the {style} movement. What's similar? What's different? What's cool about it?" |
|
], |
|
("style_prompt", "informal", False, True): [ |
|
"In about {word_count} words, chat about how this image relates to {style} art. What catches your eye as typical or unusual for the style?", |
|
"Summarize in roughly {word_count} words how this artwork plays with {style} ideas. What's familiar? What's a twist on the style?" |
|
], |
|
("style_prompt", "informal", True, False): [ |
|
"Write a {length} chill analysis of this image as if it's part of a {style} art show. What works? What's surprising? How does it make you feel?", |
|
"Give a {length} relaxed breakdown of how this artwork fits (or doesn't) into the {style} scene. What's your take on its use of {style} elements?" |
|
], |
|
} |
|
|
|
HF_TOKEN = os.environ.get("HF_TOKEN", None) |
|
|
|
|
|
class ImageAdapter(nn.Module): |
|
def __init__(self, input_features: int, output_features: int, ln1: bool, pos_emb: bool, num_image_tokens: int, deep_extract: bool): |
|
super().__init__() |
|
self.deep_extract = deep_extract |
|
|
|
if self.deep_extract: |
|
input_features = input_features * 5 |
|
|
|
self.linear1 = nn.Linear(input_features, output_features) |
|
self.activation = nn.GELU() |
|
self.linear2 = nn.Linear(output_features, output_features) |
|
self.ln1 = nn.Identity() if not ln1 else nn.LayerNorm(input_features) |
|
self.pos_emb = None if not pos_emb else nn.Parameter(torch.zeros(num_image_tokens, input_features)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
self.other_tokens = nn.Embedding(3, output_features) |
|
self.other_tokens.weight.data.normal_(mean=0.0, std=0.02) |
|
|
|
def forward(self, vision_outputs: torch.Tensor): |
|
if self.deep_extract: |
|
x = torch.concat(( |
|
vision_outputs[-2], |
|
vision_outputs[3], |
|
vision_outputs[7], |
|
vision_outputs[13], |
|
vision_outputs[20], |
|
), dim=-1) |
|
assert len(x.shape) == 3, f"Expected 3, got {len(x.shape)}" |
|
assert x.shape[-1] == vision_outputs[-2].shape[-1] * 5, f"Expected {vision_outputs[-2].shape[-1] * 5}, got {x.shape[-1]}" |
|
else: |
|
x = vision_outputs[-2] |
|
|
|
x = self.ln1(x) |
|
|
|
if self.pos_emb is not None: |
|
assert x.shape[-2:] == self.pos_emb.shape, f"Expected {self.pos_emb.shape}, got {x.shape[-2:]}" |
|
x = x + self.pos_emb |
|
|
|
x = self.linear1(x) |
|
x = self.activation(x) |
|
x = self.linear2(x) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
other_tokens = self.other_tokens(torch.tensor([0, 1], device=self.other_tokens.weight.device).expand(x.shape[0], -1)) |
|
assert other_tokens.shape == (x.shape[0], 2, x.shape[2]), f"Expected {(x.shape[0], 2, x.shape[2])}, got {other_tokens.shape}" |
|
x = torch.cat((other_tokens[:, 0:1], x, other_tokens[:, 1:2]), dim=1) |
|
|
|
return x |
|
|
|
def get_eot_embedding(self): |
|
return self.other_tokens(torch.tensor([2], device=self.other_tokens.weight.device)).squeeze(0) |
|
|
|
|
|
|
|
|
|
print("Loading CLIP") |
|
clip_processor = AutoProcessor.from_pretrained(CLIP_PATH) |
|
clip_model = AutoModel.from_pretrained(CLIP_PATH) |
|
clip_model = clip_model.vision_model |
|
|
|
if (CHECKPOINT_PATH / "clip_model.pt").exists(): |
|
print("Loading VLM's custom vision model") |
|
checkpoint = torch.load(CHECKPOINT_PATH / "clip_model.pt", map_location='cpu') |
|
checkpoint = {k.replace("_orig_mod.module.", ""): v for k, v in checkpoint.items()} |
|
clip_model.load_state_dict(checkpoint) |
|
del checkpoint |
|
|
|
clip_model.eval() |
|
clip_model.requires_grad_(False) |
|
clip_model.to("cuda") |
|
|
|
|
|
|
|
print("Loading tokenizer") |
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=False) |
|
assert isinstance(tokenizer, PreTrainedTokenizer) or isinstance(tokenizer, PreTrainedTokenizerFast), f"Tokenizer is of type {type(tokenizer)}" |
|
|
|
|
|
print("Loading LLM") |
|
if (CHECKPOINT_PATH / "text_model").exists: |
|
print("Loading VLM's custom text model") |
|
text_model = AutoModelForCausalLM.from_pretrained(CHECKPOINT_PATH / "text_model", device_map=0, torch_dtype=torch.bfloat16) |
|
else: |
|
text_model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map="auto", torch_dtype=torch.bfloat16) |
|
|
|
text_model.eval() |
|
|
|
|
|
print("Loading image adapter") |
|
image_adapter = ImageAdapter(clip_model.config.hidden_size, text_model.config.hidden_size, False, False, 38, False) |
|
image_adapter.load_state_dict(torch.load(CHECKPOINT_PATH / "image_adapter.pt", map_location="cpu")) |
|
image_adapter.eval() |
|
image_adapter.to("cuda") |
|
|
|
|
|
print(f"Tokenizer class: {type(tokenizer)}") |
|
print(f"BOS token: {tokenizer.bos_token}") |
|
print(f"BOS token ID: {tokenizer.bos_token_id}") |
|
print(f"EOS token: {tokenizer.eos_token}") |
|
print(f"EOS token ID: {tokenizer.eos_token_id}") |
|
print(f"Text model device: {text_model.device}") |
|
|
|
|
|
if tokenizer.bos_token_id is None or tokenizer.eos_token_id is None: |
|
print("Warning: BOS or EOS token is missing. Adding default tokens.") |
|
special_tokens_dict = {} |
|
if tokenizer.bos_token_id is None: |
|
special_tokens_dict['bos_token'] = '<|endoftext|>' |
|
if tokenizer.eos_token_id is None: |
|
special_tokens_dict['eos_token'] = '<|endoftext|>' |
|
num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict) |
|
print(f"Added {num_added_tokens} special tokens to the tokenizer.") |
|
|
|
|
|
text_model.resize_token_embeddings(len(tokenizer)) |
|
|
|
@spaces.GPU() |
|
@torch.no_grad() |
|
def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str, caption_length: str | int, art_style: str) -> str: |
|
torch.cuda.empty_cache() |
|
|
|
|
|
length = None |
|
if caption_length != "any": |
|
if isinstance(caption_length, int): |
|
length = caption_length |
|
elif isinstance(caption_length, str): |
|
try: |
|
length = int(caption_length) |
|
except ValueError: |
|
|
|
length = caption_length |
|
|
|
|
|
if caption_type in ["rng-tags", "training_prompt"]: |
|
caption_tone = "formal" |
|
|
|
|
|
prompt_key = (caption_type, caption_tone, isinstance(length, str), isinstance(length, int)) |
|
if prompt_key not in CAPTION_TYPE_MAP: |
|
raise ValueError(f"Invalid caption type: {prompt_key}") |
|
|
|
prompt_str = CAPTION_TYPE_MAP[prompt_key][0].format( |
|
length=length, |
|
word_count=length, |
|
style=art_style, |
|
style_characteristics=STYLE_CHARACTERISTICS.get(art_style, "its unique elements"), |
|
style_focus=STYLE_FOCUS.get(art_style, "its distinctive features") |
|
) |
|
print(f"Prompt: {prompt_str}") |
|
|
|
|
|
image = input_image.resize((384, 384), Image.LANCZOS) |
|
pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0 |
|
pixel_values = TVF.normalize(pixel_values, [0.5], [0.5]) |
|
pixel_values = pixel_values.to('cuda') |
|
|
|
|
|
prompt = tokenizer.encode(prompt_str, return_tensors='pt', padding=False, truncation=False, add_special_tokens=False) |
|
|
|
|
|
with torch.amp.autocast_mode.autocast('cuda', enabled=True): |
|
vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True) |
|
image_features = vision_outputs.hidden_states |
|
embedded_images = image_adapter(image_features) |
|
embedded_images = embedded_images.to('cuda') |
|
|
|
|
|
prompt_embeds = text_model.model.embed_tokens(prompt.to('cuda')) |
|
assert prompt_embeds.shape == (1, prompt.shape[1], text_model.config.hidden_size), f"Prompt shape is {prompt_embeds.shape}, expected {(1, prompt.shape[1], text_model.config.hidden_size)}" |
|
|
|
|
|
bos_token_id = tokenizer.bos_token_id |
|
if bos_token_id is None: |
|
print("Warning: bos_token_id is None. Using default value of 1.") |
|
bos_token_id = 1 |
|
|
|
embedded_bos = text_model.model.embed_tokens(torch.tensor([[bos_token_id]], device=text_model.device, dtype=torch.int64)) |
|
eot_embed = image_adapter.get_eot_embedding().unsqueeze(0).to(dtype=text_model.dtype) |
|
|
|
|
|
inputs_embeds = torch.cat([ |
|
embedded_bos.expand(embedded_images.shape[0], -1, -1), |
|
embedded_images.to(dtype=embedded_bos.dtype), |
|
prompt_embeds.expand(embedded_images.shape[0], -1, -1), |
|
eot_embed.expand(embedded_images.shape[0], -1, -1), |
|
], dim=1) |
|
|
|
input_ids = torch.cat([ |
|
torch.tensor([[bos_token_id]], dtype=torch.long), |
|
torch.zeros((1, embedded_images.shape[1]), dtype=torch.long), |
|
prompt, |
|
torch.tensor([[tokenizer.convert_tokens_to_ids("<|eot_id|>")]], dtype=torch.long), |
|
], dim=1).to('cuda') |
|
attention_mask = torch.ones_like(input_ids) |
|
|
|
generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=300, do_sample=True, suppress_tokens=None) |
|
|
|
|
|
generate_ids = generate_ids[:, input_ids.shape[1]:] |
|
if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids("<|eot_id|>"): |
|
generate_ids = generate_ids[:, :-1] |
|
|
|
caption = tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0] |
|
|
|
return caption.strip() |
|
|
|
css = """ |
|
h1, h2, h3, h4, h5, h6, p, li, ul, ol, a, .centered-image { |
|
text-align: center; |
|
display: block; |
|
margin-left: auto; |
|
margin-right: auto; |
|
} |
|
ul, ol { |
|
margin-left: auto; |
|
margin-right: auto; |
|
display: table; |
|
} |
|
.centered-image { |
|
max-width: 100%; |
|
height: auto; |
|
} |
|
""" |
|
|
|
ART_STYLES = [ |
|
"Impressionism", "Cubism", "Surrealism", "Abstract Expressionism", "Pop Art", |
|
"Minimalism", "Baroque", "Renaissance", "Art Nouveau", "Gothic", |
|
"Romanticism", "Realism", "Expressionism", "Fauvism", "Art Deco", |
|
"Futurism", "Dadaism", "Pointillism", "Rococo", "Neoclassicism" |
|
] |
|
|
|
STYLE_CHARACTERISTICS = { |
|
"Impressionism": "loose brushstrokes, emphasis on light and color, everyday subjects", |
|
"Cubism": "geometric shapes, multiple perspectives, fragmented forms", |
|
"Surrealism": "dreamlike imagery, unexpected juxtapositions, subconscious exploration", |
|
"Abstract Expressionism": "expressive brushwork, emotional content, abstract forms", |
|
"Pop Art": "bright colors, popular culture references, satire", |
|
"Minimalism": "simple forms, limited color palette, emphasis on space", |
|
"Baroque": "dramatic lighting, elaborate detail, grandeur", |
|
"Renaissance": "realistic depictions, perspective, religious themes", |
|
"Art Nouveau": "stylized forms, organic shapes, decorative elements", |
|
"Gothic": "dark themes, dramatic lighting, architectural elements", |
|
"Romanticism": "emotional content, nature scenes, idealized figures", |
|
"Realism": "detailed depictions, realistic textures, everyday subjects", |
|
"Expressionism": "emotional content, distorted forms, abstract elements", |
|
"Fauvism": "bold colors, abstract forms, emotional content", |
|
"Art Deco": "geometric shapes, streamlined forms, modern aesthetics", |
|
"Futurism": "dynamic forms, speed, technology", |
|
"Dadaism": "anti-art, absurdity, subversion of traditional art", |
|
"Pointillism": "small dots of color, impressionistic style, emphasis on light", |
|
"Rococo": "ornate style, lighthearted themes, decorative elements", |
|
"Neoclassicism": "classical style, balance, symmetry" |
|
} |
|
|
|
STYLE_FOCUS = { |
|
"Impressionism": "capturing fleeting moments and atmospheric effects", |
|
"Cubism": "deconstructing and reassembling forms from multiple viewpoints", |
|
"Surrealism": "creating a sense of the uncanny and exploring the subconscious mind", |
|
"Abstract Expressionism": "expressing emotional content through abstract forms", |
|
"Pop Art": "commenting on popular culture and satirizing consumerism", |
|
"Minimalism": "exploring the relationship between form and space", |
|
"Baroque": "creating dramatic and grandiose compositions", |
|
"Renaissance": "depicting realistic scenes and exploring perspective", |
|
"Art Nouveau": "incorporating organic and decorative elements", |
|
"Gothic": "exploring dark themes and dramatic lighting", |
|
"Romanticism": "depicting emotional scenes and idealized figures", |
|
"Realism": "capturing detailed and realistic textures", |
|
"Expressionism": "expressing emotional content through distorted forms", |
|
"Fauvism": "emphasizing bold colors and emotional content", |
|
"Art Deco": "incorporating geometric shapes and modern aesthetics", |
|
"Futurism": "depicting speed, technology, and dynamism", |
|
"Dadaism": "subverting traditional art and exploring absurdity", |
|
"Pointillism": "capturing light and color through small dots", |
|
"Rococo": "creating lighthearted and decorative compositions", |
|
"Neoclassicism": "achieving balance and symmetry in classical style" |
|
} |
|
|
|
with gr.Blocks(theme="Hev832/Applio", css=css) as demo: |
|
with gr.Tab("Welcome"): |
|
gr.Markdown( |
|
""" |
|
<img src="https://path-to-yamamoto-logo.png" alt="Yamamoto Logo" class="centered-image"> |
|
|
|
# 🎨 Yamamoto JoyCaption: AI-Powered Art Inspiration |
|
|
|
## Accelerate Your Creative Workflow with Intelligent Image Analysis |
|
|
|
This innovative tool empowers Yamamoto's artists to quickly generate descriptive captions,<br> |
|
training prompts, or tags from existing artwork, fueling the creative process for GenAI models. |
|
|
|
## 🚀 How It Works: |
|
1. **Upload Your Inspiration**: Drop in an image (e.g., a charcoal horse picture) that embodies your desired style. |
|
2. **Choose Your Output**: Select from descriptive captions, training prompts, or tags. |
|
3. **Customize the Results**: Adjust tone, length, and other parameters to fine-tune the output. |
|
4. **Generate and Iterate**: Click 'Caption' to analyze your image and use the results to inspire new creations. |
|
""" |
|
) |
|
|
|
with gr.Tab("JoyCaption"): |
|
gr.Markdown(""" |
|
# JoyCaption: AI-Powered Image Analysis Tool |
|
|
|
This tool helps you generate various types of text based on an uploaded image. Here's how to use it: |
|
|
|
1. Upload an image |
|
2. Choose your desired output type |
|
3. Adjust settings as needed |
|
4. Click 'Generate Caption' to get your result |
|
""") |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
input_image = gr.Image(type="pil", label="Upload Your Image") |
|
|
|
caption_type = gr.Dropdown( |
|
choices=[ |
|
"descriptive", |
|
"training_prompt", |
|
"rng-tags", |
|
"thematic_analysis", |
|
"stylistic_comparison", |
|
"narrative_suggestion", |
|
"contextual_storytelling", |
|
"style_prompt" |
|
], |
|
label="Output Type", |
|
value="descriptive", |
|
) |
|
|
|
gr.Markdown(""" |
|
### Output Types Explained: |
|
- **Descriptive**: A general description of the image |
|
- **Training Prompt**: A prompt for AI image generation |
|
- **RNG-Tags**: Tags for categorizing the image |
|
- **Thematic Analysis**: Exploration of themes in the image |
|
- **Stylistic Comparison**: Compares the image to art styles |
|
- **Narrative Suggestion**: A story idea based on the image |
|
- **Contextual Storytelling**: A background story for the image |
|
- **Style Prompt**: Analyzes the image in context of a specific art style |
|
""") |
|
|
|
caption_tone = gr.Dropdown( |
|
choices=["formal", "informal"], |
|
label="Tone", |
|
value="formal", |
|
) |
|
|
|
gr.Markdown("Choose between a formal (professional) or informal (casual) tone for the output.") |
|
|
|
caption_length = gr.Dropdown( |
|
choices=["any", "very short", "short", "medium-length", "long", "very long"] + |
|
[str(i) for i in range(20, 261, 10)], |
|
label="Length", |
|
value="any", |
|
) |
|
|
|
gr.Markdown(""" |
|
Select the desired length of the output: |
|
- 'any': No specific length |
|
- Descriptive options: very short to very long |
|
- Numeric options: Specify exact word count (20 to 260 words) |
|
""") |
|
|
|
art_style = gr.Dropdown( |
|
choices=ART_STYLES, |
|
label="Art Style (for Style Prompt)", |
|
value="Impressionism", |
|
visible=False |
|
) |
|
|
|
gr.Markdown("Select an art style to analyze the image in that context. Only applicable for 'Style Prompt' output type.") |
|
|
|
with gr.Column(scale=1): |
|
output_caption = gr.Textbox(label="Generated Output", lines=10) |
|
generate_button = gr.Button("Generate Caption") |
|
|
|
gr.Markdown(""" |
|
### Additional Notes: |
|
- The 'Tone' setting doesn't affect 'RNG-Tags' and 'Training Prompt' outputs. |
|
- 'Art Style' is only used when 'Style Prompt' is selected as the output type. |
|
- The AI model analyzes the image and generates text based on your selections. |
|
""") |
|
|
|
run_button = gr.Button("Caption") |
|
|
|
with gr.Column(): |
|
output_caption = gr.Textbox(label="Caption") |
|
|
|
|
|
caption_type.change( |
|
fn=lambda x: gr.update(visible=(x == "style_prompt")), |
|
inputs=[caption_type], |
|
outputs=[art_style] |
|
) |
|
|
|
generate_button.click( |
|
fn=stream_chat, |
|
inputs=[input_image, caption_type, caption_tone, caption_length, art_style], |
|
outputs=[output_caption] |
|
) |
|
|
|
run_button.click(fn=stream_chat, inputs=[input_image, caption_type, caption_tone, caption_length, art_style], outputs=[output_caption]) |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch() |