Update app.py
Browse files
app.py
CHANGED
@@ -12,147 +12,28 @@ import torchvision.transforms.functional as TVF
|
|
12 |
|
13 |
|
14 |
CLIP_PATH = "google/siglip-so400m-patch14-384"
|
15 |
-
MODEL_PATH = "
|
16 |
CHECKPOINT_PATH = Path("9em124t2-499968")
|
|
|
17 |
CAPTION_TYPE_MAP = {
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
"Summarize the coolest parts of this artwork in a casual tone, using roughly {word_count} words."
|
37 |
-
],
|
38 |
-
("descriptive", "informal", True, False): [
|
39 |
-
"Write a {length} chill description of this image, highlighting what you find most interesting or unique.",
|
40 |
-
"Give a {length} relaxed explanation of what's going on in this artwork and why it catches your eye."
|
41 |
-
],
|
42 |
-
("training_prompt", "formal", False, False): [
|
43 |
-
"Generate a detailed stable diffusion prompt to recreate this image, including style, composition, and key elements.",
|
44 |
-
"Craft a comprehensive prompt for an AI art generator to produce an image in the same style and mood as this artwork."
|
45 |
-
],
|
46 |
-
("training_prompt", "formal", False, True): [
|
47 |
-
"Within {word_count} words, create a precise stable diffusion prompt capturing the essence of this image.",
|
48 |
-
"Compose a concise AI art prompt of {word_count} words to replicate this artwork's style and content."
|
49 |
-
],
|
50 |
-
("training_prompt", "formal", True, False): [
|
51 |
-
"Write a {length} stable diffusion prompt that thoroughly describes this image's style, subject, and artistic techniques.",
|
52 |
-
"Develop a {length} detailed prompt for AI art generation, breaking down the key visual elements and artistic approach of this image."
|
53 |
-
],
|
54 |
-
("rng-tags", "formal", False, False): [
|
55 |
-
"Generate a comprehensive list of Booru tags describing this image's content, style, and artistic elements.",
|
56 |
-
"Create an extensive set of Booru tags covering all aspects of this artwork, including subject, technique, and mood."
|
57 |
-
],
|
58 |
-
("rng-tags", "formal", False, True): [
|
59 |
-
"Produce a focused list of Booru tags within {word_count} words, capturing the most important aspects of this image.",
|
60 |
-
"Compile a concise set of Booru tags, limited to {word_count} words, that best represent this artwork's key features."
|
61 |
-
],
|
62 |
-
("rng-tags", "formal", True, False): [
|
63 |
-
"Generate a {length} list of Booru tags, providing a thorough categorization of this image's content and style.",
|
64 |
-
"Create a {length} set of Booru tags that extensively describe all visual elements and artistic choices in this artwork."
|
65 |
-
],
|
66 |
-
("artistic_inspiration", "formal", False, False): [
|
67 |
-
"Analyze this image and suggest artistic variations or extensions that could be created based on its style and theme.",
|
68 |
-
"Provide a formal interpretation of this artwork's mood and style, offering ideas for complementary pieces or a series."
|
69 |
-
],
|
70 |
-
("artistic_inspiration", "informal", False, False): [
|
71 |
-
"Brainstorm some cool ideas for new artworks inspired by this image's style or subject matter.",
|
72 |
-
"Riff on this artwork's vibe and come up with some creative spin-offs or related pieces an artist could make."
|
73 |
-
],
|
74 |
-
("technical_breakdown", "formal", False, False): [
|
75 |
-
"Provide a detailed technical analysis of the artistic techniques and materials likely used to create this image.",
|
76 |
-
"Break down the compositional elements and artistic methods employed in this artwork, suitable for an art student's study."
|
77 |
-
],
|
78 |
-
("emotional_response", "informal", False, False): [
|
79 |
-
"Describe the emotions and feelings this artwork evokes, and explain why it might resonate with viewers.",
|
80 |
-
"Share your gut reaction to this image and speculate on what the artist might have been feeling or thinking."
|
81 |
-
],
|
82 |
-
|
83 |
-
("thematic_analysis", "formal", False, False): [
|
84 |
-
"Provide an in-depth analysis of the themes presented in this image, exploring the underlying messages and concepts.",
|
85 |
-
"Analyze the primary and secondary themes of this artwork, discussing their significance and interplay."
|
86 |
-
],
|
87 |
-
("thematic_analysis", "formal", False, True): [
|
88 |
-
"Within {word_count} words, dissect the main themes of this image, highlighting their relevance and impact.",
|
89 |
-
"Craft a concise thematic analysis of this artwork in {word_count} words, focusing on its core messages."
|
90 |
-
],
|
91 |
-
("thematic_analysis", "formal", True, False): [
|
92 |
-
"Write a {length} formal exploration of the themes depicted in this image, examining their depth and meaning.",
|
93 |
-
"Develop a {length} scholarly analysis of the thematic elements in this artwork, discussing their significance."
|
94 |
-
],
|
95 |
-
("stylistic_comparison", "informal", False, False): [
|
96 |
-
"Compare the style of this image to other famous art movements or artists, highlighting similarities and differences.",
|
97 |
-
"Describe how this artwork's style relates to [specific artist/style], and what makes it unique."
|
98 |
-
],
|
99 |
-
("stylistic_comparison", "informal", False, True): [
|
100 |
-
"In about {word_count} words, compare this image's style with other known art styles or artists.",
|
101 |
-
"Summarize the stylistic similarities and differences of this artwork compared to other genres in {word_count} words."
|
102 |
-
],
|
103 |
-
("stylistic_comparison", "informal", True, False): [
|
104 |
-
"Write a {length} casual comparison of this image's style with other art movements or famous artists.",
|
105 |
-
"Give a {length} relaxed description of how this artwork's style aligns or differs from other genres."
|
106 |
-
],
|
107 |
-
("narrative_suggestion", "formal", False, False): [
|
108 |
-
"Create a short narrative inspired by this image, outlining a possible story that reflects its visual elements.",
|
109 |
-
"Develop a brief storyline that complements the themes and mood depicted in this artwork."
|
110 |
-
],
|
111 |
-
("narrative_suggestion", "formal", False, True): [
|
112 |
-
"Within {word_count} words, outline a narrative inspired by this image's visual elements and mood.",
|
113 |
-
"Compose a concise story idea based on the themes and composition of this artwork in {word_count} words."
|
114 |
-
],
|
115 |
-
("narrative_suggestion", "formal", True, False): [
|
116 |
-
"Write a {length} formal narrative inspired by this image, detailing a story that aligns with its visual and thematic elements.",
|
117 |
-
"Develop a {length} scholarly storyline that reflects the mood and composition of this artwork."
|
118 |
-
],
|
119 |
-
("contextual_storytelling", "informal", False, False): [
|
120 |
-
"Tell a cool story that could be happening in the scene of this image, based on its visual cues.",
|
121 |
-
"Imagine a background story for this artwork, explaining what's happening and why."
|
122 |
-
],
|
123 |
-
("contextual_storytelling", "informal", False, True): [
|
124 |
-
"In about {word_count} words, create a backstory for the scene depicted in this image.",
|
125 |
-
"Summarize a possible background narrative for this artwork in {word_count} words."
|
126 |
-
],
|
127 |
-
("contextual_storytelling", "informal", True, False): [
|
128 |
-
"Write a {length} informal story that provides context to the scene portrayed in this image.",
|
129 |
-
"Give a {length} casual backstory explaining the events depicted in this artwork."
|
130 |
-
],
|
131 |
-
|
132 |
-
("style_prompt", "formal", False, False): [
|
133 |
-
"Analyze this image through the lens of {style} art. Describe how it aligns with or diverges from {style_characteristics}.",
|
134 |
-
"Examine this artwork in the context of the {style} movement, focusing on {style_focus} and how these elements are represented or reinterpreted in the image."
|
135 |
-
],
|
136 |
-
("style_prompt", "formal", False, True): [
|
137 |
-
"Within {word_count} words, compare this image to the {style} style, highlighting elements that reflect or contrast with its key characteristics.",
|
138 |
-
"Compose a concise {word_count}-word analysis of how this artwork relates to the {style} movement, noting its adherence to or departure from typical {style} elements."
|
139 |
-
],
|
140 |
-
("style_prompt", "formal", True, False): [
|
141 |
-
"Write a {length} critique of this image, exploring its relationship to the {style} movement. Discuss composition, technique, and thematic elements in this context.",
|
142 |
-
"Develop a {length} analysis of how this artwork incorporates or challenges the principles of {style}, considering its visual language and artistic approach."
|
143 |
-
],
|
144 |
-
("style_prompt", "informal", False, False): [
|
145 |
-
"Imagine this image is in an exhibition of {style} art. Describe what makes it fit in or stand out from other {style} pieces.",
|
146 |
-
"Give a casual rundown of how this artwork vibes with the {style} movement. What's similar? What's different? What's cool about it?"
|
147 |
-
],
|
148 |
-
("style_prompt", "informal", False, True): [
|
149 |
-
"In about {word_count} words, chat about how this image relates to {style} art. What catches your eye as typical or unusual for the style?",
|
150 |
-
"Summarize in roughly {word_count} words how this artwork plays with {style} ideas. What's familiar? What's a twist on the style?"
|
151 |
-
],
|
152 |
-
("style_prompt", "informal", True, False): [
|
153 |
-
"Write a {length} chill analysis of this image as if it's part of a {style} art show. What works? What's surprising? How does it make you feel?",
|
154 |
-
"Give a {length} relaxed breakdown of how this artwork fits (or doesn't) into the {style} scene. What's your take on its use of {style} elements?"
|
155 |
-
],
|
156 |
}
|
157 |
|
158 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
@@ -257,122 +138,105 @@ text_model.eval()
|
|
257 |
# Image Adapter
|
258 |
print("Loading image adapter")
|
259 |
image_adapter = ImageAdapter(clip_model.config.hidden_size, text_model.config.hidden_size, False, False, 38, False)
|
260 |
-
image_adapter.load_state_dict(torch.load(CHECKPOINT_PATH / "image_adapter.pt", map_location="cpu"))
|
261 |
image_adapter.eval()
|
262 |
image_adapter.to("cuda")
|
263 |
|
264 |
-
# After loading the tokenizer and model
|
265 |
-
print(f"Tokenizer class: {type(tokenizer)}")
|
266 |
-
print(f"BOS token: {tokenizer.bos_token}")
|
267 |
-
print(f"BOS token ID: {tokenizer.bos_token_id}")
|
268 |
-
print(f"EOS token: {tokenizer.eos_token}")
|
269 |
-
print(f"EOS token ID: {tokenizer.eos_token_id}")
|
270 |
-
print(f"Text model device: {text_model.device}")
|
271 |
-
|
272 |
-
# Ensure the tokenizer has the necessary special tokens
|
273 |
-
if tokenizer.bos_token_id is None or tokenizer.eos_token_id is None:
|
274 |
-
print("Warning: BOS or EOS token is missing. Adding default tokens.")
|
275 |
-
special_tokens_dict = {}
|
276 |
-
if tokenizer.bos_token_id is None:
|
277 |
-
special_tokens_dict['bos_token'] = '<|endoftext|>'
|
278 |
-
if tokenizer.eos_token_id is None:
|
279 |
-
special_tokens_dict['eos_token'] = '<|endoftext|>'
|
280 |
-
num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)
|
281 |
-
print(f"Added {num_added_tokens} special tokens to the tokenizer.")
|
282 |
-
|
283 |
-
# Resize token embeddings of the model if new tokens are added
|
284 |
-
text_model.resize_token_embeddings(len(tokenizer))
|
285 |
-
|
286 |
-
@spaces.GPU()
|
287 |
-
@torch.no_grad()
|
288 |
-
def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str, caption_length: str | int, art_style: str) -> str:
|
289 |
-
torch.cuda.empty_cache()
|
290 |
-
|
291 |
-
# Handle caption_length
|
292 |
-
length = None
|
293 |
-
if caption_length != "any":
|
294 |
-
if isinstance(caption_length, int):
|
295 |
-
length = caption_length
|
296 |
-
elif isinstance(caption_length, str):
|
297 |
-
try:
|
298 |
-
length = int(caption_length)
|
299 |
-
except ValueError:
|
300 |
-
# If it's not a number, treat it as a descriptive length
|
301 |
-
length = caption_length
|
302 |
-
|
303 |
-
# 'rng-tags' and 'training_prompt' don't have formal/informal tones
|
304 |
-
if caption_type in ["rng-tags", "training_prompt"]:
|
305 |
-
caption_tone = "formal"
|
306 |
-
|
307 |
-
# Build prompt
|
308 |
-
prompt_key = (caption_type, caption_tone, isinstance(length, str), isinstance(length, int))
|
309 |
-
if prompt_key not in CAPTION_TYPE_MAP:
|
310 |
-
raise ValueError(f"Invalid caption type: {prompt_key}")
|
311 |
-
|
312 |
-
prompt_str = CAPTION_TYPE_MAP[prompt_key][0].format(
|
313 |
-
length=length,
|
314 |
-
word_count=length,
|
315 |
-
style=art_style,
|
316 |
-
style_characteristics=STYLE_CHARACTERISTICS.get(art_style, "its unique elements"),
|
317 |
-
style_focus=STYLE_FOCUS.get(art_style, "its distinctive features")
|
318 |
-
)
|
319 |
-
print(f"Prompt: {prompt_str}")
|
320 |
|
321 |
-
|
|
|
|
|
|
|
322 |
image = input_image.resize((384, 384), Image.LANCZOS)
|
323 |
pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
|
324 |
pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
|
325 |
-
|
326 |
|
327 |
-
|
|
|
|
|
|
|
328 |
prompt = tokenizer.encode(prompt_str, return_tensors='pt', padding=False, truncation=False, add_special_tokens=False)
|
329 |
-
|
330 |
-
# Embed image
|
331 |
-
with torch.amp.autocast_mode.autocast('cuda', enabled=True):
|
332 |
-
vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True)
|
333 |
-
image_features = vision_outputs.hidden_states
|
334 |
-
embedded_images = image_adapter(image_features)
|
335 |
-
embedded_images = embedded_images.to('cuda')
|
336 |
-
|
337 |
-
# Embed prompt
|
338 |
prompt_embeds = text_model.model.embed_tokens(prompt.to('cuda'))
|
339 |
-
|
340 |
-
|
341 |
-
# Check for bos_token_id and provide a fallback
|
342 |
-
bos_token_id = tokenizer.bos_token_id
|
343 |
-
if bos_token_id is None:
|
344 |
-
print("Warning: bos_token_id is None. Using default value of 1.")
|
345 |
-
bos_token_id = 1 # Common default, but may need adjustment
|
346 |
-
|
347 |
-
embedded_bos = text_model.model.embed_tokens(torch.tensor([[bos_token_id]], device=text_model.device, dtype=torch.int64))
|
348 |
eot_embed = image_adapter.get_eot_embedding().unsqueeze(0).to(dtype=text_model.dtype)
|
349 |
|
350 |
-
# Construct prompts
|
351 |
inputs_embeds = torch.cat([
|
352 |
-
embedded_bos.expand(
|
353 |
-
|
354 |
-
prompt_embeds.expand(
|
355 |
-
eot_embed.expand(
|
356 |
], dim=1)
|
357 |
|
358 |
input_ids = torch.cat([
|
359 |
-
torch.tensor([[bos_token_id]], dtype=torch.long),
|
360 |
-
torch.zeros((1,
|
361 |
prompt,
|
362 |
torch.tensor([[tokenizer.convert_tokens_to_ids("<|eot_id|>")]], dtype=torch.long),
|
363 |
], dim=1).to('cuda')
|
364 |
attention_mask = torch.ones_like(input_ids)
|
365 |
|
366 |
-
generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=
|
367 |
|
368 |
-
# Trim off the prompt
|
369 |
generate_ids = generate_ids[:, input_ids.shape[1]:]
|
370 |
if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids("<|eot_id|>"):
|
371 |
generate_ids = generate_ids[:, :-1]
|
372 |
|
373 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
374 |
|
375 |
-
|
|
|
|
|
376 |
|
377 |
css = """
|
378 |
h1, h2, h3, h4, h5, h6, p, li, ul, ol, a, .centered-image {
|
@@ -392,63 +256,110 @@ ul, ol {
|
|
392 |
}
|
393 |
"""
|
394 |
|
395 |
-
|
396 |
-
|
397 |
-
"
|
398 |
-
"
|
399 |
-
"
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
"
|
404 |
-
"
|
405 |
-
"
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
"
|
410 |
-
"
|
411 |
-
"
|
412 |
-
"
|
413 |
-
"
|
414 |
-
"
|
415 |
-
"
|
416 |
-
"
|
417 |
-
"
|
418 |
-
"
|
419 |
-
"
|
420 |
-
"
|
421 |
-
"
|
422 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
423 |
}
|
424 |
|
425 |
-
|
426 |
-
"
|
427 |
-
"
|
428 |
-
"
|
429 |
-
"
|
430 |
-
"
|
431 |
-
"
|
432 |
-
"
|
433 |
-
"
|
434 |
-
"
|
435 |
-
"
|
436 |
-
"
|
437 |
-
"
|
438 |
-
"
|
439 |
-
"Fauvism": "emphasizing bold colors and emotional content",
|
440 |
-
"Art Deco": "incorporating geometric shapes and modern aesthetics",
|
441 |
-
"Futurism": "depicting speed, technology, and dynamism",
|
442 |
-
"Dadaism": "subverting traditional art and exploring absurdity",
|
443 |
-
"Pointillism": "capturing light and color through small dots",
|
444 |
-
"Rococo": "creating lighthearted and decorative compositions",
|
445 |
-
"Neoclassicism": "achieving balance and symmetry in classical style"
|
446 |
}
|
447 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
448 |
with gr.Blocks(theme="Hev832/Applio", css=css) as demo:
|
449 |
with gr.Tab("Welcome"):
|
450 |
gr.Markdown(
|
451 |
-
|
452 |
<img src="https://path-to-yamamoto-logo.png" alt="Yamamoto Logo" class="centered-image">
|
453 |
|
454 |
# 🎨 Yamamoto JoyCaption: AI-Powered Art Inspiration
|
@@ -456,7 +367,7 @@ with gr.Blocks(theme="Hev832/Applio", css=css) as demo:
|
|
456 |
## Accelerate Your Creative Workflow with Intelligent Image Analysis
|
457 |
|
458 |
This innovative tool empowers Yamamoto's artists to quickly generate descriptive captions,<br>
|
459 |
-
training prompts,
|
460 |
|
461 |
## 🚀 How It Works:
|
462 |
1. **Upload Your Inspiration**: Drop in an image (e.g., a charcoal horse picture) that embodies your desired style.
|
@@ -465,109 +376,147 @@ with gr.Blocks(theme="Hev832/Applio", css=css) as demo:
|
|
465 |
4. **Generate and Iterate**: Click 'Caption' to analyze your image and use the results to inspire new creations.
|
466 |
"""
|
467 |
)
|
468 |
-
|
469 |
with gr.Tab("JoyCaption"):
|
470 |
-
gr.
|
471 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
472 |
|
473 |
-
|
|
|
|
|
474 |
|
475 |
-
|
476 |
-
|
477 |
-
|
478 |
-
|
479 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
480 |
|
481 |
with gr.Row():
|
482 |
-
with gr.Column(
|
483 |
-
input_image = gr.Image(type="pil", label="
|
484 |
|
485 |
caption_type = gr.Dropdown(
|
486 |
-
choices=[
|
487 |
-
|
488 |
-
"training_prompt",
|
489 |
-
"rng-tags",
|
490 |
-
"thematic_analysis",
|
491 |
-
"stylistic_comparison",
|
492 |
-
"narrative_suggestion",
|
493 |
-
"contextual_storytelling",
|
494 |
-
"style_prompt"
|
495 |
-
],
|
496 |
-
label="Output Type",
|
497 |
value="descriptive",
|
498 |
)
|
499 |
|
500 |
-
gr.Markdown("""
|
501 |
-
### Output Types Explained:
|
502 |
-
- **Descriptive**: A general description of the image
|
503 |
-
- **Training Prompt**: A prompt for AI image generation
|
504 |
-
- **RNG-Tags**: Tags for categorizing the image
|
505 |
-
- **Thematic Analysis**: Exploration of themes in the image
|
506 |
-
- **Stylistic Comparison**: Compares the image to art styles
|
507 |
-
- **Narrative Suggestion**: A story idea based on the image
|
508 |
-
- **Contextual Storytelling**: A background story for the image
|
509 |
-
- **Style Prompt**: Analyzes the image in context of a specific art style
|
510 |
-
""")
|
511 |
-
|
512 |
caption_tone = gr.Dropdown(
|
513 |
choices=["formal", "informal"],
|
514 |
-
label="Tone",
|
515 |
value="formal",
|
516 |
)
|
517 |
|
518 |
-
gr.Markdown("Choose between a formal (professional) or informal (casual) tone for the output.")
|
519 |
-
|
520 |
caption_length = gr.Dropdown(
|
521 |
choices=["any", "very short", "short", "medium-length", "long", "very long"] +
|
522 |
[str(i) for i in range(20, 261, 10)],
|
523 |
-
label="Length",
|
524 |
value="any",
|
525 |
)
|
526 |
|
527 |
-
gr.
|
528 |
-
|
529 |
-
|
530 |
-
|
531 |
-
|
532 |
-
""")
|
533 |
-
|
534 |
-
art_style = gr.Dropdown(
|
535 |
-
choices=ART_STYLES,
|
536 |
-
label="Art Style (for Style Prompt)",
|
537 |
-
value="Impressionism",
|
538 |
-
visible=False
|
539 |
)
|
540 |
|
541 |
-
gr.
|
542 |
-
|
543 |
-
|
544 |
-
|
545 |
-
|
546 |
-
|
547 |
-
|
548 |
-
|
549 |
-
|
550 |
-
|
551 |
-
|
552 |
-
|
553 |
-
|
554 |
-
|
555 |
-
|
556 |
-
|
557 |
-
|
558 |
-
|
559 |
-
|
560 |
-
|
561 |
-
|
562 |
-
|
563 |
-
|
564 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
565 |
|
566 |
-
generate_button.click(
|
567 |
-
fn=stream_chat,
|
568 |
-
inputs=[input_image, caption_type, caption_tone, caption_length, art_style],
|
569 |
-
outputs=[output_caption]
|
570 |
-
)
|
571 |
|
572 |
if __name__ == "__main__":
|
573 |
demo.launch()
|
|
|
12 |
|
13 |
|
14 |
CLIP_PATH = "google/siglip-so400m-patch14-384"
|
15 |
+
MODEL_PATH = "Qwen/Qwen2.5-7B-Instruct"
|
16 |
CHECKPOINT_PATH = Path("9em124t2-499968")
|
17 |
+
TITLE = "<h1><center>JoyCaption Alpha One (2024-09-20a)</center></h1>"
|
18 |
CAPTION_TYPE_MAP = {
|
19 |
+
("descriptive", "formal", False, False): ["Write a descriptive caption for this image in a formal tone."],
|
20 |
+
("descriptive", "formal", False, True): ["Write a descriptive caption for this image in a formal tone within {word_count} words."],
|
21 |
+
("descriptive", "formal", True, False): ["Write a {length} descriptive caption for this image in a formal tone."],
|
22 |
+
("descriptive", "informal", False, False): ["Write a descriptive caption for this image in a casual tone."],
|
23 |
+
("descriptive", "informal", False, True): ["Write a descriptive caption for this image in a casual tone within {word_count} words."],
|
24 |
+
("descriptive", "informal", True, False): ["Write a {length} descriptive caption for this image in a casual tone."],
|
25 |
+
|
26 |
+
("training_prompt", "formal", False, False): ["Write a stable diffusion prompt for this image."],
|
27 |
+
("training_prompt", "formal", False, True): ["Write a stable diffusion prompt for this image within {word_count} words."],
|
28 |
+
("training_prompt", "formal", True, False): ["Write a {length} stable diffusion prompt for this image."],
|
29 |
+
|
30 |
+
("rng-tags", "formal", False, False): ["Write a list of Booru tags for this image."],
|
31 |
+
("rng-tags", "formal", False, True): ["Write a list of Booru tags for this image within {word_count} words."],
|
32 |
+
("rng-tags", "formal", True, False): ["Write a {length} list of Booru tags for this image."],
|
33 |
+
|
34 |
+
("style_prompt", "formal", False, False): ["Generate a detailed style prompt for this image, including lens type, film stock, composition notes, lighting aspects, and any special photographic techniques."],
|
35 |
+
("style_prompt", "formal", False, True): ["Generate a detailed style prompt for this image within {word_count} words, including lens type, film stock, composition notes, lighting aspects, and any special photographic techniques."],
|
36 |
+
("style_prompt", "formal", True, False): ["Generate a {length} detailed style prompt for this image, including lens type, film stock, composition notes, lighting aspects, and any special photographic techniques."],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
}
|
38 |
|
39 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
|
|
138 |
# Image Adapter
|
139 |
print("Loading image adapter")
|
140 |
image_adapter = ImageAdapter(clip_model.config.hidden_size, text_model.config.hidden_size, False, False, 38, False)
|
141 |
+
image_adapter.load_state_dict(torch.load(CHECKPOINT_PATH / "image_adapter.pt", map_location="cpu", weights_only=True))
|
142 |
image_adapter.eval()
|
143 |
image_adapter.to("cuda")
|
144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
|
146 |
+
def preprocess_image(input_image: Image.Image) -> torch.Tensor:
|
147 |
+
"""
|
148 |
+
Preprocess the input image for the CLIP model.
|
149 |
+
"""
|
150 |
image = input_image.resize((384, 384), Image.LANCZOS)
|
151 |
pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
|
152 |
pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
|
153 |
+
return pixel_values.to('cuda')
|
154 |
|
155 |
+
def generate_caption(text_model, tokenizer, image_features, prompt_str: str, max_new_tokens: int = 300) -> str:
|
156 |
+
"""
|
157 |
+
Generate a caption based on the image features and prompt.
|
158 |
+
"""
|
159 |
prompt = tokenizer.encode(prompt_str, return_tensors='pt', padding=False, truncation=False, add_special_tokens=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
160 |
prompt_embeds = text_model.model.embed_tokens(prompt.to('cuda'))
|
161 |
+
embedded_bos = text_model.model.embed_tokens(torch.tensor([[tokenizer.bos_token_id]], device=text_model.device, dtype=torch.int64))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
eot_embed = image_adapter.get_eot_embedding().unsqueeze(0).to(dtype=text_model.dtype)
|
163 |
|
|
|
164 |
inputs_embeds = torch.cat([
|
165 |
+
embedded_bos.expand(image_features.shape[0], -1, -1),
|
166 |
+
image_features.to(dtype=embedded_bos.dtype),
|
167 |
+
prompt_embeds.expand(image_features.shape[0], -1, -1),
|
168 |
+
eot_embed.expand(image_features.shape[0], -1, -1),
|
169 |
], dim=1)
|
170 |
|
171 |
input_ids = torch.cat([
|
172 |
+
torch.tensor([[tokenizer.bos_token_id]], dtype=torch.long),
|
173 |
+
torch.zeros((1, image_features.shape[1]), dtype=torch.long),
|
174 |
prompt,
|
175 |
torch.tensor([[tokenizer.convert_tokens_to_ids("<|eot_id|>")]], dtype=torch.long),
|
176 |
], dim=1).to('cuda')
|
177 |
attention_mask = torch.ones_like(input_ids)
|
178 |
|
179 |
+
generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=max_new_tokens, do_sample=True, suppress_tokens=None)
|
180 |
|
|
|
181 |
generate_ids = generate_ids[:, input_ids.shape[1]:]
|
182 |
if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids("<|eot_id|>"):
|
183 |
generate_ids = generate_ids[:, :-1]
|
184 |
|
185 |
+
return tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0].strip()
|
186 |
+
|
187 |
+
@spaces.GPU()
|
188 |
+
@torch.no_grad()
|
189 |
+
def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str, caption_length: str | int, lens_type: str = "", film_stock: str = "", composition_style: str = "", lighting_aspect: str = "", special_technique: str = "", color_effect: str = "") -> str:
|
190 |
+
"""
|
191 |
+
Generate a caption or style prompt based on the input image and parameters.
|
192 |
+
"""
|
193 |
+
torch.cuda.empty_cache()
|
194 |
+
|
195 |
+
try:
|
196 |
+
length = None if caption_length == "any" else caption_length
|
197 |
+
if isinstance(length, str):
|
198 |
+
length = int(length)
|
199 |
+
except ValueError:
|
200 |
+
raise ValueError(f"Invalid caption length: {caption_length}")
|
201 |
+
|
202 |
+
if caption_type in ["rng-tags", "training_prompt", "style_prompt"]:
|
203 |
+
caption_tone = "formal"
|
204 |
+
|
205 |
+
prompt_key = (caption_type, caption_tone, isinstance(length, str), isinstance(length, int))
|
206 |
+
if prompt_key not in CAPTION_TYPE_MAP:
|
207 |
+
raise ValueError(f"Invalid caption type: {prompt_key}")
|
208 |
+
|
209 |
+
prompt_str = CAPTION_TYPE_MAP[prompt_key][0].format(length=length, word_count=length)
|
210 |
+
|
211 |
+
if caption_type == "style_prompt":
|
212 |
+
prompt_str += f" Lens type: {lens_type} ({lens_types_info[lens_type]}). "
|
213 |
+
prompt_str += f"Film stock: {film_stocks_info[film_stock]}). "
|
214 |
+
prompt_str += f"Composition style: {composition_styles_info[composition_style]}). "
|
215 |
+
prompt_str += f"Lighting aspect: {lighting_aspects_info[lighting_aspect]}). "
|
216 |
+
prompt_str += f"Special technique: {special_techniques_info[special_technique]}). "
|
217 |
+
prompt_str += f"Color effect: {color_effects_info[color_effect]})."
|
218 |
+
|
219 |
+
# Debugging: Print the constructed prompt string
|
220 |
+
print(f"Constructed Prompt: {prompt_str}")
|
221 |
+
|
222 |
+
pixel_values = preprocess_image(input_image)
|
223 |
+
|
224 |
+
with torch.amp.autocast_mode.autocast('cuda', enabled=True):
|
225 |
+
vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True)
|
226 |
+
image_features = vision_outputs.hidden_states
|
227 |
+
embedded_images = image_adapter(image_features)
|
228 |
+
embedded_images = embedded_images.to('cuda')
|
229 |
+
|
230 |
+
# Load the model from MODEL_PATH
|
231 |
+
text_model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map="auto", torch_dtype=torch.bfloat16)
|
232 |
+
text_model.eval()
|
233 |
+
|
234 |
+
# Debugging: Print the prompt string before passing to generate_caption
|
235 |
+
print(f"Prompt passed to generate_caption: {prompt_str}")
|
236 |
|
237 |
+
caption = generate_caption(text_model, tokenizer, embedded_images, prompt_str)
|
238 |
+
|
239 |
+
return caption
|
240 |
|
241 |
css = """
|
242 |
h1, h2, h3, h4, h5, h6, p, li, ul, ol, a, .centered-image {
|
|
|
256 |
}
|
257 |
"""
|
258 |
|
259 |
+
# Add detailed descriptions for each option
|
260 |
+
lens_types_info = {
|
261 |
+
"Standard": "A versatile lens with a field of view similar to human vision.",
|
262 |
+
"Wide-angle": "Captures a wider field of view, great for landscapes and architecture. Applies moderate to strong lens effect with image warp.",
|
263 |
+
"Telephoto": "Used for distant subjects, gives an 'award-winning' or 'National Geographic' look. Creates interesting effects when prompted.",
|
264 |
+
"Macro": "For extreme close-up photography, revealing tiny details.",
|
265 |
+
"Fish-eye": "Ultra-wide-angle lens that creates a strong bubble-like distortion. Generates panoramic photos with the entire image warping into a bubble.",
|
266 |
+
"Tilt-shift": "Allows adjusting the plane of focus, creating a 'miniature' effect. Known for the 'diorama miniature look'.",
|
267 |
+
"Zoom lens": "Variable focal length lens. Often zooms in on the subject, perfect for creating a base for inpainting. Interesting effect on landscapes with motion blur.",
|
268 |
+
"GoPro": "Wide-angle lens with clean digital look. Excludes film grain and most filter effects, resulting in natural colors and regular saturation.",
|
269 |
+
"Pinhole camera": "Creates a unique, foggy, low-detail, historic photograph look. Used since the 1850s, with peak popularity in the 1930s."
|
270 |
+
}
|
271 |
+
|
272 |
+
film_stocks_info = {
|
273 |
+
"Kodak Portra": "Professional color negative film known for its natural skin tones and low contrast.",
|
274 |
+
"Fujifilm Velvia": "Slide film known for vibrant colors and high saturation, popular among landscape photographers.",
|
275 |
+
"Ilford Delta": "Black and white film known for its fine grain and high sharpness.",
|
276 |
+
"Kodak Tri-X": "Classic high-speed black and white film, known for its distinctive grain and wide exposure latitude.",
|
277 |
+
"Fujifilm Provia": "Color reversal film known for its natural color reproduction and fine grain.",
|
278 |
+
"Cinestill": "Color photos with fine/low grain and higher than average resolution. Colors are slightly oversaturated or slightly desaturated.",
|
279 |
+
"Ektachrome": "Color photos with fine/low to moderate grain. Colors on the colder part of spectrum or regular, with normal or slightly higher saturation.",
|
280 |
+
"Ektar": "Modern Kodak film. Color photos with little to no grain. Results look like regular modern photography with artistic angles.",
|
281 |
+
"Film Washi": "Mostly black and white photos with fine/low to moderate grain. Occasionally gives colored photos with low saturation. Distinct style with high black contrast and soft camera lens effect.",
|
282 |
+
"Fomapan": "Black and white photos with fine/low to moderate grain, highly artistic exposure and angles. Adds very soft lens effect without distortion, dark photo vignette.",
|
283 |
+
"Fujicolor": "Color photos with fine/low to moderate grain. Colors are slightly or notably desaturated, with entire color hue shifted in a very distinct manner.",
|
284 |
+
"Holga": "Color photos with high grain. Colors are either very oversaturated or slightly desaturated. Distinct contrast of black. Often applies photographic vignette.",
|
285 |
+
"Instax": "Instant color photos similar to Polaroid but clearer. Near perfect colors, regular saturation, fine/low to medium grain.",
|
286 |
+
"Lomography": "Color photos with high grain. Colors are either very oversaturated or slightly desaturated. Distinct contrast of black. Often applies photographic vignette.",
|
287 |
+
"Kodachrome": "Color photos with moderate grain. Colors on either colder part of spectrum or regular, with normal or slightly higher saturation.",
|
288 |
+
"Rollei": "Mostly black and white photos, sometimes color with fine/low grain. Can be sepia colored or have unusual hues and desaturation. Great for landscapes."
|
289 |
+
}
|
290 |
+
|
291 |
+
composition_styles_info = {
|
292 |
+
"Rule of Thirds": "Divides the frame into a 3x3 grid, placing key elements along the lines or at their intersections.",
|
293 |
+
"Golden Ratio": "Uses a spiral based on the golden ratio to create a balanced and aesthetically pleasing composition.",
|
294 |
+
"Symmetry": "Creates a mirror-like balance in the image, often used for architectural or nature photography.",
|
295 |
+
"Leading Lines": "Uses lines within the frame to draw the viewer's eye to the main subject or through the image.",
|
296 |
+
"Framing": "Uses elements within the scene to create a frame around the main subject.",
|
297 |
+
"Minimalism": "Simplifies the composition to its essential elements, often with a lot of negative space.",
|
298 |
+
"Fill the Frame": "The main subject dominates the entire frame, leaving little to no background.",
|
299 |
+
"Negative Space": "Uses empty space around the subject to create a sense of simplicity or isolation.",
|
300 |
+
"Centered Composition": "Places the main subject in the center of the frame, creating a sense of stability or importance.",
|
301 |
+
"Diagonal Lines": "Uses diagonal elements to create a sense of movement or dynamic tension in the image.",
|
302 |
+
"Triangular Composition": "Arranges elements in the frame to form a triangle, creating a sense of stability and harmony.",
|
303 |
+
"Radial Balance": "Arranges elements in a circular pattern around a central point, creating a sense of movement or completeness."
|
304 |
}
|
305 |
|
306 |
+
lighting_aspects_info = {
|
307 |
+
"Natural light": "Uses available light from the sun or sky, often creating soft, even illumination.",
|
308 |
+
"Studio lighting": "Controlled artificial lighting setup, allowing for precise manipulation of light and shadow.",
|
309 |
+
"Back light": "Light source behind the subject, creating silhouettes or rim lighting effects.",
|
310 |
+
"Split light": "Strong light source at 90-degree angle, lighting one half of the subject while leaving the other in shadow.",
|
311 |
+
"Broad light": "Light source at an angle to the subject, producing well-lit photographs with soft to moderate shadows.",
|
312 |
+
"Dim light": "Weak or distant light source, creating lower than average brightness and often dramatic images.",
|
313 |
+
"Flash photography": "Uses a brief, intense burst of light. Can be fill flash (even lighting) or harsh flash (strong contrasts).",
|
314 |
+
"Sunlight": "Direct light from the sun, often creating strong contrasts and warm tones.",
|
315 |
+
"Moonlight": "Soft, cool light from the moon, often creating a mysterious or romantic atmosphere.",
|
316 |
+
"Spotlight": "Focused beam of light illuminating a specific area, creating high contrast between light and shadow.",
|
317 |
+
"High-key lighting": "Bright, even lighting with minimal shadows, creating a light and airy feel.",
|
318 |
+
"Low-key lighting": "Predominantly dark tones with selective lighting, creating a moody or dramatic atmosphere.",
|
319 |
+
"Rembrandt lighting": "Classic portrait lighting technique creating a triangle of light on the cheek of the subject."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
320 |
}
|
321 |
|
322 |
+
special_techniques_info = {
|
323 |
+
"Double exposure": "Superimposes two exposures to create a single image, often resulting in a dreamy or surreal effect.",
|
324 |
+
"Long exposure": "Uses a long shutter speed to capture motion over time, often creating smooth, blurred effects for moving elements.",
|
325 |
+
"Multiple exposure": "Superimposes multiple exposures, multiplying the subject or its key elements across the image.",
|
326 |
+
"HDR": "High Dynamic Range imaging, combining multiple exposures to capture a wider range of light and dark tones.",
|
327 |
+
"Bokeh effect": "Creates a soft, out-of-focus background, often with circular highlights.",
|
328 |
+
"Silhouette": "Captures the outline of a subject against a brighter background, creating a dramatic contrast.",
|
329 |
+
"Panning": "Follows a moving subject with the camera, creating a sharp subject with a blurred background.",
|
330 |
+
"Light painting": "Uses long exposure and moving light sources to 'paint' with light in the image.",
|
331 |
+
"Infrared photography": "Captures light in the infrared spectrum, often resulting in surreal, otherworldly images.",
|
332 |
+
"Ultraviolet photography": "Captures light in the ultraviolet spectrum, often revealing hidden patterns or creating a strong violet glow.",
|
333 |
+
"Kirlian photography": "High-voltage photographic technique that captures corona discharges around objects, creating a glowing effect.",
|
334 |
+
"Thermography": "Captures infrared radiation to create images based on temperature differences, resulting in false-color heat maps.",
|
335 |
+
"Astrophotography": "Specialized technique for capturing astronomical objects and celestial events, often resulting in stunning starry backgrounds.",
|
336 |
+
"Underwater photography": "Captures images beneath the surface of water, often in pools, seas, or aquariums.",
|
337 |
+
"Aerial photography": "Captures images from an elevated position, such as from drones, helicopters, or planes.",
|
338 |
+
"Macro photography": "Extreme close-up photography, revealing tiny details not visible to the naked eye."
|
339 |
+
}
|
340 |
+
|
341 |
+
color_effects_info = {
|
342 |
+
"Black and white": "Removes all color, leaving only shades of gray.",
|
343 |
+
"Sepia": "Reddish-brown monochrome effect, often associated with vintage photography.",
|
344 |
+
"Monochrome": "Uses variations of a single color.",
|
345 |
+
"Vintage color": "Muted or faded color palette reminiscent of old photographs.",
|
346 |
+
"Cross-processed": "Deliberate processing of film in the wrong chemicals, creating unusual color shifts.",
|
347 |
+
"Desaturated": "Reduces the intensity of all colors in the image.",
|
348 |
+
"Vivid colors": "Increases the saturation and intensity of colors.",
|
349 |
+
"Pastel colors": "Soft, pale colors with a light and airy feel.",
|
350 |
+
"High contrast": "Emphasizes the difference between light and dark areas in the image.",
|
351 |
+
"Low contrast": "Reduces the difference between light and dark areas, creating a softer look.",
|
352 |
+
"Color splash": "Converts most of the image to black and white while leaving one or more elements in color."
|
353 |
+
}
|
354 |
+
|
355 |
+
def get_dropdown_choices(info_dict):
|
356 |
+
return [f"{key}: {value}" for key, value in info_dict.items()]
|
357 |
+
|
358 |
+
# Gradio interface
|
359 |
with gr.Blocks(theme="Hev832/Applio", css=css) as demo:
|
360 |
with gr.Tab("Welcome"):
|
361 |
gr.Markdown(
|
362 |
+
"""
|
363 |
<img src="https://path-to-yamamoto-logo.png" alt="Yamamoto Logo" class="centered-image">
|
364 |
|
365 |
# 🎨 Yamamoto JoyCaption: AI-Powered Art Inspiration
|
|
|
367 |
## Accelerate Your Creative Workflow with Intelligent Image Analysis
|
368 |
|
369 |
This innovative tool empowers Yamamoto's artists to quickly generate descriptive captions,<br>
|
370 |
+
training prompts, and tags from existing artwork, fueling the creative process for GenAI models.
|
371 |
|
372 |
## 🚀 How It Works:
|
373 |
1. **Upload Your Inspiration**: Drop in an image (e.g., a charcoal horse picture) that embodies your desired style.
|
|
|
376 |
4. **Generate and Iterate**: Click 'Caption' to analyze your image and use the results to inspire new creations.
|
377 |
"""
|
378 |
)
|
379 |
+
|
380 |
with gr.Tab("JoyCaption"):
|
381 |
+
with gr.Accordion("How to Use JoyCaption", open=False):
|
382 |
+
gr.Markdown("""
|
383 |
+
# How to Use JoyCaption
|
384 |
+
|
385 |
+
Hello, artist! Let's make some fun captions for your pictures. Here's how:
|
386 |
+
|
387 |
+
1. **Pick a Picture**: Find a cool picture you want to talk about and upload it.
|
388 |
+
|
389 |
+
2. **Choose What You Want**:
|
390 |
+
- **Caption Type**:
|
391 |
+
* "Descriptive" tells you what's in the picture
|
392 |
+
* "Training Prompt" helps computers make similar pictures
|
393 |
+
* "RNG-Tags" gives you short words about the picture
|
394 |
+
* "Style Prompt" creates detailed prompts for image generation
|
395 |
|
396 |
+
3. **Pick a Style** (for "Descriptive" and "Style Prompt" only):
|
397 |
+
- "Formal" sounds like a teacher talking
|
398 |
+
- "Informal" sounds like a friend chatting
|
399 |
|
400 |
+
4. **Decide How Long**:
|
401 |
+
- "Any" lets the computer decide
|
402 |
+
- Or pick a size from "very short" to "very long"
|
403 |
+
- You can even choose a specific number of words!
|
404 |
+
|
405 |
+
5. **Advanced Options** (for "Style Prompt" only):
|
406 |
+
- Choose lens type, film stock, composition, and lighting details
|
407 |
+
|
408 |
+
6. **Make the Caption**: Click the "Make My Caption!" button and watch the magic happen!
|
409 |
+
|
410 |
+
Remember, have fun and be creative with your captions!
|
411 |
+
|
412 |
+
## Tips for Great Captions:
|
413 |
+
- Try different types to see what you like best
|
414 |
+
- Experiment with formal and informal tones for fun variations
|
415 |
+
- Adjust the length to get just the right amount of detail
|
416 |
+
- For "Style Prompt", play with the advanced options for more specific results
|
417 |
+
- If you don't like a caption, just click "Make My Caption!" again for a new one
|
418 |
+
|
419 |
+
Have a great time captioning your art!
|
420 |
+
""")
|
421 |
|
422 |
with gr.Row():
|
423 |
+
with gr.Column():
|
424 |
+
input_image = gr.Image(type="pil", label="Input Image")
|
425 |
|
426 |
caption_type = gr.Dropdown(
|
427 |
+
choices=["descriptive", "training_prompt", "rng-tags", "style_prompt"],
|
428 |
+
label="Caption Type",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
429 |
value="descriptive",
|
430 |
)
|
431 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
432 |
caption_tone = gr.Dropdown(
|
433 |
choices=["formal", "informal"],
|
434 |
+
label="Caption Tone",
|
435 |
value="formal",
|
436 |
)
|
437 |
|
|
|
|
|
438 |
caption_length = gr.Dropdown(
|
439 |
choices=["any", "very short", "short", "medium-length", "long", "very long"] +
|
440 |
[str(i) for i in range(20, 261, 10)],
|
441 |
+
label="Caption Length",
|
442 |
value="any",
|
443 |
)
|
444 |
|
445 |
+
lens_type = gr.Dropdown(
|
446 |
+
choices=get_dropdown_choices(lens_types_info),
|
447 |
+
label="Lens Type",
|
448 |
+
visible=False,
|
449 |
+
info="Select a lens type to define the perspective and field of view of the image."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
450 |
)
|
451 |
|
452 |
+
film_stock = gr.Dropdown(
|
453 |
+
choices=get_dropdown_choices(film_stocks_info),
|
454 |
+
label="Film Stock",
|
455 |
+
visible=False,
|
456 |
+
info="Choose a film stock to determine the color, grain, and overall look of the image."
|
457 |
+
)
|
458 |
+
|
459 |
+
composition_style = gr.Dropdown(
|
460 |
+
choices=get_dropdown_choices(composition_styles_info),
|
461 |
+
label="Composition Style",
|
462 |
+
visible=False,
|
463 |
+
info="Select a composition style to guide the arrangement of elements in the image."
|
464 |
+
)
|
465 |
+
|
466 |
+
lighting_aspect = gr.Dropdown(
|
467 |
+
choices=get_dropdown_choices(lighting_aspects_info),
|
468 |
+
label="Lighting Aspect",
|
469 |
+
visible=False,
|
470 |
+
info="Choose a lighting style to define the mood and atmosphere of the image."
|
471 |
+
)
|
472 |
+
|
473 |
+
special_technique = gr.Dropdown(
|
474 |
+
choices=get_dropdown_choices(special_techniques_info),
|
475 |
+
label="Special Technique",
|
476 |
+
visible=False,
|
477 |
+
info="Select a special photographic technique to add unique effects to the image."
|
478 |
+
)
|
479 |
+
|
480 |
+
color_effect = gr.Dropdown(
|
481 |
+
choices=get_dropdown_choices(color_effects_info),
|
482 |
+
label="Color Effect",
|
483 |
+
visible=False,
|
484 |
+
info="Choose a color effect to alter the overall color palette of the image."
|
485 |
+
)
|
486 |
+
|
487 |
+
gr.Markdown("**Note:** Caption tone doesn't affect `rng-tags`, `training_prompt`, and `style_prompt`.")
|
488 |
+
|
489 |
+
run_button = gr.Button("Make My Caption!")
|
490 |
+
|
491 |
+
with gr.Column():
|
492 |
+
output_caption = gr.Textbox(label="Generated Caption")
|
493 |
+
|
494 |
+
# Container for advanced options
|
495 |
+
advanced_options = gr.Column(visible=False)
|
496 |
+
with advanced_options:
|
497 |
+
gr.Markdown("### Advanced Options for Style Prompt")
|
498 |
+
lens_type.render()
|
499 |
+
film_stock.render()
|
500 |
+
composition_style.render()
|
501 |
+
lighting_aspect.render()
|
502 |
+
special_technique.render()
|
503 |
+
color_effect.render()
|
504 |
+
|
505 |
+
def update_style_options(caption_type):
|
506 |
+
return {
|
507 |
+
lens_type: gr.update(visible=caption_type == "style_prompt"),
|
508 |
+
film_stock: gr.update(visible=caption_type == "style_prompt"),
|
509 |
+
composition_style: gr.update(visible=caption_type == "style_prompt"),
|
510 |
+
lighting_aspect: gr.update(visible=caption_type == "style_prompt"),
|
511 |
+
special_technique: gr.update(visible=caption_type == "style_prompt"),
|
512 |
+
color_effect: gr.update(visible=caption_type == "style_prompt"),
|
513 |
+
advanced_options: gr.update(visible=caption_type == "style_prompt"),
|
514 |
+
}
|
515 |
+
|
516 |
+
caption_type.change(update_style_options, inputs=[caption_type], outputs=[lens_type, film_stock, composition_style, lighting_aspect, special_technique, color_effect, advanced_options])
|
517 |
+
|
518 |
+
run_button.click(fn=stream_chat, inputs=[input_image, caption_type, caption_tone, caption_length, lens_type, film_stock, composition_style, lighting_aspect, special_technique, color_effect], outputs=[output_caption])
|
519 |
|
|
|
|
|
|
|
|
|
|
|
520 |
|
521 |
if __name__ == "__main__":
|
522 |
demo.launch()
|