Update app.py
Browse files
app.py
CHANGED
@@ -12,28 +12,147 @@ import torchvision.transforms.functional as TVF
|
|
12 |
|
13 |
|
14 |
CLIP_PATH = "google/siglip-so400m-patch14-384"
|
15 |
-
MODEL_PATH = "
|
16 |
CHECKPOINT_PATH = Path("9em124t2-499968")
|
17 |
-
TITLE = "<h1><center>JoyCaption Alpha One (2024-09-20a)</center></h1>"
|
18 |
CAPTION_TYPE_MAP = {
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
}
|
38 |
|
39 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
@@ -138,105 +257,122 @@ text_model.eval()
|
|
138 |
# Image Adapter
|
139 |
print("Loading image adapter")
|
140 |
image_adapter = ImageAdapter(clip_model.config.hidden_size, text_model.config.hidden_size, False, False, 38, False)
|
141 |
-
image_adapter.load_state_dict(torch.load(CHECKPOINT_PATH / "image_adapter.pt", map_location="cpu"
|
142 |
image_adapter.eval()
|
143 |
image_adapter.to("cuda")
|
144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
|
146 |
-
|
147 |
-
"""
|
148 |
-
Preprocess the input image for the CLIP model.
|
149 |
-
"""
|
150 |
image = input_image.resize((384, 384), Image.LANCZOS)
|
151 |
pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
|
152 |
pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
|
153 |
-
|
154 |
|
155 |
-
|
156 |
-
"""
|
157 |
-
Generate a caption based on the image features and prompt.
|
158 |
-
"""
|
159 |
prompt = tokenizer.encode(prompt_str, return_tensors='pt', padding=False, truncation=False, add_special_tokens=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
160 |
prompt_embeds = text_model.model.embed_tokens(prompt.to('cuda'))
|
161 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
eot_embed = image_adapter.get_eot_embedding().unsqueeze(0).to(dtype=text_model.dtype)
|
163 |
|
|
|
164 |
inputs_embeds = torch.cat([
|
165 |
-
embedded_bos.expand(
|
166 |
-
|
167 |
-
prompt_embeds.expand(
|
168 |
-
eot_embed.expand(
|
169 |
], dim=1)
|
170 |
|
171 |
input_ids = torch.cat([
|
172 |
-
torch.tensor([[
|
173 |
-
torch.zeros((1,
|
174 |
prompt,
|
175 |
torch.tensor([[tokenizer.convert_tokens_to_ids("<|eot_id|>")]], dtype=torch.long),
|
176 |
], dim=1).to('cuda')
|
177 |
attention_mask = torch.ones_like(input_ids)
|
178 |
|
179 |
-
generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=
|
180 |
|
|
|
181 |
generate_ids = generate_ids[:, input_ids.shape[1]:]
|
182 |
if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids("<|eot_id|>"):
|
183 |
generate_ids = generate_ids[:, :-1]
|
184 |
|
185 |
-
|
186 |
-
|
187 |
-
@spaces.GPU()
|
188 |
-
@torch.no_grad()
|
189 |
-
def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str, caption_length: str | int, lens_type: str = "", film_stock: str = "", composition_style: str = "", lighting_aspect: str = "", special_technique: str = "", color_effect: str = "") -> str:
|
190 |
-
"""
|
191 |
-
Generate a caption or style prompt based on the input image and parameters.
|
192 |
-
"""
|
193 |
-
torch.cuda.empty_cache()
|
194 |
-
|
195 |
-
try:
|
196 |
-
length = None if caption_length == "any" else caption_length
|
197 |
-
if isinstance(length, str):
|
198 |
-
length = int(length)
|
199 |
-
except ValueError:
|
200 |
-
raise ValueError(f"Invalid caption length: {caption_length}")
|
201 |
-
|
202 |
-
if caption_type in ["rng-tags", "training_prompt", "style_prompt"]:
|
203 |
-
caption_tone = "formal"
|
204 |
-
|
205 |
-
prompt_key = (caption_type, caption_tone, isinstance(length, str), isinstance(length, int))
|
206 |
-
if prompt_key not in CAPTION_TYPE_MAP:
|
207 |
-
raise ValueError(f"Invalid caption type: {prompt_key}")
|
208 |
-
|
209 |
-
prompt_str = CAPTION_TYPE_MAP[prompt_key][0].format(length=length, word_count=length)
|
210 |
-
|
211 |
-
if caption_type == "style_prompt":
|
212 |
-
prompt_str += f" Lens type: {lens_type} ({lens_types_info[lens_type]}). "
|
213 |
-
prompt_str += f"Film stock: {film_stocks_info[film_stock]}). "
|
214 |
-
prompt_str += f"Composition style: {composition_styles_info[composition_style]}). "
|
215 |
-
prompt_str += f"Lighting aspect: {lighting_aspects_info[lighting_aspect]}). "
|
216 |
-
prompt_str += f"Special technique: {special_techniques_info[special_technique]}). "
|
217 |
-
prompt_str += f"Color effect: {color_effects_info[color_effect]})."
|
218 |
-
|
219 |
-
# Debugging: Print the constructed prompt string
|
220 |
-
print(f"Constructed Prompt: {prompt_str}")
|
221 |
-
|
222 |
-
pixel_values = preprocess_image(input_image)
|
223 |
-
|
224 |
-
with torch.amp.autocast_mode.autocast('cuda', enabled=True):
|
225 |
-
vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True)
|
226 |
-
image_features = vision_outputs.hidden_states
|
227 |
-
embedded_images = image_adapter(image_features)
|
228 |
-
embedded_images = embedded_images.to('cuda')
|
229 |
-
|
230 |
-
# Load the model from MODEL_PATH
|
231 |
-
text_model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map="auto", torch_dtype=torch.bfloat16)
|
232 |
-
text_model.eval()
|
233 |
-
|
234 |
-
# Debugging: Print the prompt string before passing to generate_caption
|
235 |
-
print(f"Prompt passed to generate_caption: {prompt_str}")
|
236 |
|
237 |
-
caption
|
238 |
-
|
239 |
-
return caption
|
240 |
|
241 |
css = """
|
242 |
h1, h2, h3, h4, h5, h6, p, li, ul, ol, a, .centered-image {
|
@@ -256,110 +392,63 @@ ul, ol {
|
|
256 |
}
|
257 |
"""
|
258 |
|
259 |
-
|
260 |
-
|
261 |
-
"
|
262 |
-
"
|
263 |
-
"
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
"
|
268 |
-
"
|
269 |
-
"
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
"
|
274 |
-
"
|
275 |
-
"
|
276 |
-
"
|
277 |
-
"
|
278 |
-
"
|
279 |
-
"
|
280 |
-
"
|
281 |
-
"
|
282 |
-
"
|
283 |
-
"
|
284 |
-
"
|
285 |
-
"
|
286 |
-
"
|
287 |
-
"Kodachrome": "Color photos with moderate grain. Colors on either colder part of spectrum or regular, with normal or slightly higher saturation.",
|
288 |
-
"Rollei": "Mostly black and white photos, sometimes color with fine/low grain. Can be sepia colored or have unusual hues and desaturation. Great for landscapes."
|
289 |
-
}
|
290 |
-
|
291 |
-
composition_styles_info = {
|
292 |
-
"Rule of Thirds": "Divides the frame into a 3x3 grid, placing key elements along the lines or at their intersections.",
|
293 |
-
"Golden Ratio": "Uses a spiral based on the golden ratio to create a balanced and aesthetically pleasing composition.",
|
294 |
-
"Symmetry": "Creates a mirror-like balance in the image, often used for architectural or nature photography.",
|
295 |
-
"Leading Lines": "Uses lines within the frame to draw the viewer's eye to the main subject or through the image.",
|
296 |
-
"Framing": "Uses elements within the scene to create a frame around the main subject.",
|
297 |
-
"Minimalism": "Simplifies the composition to its essential elements, often with a lot of negative space.",
|
298 |
-
"Fill the Frame": "The main subject dominates the entire frame, leaving little to no background.",
|
299 |
-
"Negative Space": "Uses empty space around the subject to create a sense of simplicity or isolation.",
|
300 |
-
"Centered Composition": "Places the main subject in the center of the frame, creating a sense of stability or importance.",
|
301 |
-
"Diagonal Lines": "Uses diagonal elements to create a sense of movement or dynamic tension in the image.",
|
302 |
-
"Triangular Composition": "Arranges elements in the frame to form a triangle, creating a sense of stability and harmony.",
|
303 |
-
"Radial Balance": "Arranges elements in a circular pattern around a central point, creating a sense of movement or completeness."
|
304 |
}
|
305 |
|
306 |
-
|
307 |
-
"
|
308 |
-
"
|
309 |
-
"
|
310 |
-
"
|
311 |
-
"
|
312 |
-
"
|
313 |
-
"
|
314 |
-
"
|
315 |
-
"
|
316 |
-
"
|
317 |
-
"
|
318 |
-
"
|
319 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
320 |
}
|
321 |
|
322 |
-
special_techniques_info = {
|
323 |
-
"Double exposure": "Superimposes two exposures to create a single image, often resulting in a dreamy or surreal effect.",
|
324 |
-
"Long exposure": "Uses a long shutter speed to capture motion over time, often creating smooth, blurred effects for moving elements.",
|
325 |
-
"Multiple exposure": "Superimposes multiple exposures, multiplying the subject or its key elements across the image.",
|
326 |
-
"HDR": "High Dynamic Range imaging, combining multiple exposures to capture a wider range of light and dark tones.",
|
327 |
-
"Bokeh effect": "Creates a soft, out-of-focus background, often with circular highlights.",
|
328 |
-
"Silhouette": "Captures the outline of a subject against a brighter background, creating a dramatic contrast.",
|
329 |
-
"Panning": "Follows a moving subject with the camera, creating a sharp subject with a blurred background.",
|
330 |
-
"Light painting": "Uses long exposure and moving light sources to 'paint' with light in the image.",
|
331 |
-
"Infrared photography": "Captures light in the infrared spectrum, often resulting in surreal, otherworldly images.",
|
332 |
-
"Ultraviolet photography": "Captures light in the ultraviolet spectrum, often revealing hidden patterns or creating a strong violet glow.",
|
333 |
-
"Kirlian photography": "High-voltage photographic technique that captures corona discharges around objects, creating a glowing effect.",
|
334 |
-
"Thermography": "Captures infrared radiation to create images based on temperature differences, resulting in false-color heat maps.",
|
335 |
-
"Astrophotography": "Specialized technique for capturing astronomical objects and celestial events, often resulting in stunning starry backgrounds.",
|
336 |
-
"Underwater photography": "Captures images beneath the surface of water, often in pools, seas, or aquariums.",
|
337 |
-
"Aerial photography": "Captures images from an elevated position, such as from drones, helicopters, or planes.",
|
338 |
-
"Macro photography": "Extreme close-up photography, revealing tiny details not visible to the naked eye."
|
339 |
-
}
|
340 |
-
|
341 |
-
color_effects_info = {
|
342 |
-
"Black and white": "Removes all color, leaving only shades of gray.",
|
343 |
-
"Sepia": "Reddish-brown monochrome effect, often associated with vintage photography.",
|
344 |
-
"Monochrome": "Uses variations of a single color.",
|
345 |
-
"Vintage color": "Muted or faded color palette reminiscent of old photographs.",
|
346 |
-
"Cross-processed": "Deliberate processing of film in the wrong chemicals, creating unusual color shifts.",
|
347 |
-
"Desaturated": "Reduces the intensity of all colors in the image.",
|
348 |
-
"Vivid colors": "Increases the saturation and intensity of colors.",
|
349 |
-
"Pastel colors": "Soft, pale colors with a light and airy feel.",
|
350 |
-
"High contrast": "Emphasizes the difference between light and dark areas in the image.",
|
351 |
-
"Low contrast": "Reduces the difference between light and dark areas, creating a softer look.",
|
352 |
-
"Color splash": "Converts most of the image to black and white while leaving one or more elements in color."
|
353 |
-
}
|
354 |
-
|
355 |
-
def get_dropdown_choices(info_dict):
|
356 |
-
return [f"{key}: {value}" for key, value in info_dict.items()]
|
357 |
-
|
358 |
-
# Gradio interface
|
359 |
with gr.Blocks(theme="Hev832/Applio", css=css) as demo:
|
360 |
with gr.Tab("Welcome"):
|
361 |
gr.Markdown(
|
362 |
-
|
363 |
<img src="https://path-to-yamamoto-logo.png" alt="Yamamoto Logo" class="centered-image">
|
364 |
|
365 |
# 🎨 Yamamoto JoyCaption: AI-Powered Art Inspiration
|
@@ -367,7 +456,7 @@ with gr.Blocks(theme="Hev832/Applio", css=css) as demo:
|
|
367 |
## Accelerate Your Creative Workflow with Intelligent Image Analysis
|
368 |
|
369 |
This innovative tool empowers Yamamoto's artists to quickly generate descriptive captions,<br>
|
370 |
-
training prompts,
|
371 |
|
372 |
## 🚀 How It Works:
|
373 |
1. **Upload Your Inspiration**: Drop in an image (e.g., a charcoal horse picture) that embodies your desired style.
|
@@ -376,147 +465,109 @@ with gr.Blocks(theme="Hev832/Applio", css=css) as demo:
|
|
376 |
4. **Generate and Iterate**: Click 'Caption' to analyze your image and use the results to inspire new creations.
|
377 |
"""
|
378 |
)
|
379 |
-
|
380 |
with gr.Tab("JoyCaption"):
|
381 |
-
|
382 |
-
|
383 |
-
# How to Use JoyCaption
|
384 |
-
|
385 |
-
Hello, artist! Let's make some fun captions for your pictures. Here's how:
|
386 |
-
|
387 |
-
1. **Pick a Picture**: Find a cool picture you want to talk about and upload it.
|
388 |
-
|
389 |
-
2. **Choose What You Want**:
|
390 |
-
- **Caption Type**:
|
391 |
-
* "Descriptive" tells you what's in the picture
|
392 |
-
* "Training Prompt" helps computers make similar pictures
|
393 |
-
* "RNG-Tags" gives you short words about the picture
|
394 |
-
* "Style Prompt" creates detailed prompts for image generation
|
395 |
|
396 |
-
|
397 |
-
- "Formal" sounds like a teacher talking
|
398 |
-
- "Informal" sounds like a friend chatting
|
399 |
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
5. **Advanced Options** (for "Style Prompt" only):
|
406 |
-
- Choose lens type, film stock, composition, and lighting details
|
407 |
-
|
408 |
-
6. **Make the Caption**: Click the "Make My Caption!" button and watch the magic happen!
|
409 |
-
|
410 |
-
Remember, have fun and be creative with your captions!
|
411 |
-
|
412 |
-
## Tips for Great Captions:
|
413 |
-
- Try different types to see what you like best
|
414 |
-
- Experiment with formal and informal tones for fun variations
|
415 |
-
- Adjust the length to get just the right amount of detail
|
416 |
-
- For "Style Prompt", play with the advanced options for more specific results
|
417 |
-
- If you don't like a caption, just click "Make My Caption!" again for a new one
|
418 |
-
|
419 |
-
Have a great time captioning your art!
|
420 |
-
""")
|
421 |
|
422 |
with gr.Row():
|
423 |
-
with gr.Column():
|
424 |
-
input_image = gr.Image(type="pil", label="
|
425 |
|
426 |
caption_type = gr.Dropdown(
|
427 |
-
choices=[
|
428 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
429 |
value="descriptive",
|
430 |
)
|
431 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
432 |
caption_tone = gr.Dropdown(
|
433 |
choices=["formal", "informal"],
|
434 |
-
label="
|
435 |
value="formal",
|
436 |
)
|
437 |
|
|
|
|
|
438 |
caption_length = gr.Dropdown(
|
439 |
choices=["any", "very short", "short", "medium-length", "long", "very long"] +
|
440 |
[str(i) for i in range(20, 261, 10)],
|
441 |
-
label="
|
442 |
value="any",
|
443 |
)
|
444 |
|
445 |
-
|
446 |
-
|
447 |
-
|
448 |
-
|
449 |
-
|
450 |
-
)
|
451 |
-
|
452 |
-
|
453 |
-
choices=
|
454 |
-
label="
|
455 |
-
|
456 |
-
|
457 |
-
)
|
458 |
-
|
459 |
-
composition_style = gr.Dropdown(
|
460 |
-
choices=get_dropdown_choices(composition_styles_info),
|
461 |
-
label="Composition Style",
|
462 |
-
visible=False,
|
463 |
-
info="Select a composition style to guide the arrangement of elements in the image."
|
464 |
)
|
465 |
|
466 |
-
|
467 |
-
|
468 |
-
|
469 |
-
|
470 |
-
|
471 |
-
|
472 |
-
|
473 |
-
|
474 |
-
|
475 |
-
|
476 |
-
|
477 |
-
|
478 |
-
|
479 |
-
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
-
|
484 |
-
|
485 |
-
|
486 |
-
|
487 |
-
|
488 |
-
|
489 |
-
|
490 |
-
|
491 |
-
with gr.Column():
|
492 |
-
output_caption = gr.Textbox(label="Generated Caption")
|
493 |
-
|
494 |
-
# Container for advanced options
|
495 |
-
advanced_options = gr.Column(visible=False)
|
496 |
-
with advanced_options:
|
497 |
-
gr.Markdown("### Advanced Options for Style Prompt")
|
498 |
-
lens_type.render()
|
499 |
-
film_stock.render()
|
500 |
-
composition_style.render()
|
501 |
-
lighting_aspect.render()
|
502 |
-
special_technique.render()
|
503 |
-
color_effect.render()
|
504 |
-
|
505 |
-
def update_style_options(caption_type):
|
506 |
-
return {
|
507 |
-
lens_type: gr.update(visible=caption_type == "style_prompt"),
|
508 |
-
film_stock: gr.update(visible=caption_type == "style_prompt"),
|
509 |
-
composition_style: gr.update(visible=caption_type == "style_prompt"),
|
510 |
-
lighting_aspect: gr.update(visible=caption_type == "style_prompt"),
|
511 |
-
special_technique: gr.update(visible=caption_type == "style_prompt"),
|
512 |
-
color_effect: gr.update(visible=caption_type == "style_prompt"),
|
513 |
-
advanced_options: gr.update(visible=caption_type == "style_prompt"),
|
514 |
-
}
|
515 |
-
|
516 |
-
caption_type.change(update_style_options, inputs=[caption_type], outputs=[lens_type, film_stock, composition_style, lighting_aspect, special_technique, color_effect, advanced_options])
|
517 |
-
|
518 |
-
run_button.click(fn=stream_chat, inputs=[input_image, caption_type, caption_tone, caption_length, lens_type, film_stock, composition_style, lighting_aspect, special_technique, color_effect], outputs=[output_caption])
|
519 |
|
|
|
|
|
|
|
|
|
|
|
520 |
|
521 |
if __name__ == "__main__":
|
522 |
demo.launch()
|
|
|
12 |
|
13 |
|
14 |
CLIP_PATH = "google/siglip-so400m-patch14-384"
|
15 |
+
MODEL_PATH = "meta-llama/Meta-Llama-3.1-8B"
|
16 |
CHECKPOINT_PATH = Path("9em124t2-499968")
|
|
|
17 |
CAPTION_TYPE_MAP = {
|
18 |
+
("descriptive", "formal", False, False): [
|
19 |
+
"Write a detailed, formal description of this image, focusing on composition, style, and artistic elements.",
|
20 |
+
"Provide a comprehensive, academic analysis of this artwork's visual characteristics and techniques."
|
21 |
+
],
|
22 |
+
("descriptive", "formal", False, True): [
|
23 |
+
"Craft a formal, concise description of this image within {word_count} words, highlighting key visual elements.",
|
24 |
+
"Summarize the artwork's main features and style in a formal tone, using no more than {word_count} words."
|
25 |
+
],
|
26 |
+
("descriptive", "formal", True, False): [
|
27 |
+
"Compose a {length} formal critique of this image, discussing its artistic merits and visual impact.",
|
28 |
+
"Create a {length} scholarly description of this artwork, analyzing its composition and aesthetic qualities."
|
29 |
+
],
|
30 |
+
("descriptive", "informal", False, False): [
|
31 |
+
"Describe this image as if you're explaining it to a friend, focusing on what stands out to you.",
|
32 |
+
"Give a casual, conversational rundown of what you see in this artwork and how it makes you feel."
|
33 |
+
],
|
34 |
+
("descriptive", "informal", False, True): [
|
35 |
+
"In about {word_count} words, give a laid-back description of this image's vibe and key features.",
|
36 |
+
"Summarize the coolest parts of this artwork in a casual tone, using roughly {word_count} words."
|
37 |
+
],
|
38 |
+
("descriptive", "informal", True, False): [
|
39 |
+
"Write a {length} chill description of this image, highlighting what you find most interesting or unique.",
|
40 |
+
"Give a {length} relaxed explanation of what's going on in this artwork and why it catches your eye."
|
41 |
+
],
|
42 |
+
("training_prompt", "formal", False, False): [
|
43 |
+
"Generate a detailed stable diffusion prompt to recreate this image, including style, composition, and key elements.",
|
44 |
+
"Craft a comprehensive prompt for an AI art generator to produce an image in the same style and mood as this artwork."
|
45 |
+
],
|
46 |
+
("training_prompt", "formal", False, True): [
|
47 |
+
"Within {word_count} words, create a precise stable diffusion prompt capturing the essence of this image.",
|
48 |
+
"Compose a concise AI art prompt of {word_count} words to replicate this artwork's style and content."
|
49 |
+
],
|
50 |
+
("training_prompt", "formal", True, False): [
|
51 |
+
"Write a {length} stable diffusion prompt that thoroughly describes this image's style, subject, and artistic techniques.",
|
52 |
+
"Develop a {length} detailed prompt for AI art generation, breaking down the key visual elements and artistic approach of this image."
|
53 |
+
],
|
54 |
+
("rng-tags", "formal", False, False): [
|
55 |
+
"Generate a comprehensive list of Booru tags describing this image's content, style, and artistic elements.",
|
56 |
+
"Create an extensive set of Booru tags covering all aspects of this artwork, including subject, technique, and mood."
|
57 |
+
],
|
58 |
+
("rng-tags", "formal", False, True): [
|
59 |
+
"Produce a focused list of Booru tags within {word_count} words, capturing the most important aspects of this image.",
|
60 |
+
"Compile a concise set of Booru tags, limited to {word_count} words, that best represent this artwork's key features."
|
61 |
+
],
|
62 |
+
("rng-tags", "formal", True, False): [
|
63 |
+
"Generate a {length} list of Booru tags, providing a thorough categorization of this image's content and style.",
|
64 |
+
"Create a {length} set of Booru tags that extensively describe all visual elements and artistic choices in this artwork."
|
65 |
+
],
|
66 |
+
("artistic_inspiration", "formal", False, False): [
|
67 |
+
"Analyze this image and suggest artistic variations or extensions that could be created based on its style and theme.",
|
68 |
+
"Provide a formal interpretation of this artwork's mood and style, offering ideas for complementary pieces or a series."
|
69 |
+
],
|
70 |
+
("artistic_inspiration", "informal", False, False): [
|
71 |
+
"Brainstorm some cool ideas for new artworks inspired by this image's style or subject matter.",
|
72 |
+
"Riff on this artwork's vibe and come up with some creative spin-offs or related pieces an artist could make."
|
73 |
+
],
|
74 |
+
("technical_breakdown", "formal", False, False): [
|
75 |
+
"Provide a detailed technical analysis of the artistic techniques and materials likely used to create this image.",
|
76 |
+
"Break down the compositional elements and artistic methods employed in this artwork, suitable for an art student's study."
|
77 |
+
],
|
78 |
+
("emotional_response", "informal", False, False): [
|
79 |
+
"Describe the emotions and feelings this artwork evokes, and explain why it might resonate with viewers.",
|
80 |
+
"Share your gut reaction to this image and speculate on what the artist might have been feeling or thinking."
|
81 |
+
],
|
82 |
+
|
83 |
+
("thematic_analysis", "formal", False, False): [
|
84 |
+
"Provide an in-depth analysis of the themes presented in this image, exploring the underlying messages and concepts.",
|
85 |
+
"Analyze the primary and secondary themes of this artwork, discussing their significance and interplay."
|
86 |
+
],
|
87 |
+
("thematic_analysis", "formal", False, True): [
|
88 |
+
"Within {word_count} words, dissect the main themes of this image, highlighting their relevance and impact.",
|
89 |
+
"Craft a concise thematic analysis of this artwork in {word_count} words, focusing on its core messages."
|
90 |
+
],
|
91 |
+
("thematic_analysis", "formal", True, False): [
|
92 |
+
"Write a {length} formal exploration of the themes depicted in this image, examining their depth and meaning.",
|
93 |
+
"Develop a {length} scholarly analysis of the thematic elements in this artwork, discussing their significance."
|
94 |
+
],
|
95 |
+
("stylistic_comparison", "informal", False, False): [
|
96 |
+
"Compare the style of this image to other famous art movements or artists, highlighting similarities and differences.",
|
97 |
+
"Describe how this artwork's style relates to [specific artist/style], and what makes it unique."
|
98 |
+
],
|
99 |
+
("stylistic_comparison", "informal", False, True): [
|
100 |
+
"In about {word_count} words, compare this image's style with other known art styles or artists.",
|
101 |
+
"Summarize the stylistic similarities and differences of this artwork compared to other genres in {word_count} words."
|
102 |
+
],
|
103 |
+
("stylistic_comparison", "informal", True, False): [
|
104 |
+
"Write a {length} casual comparison of this image's style with other art movements or famous artists.",
|
105 |
+
"Give a {length} relaxed description of how this artwork's style aligns or differs from other genres."
|
106 |
+
],
|
107 |
+
("narrative_suggestion", "formal", False, False): [
|
108 |
+
"Create a short narrative inspired by this image, outlining a possible story that reflects its visual elements.",
|
109 |
+
"Develop a brief storyline that complements the themes and mood depicted in this artwork."
|
110 |
+
],
|
111 |
+
("narrative_suggestion", "formal", False, True): [
|
112 |
+
"Within {word_count} words, outline a narrative inspired by this image's visual elements and mood.",
|
113 |
+
"Compose a concise story idea based on the themes and composition of this artwork in {word_count} words."
|
114 |
+
],
|
115 |
+
("narrative_suggestion", "formal", True, False): [
|
116 |
+
"Write a {length} formal narrative inspired by this image, detailing a story that aligns with its visual and thematic elements.",
|
117 |
+
"Develop a {length} scholarly storyline that reflects the mood and composition of this artwork."
|
118 |
+
],
|
119 |
+
("contextual_storytelling", "informal", False, False): [
|
120 |
+
"Tell a cool story that could be happening in the scene of this image, based on its visual cues.",
|
121 |
+
"Imagine a background story for this artwork, explaining what's happening and why."
|
122 |
+
],
|
123 |
+
("contextual_storytelling", "informal", False, True): [
|
124 |
+
"In about {word_count} words, create a backstory for the scene depicted in this image.",
|
125 |
+
"Summarize a possible background narrative for this artwork in {word_count} words."
|
126 |
+
],
|
127 |
+
("contextual_storytelling", "informal", True, False): [
|
128 |
+
"Write a {length} informal story that provides context to the scene portrayed in this image.",
|
129 |
+
"Give a {length} casual backstory explaining the events depicted in this artwork."
|
130 |
+
],
|
131 |
+
|
132 |
+
("style_prompt", "formal", False, False): [
|
133 |
+
"Analyze this image through the lens of {style} art. Describe how it aligns with or diverges from {style_characteristics}.",
|
134 |
+
"Examine this artwork in the context of the {style} movement, focusing on {style_focus} and how these elements are represented or reinterpreted in the image."
|
135 |
+
],
|
136 |
+
("style_prompt", "formal", False, True): [
|
137 |
+
"Within {word_count} words, compare this image to the {style} style, highlighting elements that reflect or contrast with its key characteristics.",
|
138 |
+
"Compose a concise {word_count}-word analysis of how this artwork relates to the {style} movement, noting its adherence to or departure from typical {style} elements."
|
139 |
+
],
|
140 |
+
("style_prompt", "formal", True, False): [
|
141 |
+
"Write a {length} critique of this image, exploring its relationship to the {style} movement. Discuss composition, technique, and thematic elements in this context.",
|
142 |
+
"Develop a {length} analysis of how this artwork incorporates or challenges the principles of {style}, considering its visual language and artistic approach."
|
143 |
+
],
|
144 |
+
("style_prompt", "informal", False, False): [
|
145 |
+
"Imagine this image is in an exhibition of {style} art. Describe what makes it fit in or stand out from other {style} pieces.",
|
146 |
+
"Give a casual rundown of how this artwork vibes with the {style} movement. What's similar? What's different? What's cool about it?"
|
147 |
+
],
|
148 |
+
("style_prompt", "informal", False, True): [
|
149 |
+
"In about {word_count} words, chat about how this image relates to {style} art. What catches your eye as typical or unusual for the style?",
|
150 |
+
"Summarize in roughly {word_count} words how this artwork plays with {style} ideas. What's familiar? What's a twist on the style?"
|
151 |
+
],
|
152 |
+
("style_prompt", "informal", True, False): [
|
153 |
+
"Write a {length} chill analysis of this image as if it's part of a {style} art show. What works? What's surprising? How does it make you feel?",
|
154 |
+
"Give a {length} relaxed breakdown of how this artwork fits (or doesn't) into the {style} scene. What's your take on its use of {style} elements?"
|
155 |
+
],
|
156 |
}
|
157 |
|
158 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
|
|
257 |
# Image Adapter
|
258 |
print("Loading image adapter")
|
259 |
image_adapter = ImageAdapter(clip_model.config.hidden_size, text_model.config.hidden_size, False, False, 38, False)
|
260 |
+
image_adapter.load_state_dict(torch.load(CHECKPOINT_PATH / "image_adapter.pt", map_location="cpu"))
|
261 |
image_adapter.eval()
|
262 |
image_adapter.to("cuda")
|
263 |
|
264 |
+
# After loading the tokenizer and model
|
265 |
+
print(f"Tokenizer class: {type(tokenizer)}")
|
266 |
+
print(f"BOS token: {tokenizer.bos_token}")
|
267 |
+
print(f"BOS token ID: {tokenizer.bos_token_id}")
|
268 |
+
print(f"EOS token: {tokenizer.eos_token}")
|
269 |
+
print(f"EOS token ID: {tokenizer.eos_token_id}")
|
270 |
+
print(f"Text model device: {text_model.device}")
|
271 |
+
|
272 |
+
# Ensure the tokenizer has the necessary special tokens
|
273 |
+
if tokenizer.bos_token_id is None or tokenizer.eos_token_id is None:
|
274 |
+
print("Warning: BOS or EOS token is missing. Adding default tokens.")
|
275 |
+
special_tokens_dict = {}
|
276 |
+
if tokenizer.bos_token_id is None:
|
277 |
+
special_tokens_dict['bos_token'] = '<|endoftext|>'
|
278 |
+
if tokenizer.eos_token_id is None:
|
279 |
+
special_tokens_dict['eos_token'] = '<|endoftext|>'
|
280 |
+
num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)
|
281 |
+
print(f"Added {num_added_tokens} special tokens to the tokenizer.")
|
282 |
+
|
283 |
+
# Resize token embeddings of the model if new tokens are added
|
284 |
+
text_model.resize_token_embeddings(len(tokenizer))
|
285 |
+
|
286 |
+
@spaces.GPU()
|
287 |
+
@torch.no_grad()
|
288 |
+
def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str, caption_length: str | int, art_style: str) -> str:
|
289 |
+
torch.cuda.empty_cache()
|
290 |
+
|
291 |
+
# Handle caption_length
|
292 |
+
length = None
|
293 |
+
if caption_length != "any":
|
294 |
+
if isinstance(caption_length, int):
|
295 |
+
length = caption_length
|
296 |
+
elif isinstance(caption_length, str):
|
297 |
+
try:
|
298 |
+
length = int(caption_length)
|
299 |
+
except ValueError:
|
300 |
+
# If it's not a number, treat it as a descriptive length
|
301 |
+
length = caption_length
|
302 |
+
|
303 |
+
# 'rng-tags' and 'training_prompt' don't have formal/informal tones
|
304 |
+
if caption_type in ["rng-tags", "training_prompt"]:
|
305 |
+
caption_tone = "formal"
|
306 |
+
|
307 |
+
# Build prompt
|
308 |
+
prompt_key = (caption_type, caption_tone, isinstance(length, str), isinstance(length, int))
|
309 |
+
if prompt_key not in CAPTION_TYPE_MAP:
|
310 |
+
raise ValueError(f"Invalid caption type: {prompt_key}")
|
311 |
+
|
312 |
+
prompt_str = CAPTION_TYPE_MAP[prompt_key][0].format(
|
313 |
+
length=length,
|
314 |
+
word_count=length,
|
315 |
+
style=art_style,
|
316 |
+
style_characteristics=STYLE_CHARACTERISTICS.get(art_style, "its unique elements"),
|
317 |
+
style_focus=STYLE_FOCUS.get(art_style, "its distinctive features")
|
318 |
+
)
|
319 |
+
print(f"Prompt: {prompt_str}")
|
320 |
|
321 |
+
# Preprocess image
|
|
|
|
|
|
|
322 |
image = input_image.resize((384, 384), Image.LANCZOS)
|
323 |
pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
|
324 |
pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
|
325 |
+
pixel_values = pixel_values.to('cuda')
|
326 |
|
327 |
+
# Tokenize the prompt
|
|
|
|
|
|
|
328 |
prompt = tokenizer.encode(prompt_str, return_tensors='pt', padding=False, truncation=False, add_special_tokens=False)
|
329 |
+
|
330 |
+
# Embed image
|
331 |
+
with torch.amp.autocast_mode.autocast('cuda', enabled=True):
|
332 |
+
vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True)
|
333 |
+
image_features = vision_outputs.hidden_states
|
334 |
+
embedded_images = image_adapter(image_features)
|
335 |
+
embedded_images = embedded_images.to('cuda')
|
336 |
+
|
337 |
+
# Embed prompt
|
338 |
prompt_embeds = text_model.model.embed_tokens(prompt.to('cuda'))
|
339 |
+
assert prompt_embeds.shape == (1, prompt.shape[1], text_model.config.hidden_size), f"Prompt shape is {prompt_embeds.shape}, expected {(1, prompt.shape[1], text_model.config.hidden_size)}"
|
340 |
+
|
341 |
+
# Check for bos_token_id and provide a fallback
|
342 |
+
bos_token_id = tokenizer.bos_token_id
|
343 |
+
if bos_token_id is None:
|
344 |
+
print("Warning: bos_token_id is None. Using default value of 1.")
|
345 |
+
bos_token_id = 1 # Common default, but may need adjustment
|
346 |
+
|
347 |
+
embedded_bos = text_model.model.embed_tokens(torch.tensor([[bos_token_id]], device=text_model.device, dtype=torch.int64))
|
348 |
eot_embed = image_adapter.get_eot_embedding().unsqueeze(0).to(dtype=text_model.dtype)
|
349 |
|
350 |
+
# Construct prompts
|
351 |
inputs_embeds = torch.cat([
|
352 |
+
embedded_bos.expand(embedded_images.shape[0], -1, -1),
|
353 |
+
embedded_images.to(dtype=embedded_bos.dtype),
|
354 |
+
prompt_embeds.expand(embedded_images.shape[0], -1, -1),
|
355 |
+
eot_embed.expand(embedded_images.shape[0], -1, -1),
|
356 |
], dim=1)
|
357 |
|
358 |
input_ids = torch.cat([
|
359 |
+
torch.tensor([[bos_token_id]], dtype=torch.long),
|
360 |
+
torch.zeros((1, embedded_images.shape[1]), dtype=torch.long),
|
361 |
prompt,
|
362 |
torch.tensor([[tokenizer.convert_tokens_to_ids("<|eot_id|>")]], dtype=torch.long),
|
363 |
], dim=1).to('cuda')
|
364 |
attention_mask = torch.ones_like(input_ids)
|
365 |
|
366 |
+
generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=300, do_sample=True, suppress_tokens=None)
|
367 |
|
368 |
+
# Trim off the prompt
|
369 |
generate_ids = generate_ids[:, input_ids.shape[1]:]
|
370 |
if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids("<|eot_id|>"):
|
371 |
generate_ids = generate_ids[:, :-1]
|
372 |
|
373 |
+
caption = tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
374 |
|
375 |
+
return caption.strip()
|
|
|
|
|
376 |
|
377 |
css = """
|
378 |
h1, h2, h3, h4, h5, h6, p, li, ul, ol, a, .centered-image {
|
|
|
392 |
}
|
393 |
"""
|
394 |
|
395 |
+
ART_STYLES = [
|
396 |
+
"Impressionism", "Cubism", "Surrealism", "Abstract Expressionism", "Pop Art",
|
397 |
+
"Minimalism", "Baroque", "Renaissance", "Art Nouveau", "Gothic",
|
398 |
+
"Romanticism", "Realism", "Expressionism", "Fauvism", "Art Deco",
|
399 |
+
"Futurism", "Dadaism", "Pointillism", "Rococo", "Neoclassicism"
|
400 |
+
]
|
401 |
+
|
402 |
+
STYLE_CHARACTERISTICS = {
|
403 |
+
"Impressionism": "loose brushstrokes, emphasis on light and color, everyday subjects",
|
404 |
+
"Cubism": "geometric shapes, multiple perspectives, fragmented forms",
|
405 |
+
"Surrealism": "dreamlike imagery, unexpected juxtapositions, subconscious exploration",
|
406 |
+
"Abstract Expressionism": "expressive brushwork, emotional content, abstract forms",
|
407 |
+
"Pop Art": "bright colors, popular culture references, satire",
|
408 |
+
"Minimalism": "simple forms, limited color palette, emphasis on space",
|
409 |
+
"Baroque": "dramatic lighting, elaborate detail, grandeur",
|
410 |
+
"Renaissance": "realistic depictions, perspective, religious themes",
|
411 |
+
"Art Nouveau": "stylized forms, organic shapes, decorative elements",
|
412 |
+
"Gothic": "dark themes, dramatic lighting, architectural elements",
|
413 |
+
"Romanticism": "emotional content, nature scenes, idealized figures",
|
414 |
+
"Realism": "detailed depictions, realistic textures, everyday subjects",
|
415 |
+
"Expressionism": "emotional content, distorted forms, abstract elements",
|
416 |
+
"Fauvism": "bold colors, abstract forms, emotional content",
|
417 |
+
"Art Deco": "geometric shapes, streamlined forms, modern aesthetics",
|
418 |
+
"Futurism": "dynamic forms, speed, technology",
|
419 |
+
"Dadaism": "anti-art, absurdity, subversion of traditional art",
|
420 |
+
"Pointillism": "small dots of color, impressionistic style, emphasis on light",
|
421 |
+
"Rococo": "ornate style, lighthearted themes, decorative elements",
|
422 |
+
"Neoclassicism": "classical style, balance, symmetry"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
423 |
}
|
424 |
|
425 |
+
STYLE_FOCUS = {
|
426 |
+
"Impressionism": "capturing fleeting moments and atmospheric effects",
|
427 |
+
"Cubism": "deconstructing and reassembling forms from multiple viewpoints",
|
428 |
+
"Surrealism": "creating a sense of the uncanny and exploring the subconscious mind",
|
429 |
+
"Abstract Expressionism": "expressing emotional content through abstract forms",
|
430 |
+
"Pop Art": "commenting on popular culture and satirizing consumerism",
|
431 |
+
"Minimalism": "exploring the relationship between form and space",
|
432 |
+
"Baroque": "creating dramatic and grandiose compositions",
|
433 |
+
"Renaissance": "depicting realistic scenes and exploring perspective",
|
434 |
+
"Art Nouveau": "incorporating organic and decorative elements",
|
435 |
+
"Gothic": "exploring dark themes and dramatic lighting",
|
436 |
+
"Romanticism": "depicting emotional scenes and idealized figures",
|
437 |
+
"Realism": "capturing detailed and realistic textures",
|
438 |
+
"Expressionism": "expressing emotional content through distorted forms",
|
439 |
+
"Fauvism": "emphasizing bold colors and emotional content",
|
440 |
+
"Art Deco": "incorporating geometric shapes and modern aesthetics",
|
441 |
+
"Futurism": "depicting speed, technology, and dynamism",
|
442 |
+
"Dadaism": "subverting traditional art and exploring absurdity",
|
443 |
+
"Pointillism": "capturing light and color through small dots",
|
444 |
+
"Rococo": "creating lighthearted and decorative compositions",
|
445 |
+
"Neoclassicism": "achieving balance and symmetry in classical style"
|
446 |
}
|
447 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
448 |
with gr.Blocks(theme="Hev832/Applio", css=css) as demo:
|
449 |
with gr.Tab("Welcome"):
|
450 |
gr.Markdown(
|
451 |
+
"""
|
452 |
<img src="https://path-to-yamamoto-logo.png" alt="Yamamoto Logo" class="centered-image">
|
453 |
|
454 |
# 🎨 Yamamoto JoyCaption: AI-Powered Art Inspiration
|
|
|
456 |
## Accelerate Your Creative Workflow with Intelligent Image Analysis
|
457 |
|
458 |
This innovative tool empowers Yamamoto's artists to quickly generate descriptive captions,<br>
|
459 |
+
training prompts, or tags from existing artwork, fueling the creative process for GenAI models.
|
460 |
|
461 |
## 🚀 How It Works:
|
462 |
1. **Upload Your Inspiration**: Drop in an image (e.g., a charcoal horse picture) that embodies your desired style.
|
|
|
465 |
4. **Generate and Iterate**: Click 'Caption' to analyze your image and use the results to inspire new creations.
|
466 |
"""
|
467 |
)
|
468 |
+
|
469 |
with gr.Tab("JoyCaption"):
|
470 |
+
gr.Markdown("""
|
471 |
+
# JoyCaption: AI-Powered Image Analysis Tool
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
472 |
|
473 |
+
This tool helps you generate various types of text based on an uploaded image. Here's how to use it:
|
|
|
|
|
474 |
|
475 |
+
1. Upload an image
|
476 |
+
2. Choose your desired output type
|
477 |
+
3. Adjust settings as needed
|
478 |
+
4. Click 'Generate Caption' to get your result
|
479 |
+
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
480 |
|
481 |
with gr.Row():
|
482 |
+
with gr.Column(scale=1):
|
483 |
+
input_image = gr.Image(type="pil", label="Upload Your Image")
|
484 |
|
485 |
caption_type = gr.Dropdown(
|
486 |
+
choices=[
|
487 |
+
"descriptive",
|
488 |
+
"training_prompt",
|
489 |
+
"rng-tags",
|
490 |
+
"thematic_analysis",
|
491 |
+
"stylistic_comparison",
|
492 |
+
"narrative_suggestion",
|
493 |
+
"contextual_storytelling",
|
494 |
+
"style_prompt"
|
495 |
+
],
|
496 |
+
label="Output Type",
|
497 |
value="descriptive",
|
498 |
)
|
499 |
|
500 |
+
gr.Markdown("""
|
501 |
+
### Output Types Explained:
|
502 |
+
- **Descriptive**: A general description of the image
|
503 |
+
- **Training Prompt**: A prompt for AI image generation
|
504 |
+
- **RNG-Tags**: Tags for categorizing the image
|
505 |
+
- **Thematic Analysis**: Exploration of themes in the image
|
506 |
+
- **Stylistic Comparison**: Compares the image to art styles
|
507 |
+
- **Narrative Suggestion**: A story idea based on the image
|
508 |
+
- **Contextual Storytelling**: A background story for the image
|
509 |
+
- **Style Prompt**: Analyzes the image in context of a specific art style
|
510 |
+
""")
|
511 |
+
|
512 |
caption_tone = gr.Dropdown(
|
513 |
choices=["formal", "informal"],
|
514 |
+
label="Tone",
|
515 |
value="formal",
|
516 |
)
|
517 |
|
518 |
+
gr.Markdown("Choose between a formal (professional) or informal (casual) tone for the output.")
|
519 |
+
|
520 |
caption_length = gr.Dropdown(
|
521 |
choices=["any", "very short", "short", "medium-length", "long", "very long"] +
|
522 |
[str(i) for i in range(20, 261, 10)],
|
523 |
+
label="Length",
|
524 |
value="any",
|
525 |
)
|
526 |
|
527 |
+
gr.Markdown("""
|
528 |
+
Select the desired length of the output:
|
529 |
+
- 'any': No specific length
|
530 |
+
- Descriptive options: very short to very long
|
531 |
+
- Numeric options: Specify exact word count (20 to 260 words)
|
532 |
+
""")
|
533 |
+
|
534 |
+
art_style = gr.Dropdown(
|
535 |
+
choices=ART_STYLES,
|
536 |
+
label="Art Style (for Style Prompt)",
|
537 |
+
value="Impressionism",
|
538 |
+
visible=False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
539 |
)
|
540 |
|
541 |
+
gr.Markdown("Select an art style to analyze the image in that context. Only applicable for 'Style Prompt' output type.")
|
542 |
+
|
543 |
+
with gr.Column(scale=1):
|
544 |
+
output_caption = gr.Textbox(label="Generated Output", lines=10)
|
545 |
+
generate_button = gr.Button("Generate Caption")
|
546 |
+
|
547 |
+
gr.Markdown("""
|
548 |
+
### Additional Notes:
|
549 |
+
- The 'Tone' setting doesn't affect 'RNG-Tags' and 'Training Prompt' outputs.
|
550 |
+
- 'Art Style' is only used when 'Style Prompt' is selected as the output type.
|
551 |
+
- The AI model analyzes the image and generates text based on your selections.
|
552 |
+
""")
|
553 |
+
|
554 |
+
def update_visibility(caption_type):
|
555 |
+
return {
|
556 |
+
art_style: gr.update(visible=(caption_type == "style_prompt")),
|
557 |
+
caption_tone: gr.update(visible=(caption_type not in ["rng-tags", "training_prompt"]))
|
558 |
+
}
|
559 |
+
|
560 |
+
caption_type.change(
|
561 |
+
fn=update_visibility,
|
562 |
+
inputs=[caption_type],
|
563 |
+
outputs=[art_style, caption_tone]
|
564 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
565 |
|
566 |
+
generate_button.click(
|
567 |
+
fn=stream_chat,
|
568 |
+
inputs=[input_image, caption_type, caption_tone, caption_length, art_style],
|
569 |
+
outputs=[output_caption]
|
570 |
+
)
|
571 |
|
572 |
if __name__ == "__main__":
|
573 |
demo.launch()
|