Update app.py
Browse files
app.py
CHANGED
@@ -31,9 +31,9 @@ CAPTION_TYPE_MAP = {
|
|
31 |
("rng-tags", "formal", False, True): ["Write a list of Booru tags for this image within {word_count} words."],
|
32 |
("rng-tags", "formal", True, False): ["Write a {length} list of Booru tags for this image."],
|
33 |
|
34 |
-
("style_prompt", "formal", False, False): ["Generate a detailed style prompt for this image, including lens type, film stock, composition notes, and
|
35 |
-
("style_prompt", "formal", False, True): ["Generate a detailed style prompt for this image within {word_count} words, including lens type, film stock, composition notes, and
|
36 |
-
("style_prompt", "formal", True, False): ["Generate a {length} detailed style prompt for this image, including lens type, film stock, composition notes, and
|
37 |
}
|
38 |
|
39 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
@@ -186,7 +186,7 @@ def generate_caption(text_model, tokenizer, image_features, prompt_str: str, max
|
|
186 |
|
187 |
@spaces.GPU()
|
188 |
@torch.no_grad()
|
189 |
-
def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str, caption_length: str | int, lens_type: str = "", film_stock: str = "", composition_style: str = "") -> str:
|
190 |
"""
|
191 |
Generate a caption or style prompt based on the input image and parameters.
|
192 |
"""
|
@@ -209,7 +209,12 @@ def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str,
|
|
209 |
prompt_str = CAPTION_TYPE_MAP[prompt_key][0].format(length=length, word_count=length)
|
210 |
|
211 |
if caption_type == "style_prompt":
|
212 |
-
prompt_str += f" Lens type: {lens_type}
|
|
|
|
|
|
|
|
|
|
|
213 |
|
214 |
print(f"Prompt: {prompt_str}")
|
215 |
|
@@ -243,12 +248,116 @@ ul, ol {
|
|
243 |
}
|
244 |
"""
|
245 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
246 |
# Gradio interface
|
247 |
with gr.Blocks(theme="Hev832/Applio", css=css) as demo:
|
248 |
with gr.Tab("Welcome"):
|
249 |
gr.Markdown(
|
250 |
"""
|
251 |
-
<img src="https://
|
252 |
|
253 |
# 🎨 Yamamoto JoyCaption: AI-Powered Art Inspiration
|
254 |
|
@@ -331,46 +440,73 @@ with gr.Blocks(theme="Hev832/Applio", css=css) as demo:
|
|
331 |
)
|
332 |
|
333 |
lens_type = gr.Dropdown(
|
334 |
-
choices=
|
335 |
label="Lens Type",
|
336 |
visible=False,
|
|
|
337 |
)
|
338 |
|
339 |
film_stock = gr.Dropdown(
|
340 |
-
choices=
|
341 |
label="Film Stock",
|
342 |
visible=False,
|
|
|
343 |
)
|
344 |
|
345 |
composition_style = gr.Dropdown(
|
346 |
-
choices=
|
347 |
label="Composition Style",
|
348 |
visible=False,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
349 |
)
|
350 |
|
351 |
gr.Markdown("**Note:** Caption tone doesn't affect `rng-tags`, `training_prompt`, and `style_prompt`.")
|
352 |
|
|
|
|
|
353 |
run_button = gr.Button("Make My Caption!")
|
354 |
|
355 |
with gr.Column():
|
356 |
output_caption = gr.Textbox(label="Generated Caption")
|
357 |
copy_button = gr.Button("Copy to Clipboard")
|
|
|
358 |
|
359 |
def update_style_options(caption_type):
|
360 |
return {
|
361 |
lens_type: gr.update(visible=caption_type == "style_prompt"),
|
362 |
film_stock: gr.update(visible=caption_type == "style_prompt"),
|
363 |
composition_style: gr.update(visible=caption_type == "style_prompt"),
|
|
|
|
|
|
|
364 |
}
|
365 |
|
366 |
-
caption_type.change(update_style_options, inputs=[caption_type], outputs=[lens_type, film_stock, composition_style])
|
367 |
|
368 |
-
run_button.click(fn=stream_chat, inputs=[input_image, caption_type, caption_tone, caption_length, lens_type, film_stock, composition_style], outputs=[output_caption])
|
369 |
-
|
370 |
-
def copy_to_clipboard():
|
371 |
-
return None
|
372 |
|
373 |
-
copy_button.click(
|
374 |
|
375 |
if __name__ == "__main__":
|
376 |
demo.launch()
|
|
|
31 |
("rng-tags", "formal", False, True): ["Write a list of Booru tags for this image within {word_count} words."],
|
32 |
("rng-tags", "formal", True, False): ["Write a {length} list of Booru tags for this image."],
|
33 |
|
34 |
+
("style_prompt", "formal", False, False): ["Generate a detailed style prompt for this image, including lens type, film stock, composition notes, lighting aspects, and any special photographic techniques."],
|
35 |
+
("style_prompt", "formal", False, True): ["Generate a detailed style prompt for this image within {word_count} words, including lens type, film stock, composition notes, lighting aspects, and any special photographic techniques."],
|
36 |
+
("style_prompt", "formal", True, False): ["Generate a {length} detailed style prompt for this image, including lens type, film stock, composition notes, lighting aspects, and any special photographic techniques."],
|
37 |
}
|
38 |
|
39 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
|
|
186 |
|
187 |
@spaces.GPU()
|
188 |
@torch.no_grad()
|
189 |
+
def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str, caption_length: str | int, lens_type: str = "", film_stock: str = "", composition_style: str = "", lighting_aspect: str = "", special_technique: str = "", color_effect: str = "") -> str:
|
190 |
"""
|
191 |
Generate a caption or style prompt based on the input image and parameters.
|
192 |
"""
|
|
|
209 |
prompt_str = CAPTION_TYPE_MAP[prompt_key][0].format(length=length, word_count=length)
|
210 |
|
211 |
if caption_type == "style_prompt":
|
212 |
+
prompt_str += f" Lens type: {lens_type} ({lens_types_info[lens_type]}). "
|
213 |
+
prompt_str += f"Film stock: {film_stock} ({film_stocks_info[film_stock]}). "
|
214 |
+
prompt_str += f"Composition style: {composition_style} ({composition_styles_info[composition_style]}). "
|
215 |
+
prompt_str += f"Lighting aspect: {lighting_aspect} ({lighting_aspects_info[lighting_aspect]}). "
|
216 |
+
prompt_str += f"Special technique: {special_technique} ({special_techniques_info[special_technique]}). "
|
217 |
+
prompt_str += f"Color effect: {color_effect} ({color_effects_info[color_effect]})."
|
218 |
|
219 |
print(f"Prompt: {prompt_str}")
|
220 |
|
|
|
248 |
}
|
249 |
"""
|
250 |
|
251 |
+
js = """
|
252 |
+
function copyToClipboard() {
|
253 |
+
var copyText = document.querySelector('.output-text textarea');
|
254 |
+
copyText.select();
|
255 |
+
document.execCommand('copy');
|
256 |
+
}
|
257 |
+
"""
|
258 |
+
|
259 |
+
# Add detailed descriptions for each option
|
260 |
+
lens_types_info = {
|
261 |
+
"Standard": "A versatile lens with a field of view similar to human vision.",
|
262 |
+
"Wide-angle": "Captures a wider field of view, great for landscapes and architecture. Applies moderate to strong lens effect with image warp.",
|
263 |
+
"Telephoto": "Used for distant subjects, gives an 'award-winning' or 'National Geographic' look. Creates interesting effects when prompted.",
|
264 |
+
"Macro": "For extreme close-up photography, revealing tiny details.",
|
265 |
+
"Fish-eye": "Ultra-wide-angle lens that creates a strong bubble-like distortion. Generates panoramic photos with the entire image warping into a bubble.",
|
266 |
+
"Tilt-shift": "Allows adjusting the plane of focus, creating a 'miniature' effect. Known for the 'diorama miniature look'.",
|
267 |
+
"Zoom lens": "Variable focal length lens. Often zooms in on the subject, perfect for creating a base for inpainting. Interesting effect on landscapes with motion blur.",
|
268 |
+
"GoPro": "Wide-angle lens with clean digital look. Excludes film grain and most filter effects, resulting in natural colors and regular saturation.",
|
269 |
+
"Pinhole camera": "Creates a unique, foggy, low-detail, historic photograph look. Used since the 1850s, with peak popularity in the 1930s."
|
270 |
+
}
|
271 |
+
|
272 |
+
film_stocks_info = {
|
273 |
+
"Kodak Portra": "Professional color negative film known for its natural skin tones and low contrast.",
|
274 |
+
"Fujifilm Velvia": "Slide film known for vibrant colors and high saturation, popular among landscape photographers.",
|
275 |
+
"Ilford Delta": "Black and white film known for its fine grain and high sharpness.",
|
276 |
+
"Kodak Tri-X": "Classic high-speed black and white film, known for its distinctive grain and wide exposure latitude.",
|
277 |
+
"Fujifilm Provia": "Color reversal film known for its natural color reproduction and fine grain.",
|
278 |
+
"Cinestill": "Color photos with fine/low grain and higher than average resolution. Colors are slightly oversaturated or slightly desaturated.",
|
279 |
+
"Ektachrome": "Color photos with fine/low to moderate grain. Colors on the colder part of spectrum or regular, with normal or slightly higher saturation.",
|
280 |
+
"Ektar": "Modern Kodak film. Color photos with little to no grain. Results look like regular modern photography with artistic angles.",
|
281 |
+
"Film Washi": "Mostly black and white photos with fine/low to moderate grain. Occasionally gives colored photos with low saturation. Distinct style with high black contrast and soft camera lens effect.",
|
282 |
+
"Fomapan": "Black and white photos with fine/low to moderate grain, highly artistic exposure and angles. Adds very soft lens effect without distortion, dark photo vignette.",
|
283 |
+
"Fujicolor": "Color photos with fine/low to moderate grain. Colors are slightly or notably desaturated, with entire color hue shifted in a very distinct manner.",
|
284 |
+
"Holga": "Color photos with moderate to fine/low grain. Similar to Lomography in style, but with less grain. Good chance of black and white photography depending on subject.",
|
285 |
+
"Instax": "Instant color photos similar to Polaroid but clearer. Near perfect colors, regular saturation, fine/low to medium grain.",
|
286 |
+
"Lomography": "Color photos with high grain. Colors are either very oversaturated or slightly desaturated. Distinct contrast of black. Often applies photographic vignette.",
|
287 |
+
"Kodachrome": "Color photos with moderate grain. Colors on either colder part of spectrum or regular, with normal or slightly higher saturation.",
|
288 |
+
"Rollei": "Mostly black and white photos, sometimes color with fine/low grain. Can be sepia colored or have unusual hues and desaturation. Great for landscapes."
|
289 |
+
}
|
290 |
+
|
291 |
+
composition_styles_info = {
|
292 |
+
"Rule of Thirds": "Divides the frame into a 3x3 grid, placing key elements along the lines or at their intersections.",
|
293 |
+
"Golden Ratio": "Uses a spiral based on the golden ratio to create a balanced and aesthetically pleasing composition.",
|
294 |
+
"Symmetry": "Creates a mirror-like balance in the image, often used for architectural or nature photography.",
|
295 |
+
"Leading Lines": "Uses lines within the frame to draw the viewer's eye to the main subject or through the image.",
|
296 |
+
"Framing": "Uses elements within the scene to create a frame around the main subject.",
|
297 |
+
"Minimalism": "Simplifies the composition to its essential elements, often with a lot of negative space.",
|
298 |
+
"Fill the Frame": "The main subject dominates the entire frame, leaving little to no background.",
|
299 |
+
"Negative Space": "Uses empty space around the subject to create a sense of simplicity or isolation.",
|
300 |
+
"Centered Composition": "Places the main subject in the center of the frame, creating a sense of stability or importance.",
|
301 |
+
"Diagonal Lines": "Uses diagonal elements to create a sense of movement or dynamic tension in the image.",
|
302 |
+
"Triangular Composition": "Arranges elements in the frame to form a triangle, creating a sense of stability and harmony.",
|
303 |
+
"Radial Balance": "Arranges elements in a circular pattern around a central point, creating a sense of movement or completeness."
|
304 |
+
}
|
305 |
+
|
306 |
+
lighting_aspects_info = {
|
307 |
+
"Natural light": "Uses available light from the sun or sky, often creating soft, even illumination.",
|
308 |
+
"Studio lighting": "Controlled artificial lighting setup, allowing for precise manipulation of light and shadow.",
|
309 |
+
"Back light": "Light source behind the subject, creating silhouettes or rim lighting effects.",
|
310 |
+
"Split light": "Strong light source at 90-degree angle, lighting one half of the subject while leaving the other in shadow.",
|
311 |
+
"Broad light": "Light source at an angle to the subject, producing well-lit photographs with soft to moderate shadows.",
|
312 |
+
"Dim light": "Weak or distant light source, creating lower than average brightness and often dramatic images.",
|
313 |
+
"Flash photography": "Uses a brief, intense burst of light. Can be fill flash (even lighting) or harsh flash (strong contrasts).",
|
314 |
+
"Sunlight": "Direct light from the sun, often creating strong contrasts and warm tones.",
|
315 |
+
"Moonlight": "Soft, cool light from the moon, often creating a mysterious or romantic atmosphere.",
|
316 |
+
"Spotlight": "Focused beam of light illuminating a specific area, creating high contrast between light and shadow.",
|
317 |
+
"High-key lighting": "Bright, even lighting with minimal shadows, creating a light and airy feel.",
|
318 |
+
"Low-key lighting": "Predominantly dark tones with selective lighting, creating a moody or dramatic atmosphere.",
|
319 |
+
"Rembrandt lighting": "Classic portrait lighting technique creating a triangle of light on the cheek of the subject."
|
320 |
+
}
|
321 |
+
|
322 |
+
special_techniques_info = {
|
323 |
+
"Double exposure": "Superimposes two exposures to create a single image, often resulting in a dreamy or surreal effect.",
|
324 |
+
"Long exposure": "Uses a long shutter speed to capture motion over time, often creating smooth, blurred effects for moving elements.",
|
325 |
+
"Multiple exposure": "Superimposes multiple exposures, multiplying the subject or its key elements across the image.",
|
326 |
+
"HDR": "High Dynamic Range imaging, combining multiple exposures to capture a wider range of light and dark tones.",
|
327 |
+
"Bokeh effect": "Creates a soft, out-of-focus background, often with circular highlights.",
|
328 |
+
"Silhouette": "Captures the outline of a subject against a brighter background, creating a dramatic contrast.",
|
329 |
+
"Panning": "Follows a moving subject with the camera, creating a sharp subject with a blurred background.",
|
330 |
+
"Light painting": "Uses long exposure and moving light sources to 'paint' with light in the image.",
|
331 |
+
"Infrared photography": "Captures light in the infrared spectrum, often resulting in surreal, otherworldly images.",
|
332 |
+
"Ultraviolet photography": "Captures light in the ultraviolet spectrum, often revealing hidden patterns or creating a strong violet glow.",
|
333 |
+
"Kirlian photography": "High-voltage photographic technique that captures corona discharges around objects, creating a glowing effect.",
|
334 |
+
"Thermography": "Captures infrared radiation to create images based on temperature differences, resulting in false-color heat maps.",
|
335 |
+
"Astrophotography": "Specialized technique for capturing astronomical objects and celestial events, often resulting in stunning starry backgrounds.",
|
336 |
+
"Underwater photography": "Captures images beneath the surface of water, often in pools, seas, or aquariums.",
|
337 |
+
"Aerial photography": "Captures images from an elevated position, such as from drones, helicopters, or planes.",
|
338 |
+
"Macro photography": "Extreme close-up photography, revealing tiny details not visible to the naked eye."
|
339 |
+
}
|
340 |
+
|
341 |
+
color_effects_info = {
|
342 |
+
"Black and white": "Removes all color, leaving only shades of gray.",
|
343 |
+
"Sepia": "Reddish-brown monochrome effect, often associated with vintage photography.",
|
344 |
+
"Monochrome": "Uses variations of a single color.",
|
345 |
+
"Vintage color": "Muted or faded color palette reminiscent of old photographs.",
|
346 |
+
"Cross-processed": "Deliberate processing of film in the wrong chemicals, creating unusual color shifts.",
|
347 |
+
"Desaturated": "Reduces the intensity of all colors in the image.",
|
348 |
+
"Vivid colors": "Increases the saturation and intensity of colors.",
|
349 |
+
"Pastel colors": "Soft, pale colors with a light and airy feel.",
|
350 |
+
"High contrast": "Emphasizes the difference between light and dark areas in the image.",
|
351 |
+
"Low contrast": "Reduces the difference between light and dark areas, creating a softer look.",
|
352 |
+
"Color splash": "Converts most of the image to black and white while leaving one or more elements in color."
|
353 |
+
}
|
354 |
+
|
355 |
# Gradio interface
|
356 |
with gr.Blocks(theme="Hev832/Applio", css=css) as demo:
|
357 |
with gr.Tab("Welcome"):
|
358 |
gr.Markdown(
|
359 |
"""
|
360 |
+
<img src="https://cdn-uploads.huggingface.co/production/uploads/64740cf7485a7c8e1bd51ac9/EqvglEeWdTpCqWAcuP6-9.png">
|
361 |
|
362 |
# 🎨 Yamamoto JoyCaption: AI-Powered Art Inspiration
|
363 |
|
|
|
440 |
)
|
441 |
|
442 |
lens_type = gr.Dropdown(
|
443 |
+
choices=list(lens_types_info.keys()),
|
444 |
label="Lens Type",
|
445 |
visible=False,
|
446 |
+
info="Select a lens type to define the perspective and field of view of the image."
|
447 |
)
|
448 |
|
449 |
film_stock = gr.Dropdown(
|
450 |
+
choices=list(film_stocks_info.keys()),
|
451 |
label="Film Stock",
|
452 |
visible=False,
|
453 |
+
info="Choose a film stock to determine the color, grain, and overall look of the image."
|
454 |
)
|
455 |
|
456 |
composition_style = gr.Dropdown(
|
457 |
+
choices=list(composition_styles_info.keys()),
|
458 |
label="Composition Style",
|
459 |
visible=False,
|
460 |
+
info="Select a composition style to guide the arrangement of elements in the image."
|
461 |
+
)
|
462 |
+
|
463 |
+
lighting_aspect = gr.Dropdown(
|
464 |
+
choices=list(lighting_aspects_info.keys()),
|
465 |
+
label="Lighting Aspect",
|
466 |
+
visible=False,
|
467 |
+
info="Choose a lighting style to define the mood and atmosphere of the image."
|
468 |
+
)
|
469 |
+
|
470 |
+
special_technique = gr.Dropdown(
|
471 |
+
choices=list(special_techniques_info.keys()),
|
472 |
+
label="Special Technique",
|
473 |
+
visible=False,
|
474 |
+
info="Select a special photographic technique to add unique effects to the image."
|
475 |
+
)
|
476 |
+
|
477 |
+
color_effect = gr.Dropdown(
|
478 |
+
choices=list(color_effects_info.keys()),
|
479 |
+
label="Color Effect",
|
480 |
+
visible=False,
|
481 |
+
info="Choose a color effect to alter the overall color palette of the image."
|
482 |
)
|
483 |
|
484 |
gr.Markdown("**Note:** Caption tone doesn't affect `rng-tags`, `training_prompt`, and `style_prompt`.")
|
485 |
|
486 |
+

|
487 |
+
|
488 |
run_button = gr.Button("Make My Caption!")
|
489 |
|
490 |
with gr.Column():
|
491 |
output_caption = gr.Textbox(label="Generated Caption")
|
492 |
copy_button = gr.Button("Copy to Clipboard")
|
493 |
+
gr.HTML("<script>" + js + "</script>")
|
494 |
|
495 |
def update_style_options(caption_type):
|
496 |
return {
|
497 |
lens_type: gr.update(visible=caption_type == "style_prompt"),
|
498 |
film_stock: gr.update(visible=caption_type == "style_prompt"),
|
499 |
composition_style: gr.update(visible=caption_type == "style_prompt"),
|
500 |
+
lighting_aspect: gr.update(visible=caption_type == "style_prompt"),
|
501 |
+
special_technique: gr.update(visible=caption_type == "style_prompt"),
|
502 |
+
color_effect: gr.update(visible=caption_type == "style_prompt"),
|
503 |
}
|
504 |
|
505 |
+
caption_type.change(update_style_options, inputs=[caption_type], outputs=[lens_type, film_stock, composition_style, lighting_aspect, special_technique, color_effect])
|
506 |
|
507 |
+
run_button.click(fn=stream_chat, inputs=[input_image, caption_type, caption_tone, caption_length, lens_type, film_stock, composition_style, lighting_aspect, special_technique, color_effect], outputs=[output_caption])
|
|
|
|
|
|
|
508 |
|
509 |
+
copy_button.click(None, None, None, _js="copyToClipboard()")
|
510 |
|
511 |
if __name__ == "__main__":
|
512 |
demo.launch()
|