|
import gradio as gr |
|
import spaces |
|
import json |
|
import re |
|
from gradio_client import Client |
|
|
|
|
|
|
|
kosmos2_client = Client("https://ydshieh-kosmos-2.hf.space/") |
|
|
|
def get_caption(image_in): |
|
""" |
|
fuyu_result = fuyu_client.predict( |
|
image_in, # str representing input in 'raw_image' Image component |
|
True, # bool in 'Enable detailed captioning' Checkbox component |
|
fn_index=2 |
|
) |
|
""" |
|
|
|
kosmos2_result = kosmos2_client.predict( |
|
image_in, |
|
"Detailed", |
|
fn_index=4 |
|
) |
|
|
|
print(f"KOSMOS2 RETURNS: {kosmos2_result}") |
|
|
|
with open(kosmos2_result[1], 'r') as f: |
|
data = json.load(f) |
|
|
|
reconstructed_sentence = [] |
|
for sublist in data: |
|
reconstructed_sentence.append(sublist[0]) |
|
|
|
full_sentence = ' '.join(reconstructed_sentence) |
|
|
|
|
|
|
|
pattern = r'^Describe this image in detail:\s*(.*)$' |
|
|
|
match = re.search(pattern, full_sentence) |
|
if match: |
|
description = match.group(1) |
|
print(description) |
|
else: |
|
print("Unable to locate valid description.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return description |
|
|
|
def get_caption_from_MD(image_in): |
|
client = Client("https://vikhyatk-moondream1.hf.space/") |
|
result = client.predict( |
|
image_in, |
|
"Describe precisely the image.", |
|
api_name="/answer_question" |
|
) |
|
print(result) |
|
return result |
|
|
|
def get_magnet(prompt): |
|
amended_prompt = f"{prompt}" |
|
print(amended_prompt) |
|
client = Client("https://fffiloni-magnet.hf.space/") |
|
result = client.predict( |
|
"facebook/magnet-medium-10secs", |
|
"", |
|
amended_prompt, |
|
3, |
|
0.9, |
|
10, |
|
1, |
|
20, |
|
10, |
|
10, |
|
10, |
|
"prod-stride1 (new!)", |
|
api_name="/predict_full" |
|
) |
|
print(result) |
|
return result[1] |
|
|
|
import re |
|
import torch |
|
from transformers import pipeline |
|
|
|
zephyr_model = "HuggingFaceH4/zephyr-7b-beta" |
|
mixtral_model = "mistralai/Mixtral-8x7B-Instruct-v0.1" |
|
|
|
pipe = pipeline("text-generation", model=zephyr_model, torch_dtype=torch.bfloat16, device_map="auto") |
|
|
|
agent_maker_sys = f""" |
|
You are an AI whose job is to help users create their own music which its genre will reflect the character or scene from an image described by users. |
|
In particular, you need to respond succintly with few musical words, in a friendly tone, write a musical prompt for a music generation model. |
|
|
|
For example, if a user says, "a picture of a man in a black suit and tie riding a black dragon", provide immediately a musical prompt corresponding to the image description. |
|
Immediately STOP after that. It should be EXACTLY in this format: |
|
"A grand orchestral arrangement with thunderous percussion, epic brass fanfares, and soaring strings, creating a cinematic atmosphere fit for a heroic battle" |
|
""" |
|
|
|
instruction = f""" |
|
<|system|> |
|
{agent_maker_sys}</s> |
|
<|user|> |
|
""" |
|
|
|
@spaces.GPU(enable_queue=True, duration=60) |
|
def infer(image_in): |
|
gr.Info("Getting image caption with Kosmos2...") |
|
user_prompt = get_caption(image_in) |
|
|
|
prompt = f"{instruction.strip()}\n{user_prompt}</s>" |
|
|
|
|
|
gr.Info("Building a musical prompt according to the image caption ...") |
|
outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95) |
|
|
|
|
|
pattern = r'\<\|system\|\>(.*?)\<\|assistant\|\>' |
|
cleaned_text = re.sub(pattern, '', outputs[0]["generated_text"], flags=re.DOTALL) |
|
|
|
print(f"SUGGESTED Musical prompt: {cleaned_text}") |
|
|
|
gr.Info("Now calling MAGNet for music ...") |
|
music_o = get_magnet(cleaned_text) |
|
|
|
return cleaned_text.lstrip("\n"), music_o |
|
|
|
demo_title = "Image to Music V2" |
|
description = "Get music from a picture" |
|
|
|
css = """ |
|
#col-container{ |
|
margin: 0 auto; |
|
max-width: 980px; |
|
text-align: left; |
|
} |
|
""" |
|
|
|
with gr.Blocks(css=css) as demo: |
|
with gr.Column(elem_id="col-container"): |
|
gr.HTML(f""" |
|
<h2 style="text-align: center;">{demo_title}</h2> |
|
<p style="text-align: center;">{description}</p> |
|
""") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
image_in = gr.Image( |
|
label = "Image reference", |
|
type = "filepath", |
|
elem_id = "image-in" |
|
) |
|
submit_btn = gr.Button("Make music from my pic !") |
|
with gr.Column(): |
|
caption = gr.Textbox( |
|
label = "Musical prompt", |
|
max_lines = 3 |
|
) |
|
result = gr.Audio( |
|
label = "Music" |
|
) |
|
with gr.Row(): |
|
gr.Examples( |
|
examples = [ |
|
["examples/monalisa.png"], |
|
["examples/santa.png"], |
|
["examples/ocean_poet.jpeg"], |
|
["examples/winter_hiking.png"], |
|
["examples/teatime.jpeg"], |
|
["examples/news_experts.jpeg"], |
|
["examples/chicken_adobo.jpeg"] |
|
], |
|
fn = infer, |
|
inputs = [image_in], |
|
outputs = [caption, result], |
|
cache_examples = False |
|
) |
|
|
|
submit_btn.click( |
|
fn = infer, |
|
inputs = [ |
|
image_in |
|
], |
|
outputs =[ |
|
caption, |
|
result |
|
] |
|
) |
|
|
|
demo.queue().launch(show_api=False) |