Spaces:

SuriC-nyc
/

MagicMusicMachine

Sleeping

App Files Files Community

suric commited on Aug 22, 2024

Commit

48860c6

1 Parent(s): 694f61a

update apps and examples

Browse files

Files changed (7) hide show

.gitattributes +2 -0
app.py +244 -393
data/audio/Suri's Improv.mp3 +3 -0
data/audio/like_no_tomorrow_20sec.wav +3 -0
gradio_components/image.py +47 -4
gradio_components/model_cards.py +75 -0
gradio_components/prediction.py +215 -108

.gitattributes CHANGED Viewed

@@ -40,3 +40,5 @@ data/audio/twinkle_twinkle_little_stars_mozart.mp3 filter=lfs diff=lfs merge=lfs
 *.mp3 !text !filter !merge !diff
 *.jpeg filter=lfs diff=lfs merge=lfs -text
 data/audio/old_town_road20sec.mp3 filter=lfs diff=lfs merge=lfs -text

 *.mp3 !text !filter !merge !diff
 *.jpeg filter=lfs diff=lfs merge=lfs -text
 data/audio/old_town_road20sec.mp3 filter=lfs diff=lfs merge=lfs -text
+data/audio/Suri's[[:space:]]Improv.mp3 filter=lfs diff=lfs merge=lfs -text
+data/audio/like_no_tomorrow_20sec.wav filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -1,10 +1,20 @@
 import os
 import gradio as gr
-from gradio_components.image import generate_caption, improve_prompt
 from gradio_components.prediction import predict, transcribe
 theme = gr.themes.Glass(
     primary_hue="fuchsia",
     secondary_hue="indigo",
@@ -67,463 +77,304 @@ theme = gr.themes.Glass(
 )
-_AUDIOCRAFT_MODELS = [
-    "facebook/musicgen-melody",
-    "facebook/musicgen-medium",
-    "facebook/musicgen-small",
-    "facebook/musicgen-large",
-    "facebook/musicgen-melody-large",
-    "facebook/audiogen-medium",
-]
-def generate_prompt(difficulty, style):
-    _DIFFICULTY_MAPPIN = {
-        "Easy": "beginner player",
-        "Medium": "player who has 2-3 years experience",
-        "Hard": "player who has more than 4 years experiences",
-    }
-    prompt = "piano only music for a {} to practice with the touch of {}".format(
-        _DIFFICULTY_MAPPIN[difficulty], style
-    )
-    return prompt
-def toggle_melody_condition(melody_condition):
-    if melody_condition:
-        return gr.Audio(
-            sources=["microphone", "upload"],
-            label="Record or upload your audio",
-            show_label=True,
-            visible=True,
-        )
-    else:
-        return gr.Audio(
-            sources=["microphone", "upload"],
-            label="Record or upload your audio",
-            show_label=True,
-            visible=False,
-        )
-def toggle_custom_prompt(customize, difficulty, style):
-    if customize:
-        return gr.Textbox(label="Type your prompt", interactive=True, visible=True)
-    else:
-        prompt = generate_prompt(difficulty, style)
-        return gr.Textbox(
-            label="Generated Prompt", value=prompt, interactive=False, visible=True
-        )
-def show_caption(show_caption_condition, description, prompt):
-    if show_caption_condition:
-        return (
-            gr.Textbox(
-                label="Image Caption",
-                value=description,
-                interactive=False,
-                show_label=True,
-                visible=True,
-            ),
-            gr.Textbox(
-                label="Generated Prompt",
-                value=prompt,
-                interactive=True,
-                show_label=True,
-                visible=True,
-            ),
-            gr.Button("Generate Music", interactive=True, visible=True),
-        )
-    else:
-        return (
-            gr.Textbox(
-                label="Image Caption",
-                value=description,
-                interactive=False,
-                show_label=True,
-                visible=False,
-            ),
-            gr.Textbox(
-                label="Generated Prompt",
-                value=prompt,
-                interactive=True,
-                show_label=True,
-                visible=False,
-            ),
-            gr.Button(label="Generate Music", interactive=True, visible=True),
-        )
-def optimize_fn(prompt):
-    message_object, prompt = improve_prompt(prompt)
     return prompt
-def display_prompt(prompt):
-    return gr.Textbox(
-        label="Generated Prompt", value=prompt, interactive=False, visible=True
-    )
-def post_submit(show_caption, model_path, image_input):
-    _, description, prompt = generate_caption(image_input, model_path)
-    return (
-        gr.Textbox(
-            label="Image Caption",
-            value=description,
-            interactive=False,
-            show_label=True,
-            visible=show_caption,
-        ),
-        gr.Textbox(
-            label="Generated Prompt",
-            value=prompt,
-            interactive=True,
-            show_label=True,
-            visible=show_caption,
-        ),
-        gr.Button("Generate Music", interactive=True, visible=True),
-    )
-def UI():
     with gr.Blocks() as demo:
-        with gr.Tab("Generate Music by melody"):
             with gr.Row():
                 with gr.Column():
                     with gr.Row():
                         model_path = gr.Dropdown(
-                            choices=_AUDIOCRAFT_MODELS,
                             label="Select the model",
-                            value="facebook/musicgen-melody-large",
-                        )
-                    with gr.Row():
-                        duration = gr.Slider(
-                            minimum=10,
-                            maximum=60,
-                            value=10,
-                            label="Duration",
-                            interactive=True,
                         )
                     with gr.Row():
-                        topk = gr.Number(label="Top-k", value=250, interactive=True)
-                        topp = gr.Number(label="Top-p", value=0, interactive=True)
-                        temperature = gr.Number(
-                            label="Temperature", value=1.0, interactive=True
-                        )
-                        sample_rate = gr.Number(
-                            label="output music sample rate",
-                            value=32000,
                             interactive=True,
                         )
-                        difficulty = gr.Radio(
-                            ["Easy", "Medium", "Hard"],
-                            label="Difficulty",
-                            value="Easy",
                             interactive=True,
                         )
-                        style = gr.Radio(
-                            ["Jazz", "Classical Music", "Hip Hop"],
-                            value="Classical Music",
                             label="music genre",
                             interactive=True,
                         )
-                        def update_prompt(difficulty, style):
-                            return gr.Textbox(
-                            label="",
-                            value=generate_prompt(difficulty, style),
-                            interactive=False,
-                            visible=False)
-                        customize = gr.Checkbox(
-                            label="Customize the prompt", interactive=True, value=False
-                        )
-                        _init_prompt = generate_prompt(difficulty.value, style.value)
-                        prompt = gr.Textbox(
-                            label="",
-                            value=_init_prompt,
-                            interactive=False,
-                            visible=False,
-                        )
-                        customize.change(
-                            fn=toggle_custom_prompt,
-                            inputs=[customize, difficulty, style],
-                            outputs=prompt,
-                        )
-                        difficulty.change(
-                            update_prompt,
-                            inputs=[difficulty, style],
-                            outputs=prompt
-                            )
-                        style.change(
-                            update_prompt,
-                            inputs=[difficulty, style],
-                            outputs=prompt
-                            )
-                        print(prompt)
-                        with gr.Column():
-                            optimize = gr.Button(
-                                "Optimize the prompt", interactive=True
-                            )
-                        with gr.Column():
-                            show_prompt = gr.Button("Show the prompt", interactive=True)
-                            prompt_text = gr.Textbox(
-                                "Optimized Prompt", interactive=False, visible=False
-                            )
-                        optimize.click(optimize_fn, inputs=[prompt], outputs=prompt)
-                        show_prompt.click(
-                            display_prompt, inputs=[prompt], outputs=prompt_text
-                        )
                 with gr.Column():
                     with gr.Row():
-                        melody = gr.Audio(
-                            sources=["microphone", "upload"],
-                            label="Record or upload your audio",
-                            # interactive=True,
-                            show_label=True,
-                        )
-                    with gr.Row():
                         submit = gr.Button("Generate Music")
-                        output_audio = gr.Audio(
-                            "listen to the generated music", type="filepath"
                         )
                     with gr.Row():
-                        transcribe_button = gr.Button("Transcribe")
-                        d = gr.DownloadButton("Download the file", visible=False)
-                        transcribe_button.click(
-                            transcribe, inputs=[output_audio], outputs=d
-                        )
-            submit.click(
-                fn=predict,
-                inputs=[
-                    model_path,
-                    prompt,
-                    melody,
-                    duration,
-                    topk,
-                    topp,
-                    temperature,
-                    sample_rate,
-                ],
-                outputs=output_audio,
-            )
             gr.Examples(
-                examples=[
-                    [
-                        os.path.join(
-                            os.path.dirname(__file__),
-                            "./data/audio/twinkle_twinkle_little_stars_mozart_20sec"
-                            ".mp3",
-                        ),
-                        "Easy",
-                        32000,
-                        20,
-                    ],
-                    [
-                        os.path.join(
-                            os.path.dirname(__file__),
-                            "./data/audio/golden_hour_20sec.mp3",
-                        ),
-                        "Easy",
-                        32000,
-                        20,
-                    ],
-                    [
-                        os.path.join(
-                            os.path.dirname(__file__),
-                            "./data/audio/turkish_march_mozart_20sec.mp3",
-                        ),
-                        "Easy",
-                        32000,
-                        20,
-                    ],
                     [
                         os.path.join(
-                            os.path.dirname(__file__),
-                            "./data/audio/golden_hour_20sec.mp3",
                         ),
-                        "Hard",
-                        32000,
-                        20,
-                    ],
-                    [
-                        os.path.join(
-                            os.path.dirname(__file__),
-                            "./data/audio/golden_hour_20sec.mp3",
-                        ),
-                        "Hard",
-                        32000,
-                        40,
                     ],
                     [
                         os.path.join(
-                            os.path.dirname(__file__),
-                            "./data/audio/golden_hour_20sec.mp3",
                         ),
-                        "Hard",
-                        16000,
-                        20,
-                    ],
-                    [
-                        os.path.join(
-                            os.path.dirname(__file__),
-                            "./data/audio/old_town_road20sec.mp3",
-                            ),
-                        "Hard",
-                        32000,
-                        40,
-                        ],
                 ],
-                inputs=[melody, difficulty, sample_rate, duration],
-                label="Audio Examples",
-                outputs=[output_audio],
-                # cache_examples=True,
             )
         with gr.Tab("Generate Music by image"):
-            with gr.Row():
-                with gr.Column():
                     image_input = gr.Image("Upload an image", type="filepath")
-                    melody_condition = gr.Checkbox(
-                        label="Generate music by melody", interactive=True, value=False
-                    )
-                    melody = gr.Audio(
-                        sources=["microphone", "upload"],
-                        label="Record or upload your audio",
-                        show_label=True,
-                        visible=False,
-                    )
-                    melody_condition.change(
-                        fn=toggle_melody_condition,
-                        inputs=[melody_condition],
-                        outputs=melody,
-                    )
-                    description = gr.Textbox(
-                        label="Image Captioning",
-                        show_label=True,
-                        interactive=False,
-                        visible=False,
-                    )
-                    prompt = gr.Textbox(
-                        label="Generated Prompt",
-                        show_label=True,
-                        interactive=True,
-                        visible=False,
-                    )
-                    show_prompt = gr.Checkbox(label="Show the prompt", interactive=True)
-                    submit = gr.Button("submit", interactive=True, visible=True)
-                    generate = gr.Button(
-                        "Generate Music", interactive=True, visible=False
-                    )
-                with gr.Column():
-                    with gr.Row():
-                        model_path = gr.Dropdown(
-                            choices=_AUDIOCRAFT_MODELS,
-                            label="Select the model",
-                            value="facebook/musicgen-large",
-                        )
-                    with gr.Row():
-                        duration = gr.Slider(
-                            minimum=10,
-                            maximum=60,
-                            value=10,
-                            label="Duration",
-                            interactive=True,
                         )
-                    topk = gr.Number(label="Top-k", value=250, interactive=True)
-                    topp = gr.Number(label="Top-p", value=0, interactive=True)
-                    temperature = gr.Number(
-                        label="Temperature", value=1.0, interactive=True
-                    )
-                    sample_rate = gr.Number(
-                        label="output music sample rate", value=32000, interactive=True
-                    )
-                with gr.Column():
-                    output_audio = gr.Audio(
-                        "listen to the generated music",
-                        type="filepath",
-                        show_label=True,
-                    )
-                    transcribe_button = gr.Button("Transcribe")
-                    d = gr.DownloadButton("Download the file", visible=False)
-            submit.click(
-                fn=post_submit,
-                inputs=[show_prompt, model_path, image_input],
-                outputs=[description, prompt, generate],
-            )
-            show_prompt.change(
-                fn=show_caption,
-                inputs=[show_prompt, description, prompt],
-                outputs=[description, prompt, generate],
-            )
-            transcribe_button.click(transcribe, inputs=[output_audio], outputs=d)
-            generate.click(
-                fn=predict,
-                inputs=[
-                    model_path,
-                    prompt,
-                    melody,
-                    duration,
-                    topk,
-                    topp,
-                    temperature,
-                    sample_rate,
-                ],
-                outputs=output_audio,
-            )
             gr.Examples(
-                examples=[
                     [
                         os.path.join(
-                            os.path.dirname(__file__),
-                            "./data/image/kids_drawing.jpeg",
                         ),
-                        False,
-                        None,
                         "facebook/musicgen-large",
                     ],
                     [
                         os.path.join(
-                            os.path.dirname(__file__),
-                            "./data/image/cat.jpeg",
                         ),
-                        False,
                         None,
-                        "facebook/musicgen-large",
                     ],
                     [
                         os.path.join(
-                            os.path.dirname(__file__),
-                            "./data/image/cat.jpeg",
                         ),
-                        True,
-                        "./data/audio/the_nutcracker_dance_of_the_reed_flutes.mp3",
                         "facebook/musicgen-melody-large",
                     ],
                     [
                         os.path.join(
-                            os.path.dirname(__file__),
-                            "./data/image/beach.jpeg",
                         ),
-                        False,
                         None,
-                        "facebook/audiogen-medium",
                     ],
                 ],
-                inputs=[image_input, melody_condition, melody, model_path],
-                label="Audio Examples",
-                outputs=[output_audio],
-                # cache_examples=True,
             )
-    demo.queue().launch()
 if __name__ == "__main__":
-    UI()

 import os
 import gradio as gr
+from audiocraft.models import MAGNeT, MusicGen, AudioGen
+# from gradio_components.image import generate_caption, improve_prompt
+from gradio_components.image import generate_caption_gpt4
 from gradio_components.prediction import predict, transcribe
+import re
+import argparse
+from gradio_components.model_cards import TEXT_TO_MIDI_MODELS, TEXT_TO_SOUND_MODELS, MELODY_CONTINUATION_MODELS, TEXT_TO_MUSIC_MODELS, MODEL_CARDS, MELODY_CONDITIONED_MODELS
+import ast
+import json
 theme = gr.themes.Glass(
     primary_hue="fuchsia",
     secondary_hue="indigo",
 )
+def generate_prompt(prompt, style):
+    prompt = ','.join([prompt]+style)
     return prompt
+def UI(share=False):
     with gr.Blocks() as demo:
+        with gr.Tab("Generate Music by text"):
             with gr.Row():
                 with gr.Column():
                     with gr.Row():
                         model_path = gr.Dropdown(
+                            choices=TEXT_TO_MUSIC_MODELS,
                             label="Select the model",
+                            value="facebook/musicgen-large",
                         )
                     with gr.Row():
+                        text_prompt = gr.Textbox(
+                            label="Let's make a song about ...",
+                            value="First day learning music generation in Standford university",
                             interactive=True,
+                            visible=True,
                         )
+                        num_outputs = gr.Number(
+                            label="Number of outputs",
+                            value=1,
+                            minimum=1,
+                            maximum=10,
                             interactive=True,
                         )
+                    with gr.Row():
+                        style = gr.CheckboxGroup(
+                            ["Jazz", "Classical Music", "Hip Hop", "Ragga Jungle", "Dark Jazz", "Soul", "Blues", "80s Rock N Roll"],
+                            value=None,
                             label="music genre",
                             interactive=True,
                         )
+                        @gr.on(inputs=[style], outputs=text_prompt)
+                        def update_prompt(style):
+                            return generate_prompt(text_prompt.value, style)
+                    config_output_textbox = gr.Textbox(label="Model Configs", visible=False)
+                    @gr.render(inputs=model_path)
+                    def show_config_options(model_path):
+                        print(model_path)
+                        with gr.Accordion("Model Generation Configs"):
+                            if "magnet" in model_path:
+                                with gr.Row():
+                                    top_k = gr.Number(label="Top-k", value=300, interactive=True)
+                                    top_p = gr.Number(label="Top-p", value=0, interactive=True)
+                                    temperature = gr.Number(
+                                        label="Temperature", value=1.0, interactive=True
+                                    )
+                                    span_arrangement = gr.Radio(["nonoverlap", "stride1"], value='nonoverlap', label="span arrangment", info=" Use either non-overlapping spans ('nonoverlap') or overlapping spans ('stride1') ")
+                                @gr.on(inputs=[top_k, top_p, temperature, span_arrangement], outputs=config_output_textbox)
+                                def return_model_configs(top_k, top_p, temperature, span_arrangement):
+                                    return {"top_k": top_k, "top_p": top_p, "temperature": temperature, "span_arrangement": span_arrangement}
+                            else:
+                                with gr.Row():
+                                    duration = gr.Slider(
+                                        minimum=10,
+                                        maximum=30,
+                                        value=30,
+                                        label="Duration",
+                                        interactive=True,
+                                    )
+                                    use_sampling = gr.Checkbox(label="Use Sampling", interactive=True, value=True)
+                                    top_k = gr.Number(label="Top-k", value=300, interactive=True)
+                                    top_p = gr.Number(label="Top-p", value=0, interactive=True)
+                                    temperature = gr.Number(
+                                        label="Temperature", value=1.0, interactive=True
+                                    )
+                                @gr.on(inputs=[duration, use_sampling, top_k, top_p, temperature], outputs=config_output_textbox)
+                                def return_model_configs(duration, use_sampling, top_k, top_p, temperature):
+                                    return {"duration": duration, "use_sampling": use_sampling, "top_k": top_k, "top_p": top_p, "temperature": temperature}
                 with gr.Column():
                     with gr.Row():
+                        melody = gr.Audio(sources=["upload"], type="numpy", label="File",
+                                        interactive=True, elem_id="melody-input", visible=False)
                         submit = gr.Button("Generate Music")
+                    result_text = gr.Textbox(label="Generated Music (text)", type="text", interactive=False)
+                    print(result_text)
+                    output_audios = []
+                    @gr.render(inputs=result_text)
+                    def show_output_audio(tmp_paths):
+                        if tmp_paths:
+                            tmp_paths = ast.literal_eval(tmp_paths)
+                            print(tmp_paths)
+                            for i in range(len(tmp_paths)):
+                                tmp_path = tmp_paths[i]
+                                _audio = gr.Audio(value=tmp_path , label=f"Generated Music {i}", type='filepath', interactive=False, visible=True)
+                                output_audios.append(_audio)
+                    submit.click(
+                        fn=predict,
+                        inputs=[model_path, config_output_textbox, text_prompt, melody, num_outputs],
+                        outputs=result_text,
+                        queue=True
                         )
+        with gr.Tab("Generate Music by melody"):
+            with gr.Column():
+                with gr.Row():
+                    radio_melody_condition = gr.Radio(["Muisc Continuation", "Music Conditioning"], value=None, label="Select the condition")
+                    model_path2 = gr.Dropdown(label="model")
+                    @gr.on(inputs=radio_melody_condition, outputs=model_path2)
+                    def model_selection(radio_melody_condition):
+                        if radio_melody_condition == "Muisc Continuation":
+                            model_path2 = gr.Dropdown(
+                                choices=MELODY_CONTINUATION_MODELS,
+                                label="Select the model",
+                                value="facebook/musicgen-large",
+                                interactive=True,
+                                visible=True
+                            )
+                        elif radio_melody_condition == "Music Conditioning":
+                            model_path2 = gr.Dropdown(
+                                choices=MELODY_CONDITIONED_MODELS,
+                                label="Select the model",
+                                value="facebook/musicgen-melody-large",
+                                interactive=True,
+                                visible=True
+                            )
+                        else:
+                            model_path2 = gr.Dropdown(
+                                choices=TEXT_TO_SOUND_MODELS,
+                                label="Select the model",
+                                value="facebook/musicgen-large",
+                                interactive=True,
+                                visible=False
+                            )
+                        return model_path2
+                    upload_melody = gr.Audio(sources=["upload", "microphone"], type="filepath", label="File")
+                    prompt_text2 = gr.Textbox(
+                        label="Let's make a song about ...",
+                        value=None,
+                        interactive=True,
+                        visible=True,
+                    )
+                with gr.Row():
+                    config_output_textbox2 = gr.Textbox(
+                        label="Model Configs",
+                        visible=True)
                     with gr.Row():
+                        duration2 = gr.Number(10, label="Duration", interactive=True)
+                        num_outputs2 = gr.Number(1, label="Number of outputs", interactive=True)
+                    @gr.on(inputs=[duration2], outputs=config_output_textbox2)
+                    def return_model_configs2(duration):
+                        return {"duration": duration, "use_sampling": True, "top_k": 300, "top_p": 0, "temperature": 1}
+                    submit2 = gr.Button("Generate Music")
+                    result_text2 = gr.Textbox(label="Generated Music (melody)", type="text", interactive=False, visible=True)
+                    submit2.click(
+                        fn=predict,
+                        inputs=[model_path2, config_output_textbox2, prompt_text2, upload_melody, num_outputs2],
+                        outputs=result_text2,
+                        queue=True
+                    )
+                    @gr.render(inputs=result_text2)
+                    def show_output_audio(tmp_paths):
+                        if tmp_paths:
+                            tmp_paths = ast.literal_eval(tmp_paths)
+                            print(tmp_paths)
+                            for i in range(len(tmp_paths)):
+                                tmp_path = tmp_paths[i]
+                                _audio = gr.Audio(value=tmp_path , label=f"Generated Music {i}", type='filepath', interactive=False)
+                                output_audios.append(_audio)
             gr.Examples(
+                examples = [
                     [
                         os.path.join(
+                            os.path.dirname(__file__), "./data/audio/Suri's Improv.mp3"
                         ),
+                        30,
+                        "facebook/musicgen-large",
+                        "Muisc Continuation",
                     ],
                     [
                         os.path.join(
+                            os.path.dirname(__file__), "./data/audio/lie_no_tomorrow_20sec.wav"
                         ),
+                        40,
+                        "facebook/musicgen-melody-large",
+                        "Music Conditioning",
+                    ]
                 ],
+                inputs=[upload_melody, duration2, model_path2, radio_melody_condition],
             )
         with gr.Tab("Generate Music by image"):
+            with gr.Column():
+                with gr.Row():
                     image_input = gr.Image("Upload an image", type="filepath")
+                    with gr.Accordion("Image Captioning", open=False):
+                        image_description = gr.Textbox(label='image description', visible=True, interactive=False)
+                        image_caption = gr.Textbox(label='generated text prompt', visible=True, interactive=True)
+                    @gr.on(inputs=image_input, outputs=[image_description, image_caption])
+                    def generate_image_text_prompt(image_input):
+                        if image_input:
+                            image_description, image_caption = generate_caption_gpt4(image_input, model_path)
+                            # meesage_object, description, prompt = generate_caption_claude3(image_input, model_path)
+                            return image_description, image_caption
+                        return "", ""
+                with gr.Row():
+                    melody3 = gr.Audio(sources=["upload", "microphone"], type="filepath", label="File", visible=True)
+            with gr.Column():
+                model_path3 = gr.Dropdown(
+                    choices=TEXT_TO_SOUND_MODELS + TEXT_TO_MUSIC_MODELS + MELODY_CONDITIONED_MODELS,
+                    label="Select the model",
+                    value="facebook/musicgen-large",
+                )
+                duration3 = gr.Number(30, visible=False, label="Duration")
+                submit3 = gr.Button("Generate Music")
+                result_text3 = gr.Textbox(label="Generated Music (image)", type="text", interactive=False, visible=True)
+                def predict_image_music(model_path3, image_caption, duration3, melody3):
+                    model_configs = {"duration": duration3, "use_sampling": True, "top_k": 250, "top_p": 0, "temperature": 1}
+                    return predict(
+                        model_version = model_path3,
+                        generation_configs = model_configs,
+                        prompt_text = image_caption,
+                        prompt_wav = melody3
                         )
+                submit3.click(
+                    fn=predict_image_music,
+                    inputs=[model_path3, image_caption, duration3, melody3],
+                    outputs=result_text3,
+                    queue=True
+                )
+                @gr.render(inputs=result_text3)
+                def show_output_audio(tmp_paths):
+                    if tmp_paths:
+                        tmp_paths = ast.literal_eval(tmp_paths)
+                        print(tmp_paths)
+                        for i in range(len(tmp_paths)):
+                            tmp_path = tmp_paths[i]
+                            _audio = gr.Audio(value=tmp_path , label=f"Generated Music {i}", type='filepath', interactive=False)
+                            output_audios.append(_audio)
+                @gr.render(inputs=result_text3)
+                def show_transcribt_audio(tmp_paths):
+                    transcribe(tmp_paths)
             gr.Examples(
+                examples = [
                     [
                         os.path.join(
+                            os.path.dirname(__file__), "./data/image/beach.jpeg"
                         ),
                         "facebook/musicgen-large",
+                        30,
+                        None,
                     ],
                     [
                         os.path.join(
+                            os.path.dirname(__file__), "./data/image/beach.jpeg"
                         ),
+                        "facebook/audiogen-medium",
+                        15,
                         None,
                     ],
                     [
                         os.path.join(
+                            os.path.dirname(__file__), "./data/image/beach.jpeg"
                         ),
                         "facebook/musicgen-melody-large",
+                        30,
+                        os.path.join(
+                            os.path.dirname(__file__), "./data/audio/Suri's Improv.mp3"
+                        ),
                     ],
                     [
                         os.path.join(
+                            os.path.dirname(__file__), "./data/image/cat.jpeg"
                         ),
+                        "facebook/musicgen-large",
+                        30,
                         None,
                     ],
                 ],
+                inputs=[image_input, model_path3, duration3, melody3],
             )
+    demo.queue().launch(share=share)
 if __name__ == "__main__":
+    # Create the parser
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--share', action='store_true', help='Enable sharing.')
+    args = parser.parse_args()
+    UI(share=args.share)

data/audio/Suri's Improv.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:077b6d42a3ee05b15c3a02d7e2aaad7841e52b005d0443ac4aa280464a9a9c96
+size 163337

data/audio/like_no_tomorrow_20sec.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cabd551cfeea4bb608a010aa33118e744b55b816fc71697ffab7edb5ff350805
+size 7680088

gradio_components/image.py CHANGED Viewed

@@ -4,9 +4,11 @@ import os
 import anthropic
 import gradio as gr
 # Remember to put your API Key here
-client = anthropic.Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])
 # image1_url = "https://i.abcnewsfe.com/a/7d849ccc-e0fe-4416-959d-85889e338add/dune-1-ht-bb-231212_1702405287482_hpMain_16x9.jpeg"
 image1_media_type = "image/jpeg"
@@ -20,12 +22,22 @@ The model was trained with descriptions from a stock music catalog, descriptions
 Try to make the prompt simple and concise with only 1-2 sentences
-Make sure the ouput is in JSON fomat, with two items `description` and `prompt`"""
 SYSTEM_PROMPT_AUDIO = """You are an expert llm prompt engineer, you understand the structure of llms and facebook musicgen text to audio model. You will be provided with an image, and require to output a prompt for the musicgen model to capture the essense of the image. Try to do it step by step, evaluate and analyze the image thoroughly. After that, develop a prompt that contains the detail of what background sounds this image should have. This prompt will be provided to audiogen model to generate a 15s audio clip.
 Try to make the prompt simple and concise with only 1-2 sentences
-Make sure the ouput is in JSON fomat, with two items `description` and `prompt`
 """
 PROMPT_IMPROVEMENT_GENERATE_PROMPT = """
@@ -58,8 +70,39 @@ def improve_prompt(prompt):
     prompt = message_object["prompt"]
     return message_object, prompt
-def generate_caption(image_file, model_file, progress=gr.Progress()):
     if model_file == "facebook/audiogen-medium":
         system_prompt = SYSTEM_PROMPT_AUDIO
     else:

 import anthropic
 import gradio as gr
+from openai import OpenAI
 # Remember to put your API Key here
+client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
 # image1_url = "https://i.abcnewsfe.com/a/7d849ccc-e0fe-4416-959d-85889e338add/dune-1-ht-bb-231212_1702405287482_hpMain_16x9.jpeg"
 image1_media_type = "image/jpeg"
 Try to make the prompt simple and concise with only 1-2 sentences
+only return dictionary, with two items `description` and `prompt`
+for example
+{
+  "description": "A serene beach at sunset with gentle waves and a distant ship.",
+  "prompt": "A calming instrumental with gentle guitar, soft piano, and ocean waves sound effects, perfect for a relaxing moment by the sea."
+}
+"""
 SYSTEM_PROMPT_AUDIO = """You are an expert llm prompt engineer, you understand the structure of llms and facebook musicgen text to audio model. You will be provided with an image, and require to output a prompt for the musicgen model to capture the essense of the image. Try to do it step by step, evaluate and analyze the image thoroughly. After that, develop a prompt that contains the detail of what background sounds this image should have. This prompt will be provided to audiogen model to generate a 15s audio clip.
 Try to make the prompt simple and concise with only 1-2 sentences
+only return dictionary, with two items `description` and `prompt`
+for example
+{"description": "A serene beach scene at sunset with gentle waves lapping on the shore and a distant ship sailing on the water.",
+ "prompt": "Gentle waves flowing on the beach at sunset, with a distant ship in the background."}
 """
 PROMPT_IMPROVEMENT_GENERATE_PROMPT = """
     prompt = message_object["prompt"]
     return message_object, prompt
+def generate_caption_gpt4(image_file, model_file):
+    client = OpenAI()
+    if model_file == "facebook/audiogen-medium":
+        system_prompt = SYSTEM_PROMPT_AUDIO
+    else:
+        system_prompt = SYSTEM_PROMPT
+    with open(image_file, "rb") as f:
+        image_encoded = base64.b64encode(f.read()).decode("utf-8")
+    response = client.chat.completions.create(
+        model="gpt-4o",
+        messages=[
+            {
+            "role": "user",
+            "content": [
+                {"type": "text",
+                 "text": system_prompt},
+                {
+                "type": "image_url",
+                "image_url": {
+                    "url": f"data:image/jpeg;base64,{image_encoded}",
+                },
+                },
+            ],
+            }
+        ],
+        max_tokens=300,
+        )
+    message = json.loads(response.choices[0].message.content)
+    return message['description'], message['prompt']
+def generate_caption_claude3(image_file, model_file, progress=gr.Progress()):
     if model_file == "facebook/audiogen-medium":
         system_prompt = SYSTEM_PROMPT_AUDIO
     else:

gradio_components/model_cards.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import re
+TEXT_TO_MUSIC_MODELS = [
+    "facebook/musicgen-medium",
+    "facebook/musicgen-small",
+    "facebook/musicgen-large",
+    'facebook/magnet-small-10secs',
+    'facebook/magnet-medium-10secs',
+    'facebook/magnet-small-30secs',
+    'facebook/magnet-medium-30secs',
+    # "facebook/musicgen-stereo-small",
+    # "facebook/musicgen-stereo-medium",
+    # "facebook/musicgen-stereo-large",
+]
+TEXT_TO_MIDI_MODELS = [
+    "musiclang/musiclang-v2",
+]
+MELODY_CONTINUATION_MODELS = [
+    "facebook/musicgen-medium",
+    "facebook/musicgen-small",
+    "facebook/musicgen-large",
+]
+TEXT_TO_SOUND_MODELS = [
+    'facebook/audio-magnet-small',
+    'facebook/audio-magnet-medium',
+    "facebook/audiogen-medium",
+]
+MELODY_CONDITIONED_MODELS = [
+    "facebook/musicgen-melody",
+    "facebook/musicgen-melody-large",
+    # "facebook/musicgen-stereo-melody",
+    # "facebook/musicgen-stereo-melody-large",
+]
+STEREO_MODEL = [
+    "facebook/musicgen-stereo-small",
+    "facebook/musicgen-stereo-medium",
+    "facebook/musicgen-stereo-large",
+    "facebook/musicgen-stereo-melody",
+    "facebook/musicgen-stereo-melody-large",
+]
+MODEL_CARDS = {
+    "text-to-music": TEXT_TO_MUSIC_MODELS,
+    "text-to-midi": TEXT_TO_MIDI_MODELS,
+    "text-to-sound": TEXT_TO_SOUND_MODELS,
+    "melody-conditioned": MELODY_CONDITIONED_MODELS,
+}
+MODEL_DISCLAIMERS = {
+    "facebook/musicgen-melody": "1.5B transformer decoder also supporting melody conditioning.",
+    "facebook/musicgen-medium": "1.5B transformer decoder.",
+    "facebook/musicgen-small": "300M transformer decoder.",
+    "facebook/musicgen-large": "3.3B transformer decoder also supporting melody conditioning.",
+    "facebook/musicgen-melody-large": "3.3B transformer decoder.",
+    'facebook/magnet-small-10secs': "A 300M non-autoregressive transformer capable of generating 10-second music conditioned on text.",
+    'facebook/magnet-medium-10secs': "A 1.5B parameters, 10 seconds music samples..",
+    'facebook/magnet-small-30secs': "A 300M parameters, 30 seconds music samples.",
+    'facebook/magnet-medium-30secs': "A 1.5B parameters, 30 seconds music samples.",
+    # "musiclang/musiclang-v2": "This model generates music from text prompts.", TODO: Implement MusicLang
+    'facebook/audio-magnet-small': "a 300M non-autoregressive transformer capable of generating 10 second sound effects conditioned on text.",
+    'facebook/audio-magnet-medium': "10 second sound effect generation, 1.5B parameters.",
+    "facebook/audiogen-medium": "1.5B transformer decoder capable of generating sound effects conditioned on text.",
+}
+def print_model_cards():
+    for key, value in MODEL_CARDS.items():
+        print(key, ":", value)

gradio_components/prediction.py CHANGED Viewed

@@ -8,77 +8,177 @@ import gradio as gr
 import torch
 from audiocraft.data.audio import audio_write
 from audiocraft.data.audio_utils import convert_audio
-from audiocraft.models import AudioGen, MusicGen
 from basic_pitch import ICASSP_2022_MODEL_PATH
-from transformers import AutoModelForSeq2SeqLM
-def load_model(version="facebook/musicgen-melody"):
-    if version in ["facebook/audiogen-medium"]:
-        return AudioGen.get_pretrained(version)
     else:
-        return MusicGen.get_pretrained(version)
 def _do_predictions(
     model_file,
     model,
-    texts,
-    melodies,
-    duration,
     progress=False,
-    gradio_progress=None,
-    target_sr=32000,
-    target_ac=1,
     **gen_kwargs,
 ):
     print(
-        "new batch",
-        len(texts),
-        texts,
-        [None if m is None else (m[0], m[1].shape) for m in melodies],
     )
     be = time.time()
-    processed_melodies = []
-    model.set_generation_params(duration=duration)
-    for melody in melodies:
-        if melody is None:
-            processed_melodies.append(None)
-        else:
-            sr, melody = (
-                melody[0],
-                torch.from_numpy(melody[1]).to(model.device).float().t(),
-            )
-            print(f"Input audio sample rate is {sr}")
-            if melody.dim() == 1:
-                melody = melody[None]
-            melody = melody[..., : int(sr * duration)]
-            melody = convert_audio(melody, sr, target_sr, target_ac)
-            processed_melodies.append(melody)
     try:
-        if any(m is not None for m in processed_melodies):
-            # melody condition
-            outputs = model.generate_with_chroma(
-                descriptions=texts,
-                melody_wavs=processed_melodies,
-                melody_sample_rate=target_sr,
-                progress=progress,
-                return_tokens=False,
-            )
-        else:
-            if model_file == "facebook/audiogen-medium":
-                # audio condition
-                outputs = model.generate(texts, progress=progress)
             else:
-                # text only
-                outputs = model.generate(texts, progress=progress)
     except RuntimeError as e:
         raise gr.Error("Error while generating " + e.args[0])
     outputs = outputs.detach().cpu().float()
-    pending_videos = []
-    out_wavs = []
     for output in outputs:
         with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
             audio_write(
@@ -90,45 +190,36 @@ def _do_predictions(
                 loudness_compressor=True,
                 add_suffix=False,
             )
-            out_wavs.append(file.name)
-    print("generation finished", len(texts), time.time() - be)
-    return out_wavs
 def predict(
-    model_path,
-    text,
-    melody,
-    duration,
-    topk,
-    topp,
-    temperature,
-    target_sr,
     progress=gr.Progress(),
 ):
     global INTERRUPTING
-    global USE_DIFFUSION
     INTERRUPTING = False
     progress(0, desc="Loading model...")
-    model_path = model_path.strip()
-    # if model_path:
-    #     if not Path(model_path).exists():
-    #         raise gr.Error(f"Model path {model_path} doesn't exist.")
-    #     if not Path(model_path).is_dir():
-    #         raise gr.Error(f"Model path {model_path} must be a folder containing "
-    #                        "state_dict.bin and compression_state_dict_.bin.")
-    if temperature < 0:
-        raise gr.Error("Temperature must be >= 0.")
-    if topk < 0:
-        raise gr.Error("Topk must be non-negative.")
-    if topp < 0:
-        raise gr.Error("Topp must be non-negative.")
-    topk = int(topk)
-    model = load_model(model_path)
-    max_generated = 0
     def _progress(generated, to_generate):
         nonlocal max_generated
         max_generated = max(generated, max_generated)
@@ -136,40 +227,56 @@ def predict(
         if INTERRUPTING:
             raise gr.Error("Interrupted.")
     model.set_custom_progress_callback(_progress)
-    wavs = _do_predictions(
-        model_path,
         model,
-        [text],
-        [melody],
-        duration,
         progress=True,
-        target_ac=1,
-        target_sr=target_sr,
-        top_k=topk,
-        top_p=topp,
-        temperature=temperature,
-        gradio_progress=progress,
     )
-    return wavs[0]
 def transcribe(audio_path):
     # model_output, midi_data, note_events = predict("generated_0.wav")
-    model_output, midi_data, note_events = basic_pitch.inference.predict(
-        audio_path=audio_path,
-        model_or_model_path=ICASSP_2022_MODEL_PATH,
-    )
-    with NamedTemporaryFile("wb", suffix=".mid", delete=False) as file:
-        try:
-            midi_data.write(file)
-            print(f"midi file saved to {file.name}")
-        except Exception as e:
-            print(f"Error while writing midi file: {e}")
-            raise e
-    return gr.DownloadButton(
-        value=file.name, label=f"Download MIDI file {file.name}", visible=True
-    )

 import torch
 from audiocraft.data.audio import audio_write
 from audiocraft.data.audio_utils import convert_audio
+from audiocraft.models import AudioGen, MusicGen, MAGNeT
 from basic_pitch import ICASSP_2022_MODEL_PATH
+# from transformers import AutoModelForSeq2SeqLM
+from concurrent.futures import ProcessPoolExecutor
+import typing as tp
+import warnings
+import json
+import ast
+import torchaudio
+MODEL = None
+def load_model(version='facebook/musicgen-large'):
+    global MODEL
+    if MODEL is None or MODEL.name != version:
+        del MODEL
+        MODEL = None  # in case loading would crash
+    print("Loading model", version)
+    if "magnet" in version:
+        MODEL = MAGNeT.get_pretrained(version)
+    elif "musicgen" in version:
+        MODEL = MusicGen.get_pretrained(version)
+    elif "musiclang" in version:
+        # TODO: Implement MusicLang
+        pass
+    elif "audiogen" in version:
+        MODEL = AudioGen.get_pretrained(version)
     else:
+        raise ValueError("Invalid model version")
+    return MODEL
+pool = ProcessPoolExecutor(4)
+class FileCleaner:
+    def __init__(self, file_lifetime: float = 3600):
+        self.file_lifetime = file_lifetime
+        self.files = []
+    def add(self, path: tp.Union[str, Path]):
+        self._cleanup()
+        self.files.append((time.time(), Path(path)))
+    def _cleanup(self):
+        now = time.time()
+        for time_added, path in list(self.files):
+            if now - time_added > self.file_lifetime:
+                if path.exists():
+                    path.unlink()
+                self.files.pop(0)
+            else:
+                break
+file_cleaner = FileCleaner()
+def inference_musicgen_text_to_music(model, configs, text, num_outputs=1):
+    model.set_generation_params(
+        **configs
+    )
+    descriptions = [text for _ in range(num_outputs)]
+    output = model.generate(descriptions=descriptions ,progress=True, return_tokens=False)
+    return output
+def inference_musicgen_continuation(model, configs, text, prompt_waveform, prompt_sr, num_outputs=1):
+    model.set_generation_params(
+        **configs
+    )
+    # melody, prompt_sr = torchaudio.load(prompt_waveform)
+    # descriptions = [text for _ in range(num_outputs)]
+    # prompt = [prompt_waveform for _ in range(num_outputs)]
+    output = model.generate_continuation(prompt_waveform, prompt_sample_rate=prompt_sr, progress=True, return_tokens=False)
+    return output
+def inference_musicgen_melody_condition(model, configs, text, prompt_waveform, prompt_sr, num_outputs=1):
+    model.set_generation_params(**configs)
+    descriptions = [text for _ in range(num_outputs)]
+    output = model.generate_with_chroma(
+        descriptions=descriptions,
+        melody_wavs=prompt_waveform,
+        melody_sample_rate=prompt_sr,
+        progress=True,
+        return_tokens=False
+    )
+    return output
+def inference_magnet(model, configs, text, num_outputs=1):
+    model.set_generation_params(
+        **configs
+    )
+    descriptions = [text for _ in range(num_outputs)]
+    output = model.generate(descriptions=descriptions, progress=True, return_tokens=False)
+    return output
+def inference_magnet_audio(model, configs, text, num_outputs=1):
+    model.set_generation_params(
+        **configs
+    )
+    descriptions = [text for _ in range(num_outputs)]
+    output = model.generate(descriptions=descriptions, progress=True, return_tokens=False)
+    return output
+def inference_audiogen(model, configs, text, num_outputs=1):
+    model.set_generation_params(
+        **configs
+    )
+    descriptions = [text for _ in range(num_outputs)]
+    output = model.generate(descriptions=descriptions, progress=True, return_tokens=False)
+    return output
+def inference_musiclang():
+    # TODO: Implement MusicLang
+    pass
+def process_audio(gr_audio, prompt_duration, model):
+    # audio, sr = torch.from_numpy(gr_audio[1]).to(model.device).float().t(), gr_audio[0]
+    audio, sr = torchaudio.load(gr_audio)
+    audio = audio[..., :int(prompt_duration * sr)]
+    return audio, sr
+_MODEL_INFERENCES = {
+    "facebook/musicgen-small": inference_musicgen_text_to_music,
+    "facebook/musicgen-medium": inference_musicgen_text_to_music,
+    "facebook/musicgen-large": inference_musicgen_text_to_music,
+    "facebook/musicgen-melody": inference_musicgen_melody_condition,
+    "facebook/musicgen-melody-large": inference_musicgen_melody_condition,
+    "facebook/magnet-small-10secs": inference_magnet,
+    "facebook/magnet-medium-10secs": inference_magnet,
+    "facebook/magnet-small-30secs": inference_magnet,
+    "facebook/magnet-medium-30secs": inference_magnet,
+    "facebook/audio-magnet-small": inference_magnet_audio,
+    "facebook/audio-magnet-medium": inference_magnet_audio,
+    "facebook/audiogen-medium": inference_audiogen,
+    "musicgen-continuation": inference_musicgen_continuation,
+}
 def _do_predictions(
     model_file,
     model,
+    text,
+    melody = None,
+    mel_sample_rate=None,
     progress=False,
+    num_generations=1,
     **gen_kwargs,
 ):
     print(
+        "new generation",
+        text,
+        None if melody is None else melody.shape
     )
     be = time.time()
     try:
+        if melody is not None:
+            # melody condition or continuation
+            if 'melody' in model_file:
+                # melody condition - musicgen-melody, musicgen-melody-large
+                inderence_func = _MODEL_INFERENCES[model_file]
             else:
+                # melody continuation
+                inderence_func = _MODEL_INFERENCES['musicgen-continuation']
+            outputs = inderence_func(model, gen_kwargs, text, melody, mel_sample_rate, num_generations)
+        else:
+            # text-to-music, text-to-sound
+            inderence_func = _MODEL_INFERENCES[model_file]
+            outputs = inderence_func(model, gen_kwargs, text, num_generations)
     except RuntimeError as e:
         raise gr.Error("Error while generating " + e.args[0])
     outputs = outputs.detach().cpu().float()
+    out_audios = []
+    video_processes = []
     for output in outputs:
         with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
             audio_write(
                 loudness_compressor=True,
                 add_suffix=False,
             )
+            # video_processes.append(pool.submit(make_waveform, file.name))
+            out_audios.append(file.name)
+            file_cleaner.add(file.name)
+    # out_videos = [video.result() for video in video_processes]
+    # for video in out_videos:
+        # file_cleaner.add(video)
+    print("generation finished", len(outputs), time.time() - be)
+    return out_audios
+def make_waveform(*args, **kwargs):
+    # Further remove some warnings.
+    be = time.time()
+    with warnings.catch_warnings():
+        warnings.simplefilter('ignore')
+        out = gr.make_waveform(*args, **kwargs)
+        print("Make a video took", time.time() - be)
+        return out
 def predict(
+    model_version,
+    generation_configs,
+    prompt_text=None,
+    prompt_wav=None,
+    num_generations=1,
     progress=gr.Progress(),
 ):
     global INTERRUPTING
     INTERRUPTING = False
     progress(0, desc="Loading model...")
     def _progress(generated, to_generate):
         nonlocal max_generated
         max_generated = max(generated, max_generated)
         if INTERRUPTING:
             raise gr.Error("Interrupted.")
+    model = load_model(model_version)
     model.set_custom_progress_callback(_progress)
+    if isinstance(generation_configs, str):
+        generation_configs = ast.literal_eval(generation_configs)
+    max_generated = 0
+    if prompt_wav is not None:
+        melody, mel_sample_rate = process_audio(prompt_wav, generation_configs['duration'], model)
+    else:
+        melody, mel_sample_rate = None, None
+    audios = _do_predictions(
+        model_version,
         model,
+        prompt_text,
+        melody,
+        mel_sample_rate,
         progress=True,
+        num_generations = num_generations,
+        **generation_configs,
     )
+    return audios
 def transcribe(audio_path):
+    """
+    Transcribe an audio file to MIDI using the basic_pitch model.
+    """
     # model_output, midi_data, note_events = predict("generated_0.wav")
+    tmp_paths = ast.literal_eval(audio_path)
+    download_buttons = []
+    for audio_path in tmp_paths:
+        model_output, midi_data, note_events = basic_pitch.inference.predict(
+            audio_path=audio_path,
+            model_or_model_path=ICASSP_2022_MODEL_PATH,
+        )
+        with NamedTemporaryFile("wb", suffix=".mid", delete=False) as file:
+            try:
+                midi_data.write(file)
+                print(f"midi file saved to {file.name}")
+            except Exception as e:
+                print(f"Error while writing midi file: {e}")
+                raise e
+        download_buttons.append(gr.DownloadButton(
+            value=file.name, label=f"Download MIDI file {file.name}", visible=True
+        ))
+        file_cleaner.add(file.name)
+    return download_buttons