Spaces:

SuriC-nyc
/

MagicMusicMachine

Sleeping

File size: 18,670 Bytes

import os
import gradio as gr
from audiocraft.models import MAGNeT, MusicGen, AudioGen

# from gradio_components.image import generate_caption, improve_prompt
from gradio_components.image import generate_caption_gpt4
from gradio_components.prediction import predict, transcribe

import re
import argparse
from gradio_components.model_cards import TEXT_TO_MIDI_MODELS, TEXT_TO_SOUND_MODELS, MELODY_CONTINUATION_MODELS, TEXT_TO_MUSIC_MODELS, MODEL_CARDS, MELODY_CONDITIONED_MODELS
import ast
import json


    

theme = gr.themes.Glass(
    primary_hue="fuchsia",
    secondary_hue="indigo",
    neutral_hue="slate",
    font=[
        gr.themes.GoogleFont("Source Sans Pro"),
        "ui-sans-serif",
        "system-ui",
        "sans-serif",
    ],
).set(
    body_background_fill_dark="*background_fill_primary",
    embed_radius="*table_radius",
    background_fill_primary="*neutral_50",
    background_fill_primary_dark="*neutral_950",
    background_fill_secondary_dark="*neutral_900",
    border_color_accent="*neutral_600",
    border_color_accent_subdued="*color_accent",
    border_color_primary_dark="*neutral_700",
    block_background_fill="*background_fill_primary",
    block_background_fill_dark="*neutral_800",
    block_border_width="1px",
    block_label_background_fill="*background_fill_primary",
    block_label_background_fill_dark="*background_fill_secondary",
    block_label_text_color="*neutral_500",
    block_label_text_size="*text_sm",
    block_label_text_weight="400",
    block_shadow="none",
    block_shadow_dark="none",
    block_title_text_color="*neutral_500",
    block_title_text_weight="400",
    panel_border_width="0",
    panel_border_width_dark="0",
    checkbox_background_color_dark="*neutral_800",
    checkbox_border_width="*input_border_width",
    checkbox_label_border_width="*input_border_width",
    input_background_fill="*neutral_100",
    input_background_fill_dark="*neutral_700",
    input_border_color_focus_dark="*neutral_700",
    input_border_width="0px",
    input_border_width_dark="0px",
    slider_color="#2563eb",
    slider_color_dark="#2563eb",
    table_even_background_fill_dark="*neutral_950",
    table_odd_background_fill_dark="*neutral_900",
    button_border_width="*input_border_width",
    button_shadow_active="none",
    button_primary_background_fill="*primary_200",
    button_primary_background_fill_dark="*primary_700",
    button_primary_background_fill_hover="*button_primary_background_fill",
    button_primary_background_fill_hover_dark="*button_primary_background_fill",
    button_secondary_background_fill="*neutral_200",
    button_secondary_background_fill_dark="*neutral_600",
    button_secondary_background_fill_hover="*button_secondary_background_fill",
    button_secondary_background_fill_hover_dark="*button_secondary_background_fill",
    button_cancel_background_fill="*button_secondary_background_fill",
    button_cancel_background_fill_dark="*button_secondary_background_fill",
    button_cancel_background_fill_hover="*button_cancel_background_fill",
    button_cancel_background_fill_hover_dark="*button_cancel_background_fill",
)



def generate_prompt(prompt, style):
    prompt = ','.join([prompt]+style)
    return prompt


def UI(share=False):
    with gr.Blocks() as demo:
        with gr.Tab("Generate Music by text"):
            with gr.Row():
                with gr.Column():
                    with gr.Row():
                        model_path = gr.Dropdown(
                            choices=TEXT_TO_MUSIC_MODELS,
                            label="Select the model",
                            value="facebook/musicgen-large",
                        )

                    with gr.Row():
                        text_prompt = gr.Textbox(
                            label="Let's make a song about ...",
                            value="First day learning music generation in Standford university",
                            interactive=True,
                            visible=True,
                        )
                        num_outputs = gr.Number(
                            label="Number of outputs",
                            value=1,
                            minimum=1,
                            maximum=10,
                            interactive=True,
                        )
                   
                    with gr.Row():
                        style = gr.CheckboxGroup(
                            ["Jazz", "Classical Music", "Hip Hop", "Ragga Jungle", "Dark Jazz", "Soul", "Blues", "80s Rock N Roll"],
                            value=None,
                            label="music genre",
                            interactive=True,
                        )
                        @gr.on(inputs=[style], outputs=text_prompt)
                        def update_prompt(style):
                            return generate_prompt(text_prompt.value, style)

                    config_output_textbox = gr.Textbox(label="Model Configs", visible=False)
                    
                    @gr.render(inputs=model_path)
                    def show_config_options(model_path):
                        print(model_path)
                        
                        with gr.Accordion("Model Generation Configs"):
                            if "magnet" in model_path:
                                with gr.Row():
                                    top_k = gr.Number(label="Top-k", value=300, interactive=True)
                                    top_p = gr.Number(label="Top-p", value=0, interactive=True)
                                    temperature = gr.Number(
                                        label="Temperature", value=1.0, interactive=True
                                    )
                                    span_arrangement = gr.Radio(["nonoverlap", "stride1"], value='nonoverlap', label="span arrangment", info=" Use either non-overlapping spans ('nonoverlap') or overlapping spans ('stride1') ")
                                @gr.on(inputs=[top_k, top_p, temperature, span_arrangement], outputs=config_output_textbox)
                                def return_model_configs(top_k, top_p, temperature, span_arrangement):
                                    return {"top_k": top_k, "top_p": top_p, "temperature": temperature, "span_arrangement": span_arrangement}
                            else:
                                with gr.Row():
                                    duration = gr.Slider(
                                        minimum=10,
                                        maximum=30,
                                        value=30,
                                        label="Duration",
                                        interactive=True,
                                    )
                                    use_sampling = gr.Checkbox(label="Use Sampling", interactive=True, value=True)
                                    top_k = gr.Number(label="Top-k", value=300, interactive=True)
                                    top_p = gr.Number(label="Top-p", value=0, interactive=True)
                                    temperature = gr.Number(
                                        label="Temperature", value=1.0, interactive=True
                                    )
                                @gr.on(inputs=[duration, use_sampling, top_k, top_p, temperature], outputs=config_output_textbox)
                                def return_model_configs(duration, use_sampling, top_k, top_p, temperature):
                                    return {"duration": duration, "use_sampling": use_sampling, "top_k": top_k, "top_p": top_p, "temperature": temperature}

                with gr.Column():
                    with gr.Row():
                        melody = gr.Audio(sources=["upload"], type="numpy", label="File",
                                        interactive=True, elem_id="melody-input", visible=False)
                        submit = gr.Button("Generate Music")
                    result_text = gr.Textbox(label="Generated Music (text)", type="text", interactive=False)
                    print(result_text)
                    output_audios = []
                    @gr.render(inputs=result_text)
                    def show_output_audio(tmp_paths):
                        if tmp_paths:
                            tmp_paths = ast.literal_eval(tmp_paths)
                            print(tmp_paths)
                            for i in range(len(tmp_paths)):
                                tmp_path = tmp_paths[i]
                                _audio = gr.Audio(value=tmp_path , label=f"Generated Music {i}", type='filepath', interactive=False, visible=True)
                                output_audios.append(_audio)
                
                    submit.click(
                        fn=predict,
                        inputs=[model_path, config_output_textbox, text_prompt, melody, num_outputs], 
                        outputs=result_text,
                        queue=True
                        )
                
            
        with gr.Tab("Generate Music by melody"):
            with gr.Column():
                with gr.Row():
                    radio_melody_condition = gr.Radio(["Muisc Continuation", "Music Conditioning"], value=None, label="Select the condition")
                    model_path2 = gr.Dropdown(label="model")
                    @gr.on(inputs=radio_melody_condition, outputs=model_path2)
                    def model_selection(radio_melody_condition):
                        if radio_melody_condition == "Muisc Continuation":
                            model_path2 = gr.Dropdown(
                                choices=MELODY_CONTINUATION_MODELS,
                                label="Select the model",
                                value="facebook/musicgen-large",
                                interactive=True,
                                visible=True
                            )
                        elif radio_melody_condition == "Music Conditioning":
                            model_path2 = gr.Dropdown(
                                choices=MELODY_CONDITIONED_MODELS,
                                label="Select the model",
                                value="facebook/musicgen-melody-large",
                                interactive=True,
                                visible=True
                            )
                        else:
                            model_path2 = gr.Dropdown(
                                choices=TEXT_TO_SOUND_MODELS,
                                label="Select the model",
                                value="facebook/musicgen-large",
                                interactive=True,
                                visible=False
                            )
                        return model_path2
                    upload_melody = gr.Audio(sources=["upload", "microphone"], type="filepath", label="File")
                    prompt_text2 = gr.Textbox(
                        label="Let's make a song about ...",
                        value=None,
                        interactive=True,
                        visible=True,
                    )
                with gr.Row():
                    config_output_textbox2 = gr.Textbox(
                        label="Model Configs", 
                        visible=True)
                    with gr.Row():
                        duration2 = gr.Number(10, label="Duration", interactive=True)
                        num_outputs2 = gr.Number(1, label="Number of outputs", interactive=True)

                    @gr.on(inputs=[duration2], outputs=config_output_textbox2)
                    def return_model_configs2(duration):
                        return {"duration": duration, "use_sampling": True, "top_k": 300, "top_p": 0, "temperature": 1}
                    submit2 = gr.Button("Generate Music")
                    result_text2 = gr.Textbox(label="Generated Music (melody)", type="text", interactive=False, visible=True)
                    submit2.click(
                        fn=predict,
                        inputs=[model_path2, config_output_textbox2, prompt_text2, upload_melody, num_outputs2],
                        outputs=result_text2,
                        queue=True
                    )

                    @gr.render(inputs=result_text2)
                    def show_output_audio(tmp_paths):
                        if tmp_paths:
                            tmp_paths = ast.literal_eval(tmp_paths)
                            print(tmp_paths)
                            for i in range(len(tmp_paths)):
                                tmp_path = tmp_paths[i]
                                _audio = gr.Audio(value=tmp_path , label=f"Generated Music {i}", type='filepath', interactive=False)
                                output_audios.append(_audio)
            gr.Examples(
                examples = [
                    [
                        os.path.join(
                            os.path.dirname(__file__), "./data/audio/Suri's Improv.mp3"
                        ),
                        30, 
                        "facebook/musicgen-large",
                        "Muisc Continuation",
                    ],
                    [
                        os.path.join(
                            os.path.dirname(__file__), "./data/audio/lie_no_tomorrow_20sec.wav"
                        ),
                        40, 
                        "facebook/musicgen-melody-large",
                        "Music Conditioning",
                    ]
                ],
                inputs=[upload_melody, duration2, model_path2, radio_melody_condition],
            )

        with gr.Tab("Generate Music by image"):
            with gr.Column():
                with gr.Row():
                    image_input = gr.Image("Upload an image", type="filepath")
                    with gr.Accordion("Image Captioning", open=False):
                        image_description = gr.Textbox(label='image description', visible=True, interactive=False)
                        image_caption = gr.Textbox(label='generated text prompt', visible=True, interactive=True)
                    @gr.on(inputs=image_input, outputs=[image_description, image_caption])
                    def generate_image_text_prompt(image_input):
                        if image_input:
                            image_description, image_caption = generate_caption_gpt4(image_input, model_path)
                            # meesage_object, description, prompt = generate_caption_claude3(image_input, model_path)
                            return image_description, image_caption
                        return "", ""
                with gr.Row():
                    melody3 = gr.Audio(sources=["upload", "microphone"], type="filepath", label="File", visible=True)
            with gr.Column():
                model_path3 = gr.Dropdown(
                    choices=TEXT_TO_SOUND_MODELS + TEXT_TO_MUSIC_MODELS + MELODY_CONDITIONED_MODELS,
                    label="Select the model",
                    value="facebook/musicgen-large",
                )
                duration3 = gr.Number(30, visible=False, label="Duration")
                submit3 = gr.Button("Generate Music")
                result_text3 = gr.Textbox(label="Generated Music (image)", type="text", interactive=False, visible=True)
                def predict_image_music(model_path3, image_caption, duration3, melody3):
                    model_configs = {"duration": duration3, "use_sampling": True, "top_k": 250, "top_p": 0, "temperature": 1}
                    return predict(
                        model_version = model_path3, 
                        generation_configs = model_configs, 
                        prompt_text = image_caption, 
                        prompt_wav = melody3
                        )

                submit3.click(
                    fn=predict_image_music,
                    inputs=[model_path3, image_caption, duration3, melody3],
                    outputs=result_text3,
                    queue=True
                )

                @gr.render(inputs=result_text3)
                def show_output_audio(tmp_paths):
                    if tmp_paths:
                        tmp_paths = ast.literal_eval(tmp_paths)
                        print(tmp_paths)
                        for i in range(len(tmp_paths)):
                            tmp_path = tmp_paths[i]
                            _audio = gr.Audio(value=tmp_path , label=f"Generated Music {i}", type='filepath', interactive=False)
                            output_audios.append(_audio)

                @gr.render(inputs=result_text3)
                def show_transcribt_audio(tmp_paths):
                    transcribe(tmp_paths)
            gr.Examples(
                examples = [
                    [
                        os.path.join(
                            os.path.dirname(__file__), "./data/image/beach.jpeg"
                        ),
                        "facebook/musicgen-large",
                        30,
                        None,
                    ],
                    [
                        os.path.join(
                            os.path.dirname(__file__), "./data/image/beach.jpeg"
                        ),
                        "facebook/audiogen-medium",
                        15,
                        None,
                    ],
                    [
                        os.path.join(
                            os.path.dirname(__file__), "./data/image/beach.jpeg"
                        ),
                        "facebook/musicgen-melody-large",
                        30,
                        os.path.join(
                            os.path.dirname(__file__), "./data/audio/Suri's Improv.mp3"
                        ),
                    ],
                    [
                        os.path.join(
                            os.path.dirname(__file__), "./data/image/cat.jpeg"
                        ),
                        "facebook/musicgen-large",
                        30,
                        None,
                    ],
                ],
                inputs=[image_input, model_path3, duration3, melody3],
            )

    demo.queue().launch(share=share)


if __name__ == "__main__":
    # Create the parser
    parser = argparse.ArgumentParser()
    parser.add_argument('--share', action='store_true', help='Enable sharing.')
    args = parser.parse_args()

    UI(share=args.share)