Spaces:

SuriC-nyc
/

MagicMusicMachine

Sleeping

App Files Files Community

suric commited on Mar 13, 2024

Commit

6a24aec

1 Parent(s): da26cf8

update image-to-music tab

Browse files

Files changed (10) hide show

.gitattributes +1 -0
app.py +443 -202
data/audio/the_nutcracker_dance_of_the_reed_flutes.mp3 +3 -0
data/image/.DS_Store +0 -0
data/image/beach.jpeg +3 -0
data/image/cat.jpeg +3 -0
data/image/kids_drawing.jpeg +3 -0
gradio_components/image.py +59 -0
gradio_components/prediction.py +57 -23
requirements.txt +2 -1

.gitattributes CHANGED Viewed

@@ -38,3 +38,4 @@ data/audio/turkish_march_mozart.mp3 filter=lfs diff=lfs merge=lfs -text
 data/audio/twinkle_twinkle_little_stars_mozart.mp3 filter=lfs diff=lfs merge=lfs -text
 *.mp3 filter=lfs diff=lfs merge=lfs -text
 *.mp3 !text !filter !merge !diff

 data/audio/twinkle_twinkle_little_stars_mozart.mp3 filter=lfs diff=lfs merge=lfs -text
 *.mp3 filter=lfs diff=lfs merge=lfs -text
 *.mp3 !text !filter !merge !diff
+*.jpeg filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -2,217 +2,458 @@ import os
 import gradio as gr
 from gradio_components.prediction import predict, transcribe
 theme = gr.themes.Glass(
-	primary_hue="fuchsia",
-	secondary_hue="indigo",
-	neutral_hue="slate",
-	font=[gr.themes.GoogleFont('Source Sans Pro'), 'ui-sans-serif', 'system-ui',
-	      'sans-serif'],
-	).set(
-	body_background_fill_dark='*background_fill_primary',
-	embed_radius='*table_radius',
-	background_fill_primary='*neutral_50',
-	background_fill_primary_dark='*neutral_950',
-	background_fill_secondary_dark='*neutral_900',
-	border_color_accent='*neutral_600',
-	border_color_accent_subdued='*color_accent',
-	border_color_primary_dark='*neutral_700',
-	block_background_fill='*background_fill_primary',
-	block_background_fill_dark='*neutral_800',
-	block_border_width='1px',
-	block_label_background_fill='*background_fill_primary',
-	block_label_background_fill_dark='*background_fill_secondary',
-	block_label_text_color='*neutral_500',
-	block_label_text_size='*text_sm',
-	block_label_text_weight='400',
-	block_shadow='none',
-	block_shadow_dark='none',
-	block_title_text_color='*neutral_500',
-	block_title_text_weight='400',
-	panel_border_width='0',
-	panel_border_width_dark='0',
-	checkbox_background_color_dark='*neutral_800',
-	checkbox_border_width='*input_border_width',
-	checkbox_label_border_width='*input_border_width',
-	input_background_fill='*neutral_100',
-	input_background_fill_dark='*neutral_700',
-	input_border_color_focus_dark='*neutral_700',
-	input_border_width='0px',
-	input_border_width_dark='0px',
-	slider_color='#2563eb',
-	slider_color_dark='#2563eb',
-	table_even_background_fill_dark='*neutral_950',
-	table_odd_background_fill_dark='*neutral_900',
-	button_border_width='*input_border_width',
-	button_shadow_active='none',
-	button_primary_background_fill='*primary_200',
-	button_primary_background_fill_dark='*primary_700',
-	button_primary_background_fill_hover='*button_primary_background_fill',
-	button_primary_background_fill_hover_dark='*button_primary_background_fill',
-	button_secondary_background_fill='*neutral_200',
-	button_secondary_background_fill_dark='*neutral_600',
-	button_secondary_background_fill_hover='*button_secondary_background_fill',
-	button_secondary_background_fill_hover_dark='*button_secondary_background_fill',
-	button_cancel_background_fill='*button_secondary_background_fill',
-	button_cancel_background_fill_dark='*button_secondary_background_fill',
-	button_cancel_background_fill_hover='*button_cancel_background_fill',
-	button_cancel_background_fill_hover_dark='*button_cancel_background_fill'
-	)
-_AUDIOCRAFT_MODELS = ["facebook/musicgen-melody",
-                      "facebook/musicgen-medium",
-                      "facebook/musicgen-small",
-                      "facebook/musicgen-large",
-                      "facebook/musicgen-melody-large"]
 def generate_prompt(difficulty, style):
-	_DIFFICULTY_MAPPIN = {
-		"Easy": "beginner player",
-		"Medum": "player who has 2-3 years experience",
-		"Hard": "player who has more than 4 years experiences"
-		}
-	prompt = 'piano only music for a {} to pratice with the touch of {}'.format(
-		_DIFFICULTY_MAPPIN[difficulty], style
-		)
-	return prompt
 def UI():
-	with gr.Blocks() as demo:
-		with gr.Tab("Generate Music by melody"):
-			with gr.Row():
-				with gr.Column():
-					with gr.Row():
-						model_path = gr.Dropdown(
-							choices=_AUDIOCRAFT_MODELS,
-							label="Select the model",
-							value="facebook/musicgen-melody-large"
-							)
-					with gr.Row():
-						duration = gr.Slider(
-							minimum=10,
-							maximum=60,
-							value=10,
-							label="Duration",
-							interactive=True
-							)
-					with gr.Row():
-						topk = gr.Number(label="Top-k", value=250, interactive=True)
-						topp = gr.Number(label="Top-p", value=0, interactive=True)
-						temperature = gr.Number(
-							label="Temperature", value=1.0, interactive=True
-							)
-						sample_rate = gr.Number(
-							label="output music sample rate", value=32000,
-							interactive=True
-							)
-						difficulty = gr.Radio(
-							["Easy", "Medium", "Hard"], label="Difficulty",
-							value="Easy", interactive=True
-							)
-						style = gr.Radio(
-							["Jazz", "Classical Music", "Hip Hop", "Others"],
-							value="Classical Music", label="music genre",
-							interactive=True
-							)
-						if style == "Others":
-							style = gr.Textbox(label="Type your music genre")
-						prompt = generate_prompt(difficulty.value, style.value)
-						customize = gr.Checkbox(
-							label="Customize the prompt", interactive=True
-							)
-						if customize:
-							prompt = gr.Textbox(label="Type your prompt")
-				with gr.Column():
-					with gr.Row():
-						melody = gr.Audio(
-							sources=["microphone", "upload"],
-							label="Record or upload your audio",
-							#interactive=True,
-							show_label=True,
-							)
-					with gr.Row():
-						submit = gr.Button("Generate Music")
-						output_audio = gr.Audio("listen to the generated music", type="filepath")
-					with gr.Row():
-						transcribe_button = gr.Button("Transcribe")
-						d = gr.DownloadButton("Download the file", visible=False)
-						transcribe_button.click(transcribe, inputs=[output_audio], outputs=d)
-			submit.click(
-				fn=predict,
-				inputs=[model_path, prompt, melody, duration, topk, topp, temperature,
-				        sample_rate],
-				outputs=output_audio
-				)
-		gr.Examples(
-			examples=[
-				[
-					os.path.join(
-						os.path.dirname(__file__),
-						"./data/audio/twinkle_twinkle_little_stars_mozart_20sec.mp3"
-						),
-					"Easy",
-					32000,
-					20
-					],
-				[
-					os.path.join(
-						os.path.dirname(__file__),
-						"./data/audio/golden_hour_20sec.mp3"
-						),
-					"Easy",
-					32000,
-					20
-					],
-				[
-					os.path.join(
-						os.path.dirname(__file__),
-						"./data/audio/turkish_march_mozart_20sec.mp3"
-						),
-					"Easy",
-					32000,
-					20
-					],
-				[
-					os.path.join(
-						os.path.dirname(__file__),
-						"./data/audio/golden_hour_20sec.mp3"
-						),
-					"Hard",
-					32000,
-					20
-					],
-				[
-					os.path.join(
-						os.path.dirname(__file__),
-						"./data/audio/golden_hour_20sec.mp3"
-						),
-					"Hard",
-					32000,
-					40
-					],
-				[
-					os.path.join(
-						os.path.dirname(__file__),
-						"./data/audio/golden_hour_20sec.mp3"
-						),
-					"Hard",
-					16000,
-					20
-					],
-				],
-			inputs=[melody, difficulty, sample_rate, duration],
-			label="Audio Examples",
-			outputs=[output_audio],
-			# cache_examples=True,
-			)
-	demo.queue().launch()
 if __name__ == "__main__":
-	UI()

 import gradio as gr
+from gradio_components.image import generate_caption
 from gradio_components.prediction import predict, transcribe
 theme = gr.themes.Glass(
+    primary_hue="fuchsia",
+    secondary_hue="indigo",
+    neutral_hue="slate",
+    font=[
+        gr.themes.GoogleFont("Source Sans Pro"),
+        "ui-sans-serif",
+        "system-ui",
+        "sans-serif",
+    ],
+).set(
+    body_background_fill_dark="*background_fill_primary",
+    embed_radius="*table_radius",
+    background_fill_primary="*neutral_50",
+    background_fill_primary_dark="*neutral_950",
+    background_fill_secondary_dark="*neutral_900",
+    border_color_accent="*neutral_600",
+    border_color_accent_subdued="*color_accent",
+    border_color_primary_dark="*neutral_700",
+    block_background_fill="*background_fill_primary",
+    block_background_fill_dark="*neutral_800",
+    block_border_width="1px",
+    block_label_background_fill="*background_fill_primary",
+    block_label_background_fill_dark="*background_fill_secondary",
+    block_label_text_color="*neutral_500",
+    block_label_text_size="*text_sm",
+    block_label_text_weight="400",
+    block_shadow="none",
+    block_shadow_dark="none",
+    block_title_text_color="*neutral_500",
+    block_title_text_weight="400",
+    panel_border_width="0",
+    panel_border_width_dark="0",
+    checkbox_background_color_dark="*neutral_800",
+    checkbox_border_width="*input_border_width",
+    checkbox_label_border_width="*input_border_width",
+    input_background_fill="*neutral_100",
+    input_background_fill_dark="*neutral_700",
+    input_border_color_focus_dark="*neutral_700",
+    input_border_width="0px",
+    input_border_width_dark="0px",
+    slider_color="#2563eb",
+    slider_color_dark="#2563eb",
+    table_even_background_fill_dark="*neutral_950",
+    table_odd_background_fill_dark="*neutral_900",
+    button_border_width="*input_border_width",
+    button_shadow_active="none",
+    button_primary_background_fill="*primary_200",
+    button_primary_background_fill_dark="*primary_700",
+    button_primary_background_fill_hover="*button_primary_background_fill",
+    button_primary_background_fill_hover_dark="*button_primary_background_fill",
+    button_secondary_background_fill="*neutral_200",
+    button_secondary_background_fill_dark="*neutral_600",
+    button_secondary_background_fill_hover="*button_secondary_background_fill",
+    button_secondary_background_fill_hover_dark="*button_secondary_background_fill",
+    button_cancel_background_fill="*button_secondary_background_fill",
+    button_cancel_background_fill_dark="*button_secondary_background_fill",
+    button_cancel_background_fill_hover="*button_cancel_background_fill",
+    button_cancel_background_fill_hover_dark="*button_cancel_background_fill",
+)
+_AUDIOCRAFT_MODELS = [
+    "facebook/musicgen-melody",
+    "facebook/musicgen-medium",
+    "facebook/musicgen-small",
+    "facebook/musicgen-large",
+    "facebook/musicgen-melody-large",
+    "facebook/audiogen-medium",
+]
 def generate_prompt(difficulty, style):
+    _DIFFICULTY_MAPPIN = {
+        "Easy": "beginner player",
+        "Medum": "player who has 2-3 years experience",
+        "Hard": "player who has more than 4 years experiences",
+    }
+    prompt = "piano only music for a {} to pratice with the touch of {}".format(
+        _DIFFICULTY_MAPPIN[difficulty], style
+    )
+    return prompt
+def toggle_melody_condition(melody_condition):
+    if melody_condition:
+        return gr.Audio(
+            sources=["microphone", "upload"],
+            label="Record or upload your audio",
+            show_label=True,
+            visible=True,
+        )
+    else:
+        return gr.Audio(
+            sources=["microphone", "upload"],
+            label="Record or upload your audio",
+            show_label=True,
+            visible=False,
+        )
+def show_caption(show_caption_condition, description, prompt):
+    if show_caption_condition:
+        return (
+            gr.Textbox(
+                label="Image Caption",
+                value=description,
+                interactive=False,
+                show_label=True,
+                visible=True,
+            ),
+            gr.Textbox(
+                label="Generated Prompt",
+                value=prompt,
+                interactive=True,
+                show_label=True,
+                visible=True,
+            ),
+            gr.Button("Generate Music", interactive=True, visible=True),
+        )
+    else:
+        return (
+            gr.Textbox(
+                label="Image Caption",
+                value=description,
+                interactive=False,
+                show_label=True,
+                visible=False,
+            ),
+            gr.Textbox(
+                label="Generated Prompt",
+                value=prompt,
+                interactive=True,
+                show_label=True,
+                visible=False,
+            ),
+            gr.Button(label="Generate Music", interactive=True, visible=True),
+        )
+def post_submit(show_caption, image_input):
+    _, description, prompt = generate_caption(image_input)
+    return (
+        gr.Textbox(
+            label="Image Caption",
+            value=description,
+            interactive=False,
+            show_label=True,
+            visible=show_caption,
+        ),
+        gr.Textbox(
+            label="Generated Prompt",
+            value=prompt,
+            interactive=True,
+            show_label=True,
+            visible=show_caption,
+        ),
+        gr.Button("Generate Music", interactive=True, visible=True),
+    )
 def UI():
+    with gr.Blocks() as demo:
+        with gr.Tab("Generate Music by melody"):
+            with gr.Row():
+                with gr.Column():
+                    with gr.Row():
+                        model_path = gr.Dropdown(
+                            choices=_AUDIOCRAFT_MODELS,
+                            label="Select the model",
+                            value="facebook/musicgen-melody-large",
+                        )
+                    with gr.Row():
+                        duration = gr.Slider(
+                            minimum=10,
+                            maximum=60,
+                            value=10,
+                            label="Duration",
+                            interactive=True,
+                        )
+                    with gr.Row():
+                        topk = gr.Number(label="Top-k", value=250, interactive=True)
+                        topp = gr.Number(label="Top-p", value=0, interactive=True)
+                        temperature = gr.Number(
+                            label="Temperature", value=1.0, interactive=True
+                        )
+                        sample_rate = gr.Number(
+                            label="output music sample rate",
+                            value=32000,
+                            interactive=True,
+                        )
+                        difficulty = gr.Radio(
+                            ["Easy", "Medium", "Hard"],
+                            label="Difficulty",
+                            value="Easy",
+                            interactive=True,
+                        )
+                        style = gr.Radio(
+                            ["Jazz", "Classical Music", "Hip Hop", "Others"],
+                            value="Classical Music",
+                            label="music genre",
+                            interactive=True,
+                        )
+                        if style == "Others":
+                            style = gr.Textbox(label="Type your music genre")
+                        prompt = generate_prompt(difficulty.value, style.value)
+                        customize = gr.Checkbox(
+                            label="Customize the prompt", interactive=True
+                        )
+                        if customize:
+                            prompt = gr.Textbox(label="Type your prompt")
+                with gr.Column():
+                    with gr.Row():
+                        melody = gr.Audio(
+                            sources=["microphone", "upload"],
+                            label="Record or upload your audio",
+                            # interactive=True,
+                            show_label=True,
+                        )
+                    with gr.Row():
+                        submit = gr.Button("Generate Music")
+                        output_audio = gr.Audio(
+                            "listen to the generated music", type="filepath"
+                        )
+                    with gr.Row():
+                        transcribe_button = gr.Button("Transcribe")
+                        d = gr.DownloadButton("Download the file", visible=False)
+                        transcribe_button.click(
+                            transcribe, inputs=[output_audio], outputs=d
+                        )
+            submit.click(
+                fn=predict,
+                inputs=[
+                    model_path,
+                    prompt,
+                    melody,
+                    duration,
+                    topk,
+                    topp,
+                    temperature,
+                    sample_rate,
+                ],
+                outputs=output_audio,
+            )
+            gr.Examples(
+                examples=[
+                    [
+                        os.path.join(
+                            os.path.dirname(__file__),
+                            "./data/audio/twinkle_twinkle_little_stars_mozart_20sec"
+                            ".mp3",
+                        ),
+                        "Easy",
+                        32000,
+                        20,
+                    ],
+                    [
+                        os.path.join(
+                            os.path.dirname(__file__),
+                            "./data/audio/golden_hour_20sec.mp3",
+                        ),
+                        "Easy",
+                        32000,
+                        20,
+                    ],
+                    [
+                        os.path.join(
+                            os.path.dirname(__file__),
+                            "./data/audio/turkish_march_mozart_20sec.mp3",
+                        ),
+                        "Easy",
+                        32000,
+                        20,
+                    ],
+                    [
+                        os.path.join(
+                            os.path.dirname(__file__),
+                            "./data/audio/golden_hour_20sec.mp3",
+                        ),
+                        "Hard",
+                        32000,
+                        20,
+                    ],
+                    [
+                        os.path.join(
+                            os.path.dirname(__file__),
+                            "./data/audio/golden_hour_20sec.mp3",
+                        ),
+                        "Hard",
+                        32000,
+                        40,
+                    ],
+                    [
+                        os.path.join(
+                            os.path.dirname(__file__),
+                            "./data/audio/golden_hour_20sec.mp3",
+                        ),
+                        "Hard",
+                        16000,
+                        20,
+                    ],
+                ],
+                inputs=[melody, difficulty, sample_rate, duration],
+                label="Audio Examples",
+                outputs=[output_audio],
+                # cache_examples=True,
+            )
+        with gr.Tab("Generate Music by image"):
+            with gr.Row():
+                with gr.Column():
+                    image_input = gr.Image("Upload an image", type="filepath")
+                    melody_condition = gr.Checkbox(
+                        label="Generate music by melody", interactive=True, value=False
+                    )
+                    melody = gr.Audio(
+                        sources=["microphone", "upload"],
+                        label="Record or upload your audio",
+                        show_label=True,
+                        visible=False,
+                    )
+                    melody_condition.change(
+                        fn=toggle_melody_condition,
+                        inputs=[melody_condition],
+                        outputs=melody,
+                    )
+                    description = gr.Textbox(
+                        label="Image Captioning",
+                        show_label=True,
+                        interactive=False,
+                        visible=False,
+                    )
+                    prompt = gr.Textbox(
+                        label="Generated Prompt",
+                        show_label=True,
+                        interactive=True,
+                        visible=False,
+                    )
+                    show_prompt = gr.Checkbox(label="Show the prompt", interactive=True)
+                    submit = gr.Button("submit", interactive=True, visible=True)
+                    generate = gr.Button(
+                        "Generate Music", interactive=True, visible=False
+                    )
+                    submit.click(
+                        fn=post_submit,
+                        inputs=[show_prompt, image_input],
+                        outputs=[description, prompt, generate],
+                    )
+                    show_prompt.change(
+                        fn=show_caption,
+                        inputs=[show_prompt, description, prompt],
+                        outputs=[description, prompt, generate],
+                    )
+                with gr.Column():
+                    with gr.Row():
+                        model_path = gr.Dropdown(
+                            choices=_AUDIOCRAFT_MODELS,
+                            label="Select the model",
+                            value="facebook/musicgen-large",
+                        )
+                    with gr.Row():
+                        duration = gr.Slider(
+                            minimum=10,
+                            maximum=60,
+                            value=10,
+                            label="Duration",
+                            interactive=True,
+                        )
+                    topk = gr.Number(label="Top-k", value=250, interactive=True)
+                    topp = gr.Number(label="Top-p", value=0, interactive=True)
+                    temperature = gr.Number(
+                        label="Temperature", value=1.0, interactive=True
+                    )
+                    sample_rate = gr.Number(
+                        label="output music sample rate", value=32000, interactive=True
+                    )
+                with gr.Column():
+                    output_audio = gr.Audio(
+                        "listen to the generated music",
+                        type="filepath",
+                        show_label=True,
+                    )
+                    transcribe_button = gr.Button("Transcribe")
+                    d = gr.DownloadButton("Download the file", visible=False)
+            transcribe_button.click(transcribe, inputs=[output_audio], outputs=d)
+            generate.click(
+                fn=predict,
+                inputs=[
+                    model_path,
+                    prompt,
+                    melody,
+                    duration,
+                    topk,
+                    topp,
+                    temperature,
+                    sample_rate,
+                ],
+                outputs=output_audio,
+            )
+            gr.Examples(
+                examples=[
+                    [
+                        os.path.join(
+                            os.path.dirname(__file__),
+                            "./data/image/kids_drawing.jpeg",
+                        ),
+                        False,
+                        None,
+                        "facebook/musicgen-large",
+                    ],
+                    [
+                        os.path.join(
+                            os.path.dirname(__file__),
+                            "./data/image/cat.jpeg",
+                        ),
+                        False,
+                        None,
+                        "facebook/musicgen-large",
+                    ],
+                    [
+                        os.path.join(
+                            os.path.dirname(__file__),
+                            "./data/image/cat.jpeg",
+                        ),
+                        True,
+                        "./data/audio/the_nutcracker_dance_of_the_reed_flutes.mp3",
+                        "facebook/musicgen-melody-large",
+                    ],
+                    [
+                        os.path.join(
+                            os.path.dirname(__file__),
+                            "./data/image/beach.jpeg",
+                        ),
+                        False,
+                        None,
+                        "facebook/audiogen-medium",
+                    ],
+                ],
+                inputs=[image_input, melody_condition, melody, model_path],
+                label="Audio Examples",
+                outputs=[output_audio],
+                # cache_examples=True,
+            )
+    demo.queue().launch()
 if __name__ == "__main__":
+    UI()

data/audio/the_nutcracker_dance_of_the_reed_flutes.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aa933b78b2380d325e6436d91de191c7abcb82e4c62ef2ed52a04868233a5012
+size 3577581

data/image/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

data/image/beach.jpeg ADDED Viewed

Git LFS Details

SHA256: 1b742f5752fcab31147a6c213e3b60f56c45b35344faa0a7266ddb95944bcfa3
Pointer size: 130 Bytes
Size of remote file: 39.7 kB

data/image/cat.jpeg ADDED Viewed

Git LFS Details

SHA256: 5f63e517121b2e3e8b21d1cbba5e1fac9e5317da7bbc9980dbaf622cf2439518
Pointer size: 132 Bytes
Size of remote file: 2.4 MB

data/image/kids_drawing.jpeg ADDED Viewed

Git LFS Details

SHA256: d8802f50a0b4353fb9f76f9291dfff758a0109aa437c9b32282c80b74e471d84
Pointer size: 131 Bytes
Size of remote file: 639 kB

gradio_components/image.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import base64
+import json
+import os
+import anthropic
+import gradio as gr
+# Remember to put your API Key here
+client = anthropic.Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])
+# image1_url = "https://i.abcnewsfe.com/a/7d849ccc-e0fe-4416-959d-85889e338add/dune-1-ht-bb-231212_1702405287482_hpMain_16x9.jpeg"
+image1_media_type = "image/jpeg"
+# image1_data = base64.b64encode(httpx.get(image1_url).content).decode("utf-8")
+#
+SYSTEM_PROMPT = """You are an expert llm prompt engineer, you understand the structure of llms and facebook musicgen text to audio model. You will be provided with an image, and require to output a prompt for the musicgen model to capture the essense of the image. Try to do it step by step, evaluate and analyze the image thoroughly. After that, develop a prompt that contains music genera, style, instrument, and all the other details needed. This prompt will be provided to musicgen model to generate a 15s audio clip.
+Here are some descriptions from musicgen model:
+The model was trained with descriptions from a stock music catalog, descriptions that will work best should include some level of detail on the instruments present, along with some intended use case (e.g. adding “perfect for a commercial” can somehow help).
+Try to make the prompt simple and concise with only 1-2 sentences
+Make sure the ouput is in JSON fomat, with two items `description` and `prompt`"""
+def generate_caption(image_file, progress=gr.Progress()):
+    with open(image_file, "rb") as f:
+        image_encoded = base64.b64encode(f.read()).decode("utf-8")
+    progress(0, desc="Starting image captioning...")
+    message = client.messages.create(
+        model="claude-3-opus-20240229",
+        max_tokens=1024,
+        system=SYSTEM_PROMPT,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "source": {
+                            "type": "base64",
+                            "media_type": image1_media_type,
+                            "data": image_encoded,
+                        },
+                    },
+                    {"type": "text", "text": "develop the prompt based on this image"},
+                ],
+            }
+        ],
+    )
+    progress(100, desc="image captioning...Done!")
+    # Parse the content string into a Python object
+    message_object = json.loads(message.content[0].text)
+    # Access the description and prompt from the message object
+    description = message_object["description"]
+    prompt = message_object["prompt"]
+    print(description)
+    print(prompt)
+    return message_object, description, prompt

gradio_components/prediction.py CHANGED Viewed

@@ -1,36 +1,53 @@
 import time
-import torch
-from audiocraft.data.audio_utils import convert_audio
-from audiocraft.data.audio import audio_write
-import gradio as gr
-from audiocraft.models import MusicGen
-from tempfile import NamedTemporaryFile
 from pathlib import Path
-from transformers import AutoModelForSeq2SeqLM
 import basic_pitch
 import basic_pitch.inference
 from basic_pitch import ICASSP_2022_MODEL_PATH
-def load_model(version='facebook/musicgen-melody'):
     return MusicGen.get_pretrained(version)
-def _do_predictions(model, texts, melodies, duration, progress=False, gradio_progress=None, target_sr=32000, target_ac = 1, **gen_kwargs):
-    print("new batch", len(texts), texts, [None if m is None else (m[0], m[1].shape) for m in melodies])
     be = time.time()
     processed_melodies = []
     for melody in melodies:
         if melody is None:
             processed_melodies.append(None)
         else:
-            sr, melody = melody[0], torch.from_numpy(melody[1]).to(model.device).float().t()
             print(f"Input audio sample rate is {sr}")
             if melody.dim() == 1:
                 melody = melody[None]
-            melody = melody[..., :int(sr * duration)]
             melody = convert_audio(melody, sr, target_sr, target_ac)
             processed_melodies.append(melody)
@@ -42,7 +59,7 @@ def _do_predictions(model, texts, melodies, duration, progress=False, gradio_pro
                 melody_wavs=processed_melodies,
                 melody_sample_rate=target_sr,
                 progress=progress,
-                return_tokens=False
             )
         else:
             # text only
@@ -55,14 +72,30 @@ def _do_predictions(model, texts, melodies, duration, progress=False, gradio_pro
     for output in outputs:
         with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
             audio_write(
-                file.name, output, model.sample_rate, strategy="loudness",
-                loudness_headroom_db=16, loudness_compressor=True, add_suffix=False)
             out_wavs.append(file.name)
     print("generation finished", len(texts), time.time() - be)
     return out_wavs
-def predict(model_path, text, melody, duration, topk, topp, temperature, target_sr, progress=gr.Progress()):
     global INTERRUPTING
     global USE_DIFFUSION
     INTERRUPTING = False
@@ -92,6 +125,7 @@ def predict(model_path, text, melody, duration, topk, topp, temperature, target_
         progress((min(max_generated, to_generate), to_generate))
         if INTERRUPTING:
             raise gr.Error("Interrupted.")
     model.set_custom_progress_callback(_progress)
     wavs = _do_predictions(
@@ -105,7 +139,8 @@ def predict(model_path, text, melody, duration, topk, topp, temperature, target_
         top_k=topk,
         top_p=topp,
         temperature=temperature,
-        gradio_progress=progress)
     return wavs[0]
@@ -114,7 +149,7 @@ def transcribe(audio_path):
     model_output, midi_data, note_events = basic_pitch.inference.predict(
         audio_path=audio_path,
         model_or_model_path=ICASSP_2022_MODEL_PATH,
-        )
     with NamedTemporaryFile("wb", suffix=".mid", delete=False) as file:
         try:
@@ -125,6 +160,5 @@ def transcribe(audio_path):
             raise e
     return gr.DownloadButton(
-        value=file.name,
-        label=f"Download MIDI file {file.name}",
-        visible=True)

 import time
 from pathlib import Path
+from tempfile import NamedTemporaryFile
 import basic_pitch
 import basic_pitch.inference
+import gradio as gr
+import torch
+from audiocraft.data.audio import audio_write
+from audiocraft.data.audio_utils import convert_audio
+from audiocraft.models import MusicGen
 from basic_pitch import ICASSP_2022_MODEL_PATH
+from transformers import AutoModelForSeq2SeqLM
+def load_model(version="facebook/musicgen-melody"):
     return MusicGen.get_pretrained(version)
+def _do_predictions(
+    model,
+    texts,
+    melodies,
+    duration,
+    progress=False,
+    gradio_progress=None,
+    target_sr=32000,
+    target_ac=1,
+    **gen_kwargs,
+):
+    print(
+        "new batch",
+        len(texts),
+        texts,
+        [None if m is None else (m[0], m[1].shape) for m in melodies],
+    )
     be = time.time()
     processed_melodies = []
     for melody in melodies:
         if melody is None:
             processed_melodies.append(None)
         else:
+            sr, melody = (
+                melody[0],
+                torch.from_numpy(melody[1]).to(model.device).float().t(),
+            )
             print(f"Input audio sample rate is {sr}")
             if melody.dim() == 1:
                 melody = melody[None]
+            melody = melody[..., : int(sr * duration)]
             melody = convert_audio(melody, sr, target_sr, target_ac)
             processed_melodies.append(melody)
                 melody_wavs=processed_melodies,
                 melody_sample_rate=target_sr,
                 progress=progress,
+                return_tokens=False,
             )
         else:
             # text only
     for output in outputs:
         with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
             audio_write(
+                file.name,
+                output,
+                model.sample_rate,
+                strategy="loudness",
+                loudness_headroom_db=16,
+                loudness_compressor=True,
+                add_suffix=False,
+            )
             out_wavs.append(file.name)
     print("generation finished", len(texts), time.time() - be)
     return out_wavs
+def predict(
+    model_path,
+    text,
+    melody,
+    duration,
+    topk,
+    topp,
+    temperature,
+    target_sr,
+    progress=gr.Progress(),
+):
     global INTERRUPTING
     global USE_DIFFUSION
     INTERRUPTING = False
         progress((min(max_generated, to_generate), to_generate))
         if INTERRUPTING:
             raise gr.Error("Interrupted.")
     model.set_custom_progress_callback(_progress)
     wavs = _do_predictions(
         top_k=topk,
         top_p=topp,
         temperature=temperature,
+        gradio_progress=progress,
+    )
     return wavs[0]
     model_output, midi_data, note_events = basic_pitch.inference.predict(
         audio_path=audio_path,
         model_or_model_path=ICASSP_2022_MODEL_PATH,
+    )
     with NamedTemporaryFile("wb", suffix=".mid", delete=False) as file:
         try:
             raise e
     return gr.DownloadButton(
+        value=file.name, label=f"Download MIDI file {file.name}", visible=True
+    )

requirements.txt CHANGED Viewed

@@ -2,4 +2,5 @@ torch==2.1.0
 audiocraft
 basic-pitch
 gradio
-tensorflow==2.15.0

 audiocraft
 basic-pitch
 gradio
+tensorflow==2.15.0
+anthropic