|
import binascii |
|
import os |
|
|
|
import gradio as gr |
|
import librosa |
|
import numpy as np |
|
import pretty_midi |
|
import torch |
|
import yt_dlp |
|
from transformers import Pop2PianoForConditionalGeneration, Pop2PianoProcessor |
|
|
|
from utils import cli_to_api, mp3_write, normalize |
|
|
|
yt_video_dir = "./yt_dir" |
|
outputs_dir = "./midi_wav_outputs" |
|
os.makedirs(outputs_dir, exist_ok=True) |
|
os.makedirs(yt_video_dir, exist_ok=True) |
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano").to(device) |
|
processor = Pop2PianoProcessor.from_pretrained("sweetcocoa/pop2piano") |
|
composers = model.generation_config.composer_to_feature_token.keys() |
|
|
|
|
|
def get_audio_from_yt_video(yt_link: str): |
|
filename = binascii.hexlify(os.urandom(8)).decode() + ".mp3" |
|
filename = os.path.join(yt_video_dir, filename) |
|
yt_opt = cli_to_api( |
|
[ |
|
"--extract-audio", |
|
"--audio-format", |
|
"mp3", |
|
"--restrict-filenames", |
|
"-o", |
|
filename, |
|
] |
|
) |
|
with yt_dlp.YoutubeDL(yt_opt) as ydl: |
|
ydl.download([yt_link]) |
|
|
|
return filename, filename |
|
|
|
|
|
def inference(file_uploaded, composer): |
|
|
|
|
|
pop_y, sr = librosa.load(file_uploaded, sr=None) |
|
|
|
inputs = processor(audio=pop_y, sampling_rate=sr, return_tensors="pt").to(device) |
|
model_output = model.generate(input_features=inputs["input_features"], composer=composer) |
|
tokenizer_output = processor.batch_decode( |
|
token_ids=model_output.to("cpu"), feature_extractor_output=inputs.to("cpu") |
|
)["pretty_midi_objects"] |
|
|
|
return prepare_output_file(tokenizer_output, sr, pop_y) |
|
|
|
|
|
def prepare_output_file(tokenizer_output: pretty_midi.PrettyMIDI, sr: int, pop_y: np.ndarray): |
|
|
|
output_file_name = "p2p_" + binascii.hexlify(os.urandom(8)).decode() |
|
midi_output = os.path.join(outputs_dir, output_file_name + ".mid") |
|
|
|
|
|
tokenizer_output[0].write(midi_output) |
|
midi_y: np.ndarray = tokenizer_output[0].fluidsynth(sr) |
|
midi_y_path: str = midi_output.replace(".mid", ".mp3") |
|
mp3_write(midi_y_path, sr, normalize(midi_y), normalized=True) |
|
|
|
|
|
if len(pop_y) > len(midi_y): |
|
midi_y = np.pad(midi_y, (0, len(pop_y) - len(midi_y))) |
|
elif len(pop_y) < len(midi_y): |
|
pop_y = np.pad(pop_y, (0, -len(pop_y) + len(midi_y))) |
|
stereo = np.stack((midi_y, pop_y * 0.5)) |
|
|
|
|
|
stereo_path = midi_output.replace(".mid", ".mix.mp3") |
|
mp3_write(stereo_path, sr, normalize(stereo.T), normalized=True) |
|
|
|
return midi_y_path, midi_y_path, midi_output, stereo_path, stereo_path |
|
|
|
|
|
block = gr.Blocks() |
|
|
|
with block: |
|
gr.HTML( |
|
""" |
|
<div style="text-align: center; max-width: 400px; margin: 0 auto;"> |
|
<div |
|
style=" |
|
display: inline-flex; |
|
align-items: center; |
|
gap: 0.8rem; |
|
font-size: 1.75rem; |
|
" |
|
> |
|
<h1 style="font-weight: 900; margin-bottom: 7px;"> |
|
Pop2piano |
|
</h1> |
|
</div> |
|
<p style="margin-bottom: 10px; font-size: 94%"> |
|
A demo for Pop2Piano:Pop Audio-based Piano Cover Generation.<br> |
|
Please select the composer(Arranger) and upload the pop audio or enter the YouTube link and then click Generate. |
|
</p> |
|
</div> |
|
""" |
|
) |
|
with gr.Group(): |
|
with gr.Column(): |
|
with gr.Blocks() as audio_select: |
|
with gr.Tab("Upload Audio"): |
|
file_uploaded = gr.Audio(label="Upload an audio", type="filepath") |
|
with gr.Tab("YouTube url"): |
|
with gr.Row(): |
|
yt_link = gr.Textbox( |
|
label="Enter YouTube Link of the Video", autofocus=True, lines=3 |
|
) |
|
yt_btn = gr.Button("Download Audio from YouTube Link", size="lg") |
|
yt_audio_path = gr.Audio( |
|
label="Audio Extracted from the YouTube Video", interactive=False |
|
) |
|
yt_btn.click( |
|
get_audio_from_yt_video, |
|
inputs=[yt_link], |
|
outputs=[yt_audio_path, file_uploaded], |
|
) |
|
with gr.Column(): |
|
composer = gr.Dropdown(label="Arranger", choices=composers, value="composer1") |
|
generate_btn = gr.Button("Generate") |
|
|
|
with gr.Group(): |
|
gr.HTML( |
|
""" |
|
<div> <h3> <center> Listen to the generated MIDI. </h3> </div> |
|
""" |
|
) |
|
with gr.Row(equal_height=True): |
|
stereo_mix1 = gr.Audio(label="Listen to the Stereo Mix") |
|
wav_output1 = gr.Audio(label="Listen to the Generated MIDI") |
|
|
|
with gr.Row(): |
|
stereo_mix2 = gr.File(label="Download the Stereo Mix (.mp3") |
|
wav_output2 = gr.File(label="Download the Generated MIDI (.mp3)") |
|
midi_output = gr.File(label="Download the Generated MIDI (.mid)") |
|
generate_btn.click( |
|
inference, |
|
inputs=[file_uploaded, composer], |
|
outputs=[wav_output1, wav_output2, midi_output, stereo_mix1, stereo_mix2], |
|
) |
|
|
|
with gr.Group(): |
|
gr.Examples( |
|
[ |
|
["./examples/custom_song.mp3", "composer1"], |
|
], |
|
fn=inference, |
|
inputs=[file_uploaded, composer], |
|
outputs=[wav_output1, wav_output2, midi_output, stereo_mix1, stereo_mix2], |
|
cache_examples=True, |
|
) |
|
|
|
gr.HTML( |
|
""" |
|
<div class="footer"> |
|
<center><p><a href="http://sweetcocoa.github.io/pop2piano_samples" style="text-decoration: underline;" target="_blank">Project Page</a> |
|
<center><a href="https://huggingface.co/docs/transformers/main/model_doc/pop2piano" style="text-decoration: underline;" target="_blank">HuggingFace Model Docs</a> |
|
<center><a href="https://github.com/sweetcocoa/pop2piano" style="text-decoration: underline;" target="_blank">Github</a> |
|
</p> |
|
</div> |
|
""" |
|
) |
|
|
|
block.launch(debug=False) |
|
|