File size: 7,619 Bytes
1a572e4
 
 
db4880c
 
 
1a572e4
db4880c
7a3b53b
db4880c
 
 
1a572e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
db4880c
 
1a572e4
db4880c
1a572e4
db4880c
 
1a572e4
 
 
db4880c
 
1a572e4
 
db4880c
 
 
 
 
1a572e4
 
db4880c
1a572e4
 
 
db4880c
 
1a572e4
db4880c
 
 
 
1a572e4
 
db4880c
1a572e4
 
 
 
 
 
 
 
 
db4880c
 
 
 
 
 
 
 
 
1a572e4
 
 
 
 
 
7a3b53b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a572e4
 
7a3b53b
 
 
 
 
1a572e4
 
 
 
 
db4880c
 
 
1a572e4
 
db4880c
 
 
 
 
 
 
 
 
1a572e4
 
 
 
 
 
 
 
 
db4880c
 
 
 
 
 
1a572e4
 
 
 
 
 
db4880c
 
 
 
 
 
 
 
 
 
1a572e4
 
 
 
db4880c
 
 
 
 
 
 
1a572e4
db4880c
 
 
 
e94f209
1a572e4
 
db4880c
e94f209
7a3b53b
 
 
1a572e4
 
 
 
db4880c
1a572e4
 
 
 
 
 
7a3b53b
 
 
 
 
db4880c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
import os
import binascii
import warnings

import torch
import librosa
import numpy as np
import pytube as pt  # to download the youtube videos as audios
import gradio as gr
import soundfile as sf  # to make the stereo mix

from pytube.exceptions import VideoUnavailable
from transformers import Pop2PianoForConditionalGeneration, Pop2PianoProcessor


yt_video_dir = "./yt_dir"
outputs_dir = "./midi_wav_outputs"
os.makedirs(outputs_dir, exist_ok=True)
os.makedirs(yt_video_dir, exist_ok=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano").to(device)
processor = Pop2PianoProcessor.from_pretrained("sweetcocoa/pop2piano")
composers = model.generation_config.composer_to_feature_token.keys()


def get_audio_from_yt_video(yt_link):
    try:
        yt = pt.YouTube(yt_link)
        t = yt.streams.filter(only_audio=True)
        filename = os.path.join(yt_video_dir, binascii.hexlify(os.urandom(8)).decode() + ".mp4")
        t[0].download(filename=filename)
    except VideoUnavailable as e:
        warnings.warn(f"Video Not Found at {yt_link} ({e})")
        filename = None

    return filename, filename


def inference(file_uploaded, composer):
    # to save the native sampling rate of the file, sr=None is used, but this can cause some silent errors where the
    # generated output will not be upto the desired quality. If that happens please consider switching sr to 44100 Hz.
    waveform, sr = librosa.load(file_uploaded, sr=None)

    inputs = processor(audio=waveform, sampling_rate=sr, return_tensors="pt").to(device)
    model_output = model.generate(input_features=inputs["input_features"], composer=composer)
    tokenizer_output = processor.batch_decode(
        token_ids=model_output.to("cpu"), feature_extractor_output=inputs.to("cpu")
    )["pretty_midi_objects"]

    return prepare_output_file(tokenizer_output, sr)


def prepare_output_file(tokenizer_output, sr:int):
    # Add some random values so that no two file names are same
    output_file_name = "output_" + binascii.hexlify(os.urandom(8)).decode()
    midi_output = os.path.join(outputs_dir, output_file_name + ".mid")

    # write the .mid and its wav files
    tokenizer_output[0].write(midi_output)
    midi_wav:np.ndarray = tokenizer_output[0].fluidsynth(sr)
    wav_output:str = midi_output.replace(".mid", ".wav")
    sf.write(wav_output, midi_wav, samplerate=sr)

    return wav_output, wav_output, midi_output


def get_stereo(pop_path, midi, pop_scale=0.5):
    pop_y, sr = librosa.load(pop_path, sr=None)
    midi_y, _ = librosa.load(midi.name, sr=None)

    if len(pop_y) > len(midi_y):
        midi_y = np.pad(midi_y, (0, len(pop_y) - len(midi_y)))
    elif len(pop_y) < len(midi_y):
        pop_y = np.pad(pop_y, (0, -len(pop_y) + len(midi_y)))
    stereo = np.stack((midi_y, pop_y * pop_scale))

    stereo_mix_path = pop_path.replace("output", "output_stereo_mix")
    sf.write(
        file=stereo_mix_path,
        data=stereo.T,
        samplerate=sr,
        format="wav",
    )

    return stereo_mix_path, stereo_mix_path


# Thanks a lot to "https://huggingface.co/Taithrah" for this theme.
# taken from https://huggingface.co/spaces/NoCrypt/miku
block = gr.Blocks(theme="Taithrah/Minimal")

with block:
    gr.HTML(
        """
            <div style="text-align: center; max-width: 700px; margin: 0 auto;">
              <div
                style="
                  display: inline-flex;
                  align-items: center;
                  gap: 0.8rem;
                  font-size: 1.75rem;
                "
              >
                <h1 style="font-weight: 900; margin-bottom: 7px;">
                  Pop2piano
                </h1>
              </div>
              <p style="margin-bottom: 10px; font-size: 94%">
                A demo for Pop2Piano:Pop Audio-based Piano Cover Generation.<br>
                Please select the composer(Arranger) and upload the pop audio or enter the YouTube link and then click Generate.
              </p>
            </div>
        """
    )
    with gr.Group():
        with gr.Row(equal_height=True):
            with gr.Column():
                file_uploaded = gr.Audio(label="Upload an audio", type="filepath")
            with gr.Column():
                with gr.Row():
                    yt_link = gr.Textbox(
                        label="Enter YouTube Link of the Video", autofocus=True, lines=3
                    )
                    yt_btn = gr.Button("Download Audio from YouTube Link", size="lg")

                yt_audio_path = gr.Audio(
                    label="Audio Extracted from the YouTube Video", interactive=False
                )
                yt_btn.click(
                    get_audio_from_yt_video,
                    inputs=[yt_link],
                    outputs=[yt_audio_path, file_uploaded],
                )

    with gr.Group():
        with gr.Column():
            composer = gr.Dropdown(label="Arranger", choices=composers, value="composer1")
            generate_btn = gr.Button("Generate")

        with gr.Row().style(mobile_collapse=False, equal_height=True):
            wav_output2 = gr.File(label="Download the Generated MIDI (.wav)")
            wav_output1 = gr.Audio(label="Listen to the Generated MIDI")
            midi_output = gr.File(label="Download the Generated MIDI (.mid)")
            generate_btn.click(
                inference,
                inputs=[file_uploaded, composer],
                outputs=[wav_output1, wav_output2, midi_output],
            )

    with gr.Group():
        gr.HTML(
            """
            <div> <h3> <center> Get the Stereo Mix from the Pop Music and Generated MIDI </h3> </div>
            """
        )
        pop_scale = (
            gr.Slider(
                0,
                1,
                value=0.5,
                label="Choose the ratio between Pop and MIDI",
                info="1.0 = Only Pop, 0.0=Only MIDI",
                interactive=True,
            ),
        )
        stereo_btn = gr.Button("Get Stereo Mix")
        with gr.Row():
            stereo_mix1 = gr.Audio(label="Listen to the Stereo Mix")
            stereo_mix2 = gr.File(label="Download the Stereo Mix")

        stereo_btn.click(
            get_stereo,
            inputs=[file_uploaded, wav_output2, pop_scale[0]],
            outputs=[stereo_mix1, stereo_mix2],
        )

    with gr.Group():
        gr.Examples(
            [
                ["./examples/custom_song.mp3", "composer1"],
            ],
            fn=inference,
            inputs=[file_uploaded, composer],
            outputs=[wav_output1, wav_output2, midi_output],
            cache_examples=True,
        )
        gr.HTML(
            """
        <div class="footer">
                    <center>The design for this Space is taken from <a href="https://huggingface.co/spaces/NoCrypt/miku"> NoCrypt/miku </a>
        </div>
        """
        )

        gr.HTML(
            """
        <div class="footer">
                    <center><p><a href="http://sweetcocoa.github.io/pop2piano_samples" style="text-decoration: underline;" target="_blank">Project Page</a> 
                    <center><a href="https://huggingface.co/docs/transformers/main/model_doc/pop2piano" style="text-decoration: underline;" target="_blank">HuggingFace Model Docs</a>
                    <center><a href="https://github.com/sweetcocoa/pop2piano" style="text-decoration: underline;" target="_blank">Github</a>
                    </p>
        </div>
        """
        )

block.launch(debug=False)