File size: 15,009 Bytes
3b9d0e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2d9223
 
3b9d0e1
c2d9223
3b9d0e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23eab92
3b9d0e1
c2d9223
3b9d0e1
 
 
 
c2d9223
3b9d0e1
c2d9223
 
 
 
 
 
 
 
 
 
3b9d0e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
import gradio as gr
import numpy as np
import soundfile as sf
import noisereduce as nr
from audio_separator.separator import Separator
import os
import tempfile
import logging
import time

# --- Configuration ---
OUTPUT_DIR = "output_audio" # Directory to store final outputs temporarily before Gradio handles them
TEMP_SEP_DIR_PREFIX = "sep_temp_" # Prefix for temporary directories used by audio-separator
# Choose a UVR5 model supported by audio-separator. 
# Examples: 'UVR-MDX-NET Voc FT', 'UVR_MDXNET_KARA_2' , 'UVR-MDX-NET-Inst_1', etc.
# Check audio-separator documentation or repo for available models.
# 'UVR-MDX-NET Voc FT' is often a good starting point for vocals.
MODEL_NAME = "UVR-MDX-NET Voc FT" 
# You can also specify a local model file path:
# MODEL_NAME = "/path/to/your/local/model.onnx" 

# --- Setup ---
# Configure logging for audio-separator and this script
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Initialize the Separator class (this might download the model on first run)
logger.info(f"Initializing audio separator with model: {MODEL_NAME}...")
try:
    # Initialize Separator WITHOUT the model_name argument first
    separator = Separator(log_level='INFO',
                          # Optional: Specify device (e.g., 'cuda' if PyTorch GPU is setup, else 'cpu')
                          # computation_device='cuda'
                          )
    logger.info("Separator initialized successfully.")
except Exception as e:
    logger.error(f"Failed to initialize audio separator: {e}", exc_info=True)
    # Optionally, raise the exception or handle it to prevent app launch
    raise RuntimeError(f"Failed to initialize audio separator: {e}")


# --- Core Processing Function ---
def enhance_vocal(input_audio_path):
    """
    Separates vocals using UVR5 (via audio-separator), enhances them using noisereduce,
    and returns the path to the enhanced vocal audio file.
    """
    if input_audio_path is None:
        logger.warning("No input audio file provided.")
        return None, "Error: No input audio file provided. Please upload an audio file."

    logger.info(f"Processing audio file: {input_audio_path}")
    processing_start_time = time.time()

    try:
        # --- Step 1: Vocal Separation using audio-separator (UVR5) ---
        logger.info(f"Starting vocal separation using model: {MODEL_NAME}...") # Log model name here
        separation_start_time = time.time()

        with tempfile.TemporaryDirectory(prefix=TEMP_SEP_DIR_PREFIX) as temp_sep_dir:
            logger.info(f"Using temporary directory for separation: {temp_sep_dir}")

            try:
                # Perform separation - NOW we pass the model name HERE
                # Check audio-separator documentation for your version if this exact syntax fails.
                output_paths = separator.separate(
                    input_audio_path,
                    output_dir=temp_sep_dir,
                    model_name=MODEL_NAME # Pass model name to the separate method
                    # Other potential arguments might be needed depending on the version
                    # e.g., output_format='wav'
                )

                separation_duration = time.time() - separation_start_time
                logger.info(f"Separation completed in {separation_duration:.2f} seconds.")
                logger.info(f"Separation output files: {output_paths}")

                # Find the vocal track path
                vocal_track_path = None
                instrumental_track_path = None # Keep track if needed for advanced noise reduction
                
                # Robustly find the vocal path based on common naming conventions
                input_basename = os.path.splitext(os.path.basename(input_audio_path))[0]
                expected_vocal_suffix = f"(Vocals)_{MODEL_NAME}.wav" # Adjust if needed based on actual output
                expected_instrumental_suffix = f"(Instrumental)_{MODEL_NAME}.wav" # Adjust if needed

                for output_path in output_paths:
                    filename = os.path.basename(output_path)
                    # Check for common vocal identifiers
                    if "Vocals" in filename or "vocals" in filename:
                         # Simple check: if output contains "(Vocals)" substring
                         # More robust: check if filename ends with the expected suffix
                         # if filename.startswith(input_basename) and expected_vocal_suffix in filename:
                        vocal_track_path = output_path
                        logger.info(f"Found vocal track: {vocal_track_path}")
                    # Check for common instrumental identifiers (optional, for potential noise profile)
                    elif "Instrumental" in filename or "instrumental" in filename or "No_Vocals" in filename:
                         # if filename.startswith(input_basename) and expected_instrumental_suffix in filename:
                        instrumental_track_path = output_path
                        logger.info(f"Found instrumental track: {instrumental_track_path}")


                if vocal_track_path is None:
                    logger.error(f"Could not find the vocal track in separation results: {output_paths}")
                    return None, "Error: Vocal separation failed to produce a recognizable vocal track."
                
                if not os.path.exists(vocal_track_path):
                     logger.error(f"Identified vocal track path does not exist: {vocal_track_path}")
                     return None, "Error: Separated vocal track file is missing."

            except Exception as sep_exc:
                logger.error(f"Error during audio separation process: {sep_exc}", exc_info=True)
                return None, f"Error during vocal separation: {str(sep_exc)}"

            # --- Step 2: Load the Separated Vocal Track ---
            logger.info(f"Loading separated vocal track: {vocal_track_path}")
            try:
                vocal_data, sr = sf.read(vocal_track_path, dtype='float32')
                logger.info(f"Vocal track loaded successfully. Sample Rate: {sr}, Duration: {len(vocal_data)/sr:.2f}s")
                # Ensure mono for noisereduce if it's stereo (take average or first channel)
                if vocal_data.ndim > 1 and vocal_data.shape[1] > 1:
                     logger.info("Vocal track is stereo, converting to mono for noise reduction.")
                     vocal_data = np.mean(vocal_data, axis=1) # Average channels

            except Exception as read_exc:
                logger.error(f"Error reading separated vocal file {vocal_track_path}: {read_exc}", exc_info=True)
                return None, f"Error reading separated vocal file: {str(read_exc)}"


            # --- Step 3: Noise Reduction using noisereduce ---
            logger.info("Applying noise reduction to the vocal track...")
            nr_start_time = time.time()
            
            try:
                # Simple approach: let noisereduce estimate noise from the vocal track itself
                # Parameters can be tuned (e.g., n_fft, hop_length, prop_decrease)
                # Look at noisereduce documentation for advanced usage.
                reduced_noise_vocals = nr.reduce_noise(y=vocal_data, 
                                                       sr=sr,
                                                       prop_decrease=1.0, # Aggressiveness of reduction
                                                       stationary=False, # Good for non-stationary background noise often present in music
                                                       n_jobs=-1) # Use all available CPU cores
                
                # # --- Alternative (more complex) Noise Reduction Strategy ---
                # # If you trust the separation completely, you could potentially use the
                # # instrumental track as a noise profile. This is EXPERIMENTAL and might
                # # remove desirable vocal characteristics if separation isn't perfect.
                # if instrumental_track_path and os.path.exists(instrumental_track_path):
                #     try:
                #         logger.info(f"Loading instrumental track for noise profile: {instrumental_track_path}")
                #         instrumental_data, inst_sr = sf.read(instrumental_track_path, dtype='float32')
                #         if inst_sr != sr:
                #             logger.warning(f"Sample rate mismatch! Vocal SR={sr}, Instrumental SR={inst_sr}. Resampling needed or risk error.")
                #             # Add resampling code here if necessary (e.g., using librosa)
                #         else:
                #              if instrumental_data.ndim > 1 and instrumental_data.shape[1] > 1:
                #                  instrumental_data = np.mean(instrumental_data, axis=1) # Mono
                #              if len(instrumental_data) > len(vocal_data): # Ensure same length (trim longer)
                #                  instrumental_data = instrumental_data[:len(vocal_data)]
                #              elif len(vocal_data) > len(instrumental_data): # Pad shorter (less ideal)
                #                   instrumental_data = np.pad(instrumental_data, (0, len(vocal_data) - len(instrumental_data)))

                #              logger.info("Using instrumental track as noise profile for reduction.")
                #              reduced_noise_vocals = nr.reduce_noise(y=vocal_data,
                #                                                     sr=sr,
                #                                                     y_noise=instrumental_data,
                #                                                     prop_decrease=1.0,
                #                                                     stationary=False,
                #                                                     n_jobs=-1)

                #     except Exception as noise_profile_exc:
                #         logger.error(f"Failed to use instrumental track as noise profile: {noise_profile_exc}. Falling back to standard reduction.", exc_info=True)
                #         # Fallback to standard reduction if using instrumental fails
                #         reduced_noise_vocals = nr.reduce_noise(y=vocal_data, sr=sr, prop_decrease=1.0, stationary=False, n_jobs=-1)
                # # --- End Alternative Strategy ---

                nr_duration = time.time() - nr_start_time
                logger.info(f"Noise reduction completed in {nr_duration:.2f} seconds.")
            except Exception as nr_exc:
                logger.error(f"Error during noise reduction: {nr_exc}", exc_info=True)
                return None, f"Error during noise reduction: {str(nr_exc)}"


            # --- Step 4: Save the Enhanced Vocal Track ---
            logger.info("Saving enhanced vocal track...")
            try:
                # Use NamedTemporaryFile to create a file that Gradio can access
                # Ensure it has a .wav extension and delete=False so it persists
                # after the 'with' block until Gradio handles it.
                with tempfile.NamedTemporaryFile(suffix=".wav", delete=False, dir=OUTPUT_DIR) as temp_out_file:
                    enhanced_vocal_path = temp_out_file.name
                
                sf.write(enhanced_vocal_path, reduced_noise_vocals, sr, subtype='PCM_16') # Save as 16-bit WAV
                logger.info(f"Enhanced vocal track saved to: {enhanced_vocal_path}")
                
                processing_duration = time.time() - processing_start_time
                logger.info(f"Total processing time: {processing_duration:.2f} seconds.")

                # Return the path to the enhanced audio file
                return enhanced_vocal_path, f"Processing successful! Total time: {processing_duration:.2f}s" # Return success message

            except Exception as write_exc:
                logger.error(f"Error saving enhanced vocal file: {write_exc}", exc_info=True)
                return None, f"Error saving enhanced vocal file: {str(write_exc)}"

        # The temporary directory 'temp_sep_dir' is automatically cleaned up here

    except Exception as e:
        processing_duration = time.time() - processing_start_time
        logger.error(f"An unexpected error occurred during processing: {e}", exc_info=True)
        return None, f"An unexpected error occurred after {processing_duration:.2f}s: {str(e)}"


# --- Gradio Interface Definition ---

# Custom CSS for better layout (Optional)
css = """
#status_textbox textarea {
    font-style: italic;
    color: grey;
}
"""

with gr.Blocks(css=css) as demo:
    gr.Markdown(
        """
        # Vocal Enhancement App (UVR5 + NoiseReduce) 🎤✨

        Upload an audio file (e.g., MP3, WAV, FLAC) containing vocals and background music/noise.
        The app will:
        1.  Use a **UVR5 (MDX-Net)** model (`""" + MODEL_NAME + """`) via `audio-separator` to isolate the vocals.
        2.  Apply **noise reduction** using the `noisereduce` library to the isolated vocals.
        3.  Return the enhanced vocal track.

        **Note:** Processing, especially the vocal separation step, can take some time depending on the audio length and your computer's performance (GPU acceleration helps significantly if configured). The first run might take longer as the separation model needs to be downloaded.
        """
    )

    with gr.Row():
        with gr.Column(scale=1):
            audio_input = gr.Audio(type="filepath", label="Input Audio File")
            submit_button = gr.Button("Enhance Vocals", variant="primary")
        with gr.Column(scale=1):
            audio_output = gr.Audio(type="filepath", label="Enhanced Vocals")
            status_output = gr.Textbox(label="Status", elem_id="status_textbox", interactive=False)

    submit_button.click(
        fn=enhance_vocal,
        inputs=audio_input,
        outputs=[audio_output, status_output],
        api_name="enhance_vocals" # For API usage if needed
    )

    gr.Examples(
        examples=[
            # Add paths to local example audio files if you have them
            # os.path.join(os.path.dirname(__file__), "audio_example_1.mp3"),
            # os.path.join(os.path.dirname(__file__), "audio/example_2.wav"),
        ],
        inputs=audio_input,
        outputs=[audio_output, status_output],
        fn=enhance_vocal,
        cache_examples=False, # Set to True if examples are static and processing is slow
    )

# --- Launch the App ---

logger.info("Starting Gradio application...")
    # Share=True creates a public link (use with caution)
demo.launch(server_name="0.0.0.0", # Allows access from other devices on the network
    server_port=7860)      # Standard Gradio port
logger.info("Gradio application stopped.")