import gradio as gr import numpy as np import soundfile as sf import noisereduce as nr from audio_separator.separator import Separator import os import tempfile import logging import time # --- Configuration --- OUTPUT_DIR = "output_audio" # Directory to store final outputs temporarily before Gradio handles them TEMP_SEP_DIR_PREFIX = "sep_temp_" # Prefix for temporary directories used by audio-separator # Choose a UVR5 model supported by audio-separator. # Examples: 'UVR-MDX-NET Voc FT', 'UVR_MDXNET_KARA_2' , 'UVR-MDX-NET-Inst_1', etc. # Check audio-separator documentation or repo for available models. # 'UVR-MDX-NET Voc FT' is often a good starting point for vocals. MODEL_NAME = "UVR-MDX-NET Voc FT" # You can also specify a local model file path: # MODEL_NAME = "/path/to/your/local/model.onnx" # --- Setup --- # Configure logging for audio-separator and this script logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Create output directory if it doesn't exist os.makedirs(OUTPUT_DIR, exist_ok=True) # Initialize the Separator class (this might download the model on first run) logger.info(f"Initializing audio separator with model: {MODEL_NAME}...") try: # Initialize Separator WITHOUT the model_name argument first separator = Separator(log_level='INFO', # Optional: Specify device (e.g., 'cuda' if PyTorch GPU is setup, else 'cpu') # computation_device='cuda' ) logger.info("Separator initialized successfully.") except Exception as e: logger.error(f"Failed to initialize audio separator: {e}", exc_info=True) # Optionally, raise the exception or handle it to prevent app launch raise RuntimeError(f"Failed to initialize audio separator: {e}") # --- Core Processing Function --- def enhance_vocal(input_audio_path): """ Separates vocals using UVR5 (via audio-separator), enhances them using noisereduce, and returns the path to the enhanced vocal audio file. """ if input_audio_path is None: logger.warning("No input audio file provided.") return None, "Error: No input audio file provided. Please upload an audio file." logger.info(f"Processing audio file: {input_audio_path}") processing_start_time = time.time() try: # --- Step 1: Vocal Separation using audio-separator (UVR5) --- logger.info(f"Starting vocal separation using model: {MODEL_NAME}...") # Log model name here separation_start_time = time.time() with tempfile.TemporaryDirectory(prefix=TEMP_SEP_DIR_PREFIX) as temp_sep_dir: logger.info(f"Using temporary directory for separation: {temp_sep_dir}") try: # Perform separation - NOW we pass the model name HERE # Check audio-separator documentation for your version if this exact syntax fails. output_paths = separator.separate( input_audio_path, output_dir=temp_sep_dir, model_name=MODEL_NAME # Pass model name to the separate method # Other potential arguments might be needed depending on the version # e.g., output_format='wav' ) separation_duration = time.time() - separation_start_time logger.info(f"Separation completed in {separation_duration:.2f} seconds.") logger.info(f"Separation output files: {output_paths}") # Find the vocal track path vocal_track_path = None instrumental_track_path = None # Keep track if needed for advanced noise reduction # Robustly find the vocal path based on common naming conventions input_basename = os.path.splitext(os.path.basename(input_audio_path))[0] expected_vocal_suffix = f"(Vocals)_{MODEL_NAME}.wav" # Adjust if needed based on actual output expected_instrumental_suffix = f"(Instrumental)_{MODEL_NAME}.wav" # Adjust if needed for output_path in output_paths: filename = os.path.basename(output_path) # Check for common vocal identifiers if "Vocals" in filename or "vocals" in filename: # Simple check: if output contains "(Vocals)" substring # More robust: check if filename ends with the expected suffix # if filename.startswith(input_basename) and expected_vocal_suffix in filename: vocal_track_path = output_path logger.info(f"Found vocal track: {vocal_track_path}") # Check for common instrumental identifiers (optional, for potential noise profile) elif "Instrumental" in filename or "instrumental" in filename or "No_Vocals" in filename: # if filename.startswith(input_basename) and expected_instrumental_suffix in filename: instrumental_track_path = output_path logger.info(f"Found instrumental track: {instrumental_track_path}") if vocal_track_path is None: logger.error(f"Could not find the vocal track in separation results: {output_paths}") return None, "Error: Vocal separation failed to produce a recognizable vocal track." if not os.path.exists(vocal_track_path): logger.error(f"Identified vocal track path does not exist: {vocal_track_path}") return None, "Error: Separated vocal track file is missing." except Exception as sep_exc: logger.error(f"Error during audio separation process: {sep_exc}", exc_info=True) return None, f"Error during vocal separation: {str(sep_exc)}" # --- Step 2: Load the Separated Vocal Track --- logger.info(f"Loading separated vocal track: {vocal_track_path}") try: vocal_data, sr = sf.read(vocal_track_path, dtype='float32') logger.info(f"Vocal track loaded successfully. Sample Rate: {sr}, Duration: {len(vocal_data)/sr:.2f}s") # Ensure mono for noisereduce if it's stereo (take average or first channel) if vocal_data.ndim > 1 and vocal_data.shape[1] > 1: logger.info("Vocal track is stereo, converting to mono for noise reduction.") vocal_data = np.mean(vocal_data, axis=1) # Average channels except Exception as read_exc: logger.error(f"Error reading separated vocal file {vocal_track_path}: {read_exc}", exc_info=True) return None, f"Error reading separated vocal file: {str(read_exc)}" # --- Step 3: Noise Reduction using noisereduce --- logger.info("Applying noise reduction to the vocal track...") nr_start_time = time.time() try: # Simple approach: let noisereduce estimate noise from the vocal track itself # Parameters can be tuned (e.g., n_fft, hop_length, prop_decrease) # Look at noisereduce documentation for advanced usage. reduced_noise_vocals = nr.reduce_noise(y=vocal_data, sr=sr, prop_decrease=1.0, # Aggressiveness of reduction stationary=False, # Good for non-stationary background noise often present in music n_jobs=-1) # Use all available CPU cores # # --- Alternative (more complex) Noise Reduction Strategy --- # # If you trust the separation completely, you could potentially use the # # instrumental track as a noise profile. This is EXPERIMENTAL and might # # remove desirable vocal characteristics if separation isn't perfect. # if instrumental_track_path and os.path.exists(instrumental_track_path): # try: # logger.info(f"Loading instrumental track for noise profile: {instrumental_track_path}") # instrumental_data, inst_sr = sf.read(instrumental_track_path, dtype='float32') # if inst_sr != sr: # logger.warning(f"Sample rate mismatch! Vocal SR={sr}, Instrumental SR={inst_sr}. Resampling needed or risk error.") # # Add resampling code here if necessary (e.g., using librosa) # else: # if instrumental_data.ndim > 1 and instrumental_data.shape[1] > 1: # instrumental_data = np.mean(instrumental_data, axis=1) # Mono # if len(instrumental_data) > len(vocal_data): # Ensure same length (trim longer) # instrumental_data = instrumental_data[:len(vocal_data)] # elif len(vocal_data) > len(instrumental_data): # Pad shorter (less ideal) # instrumental_data = np.pad(instrumental_data, (0, len(vocal_data) - len(instrumental_data))) # logger.info("Using instrumental track as noise profile for reduction.") # reduced_noise_vocals = nr.reduce_noise(y=vocal_data, # sr=sr, # y_noise=instrumental_data, # prop_decrease=1.0, # stationary=False, # n_jobs=-1) # except Exception as noise_profile_exc: # logger.error(f"Failed to use instrumental track as noise profile: {noise_profile_exc}. Falling back to standard reduction.", exc_info=True) # # Fallback to standard reduction if using instrumental fails # reduced_noise_vocals = nr.reduce_noise(y=vocal_data, sr=sr, prop_decrease=1.0, stationary=False, n_jobs=-1) # # --- End Alternative Strategy --- nr_duration = time.time() - nr_start_time logger.info(f"Noise reduction completed in {nr_duration:.2f} seconds.") except Exception as nr_exc: logger.error(f"Error during noise reduction: {nr_exc}", exc_info=True) return None, f"Error during noise reduction: {str(nr_exc)}" # --- Step 4: Save the Enhanced Vocal Track --- logger.info("Saving enhanced vocal track...") try: # Use NamedTemporaryFile to create a file that Gradio can access # Ensure it has a .wav extension and delete=False so it persists # after the 'with' block until Gradio handles it. with tempfile.NamedTemporaryFile(suffix=".wav", delete=False, dir=OUTPUT_DIR) as temp_out_file: enhanced_vocal_path = temp_out_file.name sf.write(enhanced_vocal_path, reduced_noise_vocals, sr, subtype='PCM_16') # Save as 16-bit WAV logger.info(f"Enhanced vocal track saved to: {enhanced_vocal_path}") processing_duration = time.time() - processing_start_time logger.info(f"Total processing time: {processing_duration:.2f} seconds.") # Return the path to the enhanced audio file return enhanced_vocal_path, f"Processing successful! Total time: {processing_duration:.2f}s" # Return success message except Exception as write_exc: logger.error(f"Error saving enhanced vocal file: {write_exc}", exc_info=True) return None, f"Error saving enhanced vocal file: {str(write_exc)}" # The temporary directory 'temp_sep_dir' is automatically cleaned up here except Exception as e: processing_duration = time.time() - processing_start_time logger.error(f"An unexpected error occurred during processing: {e}", exc_info=True) return None, f"An unexpected error occurred after {processing_duration:.2f}s: {str(e)}" # --- Gradio Interface Definition --- # Custom CSS for better layout (Optional) css = """ #status_textbox textarea { font-style: italic; color: grey; } """ with gr.Blocks(css=css) as demo: gr.Markdown( """ # Vocal Enhancement App (UVR5 + NoiseReduce) 🎤✨ Upload an audio file (e.g., MP3, WAV, FLAC) containing vocals and background music/noise. The app will: 1. Use a **UVR5 (MDX-Net)** model (`""" + MODEL_NAME + """`) via `audio-separator` to isolate the vocals. 2. Apply **noise reduction** using the `noisereduce` library to the isolated vocals. 3. Return the enhanced vocal track. **Note:** Processing, especially the vocal separation step, can take some time depending on the audio length and your computer's performance (GPU acceleration helps significantly if configured). The first run might take longer as the separation model needs to be downloaded. """ ) with gr.Row(): with gr.Column(scale=1): audio_input = gr.Audio(type="filepath", label="Input Audio File") submit_button = gr.Button("Enhance Vocals", variant="primary") with gr.Column(scale=1): audio_output = gr.Audio(type="filepath", label="Enhanced Vocals") status_output = gr.Textbox(label="Status", elem_id="status_textbox", interactive=False) submit_button.click( fn=enhance_vocal, inputs=audio_input, outputs=[audio_output, status_output], api_name="enhance_vocals" # For API usage if needed ) gr.Examples( examples=[ # Add paths to local example audio files if you have them # os.path.join(os.path.dirname(__file__), "audio_example_1.mp3"), # os.path.join(os.path.dirname(__file__), "audio/example_2.wav"), ], inputs=audio_input, outputs=[audio_output, status_output], fn=enhance_vocal, cache_examples=False, # Set to True if examples are static and processing is slow ) # --- Launch the App --- logger.info("Starting Gradio application...") # Share=True creates a public link (use with caution) demo.launch(server_name="0.0.0.0", # Allows access from other devices on the network server_port=7860) # Standard Gradio port logger.info("Gradio application stopped.")