TemplateREPO / Spectrograms.py

Upload 2 files

b2a4a92 verified 5 months ago

15.3 kB

	import numpy as np
	import torch
	import torchaudio
	import librosa
	import librosa.display
	import matplotlib.pyplot as plt
	import soundfile as sf
	from PIL import Image


	# Step 1: Encode Audio to Mel-Spectrogram
	def encode_audio_to_mel_spectrogram(audio_file, n_mels=128):
	"""
	Encode an audio file to a mel-spectrogram.

	Parameters:
	- audio_file: Path to the audio file.
	- n_mels: Number of mel bands (default: 128).

	Returns:
	- mel_spectrogram_db: Mel-spectrogram in dB scale.
	- sample_rate: Sample rate of the audio file.
	"""
	y, sample_rate = librosa.load(audio_file, sr=None) # Load audio
	mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sample_rate, n_mels=n_mels)
	mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max) # Convert to dB
	return mel_spectrogram_db, sample_rate

	# Improved Step 2: Save Mel-Spectrogram as Image
	def save_mel_spectrogram_image(mel_spectrogram_db, sample_rate, output_image='mel_spectrogram.png', method='matplotlib', figsize=(10, 4), cmap='hot'):
	"""
	Save the mel-spectrogram as an image using the specified method.

	Parameters:
	- mel_spectrogram_db: Mel-spectrogram in dB scale.
	- sample_rate: Sample rate of the audio file.
	- output_image: Path to save the image.
	- method: Method for saving ('matplotlib' or 'custom').
	- figsize: Size of the figure for matplotlib (default: (10, 4)).
	- cmap: Colormap for the spectrogram (default: 'hot').
	"""
	if method == 'matplotlib':
	plt.figure(figsize=figsize)
	librosa.display.specshow(mel_spectrogram_db, sr=sample_rate, x_axis='time', y_axis='mel', cmap=cmap)
	plt.colorbar(format='%+2.0f dB')
	plt.title('Mel-Spectrogram')
	plt.savefig(output_image)
	plt.close()
	print(f"Mel-spectrogram image saved using matplotlib as '{output_image}'")

	elif method == 'custom':
	# Convert dB scale to linear scale for image generation
	mel_spectrogram_linear = librosa.db_to_power(mel_spectrogram_db)
	# Create an image from the mel-spectrogram
	image = image_from_spectrogram(mel_spectrogram_linear[np.newaxis, ...]) # Add channel dimension
	# Save the image
	image.save(output_image)
	print(f"Mel-spectrogram image saved using custom method as '{output_image}'")

	else:
	raise ValueError("Invalid method. Choose 'matplotlib' or 'custom'.")


	# Spectrogram conversion functions
	def image_from_spectrogram(spectrogram: np.ndarray, power: float = 0.25) -> Image.Image:
	"""
	Compute a spectrogram image from a spectrogram magnitude array.

	Args:
	spectrogram: (channels, frequency, time)
	power: A power curve to apply to the spectrogram to preserve contrast

	Returns:
	image: (frequency, time, channels)
	"""
	# Rescale to 0-1
	max_value = np.max(spectrogram)
	data = spectrogram / max_value

	# Apply the power curve
	data = np.power(data, power)

	# Rescale to 0-255 and invert
	data = 255 - (data * 255).astype(np.uint8)

	# Convert to a PIL image
	if data.shape[0] == 1:
	image = Image.fromarray(data[0], mode="L").convert("RGB")
	elif data.shape[0] == 2:
	data = np.array([np.zeros_like(data[0]), data[0], data[1]]).transpose(1, 2, 0)
	image = Image.fromarray(data, mode="RGB")
	else:
	raise NotImplementedError(f"Unsupported number of channels: {data.shape[0]}")

	# Flip Y
	image = image.transpose(Image.FLIP_TOP_BOTTOM)
	return image


	# Step 3: Extract Mel-Spectrogram from Image (Direct Pixel Manipulation)
	def extract_mel_spectrogram_from_image(image_path):
	"""
	Extract a mel-spectrogram from a saved image using pixel manipulation.

	Parameters:
	- image_path: Path to the spectrogram image file.

	Returns:
	- mel_spectrogram_db: The extracted mel-spectrogram in dB scale.
	"""
	img = Image.open(image_path).convert('L') # Open image and convert to grayscale
	img_array = np.array(img) # Convert to NumPy array
	mel_spectrogram_db = img_array / 255.0 * -80 # Scale to dB range
	return mel_spectrogram_db

	# Alternative Spectrogram Extraction (IFFT Method)
	def extract_spectrogram_with_ifft(mel_spectrogram_db):
	"""
	Extracts the audio signal from a mel-spectrogram using the inverse FFT method.

	Parameters:
	- mel_spectrogram_db: The mel-spectrogram in dB scale.

	Returns:
	- audio: The reconstructed audio signal.
	"""
	# Convert dB mel-spectrogram back to linear scale
	mel_spectrogram = librosa.db_to_power(mel_spectrogram_db)

	# Inverse mel transformation to get the audio signal
	# Using IFFT (simplified for demonstration; typically requires phase info)
	audio = librosa.feature.inverse.mel_to_audio(mel_spectrogram)

	return audio

	# Step 4: Decode Mel-Spectrogram with Griffin-Lim
	def decode_mel_spectrogram_to_audio(mel_spectrogram_db, sample_rate, output_audio='griffin_reconstructed_audio.wav'):
	"""
	Decode a mel-spectrogram into audio using Griffin-Lim algorithm.

	Parameters:
	- mel_spectrogram_db: The mel-spectrogram in dB scale.
	- sample_rate: The sample rate for the audio file.
	- output_audio: Path to save the reconstructed audio file.
	"""
	# Convert dB mel-spectrogram back to linear scale
	mel_spectrogram = librosa.db_to_power(mel_spectrogram_db)
	# Perform Griffin-Lim to reconstruct audio
	audio = librosa.griffinlim(mel_spectrogram)
	# Save the generated audio
	sf.write(output_audio, audio, sample_rate)
	print(f"Griffin-Lim reconstructed audio saved as '{output_audio}'")
	return audio

	# Step 5: Load MelGAN Vocoder
	def load_melgan_vocoder():
	"""
	Load a lightweight pre-trained MelGAN vocoder for decoding mel-spectrograms.
	Returns a torch MelGAN vocoder model.
	"""
	model = torchaudio.models.MelGAN() # Load MelGAN model
	model.eval() # Ensure the model is in evaluation mode
	return model

	# Step 6: Decode Mel-Spectrogram with MelGAN
	def decode_mel_spectrogram_with_melgan(mel_spectrogram_db, sample_rate, output_audio='melgan_reconstructed_audio.wav'):
	"""
	Decode a mel-spectrogram into audio using MelGAN vocoder.

	Parameters:
	- mel_spectrogram_db: The mel-spectrogram in dB scale.
	- sample_rate: The sample rate for the audio file.
	- output_audio: Path to save the reconstructed audio file.

	Returns:
	- audio: The reconstructed audio signal.
	"""
	# Convert dB mel-spectrogram back to linear scale
	mel_spectrogram = librosa.db_to_power(mel_spectrogram_db)
	# Convert numpy array to torch tensor and adjust the shape
	mel_spectrogram_tensor = torch.tensor(mel_spectrogram).unsqueeze(0) # Shape: [1, mel_bins, time_frames]

	# Load the MelGAN vocoder model
	melgan = load_melgan_vocoder()

	# Pass the mel-spectrogram through MelGAN to generate audio
	with torch.no_grad():
	audio = melgan(mel_spectrogram_tensor).squeeze().numpy() # Squeeze to remove batch dimension

	# Save the generated audio
	sf.write(output_audio, audio, sample_rate)
	print(f"MelGAN reconstructed audio saved as '{output_audio}'")
	return audio

	def audio_from_waveform(samples: np.ndarray, sample_rate: int, normalize: bool = False) -> pydub.AudioSegment:
	"""
	Convert a numpy array of samples of a waveform to an audio segment.

	Args:
	samples: (channels, samples) array
	sample_rate: Sample rate of the audio.
	normalize: Flag to normalize volume.

	Returns:
	pydub.AudioSegment
	"""
	# Normalize volume to fit in int16
	if normalize:
	samples *= np.iinfo(np.int16).max / np.max(np.abs(samples))

	# Transpose and convert to int16
	samples = samples.transpose(1, 0).astype(np.int16)

	# Write to the bytes of a WAV file
	wav_bytes = io.BytesIO()
	wavfile.write(wav_bytes, sample_rate, samples)
	wav_bytes.seek(0)

	# Read into pydub
	return pydub.AudioSegment.from_wav(wav_bytes)


	def apply_filters(segment: pydub.AudioSegment, compression: bool = False) -> pydub.AudioSegment:
	"""
	Apply post-processing filters to the audio segment to compress it and keep at a -10 dBFS level.

	Args:
	segment: The audio segment to filter.
	compression: Flag to apply dynamic range compression.

	Returns:
	pydub.AudioSegment
	"""
	if compression:
	segment = pydub.effects.normalize(segment, headroom=0.1)
	segment = segment.apply_gain(-10 - segment.dBFS)
	segment = pydub.effects.compress_dynamic_range(
	segment,
	threshold=-20.0,
	ratio=4.0,
	attack=5.0,
	release=50.0,
	)

	# Apply gain to desired dB level and normalize again
	desired_db = -12
	segment = segment.apply_gain(desired_db - segment.dBFS)
	return pydub.effects.normalize(segment, headroom=0.1)


	def stitch_segments(segments: Sequence[pydub.AudioSegment], crossfade_s: float) -> pydub.AudioSegment:
	"""
	Stitch together a sequence of audio segments with a crossfade between each segment.

	Args:
	segments: Sequence of audio segments to stitch.
	crossfade_s: Duration of crossfade in seconds.

	Returns:
	pydub.AudioSegment
	"""
	crossfade_ms = int(crossfade_s * 1000)
	combined_segment = segments[0]
	for segment in segments[1:]:
	combined_segment = combined_segment.append(segment, crossfade=crossfade_ms)
	return combined_segment


	def overlay_segments(segments: Sequence[pydub.AudioSegment]) -> pydub.AudioSegment:
	"""
	Overlay a sequence of audio segments on top of each other.

	Args:
	segments: Sequence of audio segments to overlay.

	Returns:
	pydub.AudioSegment
	"""
	assert len(segments) > 0
	output: pydub.AudioSegment = segments[0]
	for segment in segments[1:]:
	output = output.overlay(segment)
	return output



	# Step 7: Full Pipeline for Audio Processing with Customization
	def mel_spectrogram_pipeline(audio_file, output_image='mel_spectrogram.png',
	output_audio_griffin='griffin_reconstructed_audio.wav',
	output_audio_melgan='melgan_reconstructed_audio.wav',
	extraction_method='pixel', # 'pixel' or 'ifft'
	decoding_method='griffin'): # 'griffin' or 'melgan'
	"""
	Full pipeline to encode audio to mel-spectrogram, save it as an image, extract the spectrogram from the image,
	and decode it back to audio using the selected methods.

	Parameters:
	- audio_file: Path to the audio file to be processed.
	- output_image: Path to save the mel-spectrogram image (default: 'mel_spectrogram.png').
	- output_audio_griffin: Path to save the Griffin-Lim reconstructed audio.
	- output_audio_melgan: Path to save the MelGAN reconstructed audio.
	- extraction_method: Method for extraction ('pixel' or 'ifft').
	- decoding_method: Method for decoding ('griffin' or 'melgan').
	"""
	# Step 1: Encode (Audio -> Mel-Spectrogram)
	mel_spectrogram_db, sample_rate = encode_audio_to_mel_spectrogram(audio_file)

	# Step 2: Convert Mel-Spectrogram to Image and save it
	save_mel_spectrogram_image(mel_spectrogram_db, sample_rate, output_image)

	# Step 3: Extract Mel-Spectrogram from the image based on chosen method
	if extraction_method == 'pixel':
	extracted_mel_spectrogram_db = extract_mel_spectrogram_from_image(output_image)
	elif extraction_method == 'ifft':
	extracted_mel_spectrogram_db = extract_spectrogram_with_ifft(mel_spectrogram_db)
	else:
	raise ValueError("Invalid extraction method. Choose 'pixel' or 'ifft'.")

	# Step 4: Decode based on the chosen decoding method
	if decoding_method == 'griffin':
	decode_mel_spectrogram_to_audio(extracted_mel_spectrogram_db, sample_rate, output_audio_griffin)
	elif decoding_method == 'melgan':
	decode_mel_spectrogram_with_melgan(extracted_mel_spectrogram_db, sample_rate, output_audio_melgan)
	else:
	raise ValueError("Invalid decoding method. Choose 'griffin' or 'melgan'.")


	def process_audio(audio_file, extraction_method, decoding_method):
	# Create temporary files for outputs
	with tempfile.NamedTemporaryFile(suffix=".png") as temp_image, \
	tempfile.NamedTemporaryFile(suffix=".wav") as temp_audio_griffin, \
	tempfile.NamedTemporaryFile(suffix=".wav") as temp_audio_melgan:

	# Step 1: Encode (Audio -> Mel-Spectrogram)
	mel_spectrogram_db, sample_rate = encode_audio_to_mel_spectrogram(audio_file)

	# Step 2: Convert Mel-Spectrogram to Image and save it
	save_mel_spectrogram_image(mel_spectrogram_db, sample_rate, temp_image.name)

	# Step 3: Extract Mel-Spectrogram from the image based on chosen method
	if extraction_method == 'pixel':
	extracted_mel_spectrogram_db = extract_mel_spectrogram_from_image(temp_image.name)
	elif extraction_method == 'ifft':
	extracted_mel_spectrogram_db = extract_spectrogram_with_ifft(mel_spectrogram_db)

	# Step 4: Decode using both methods
	decode_mel_spectrogram_to_audio(extracted_mel_spectrogram_db, sample_rate, temp_audio_griffin.name)
	decode_mel_spectrogram_with_melgan(extracted_mel_spectrogram_db, sample_rate, temp_audio_melgan.name)

	# Return results
	return (temp_image.name,
	temp_audio_griffin.name if decoding_method == 'griffin' else temp_audio_melgan.name)

	# Create Gradio interface
	iface = gr.Interface(
	fn=process_audio,
	inputs=[
	gr.Audio(type="filepath", label="Upload Audio"),
	gr.Radio(["pixel", "ifft"], label="Extraction Method", value="pixel"),
	gr.Radio(["griffin", "melgan"], label="Decoding Method", value="griffin")
	],
	outputs=[
	gr.Image(type="filepath", label="Mel-Spectrogram"),
	gr.Audio(type="filepath", label="Reconstructed Audio")
	],
	title="Audio Encoder-Decoder",
	description="Upload an audio file to encode it to a mel-spectrogram and then decode it back to audio."
	)

	# Launch the app
	iface.launch()


	# Example usage(TEST)
	if __name__ == "__main__":
	audio_file_path = 'your_audio_file.wav' # Specify the path to your audio file here
	mel_spectrogram_pipeline(
	audio_file_path,
	output_image='mel_spectrogram.png',
	output_audio_griffin='griffin_reconstructed_audio.wav',
	output_audio_melgan='melgan_reconstructed_audio.wav',
	extraction_method='pixel', # Choose 'pixel' or 'ifft'
	decoding_method='griffin' # Choose 'griffin' or 'melgan'
	)