Estella / Estella.py

Upload 6 files

4133ed9 verified 11 months ago

5.61 kB

	import os
	import numpy as np
	import librosa
	import tensorflow as tf
	from tensorflow.keras.models import Sequential
	from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
	from sklearn.preprocessing import LabelEncoder
	from sklearn.model_selection import train_test_split
	import re

	# Constants
	SAMPLE_RATE = 44100 # Sample rate
	NUM_MFCC = 13
	MAX_LEN = 1000
	WINDOW_SIZE = 1 # Window size in seconds
	HOP_SIZE = 1 # Hop size (overlap) in seconds

	# Function to extract MFCC features
	def extract_features(file_path):
	y, sr = librosa.load(file_path, sr=SAMPLE_RATE)
	mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=NUM_MFCC)

	# Pad or truncate MFCCs to a fixed length
	if mfccs.shape[1] < MAX_LEN:
	padding = MAX_LEN - mfccs.shape[1]
	mfccs = np.pad(mfccs, ((0, 0), (0, padding)), mode='constant')
	else:
	mfccs = mfccs[:, :MAX_LEN]

	return mfccs

	# Load dataset
	def load_data(dataset_path):
	features = []
	labels = []

	# Regex pattern to extract class name from filename
	pattern = re.compile(r'^(.*?)(?: \d+)?\.wav$')

	for file_name in os.listdir(dataset_path):
	if file_name.endswith('.wav'):
	file_path = os.path.join(dataset_path, file_name)
	match = pattern.match(file_name)
	if match:
	label = match.group(1) # Extract class name without number
	mfccs = extract_features(file_path)
	features.append(mfccs)
	labels.append(label)

	if len(features) == 0 or len(labels) == 0:
	raise ValueError("No data found. Ensure the dataset path is correct and contains .wav files.")

	return np.array(features), np.array(labels)

	# Load data
	dataset_path = 'dataset'
	X, y = load_data(dataset_path)

	# Encode labels
	label_encoder = LabelEncoder()
	y_encoded = label_encoder.fit_transform(y)
	y_categorical = tf.keras.utils.to_categorical(y_encoded)

	# Save LabelEncoder
	np.save('label_encoder.npy', label_encoder.classes_)

	# Split data
	X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.2, random_state=42)

	# Build model
	model = Sequential([
	tf.keras.layers.Input(shape=(NUM_MFCC, MAX_LEN, 1)),
	Conv2D(32, kernel_size=(3, 3), activation='relu'),
	MaxPooling2D(pool_size=(2, 2)),
	Conv2D(64, kernel_size=(3, 3), activation='relu'),
	MaxPooling2D(pool_size=(2, 2)),
	Flatten(),
	Dense(128, activation='relu'),
	Dropout(0.5),
	Dense(len(np.unique(y_encoded)), activation='softmax')
	])

	# Adjust learning rate if necessary
	from tensorflow.keras.optimizers import Adam

	learning_rate = 0.0001 # Adjust as necessary
	optimizer = Adam(learning_rate=learning_rate)
	model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

	# Reshape data for the model
	X_train = np.expand_dims(X_train, axis=-1)
	X_test = np.expand_dims(X_test, axis=-1)

	# Train model
	history = model.fit(X_train, y_train, epochs=20, validation_data=(X_test, y_test))

	# Save model
	model.save('sound_classification_model.h5')

	# Evaluate model
	loss, accuracy = model.evaluate(X_test, y_test)
	print(f"Test accuracy: {accuracy}")

	# Function to classify audio in sliding windows with overlapping handling
	def classify_audio(file_path, model, label_encoder, window_size=WINDOW_SIZE, hop_size=HOP_SIZE):
	y, sr = librosa.load(file_path, sr=SAMPLE_RATE)
	total_duration = librosa.get_duration(y=y, sr=sr)
	window_samples = int(window_size * sr)
	hop_samples = int(hop_size * sr)

	results = []
	detected_windows = [] # List to keep track of detected windows

	for start in range(0, len(y) - window_samples + 1, hop_samples):
	end = start + window_samples
	segment = y[start:end]
	mfccs = librosa.feature.mfcc(y=segment, sr=sr, n_mfcc=NUM_MFCC)

	# Pad or truncate MFCCs
	if mfccs.shape[1] < MAX_LEN:
	padding = MAX_LEN - mfccs.shape[1]
	mfccs = np.pad(mfccs, ((0, 0), (0, padding)), mode='constant')
	else:
	mfccs = mfccs[:, :MAX_LEN]

	mfccs = np.expand_dims(mfccs, axis=0)
	mfccs = np.expand_dims(mfccs, axis=-1)
	prediction = model.predict(mfccs)
	predicted_class = np.argmax(prediction, axis=1)
	time = start / sr
	class_label = label_encoder.inverse_transform(predicted_class)[0]

	# Check for overlaps and add detected regions
	detected = False
	for (det_start, det_end, det_label) in detected_windows:
	if (start < det_end and end > det_start): # Overlapping condition
	detected = True
	break

	if not detected:
	results.append((time, class_label))
	detected_windows.append((start, end, class_label))

	return results

	# Example usage
	if __name__ == "__main__":
	# Load model and label encoder
	def load_model_and_encoder(model_path, label_encoder_path):
	model = tf.keras.models.load_model(model_path)
	classes = np.load(label_encoder_path, allow_pickle=True)
	label_encoder = LabelEncoder()
	label_encoder.classes_ = classes
	return model, label_encoder

	model_path = 'sound_classification_model.h5'
	label_encoder_path = 'label_encoder.npy'
	audio_path = 'dataset/Debris Wood 02.wav'

	model, label_encoder = load_model_and_encoder(model_path, label_encoder_path)

	sound_identifications = classify_audio(audio_path, model, label_encoder)

	for time, label in sound_identifications:
	print(f'[{time:.2f} seconds] Class: {label}')