Estella / Estella.py
Ocillus's picture
Upload 6 files
4133ed9 verified
import os
import numpy as np
import librosa
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import re
# Constants
SAMPLE_RATE = 44100 # Sample rate
NUM_MFCC = 13
MAX_LEN = 1000
WINDOW_SIZE = 1 # Window size in seconds
HOP_SIZE = 1 # Hop size (overlap) in seconds
# Function to extract MFCC features
def extract_features(file_path):
y, sr = librosa.load(file_path, sr=SAMPLE_RATE)
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=NUM_MFCC)
# Pad or truncate MFCCs to a fixed length
if mfccs.shape[1] < MAX_LEN:
padding = MAX_LEN - mfccs.shape[1]
mfccs = np.pad(mfccs, ((0, 0), (0, padding)), mode='constant')
else:
mfccs = mfccs[:, :MAX_LEN]
return mfccs
# Load dataset
def load_data(dataset_path):
features = []
labels = []
# Regex pattern to extract class name from filename
pattern = re.compile(r'^(.*?)(?: \d+)?\.wav$')
for file_name in os.listdir(dataset_path):
if file_name.endswith('.wav'):
file_path = os.path.join(dataset_path, file_name)
match = pattern.match(file_name)
if match:
label = match.group(1) # Extract class name without number
mfccs = extract_features(file_path)
features.append(mfccs)
labels.append(label)
if len(features) == 0 or len(labels) == 0:
raise ValueError("No data found. Ensure the dataset path is correct and contains .wav files.")
return np.array(features), np.array(labels)
# Load data
dataset_path = 'dataset'
X, y = load_data(dataset_path)
# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_categorical = tf.keras.utils.to_categorical(y_encoded)
# Save LabelEncoder
np.save('label_encoder.npy', label_encoder.classes_)
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.2, random_state=42)
# Build model
model = Sequential([
tf.keras.layers.Input(shape=(NUM_MFCC, MAX_LEN, 1)),
Conv2D(32, kernel_size=(3, 3), activation='relu'),
MaxPooling2D(pool_size=(2, 2)),
Conv2D(64, kernel_size=(3, 3), activation='relu'),
MaxPooling2D(pool_size=(2, 2)),
Flatten(),
Dense(128, activation='relu'),
Dropout(0.5),
Dense(len(np.unique(y_encoded)), activation='softmax')
])
# Adjust learning rate if necessary
from tensorflow.keras.optimizers import Adam
learning_rate = 0.0001 # Adjust as necessary
optimizer = Adam(learning_rate=learning_rate)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
# Reshape data for the model
X_train = np.expand_dims(X_train, axis=-1)
X_test = np.expand_dims(X_test, axis=-1)
# Train model
history = model.fit(X_train, y_train, epochs=20, validation_data=(X_test, y_test))
# Save model
model.save('sound_classification_model.h5')
# Evaluate model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test accuracy: {accuracy}")
# Function to classify audio in sliding windows with overlapping handling
def classify_audio(file_path, model, label_encoder, window_size=WINDOW_SIZE, hop_size=HOP_SIZE):
y, sr = librosa.load(file_path, sr=SAMPLE_RATE)
total_duration = librosa.get_duration(y=y, sr=sr)
window_samples = int(window_size * sr)
hop_samples = int(hop_size * sr)
results = []
detected_windows = [] # List to keep track of detected windows
for start in range(0, len(y) - window_samples + 1, hop_samples):
end = start + window_samples
segment = y[start:end]
mfccs = librosa.feature.mfcc(y=segment, sr=sr, n_mfcc=NUM_MFCC)
# Pad or truncate MFCCs
if mfccs.shape[1] < MAX_LEN:
padding = MAX_LEN - mfccs.shape[1]
mfccs = np.pad(mfccs, ((0, 0), (0, padding)), mode='constant')
else:
mfccs = mfccs[:, :MAX_LEN]
mfccs = np.expand_dims(mfccs, axis=0)
mfccs = np.expand_dims(mfccs, axis=-1)
prediction = model.predict(mfccs)
predicted_class = np.argmax(prediction, axis=1)
time = start / sr
class_label = label_encoder.inverse_transform(predicted_class)[0]
# Check for overlaps and add detected regions
detected = False
for (det_start, det_end, det_label) in detected_windows:
if (start < det_end and end > det_start): # Overlapping condition
detected = True
break
if not detected:
results.append((time, class_label))
detected_windows.append((start, end, class_label))
return results
# Example usage
if __name__ == "__main__":
# Load model and label encoder
def load_model_and_encoder(model_path, label_encoder_path):
model = tf.keras.models.load_model(model_path)
classes = np.load(label_encoder_path, allow_pickle=True)
label_encoder = LabelEncoder()
label_encoder.classes_ = classes
return model, label_encoder
model_path = 'sound_classification_model.h5'
label_encoder_path = 'label_encoder.npy'
audio_path = 'dataset/Debris Wood 02.wav'
model, label_encoder = load_model_and_encoder(model_path, label_encoder_path)
sound_identifications = classify_audio(audio_path, model, label_encoder)
for time, label in sound_identifications:
print(f'[{time:.2f} seconds] Class: {label}')