|
import os |
|
import numpy as np |
|
import librosa |
|
import tensorflow as tf |
|
from tensorflow.keras.models import Sequential |
|
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout |
|
from sklearn.preprocessing import LabelEncoder |
|
from sklearn.model_selection import train_test_split |
|
import re |
|
|
|
|
|
SAMPLE_RATE = 44100 |
|
NUM_MFCC = 13 |
|
MAX_LEN = 1000 |
|
WINDOW_SIZE = 1 |
|
HOP_SIZE = 1 |
|
|
|
|
|
def extract_features(file_path): |
|
y, sr = librosa.load(file_path, sr=SAMPLE_RATE) |
|
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=NUM_MFCC) |
|
|
|
|
|
if mfccs.shape[1] < MAX_LEN: |
|
padding = MAX_LEN - mfccs.shape[1] |
|
mfccs = np.pad(mfccs, ((0, 0), (0, padding)), mode='constant') |
|
else: |
|
mfccs = mfccs[:, :MAX_LEN] |
|
|
|
return mfccs |
|
|
|
|
|
def load_data(dataset_path): |
|
features = [] |
|
labels = [] |
|
|
|
|
|
pattern = re.compile(r'^(.*?)(?: \d+)?\.wav$') |
|
|
|
for file_name in os.listdir(dataset_path): |
|
if file_name.endswith('.wav'): |
|
file_path = os.path.join(dataset_path, file_name) |
|
match = pattern.match(file_name) |
|
if match: |
|
label = match.group(1) |
|
mfccs = extract_features(file_path) |
|
features.append(mfccs) |
|
labels.append(label) |
|
|
|
if len(features) == 0 or len(labels) == 0: |
|
raise ValueError("No data found. Ensure the dataset path is correct and contains .wav files.") |
|
|
|
return np.array(features), np.array(labels) |
|
|
|
|
|
dataset_path = 'dataset' |
|
X, y = load_data(dataset_path) |
|
|
|
|
|
label_encoder = LabelEncoder() |
|
y_encoded = label_encoder.fit_transform(y) |
|
y_categorical = tf.keras.utils.to_categorical(y_encoded) |
|
|
|
|
|
np.save('label_encoder.npy', label_encoder.classes_) |
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.2, random_state=42) |
|
|
|
|
|
model = Sequential([ |
|
tf.keras.layers.Input(shape=(NUM_MFCC, MAX_LEN, 1)), |
|
Conv2D(32, kernel_size=(3, 3), activation='relu'), |
|
MaxPooling2D(pool_size=(2, 2)), |
|
Conv2D(64, kernel_size=(3, 3), activation='relu'), |
|
MaxPooling2D(pool_size=(2, 2)), |
|
Flatten(), |
|
Dense(128, activation='relu'), |
|
Dropout(0.5), |
|
Dense(len(np.unique(y_encoded)), activation='softmax') |
|
]) |
|
|
|
|
|
from tensorflow.keras.optimizers import Adam |
|
|
|
learning_rate = 0.0001 |
|
optimizer = Adam(learning_rate=learning_rate) |
|
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy']) |
|
|
|
|
|
X_train = np.expand_dims(X_train, axis=-1) |
|
X_test = np.expand_dims(X_test, axis=-1) |
|
|
|
|
|
history = model.fit(X_train, y_train, epochs=20, validation_data=(X_test, y_test)) |
|
|
|
|
|
model.save('sound_classification_model.h5') |
|
|
|
|
|
loss, accuracy = model.evaluate(X_test, y_test) |
|
print(f"Test accuracy: {accuracy}") |
|
|
|
|
|
def classify_audio(file_path, model, label_encoder, window_size=WINDOW_SIZE, hop_size=HOP_SIZE): |
|
y, sr = librosa.load(file_path, sr=SAMPLE_RATE) |
|
total_duration = librosa.get_duration(y=y, sr=sr) |
|
window_samples = int(window_size * sr) |
|
hop_samples = int(hop_size * sr) |
|
|
|
results = [] |
|
detected_windows = [] |
|
|
|
for start in range(0, len(y) - window_samples + 1, hop_samples): |
|
end = start + window_samples |
|
segment = y[start:end] |
|
mfccs = librosa.feature.mfcc(y=segment, sr=sr, n_mfcc=NUM_MFCC) |
|
|
|
|
|
if mfccs.shape[1] < MAX_LEN: |
|
padding = MAX_LEN - mfccs.shape[1] |
|
mfccs = np.pad(mfccs, ((0, 0), (0, padding)), mode='constant') |
|
else: |
|
mfccs = mfccs[:, :MAX_LEN] |
|
|
|
mfccs = np.expand_dims(mfccs, axis=0) |
|
mfccs = np.expand_dims(mfccs, axis=-1) |
|
prediction = model.predict(mfccs) |
|
predicted_class = np.argmax(prediction, axis=1) |
|
time = start / sr |
|
class_label = label_encoder.inverse_transform(predicted_class)[0] |
|
|
|
|
|
detected = False |
|
for (det_start, det_end, det_label) in detected_windows: |
|
if (start < det_end and end > det_start): |
|
detected = True |
|
break |
|
|
|
if not detected: |
|
results.append((time, class_label)) |
|
detected_windows.append((start, end, class_label)) |
|
|
|
return results |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
def load_model_and_encoder(model_path, label_encoder_path): |
|
model = tf.keras.models.load_model(model_path) |
|
classes = np.load(label_encoder_path, allow_pickle=True) |
|
label_encoder = LabelEncoder() |
|
label_encoder.classes_ = classes |
|
return model, label_encoder |
|
|
|
model_path = 'sound_classification_model.h5' |
|
label_encoder_path = 'label_encoder.npy' |
|
audio_path = 'dataset/Debris Wood 02.wav' |
|
|
|
model, label_encoder = load_model_and_encoder(model_path, label_encoder_path) |
|
|
|
sound_identifications = classify_audio(audio_path, model, label_encoder) |
|
|
|
for time, label in sound_identifications: |
|
print(f'[{time:.2f} seconds] Class: {label}') |
|
|