Ocillus
/

Estella

Model card Files Files and versions Community

File size: 5,610 Bytes

4133ed9

import os
import numpy as np
import librosa
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import re

# Constants
SAMPLE_RATE = 44100  # Sample rate
NUM_MFCC = 13
MAX_LEN = 1000
WINDOW_SIZE = 1  # Window size in seconds
HOP_SIZE = 1   # Hop size (overlap) in seconds

# Function to extract MFCC features
def extract_features(file_path):
    y, sr = librosa.load(file_path, sr=SAMPLE_RATE)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=NUM_MFCC)
    
    # Pad or truncate MFCCs to a fixed length
    if mfccs.shape[1] < MAX_LEN:
        padding = MAX_LEN - mfccs.shape[1]
        mfccs = np.pad(mfccs, ((0, 0), (0, padding)), mode='constant')
    else:
        mfccs = mfccs[:, :MAX_LEN]
    
    return mfccs

# Load dataset
def load_data(dataset_path):
    features = []
    labels = []
    
    # Regex pattern to extract class name from filename
    pattern = re.compile(r'^(.*?)(?: \d+)?\.wav$')
    
    for file_name in os.listdir(dataset_path):
        if file_name.endswith('.wav'):
            file_path = os.path.join(dataset_path, file_name)
            match = pattern.match(file_name)
            if match:
                label = match.group(1)  # Extract class name without number
                mfccs = extract_features(file_path)
                features.append(mfccs)
                labels.append(label)
    
    if len(features) == 0 or len(labels) == 0:
        raise ValueError("No data found. Ensure the dataset path is correct and contains .wav files.")
    
    return np.array(features), np.array(labels)

# Load data
dataset_path = 'dataset'
X, y = load_data(dataset_path)

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_categorical = tf.keras.utils.to_categorical(y_encoded)

# Save LabelEncoder
np.save('label_encoder.npy', label_encoder.classes_)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.2, random_state=42)

# Build model
model = Sequential([
    tf.keras.layers.Input(shape=(NUM_MFCC, MAX_LEN, 1)),
    Conv2D(32, kernel_size=(3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    Conv2D(64, kernel_size=(3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(len(np.unique(y_encoded)), activation='softmax')
])

# Adjust learning rate if necessary
from tensorflow.keras.optimizers import Adam

learning_rate = 0.0001  # Adjust as necessary
optimizer = Adam(learning_rate=learning_rate)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# Reshape data for the model
X_train = np.expand_dims(X_train, axis=-1)
X_test = np.expand_dims(X_test, axis=-1)

# Train model
history = model.fit(X_train, y_train, epochs=20, validation_data=(X_test, y_test))

# Save model
model.save('sound_classification_model.h5')

# Evaluate model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test accuracy: {accuracy}")

# Function to classify audio in sliding windows with overlapping handling
def classify_audio(file_path, model, label_encoder, window_size=WINDOW_SIZE, hop_size=HOP_SIZE):
    y, sr = librosa.load(file_path, sr=SAMPLE_RATE)
    total_duration = librosa.get_duration(y=y, sr=sr)
    window_samples = int(window_size * sr)
    hop_samples = int(hop_size * sr)

    results = []
    detected_windows = []  # List to keep track of detected windows

    for start in range(0, len(y) - window_samples + 1, hop_samples):
        end = start + window_samples
        segment = y[start:end]
        mfccs = librosa.feature.mfcc(y=segment, sr=sr, n_mfcc=NUM_MFCC)

        # Pad or truncate MFCCs
        if mfccs.shape[1] < MAX_LEN:
            padding = MAX_LEN - mfccs.shape[1]
            mfccs = np.pad(mfccs, ((0, 0), (0, padding)), mode='constant')
        else:
            mfccs = mfccs[:, :MAX_LEN]

        mfccs = np.expand_dims(mfccs, axis=0)
        mfccs = np.expand_dims(mfccs, axis=-1)
        prediction = model.predict(mfccs)
        predicted_class = np.argmax(prediction, axis=1)
        time = start / sr
        class_label = label_encoder.inverse_transform(predicted_class)[0]

        # Check for overlaps and add detected regions
        detected = False
        for (det_start, det_end, det_label) in detected_windows:
            if (start < det_end and end > det_start):  # Overlapping condition
                detected = True
                break
        
        if not detected:
            results.append((time, class_label))
            detected_windows.append((start, end, class_label))

    return results

# Example usage
if __name__ == "__main__":
    # Load model and label encoder
    def load_model_and_encoder(model_path, label_encoder_path):
        model = tf.keras.models.load_model(model_path)
        classes = np.load(label_encoder_path, allow_pickle=True)
        label_encoder = LabelEncoder()
        label_encoder.classes_ = classes
        return model, label_encoder

    model_path = 'sound_classification_model.h5'
    label_encoder_path = 'label_encoder.npy'
    audio_path = 'dataset/Debris Wood 02.wav'

    model, label_encoder = load_model_and_encoder(model_path, label_encoder_path)

    sound_identifications = classify_audio(audio_path, model, label_encoder)

    for time, label in sound_identifications:
        print(f'[{time:.2f} seconds] Class: {label}')