File size: 5,610 Bytes
4133ed9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import os
import numpy as np
import librosa
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import re

# Constants
SAMPLE_RATE = 44100  # Sample rate
NUM_MFCC = 13
MAX_LEN = 1000
WINDOW_SIZE = 1  # Window size in seconds
HOP_SIZE = 1   # Hop size (overlap) in seconds

# Function to extract MFCC features
def extract_features(file_path):
    y, sr = librosa.load(file_path, sr=SAMPLE_RATE)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=NUM_MFCC)
    
    # Pad or truncate MFCCs to a fixed length
    if mfccs.shape[1] < MAX_LEN:
        padding = MAX_LEN - mfccs.shape[1]
        mfccs = np.pad(mfccs, ((0, 0), (0, padding)), mode='constant')
    else:
        mfccs = mfccs[:, :MAX_LEN]
    
    return mfccs

# Load dataset
def load_data(dataset_path):
    features = []
    labels = []
    
    # Regex pattern to extract class name from filename
    pattern = re.compile(r'^(.*?)(?: \d+)?\.wav$')
    
    for file_name in os.listdir(dataset_path):
        if file_name.endswith('.wav'):
            file_path = os.path.join(dataset_path, file_name)
            match = pattern.match(file_name)
            if match:
                label = match.group(1)  # Extract class name without number
                mfccs = extract_features(file_path)
                features.append(mfccs)
                labels.append(label)
    
    if len(features) == 0 or len(labels) == 0:
        raise ValueError("No data found. Ensure the dataset path is correct and contains .wav files.")
    
    return np.array(features), np.array(labels)

# Load data
dataset_path = 'dataset'
X, y = load_data(dataset_path)

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_categorical = tf.keras.utils.to_categorical(y_encoded)

# Save LabelEncoder
np.save('label_encoder.npy', label_encoder.classes_)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.2, random_state=42)

# Build model
model = Sequential([
    tf.keras.layers.Input(shape=(NUM_MFCC, MAX_LEN, 1)),
    Conv2D(32, kernel_size=(3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    Conv2D(64, kernel_size=(3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(len(np.unique(y_encoded)), activation='softmax')
])

# Adjust learning rate if necessary
from tensorflow.keras.optimizers import Adam

learning_rate = 0.0001  # Adjust as necessary
optimizer = Adam(learning_rate=learning_rate)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# Reshape data for the model
X_train = np.expand_dims(X_train, axis=-1)
X_test = np.expand_dims(X_test, axis=-1)

# Train model
history = model.fit(X_train, y_train, epochs=20, validation_data=(X_test, y_test))

# Save model
model.save('sound_classification_model.h5')

# Evaluate model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test accuracy: {accuracy}")

# Function to classify audio in sliding windows with overlapping handling
def classify_audio(file_path, model, label_encoder, window_size=WINDOW_SIZE, hop_size=HOP_SIZE):
    y, sr = librosa.load(file_path, sr=SAMPLE_RATE)
    total_duration = librosa.get_duration(y=y, sr=sr)
    window_samples = int(window_size * sr)
    hop_samples = int(hop_size * sr)

    results = []
    detected_windows = []  # List to keep track of detected windows

    for start in range(0, len(y) - window_samples + 1, hop_samples):
        end = start + window_samples
        segment = y[start:end]
        mfccs = librosa.feature.mfcc(y=segment, sr=sr, n_mfcc=NUM_MFCC)

        # Pad or truncate MFCCs
        if mfccs.shape[1] < MAX_LEN:
            padding = MAX_LEN - mfccs.shape[1]
            mfccs = np.pad(mfccs, ((0, 0), (0, padding)), mode='constant')
        else:
            mfccs = mfccs[:, :MAX_LEN]

        mfccs = np.expand_dims(mfccs, axis=0)
        mfccs = np.expand_dims(mfccs, axis=-1)
        prediction = model.predict(mfccs)
        predicted_class = np.argmax(prediction, axis=1)
        time = start / sr
        class_label = label_encoder.inverse_transform(predicted_class)[0]

        # Check for overlaps and add detected regions
        detected = False
        for (det_start, det_end, det_label) in detected_windows:
            if (start < det_end and end > det_start):  # Overlapping condition
                detected = True
                break
        
        if not detected:
            results.append((time, class_label))
            detected_windows.append((start, end, class_label))

    return results

# Example usage
if __name__ == "__main__":
    # Load model and label encoder
    def load_model_and_encoder(model_path, label_encoder_path):
        model = tf.keras.models.load_model(model_path)
        classes = np.load(label_encoder_path, allow_pickle=True)
        label_encoder = LabelEncoder()
        label_encoder.classes_ = classes
        return model, label_encoder

    model_path = 'sound_classification_model.h5'
    label_encoder_path = 'label_encoder.npy'
    audio_path = 'dataset/Debris Wood 02.wav'

    model, label_encoder = load_model_and_encoder(model_path, label_encoder_path)

    sound_identifications = classify_audio(audio_path, model, label_encoder)

    for time, label in sound_identifications:
        print(f'[{time:.2f} seconds] Class: {label}')