Ocillus commited on
Commit
4133ed9
·
verified ·
1 Parent(s): a2f9082

Upload 6 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Emmi[[:space:]]Elliott[[:space:]]-[[:space:]]Face[[:space:]]to[[:space:]]Face.wav filter=lfs diff=lfs merge=lfs -text
Emmi Elliott - Face to Face.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9f77eb947cff98dce0ad0353faed3ac48566ac77e97ab94117131b55be2bf3d
3
+ size 2911547
Estella.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import librosa
4
+ import tensorflow as tf
5
+ from tensorflow.keras.models import Sequential
6
+ from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
7
+ from sklearn.preprocessing import LabelEncoder
8
+ from sklearn.model_selection import train_test_split
9
+ import re
10
+
11
+ # Constants
12
+ SAMPLE_RATE = 44100 # Sample rate
13
+ NUM_MFCC = 13
14
+ MAX_LEN = 1000
15
+ WINDOW_SIZE = 1 # Window size in seconds
16
+ HOP_SIZE = 1 # Hop size (overlap) in seconds
17
+
18
+ # Function to extract MFCC features
19
+ def extract_features(file_path):
20
+ y, sr = librosa.load(file_path, sr=SAMPLE_RATE)
21
+ mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=NUM_MFCC)
22
+
23
+ # Pad or truncate MFCCs to a fixed length
24
+ if mfccs.shape[1] < MAX_LEN:
25
+ padding = MAX_LEN - mfccs.shape[1]
26
+ mfccs = np.pad(mfccs, ((0, 0), (0, padding)), mode='constant')
27
+ else:
28
+ mfccs = mfccs[:, :MAX_LEN]
29
+
30
+ return mfccs
31
+
32
+ # Load dataset
33
+ def load_data(dataset_path):
34
+ features = []
35
+ labels = []
36
+
37
+ # Regex pattern to extract class name from filename
38
+ pattern = re.compile(r'^(.*?)(?: \d+)?\.wav$')
39
+
40
+ for file_name in os.listdir(dataset_path):
41
+ if file_name.endswith('.wav'):
42
+ file_path = os.path.join(dataset_path, file_name)
43
+ match = pattern.match(file_name)
44
+ if match:
45
+ label = match.group(1) # Extract class name without number
46
+ mfccs = extract_features(file_path)
47
+ features.append(mfccs)
48
+ labels.append(label)
49
+
50
+ if len(features) == 0 or len(labels) == 0:
51
+ raise ValueError("No data found. Ensure the dataset path is correct and contains .wav files.")
52
+
53
+ return np.array(features), np.array(labels)
54
+
55
+ # Load data
56
+ dataset_path = 'dataset'
57
+ X, y = load_data(dataset_path)
58
+
59
+ # Encode labels
60
+ label_encoder = LabelEncoder()
61
+ y_encoded = label_encoder.fit_transform(y)
62
+ y_categorical = tf.keras.utils.to_categorical(y_encoded)
63
+
64
+ # Save LabelEncoder
65
+ np.save('label_encoder.npy', label_encoder.classes_)
66
+
67
+ # Split data
68
+ X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.2, random_state=42)
69
+
70
+ # Build model
71
+ model = Sequential([
72
+ tf.keras.layers.Input(shape=(NUM_MFCC, MAX_LEN, 1)),
73
+ Conv2D(32, kernel_size=(3, 3), activation='relu'),
74
+ MaxPooling2D(pool_size=(2, 2)),
75
+ Conv2D(64, kernel_size=(3, 3), activation='relu'),
76
+ MaxPooling2D(pool_size=(2, 2)),
77
+ Flatten(),
78
+ Dense(128, activation='relu'),
79
+ Dropout(0.5),
80
+ Dense(len(np.unique(y_encoded)), activation='softmax')
81
+ ])
82
+
83
+ # Adjust learning rate if necessary
84
+ from tensorflow.keras.optimizers import Adam
85
+
86
+ learning_rate = 0.0001 # Adjust as necessary
87
+ optimizer = Adam(learning_rate=learning_rate)
88
+ model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
89
+
90
+ # Reshape data for the model
91
+ X_train = np.expand_dims(X_train, axis=-1)
92
+ X_test = np.expand_dims(X_test, axis=-1)
93
+
94
+ # Train model
95
+ history = model.fit(X_train, y_train, epochs=20, validation_data=(X_test, y_test))
96
+
97
+ # Save model
98
+ model.save('sound_classification_model.h5')
99
+
100
+ # Evaluate model
101
+ loss, accuracy = model.evaluate(X_test, y_test)
102
+ print(f"Test accuracy: {accuracy}")
103
+
104
+ # Function to classify audio in sliding windows with overlapping handling
105
+ def classify_audio(file_path, model, label_encoder, window_size=WINDOW_SIZE, hop_size=HOP_SIZE):
106
+ y, sr = librosa.load(file_path, sr=SAMPLE_RATE)
107
+ total_duration = librosa.get_duration(y=y, sr=sr)
108
+ window_samples = int(window_size * sr)
109
+ hop_samples = int(hop_size * sr)
110
+
111
+ results = []
112
+ detected_windows = [] # List to keep track of detected windows
113
+
114
+ for start in range(0, len(y) - window_samples + 1, hop_samples):
115
+ end = start + window_samples
116
+ segment = y[start:end]
117
+ mfccs = librosa.feature.mfcc(y=segment, sr=sr, n_mfcc=NUM_MFCC)
118
+
119
+ # Pad or truncate MFCCs
120
+ if mfccs.shape[1] < MAX_LEN:
121
+ padding = MAX_LEN - mfccs.shape[1]
122
+ mfccs = np.pad(mfccs, ((0, 0), (0, padding)), mode='constant')
123
+ else:
124
+ mfccs = mfccs[:, :MAX_LEN]
125
+
126
+ mfccs = np.expand_dims(mfccs, axis=0)
127
+ mfccs = np.expand_dims(mfccs, axis=-1)
128
+ prediction = model.predict(mfccs)
129
+ predicted_class = np.argmax(prediction, axis=1)
130
+ time = start / sr
131
+ class_label = label_encoder.inverse_transform(predicted_class)[0]
132
+
133
+ # Check for overlaps and add detected regions
134
+ detected = False
135
+ for (det_start, det_end, det_label) in detected_windows:
136
+ if (start < det_end and end > det_start): # Overlapping condition
137
+ detected = True
138
+ break
139
+
140
+ if not detected:
141
+ results.append((time, class_label))
142
+ detected_windows.append((start, end, class_label))
143
+
144
+ return results
145
+
146
+ # Example usage
147
+ if __name__ == "__main__":
148
+ # Load model and label encoder
149
+ def load_model_and_encoder(model_path, label_encoder_path):
150
+ model = tf.keras.models.load_model(model_path)
151
+ classes = np.load(label_encoder_path, allow_pickle=True)
152
+ label_encoder = LabelEncoder()
153
+ label_encoder.classes_ = classes
154
+ return model, label_encoder
155
+
156
+ model_path = 'sound_classification_model.h5'
157
+ label_encoder_path = 'label_encoder.npy'
158
+ audio_path = 'dataset/Debris Wood 02.wav'
159
+
160
+ model, label_encoder = load_model_and_encoder(model_path, label_encoder_path)
161
+
162
+ sound_identifications = classify_audio(audio_path, model, label_encoder)
163
+
164
+ for time, label in sound_identifications:
165
+ print(f'[{time:.2f} seconds] Class: {label}')
inference.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Example usage
3
+ if __name__ == "__main__":
4
+ # Load model and label encoder
5
+ def load_model_and_encoder(model_path, label_encoder_path):
6
+ model = tf.keras.models.load_model(model_path)
7
+ classes = np.load(label_encoder_path, allow_pickle=True)
8
+ label_encoder = LabelEncoder()
9
+ label_encoder.classes_ = classes
10
+ return model, label_encoder
11
+
12
+ model_path = 'sound_classification_model.h5'
13
+ label_encoder_path = 'label_encoder.npy'
14
+ audio_path = 'Emmi Elliott - Face to Face.wav'
15
+
16
+ model, label_encoder = load_model_and_encoder(model_path, label_encoder_path)
17
+
18
+ sound_identifications = classify_audio(audio_path, model, label_encoder)
19
+
20
+ for time, label in sound_identifications:
21
+ print(f'[{time:.2f} seconds] Class: {label}')
label_encoder.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c20e529c407fd1ab8124e552250df89ef7e0cf8eeb731a178f9da40c8a334b1
3
+ size 98072
script.sh ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cd dataset
2
+
3
+ #!/bin/bash
4
+
5
+ # Loop through all .caf files in the current directory
6
+ for file in *.caf; do
7
+ # Extract the base name of the file (without extension)
8
+ base="${file%.*}"
9
+ # Convert .caf to .wav using ffmpeg
10
+ ffmpeg -i "$file" "${base}.wav"
11
+ done
sound_classification_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44d94b10c7a04909c1f866e0f502ae3b0649da9d9c8e1e71fd50b9c93fd4e5f1
3
+ size 25801576