File size: 1,665 Bytes
d894230 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
import librosa
import numpy as np
from typing import Dict, Tuple
class AudioProcessor:
def __init__(self):
self.sample_rate = 16000
self.n_mfcc = 13
self.n_mels = 128
def process_audio(self, audio_path: str) -> Tuple[np.ndarray, Dict]:
# Load and preprocess audio
waveform, sr = librosa.load(audio_path, sr=self.sample_rate)
# Extract features
features = {
'mfcc': self._extract_mfcc(waveform),
'pitch': self._extract_pitch(waveform),
'energy': self._extract_energy(waveform)
}
return waveform, features
def _extract_mfcc(self, waveform: np.ndarray) -> np.ndarray:
mfccs = librosa.feature.mfcc(
y=waveform,
sr=self.sample_rate,
n_mfcc=self.n_mfcc
)
return mfccs.mean(axis=1)
def _extract_pitch(self, waveform: np.ndarray) -> Dict:
f0, voiced_flag, voiced_probs = librosa.pyin(
waveform,
fmin=librosa.note_to_hz('C2'),
fmax=librosa.note_to_hz('C7'),
sr=self.sample_rate
)
return {
'mean': float(np.nanmean(f0)),
'std': float(np.nanstd(f0)),
'max': float(np.nanmax(f0)),
'min': float(np.nanmin(f0))
}
def _extract_energy(self, waveform: np.ndarray) -> Dict:
rms = librosa.feature.rms(y=waveform)[0]
return {
'mean': float(np.mean(rms)),
'std': float(np.std(rms)),
'max': float(np.max(rms)),
'min': float(np.min(rms))
} |