from transformers import WhisperProcessor, WhisperForConditionalGeneration import numpy as np import librosa processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2") model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2") model.config.forced_decoder_ids = None def transcibe(audio:np.ndarray, sr:int): input_features = processor(audio, sampling_rate=sr, return_tensors="pt").input_features predicted_ids = model.generate(input_features) transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False) transcription = processor.tokenizer.normalize(transcription[0]) return transcription def audio_len(audio:np.ndarray, sr:int): return len(audio) / sr def rms_energy(audio: np.ndarray): return np.sqrt(np.mean(audio**2)) def zero_crossing_rate(audio: np.ndarray): return np.mean(np.abs(np.diff(np.sign(audio)))) def spectral_centroid(audio: np.ndarray, sr: int): return librosa.feature.spectral_centroid(y=audio, sr=sr).mean() def spectral_bandwidth(audio: np.ndarray, sr: int): return librosa.feature.spectral_bandwidth(y=audio, sr=sr).mean() def mfccs(audio: np.ndarray, sr: int, n_mfcc: int = 13): return librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc).mean(axis=1) def chroma_features(audio: np.ndarray, sr: int): return librosa.feature.chroma_stft(y=audio, sr=sr).mean(axis=1) def signal_to_noise_ratio(audio: np.ndarray): signal_power = np.mean(audio ** 2) noise_power = np.var(audio) return 10 * np.log10(signal_power / noise_power) def tempo(audio: np.ndarray, sr: int): onset_env = librosa.onset.onset_strength(y=audio, sr=sr) return librosa.beat.tempo(onset_envelope=onset_env, sr=sr)[0] def silence_ratio(audio: np.ndarray, threshold: float = 0.01): return np.mean(np.abs(audio) < threshold) def estimate_audio_quality(audio: np.ndarray, sr: int): # Compute features snr = signal_to_noise_ratio(audio) rms = rms_energy(audio) silence = silence_ratio(audio) spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr).mean() spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio, sr=sr).mean() zcr = zero_crossing_rate(audio) # Normalize features (example normalization, adjust as necessary) snr_norm = np.clip(snr / 50.0, 0, 1) # Assuming 50 dB is very good rms_norm = np.clip(rms / np.max(np.abs(audio)), 0, 1) # Normalizing by max amplitude silence_norm = 1 - silence # Less silence is better spectral_centroid_norm = np.clip(spectral_centroid / sr, 0, 1) spectral_bandwidth_norm = np.clip(spectral_bandwidth / (sr/2), 0, 1) zcr_norm = np.clip(zcr / 0.1, 0, 1) # Assuming 0.1 as an acceptable ZCR features = { "snr_nrom":snr_norm, "rms_norm":rms_norm, "silence_norm":silence_norm, "spectral_centroid":spectral_centroid_norm, "spectral_bandwidth_norm":spectral_bandwidth_norm, "zcr_norm":zcr_norm } # Weighting features weights = { "snr": 0.25, "rms": 0.2, "silence": 0.2, "spectral_centroid": 0.1, "spectral_bandwidth": 0.15, "zcr": 0.1 } # Calculate overall quality score quality_score = ( weights["snr"] * snr_norm + weights["rms"] * rms_norm + weights["silence"] * silence_norm + weights["spectral_centroid"] * spectral_centroid_norm + weights["spectral_bandwidth"] * spectral_bandwidth_norm + weights["zcr"] * zcr_norm ) # Interpret the score if quality_score > 0.85: quality = "Excellent" elif quality_score > 0.7: quality = "Good" elif quality_score > 0.5: quality = "Fair" else: quality = "Poor" return quality, round(quality_score, 3), features