virtual_character2 / src /utils /audio_util.py
zejunyang
init
2e4e201
raw
history blame
982 Bytes
import os
import math
import librosa
import numpy as np
from transformers import Wav2Vec2FeatureExtractor
class DataProcessor:
def __init__(self, sampling_rate, wav2vec_model_path):
self._processor = Wav2Vec2FeatureExtractor.from_pretrained(wav2vec_model_path, local_files_only=True)
self._sampling_rate = sampling_rate
def extract_feature(self, audio_path):
speech_array, sampling_rate = librosa.load(audio_path, sr=self._sampling_rate)
input_value = np.squeeze(self._processor(speech_array, sampling_rate=sampling_rate).input_values)
return input_value
def prepare_audio_feature(wav_file, fps=30, sampling_rate=16000, wav2vec_model_path=None):
data_preprocessor = DataProcessor(sampling_rate, wav2vec_model_path)
input_value = data_preprocessor.extract_feature(wav_file)
seq_len = math.ceil(len(input_value)/sampling_rate*fps)
return {
"audio_feature": input_value,
"seq_len": seq_len
}