|
|
|
|
|
''' |
|
@Project :EMO_digitalhuman |
|
@File :wav_clip.py |
|
@Author :juzhen.czy |
|
@Date :2024/3/4 19:04 |
|
''' |
|
from transformers import Wav2Vec2Model, Wav2Vec2Processor |
|
import torch |
|
from torch import nn |
|
import librosa |
|
from diffusers.models.modeling_utils import ModelMixin |
|
from einops import rearrange, repeat |
|
|
|
|
|
class Wav2Vec(ModelMixin): |
|
def __init__(self, model_path): |
|
super(Wav2Vec, self).__init__() |
|
self.processor = Wav2Vec2Processor.from_pretrained(model_path) |
|
self.wav2Vec = Wav2Vec2Model.from_pretrained(model_path) |
|
self.wav2Vec.eval() |
|
|
|
def forward(self, x): |
|
with torch.no_grad(): |
|
return self.wav2Vec(x).last_hidden_state |
|
|
|
|
|
|
|
|
|
def process(self, x): |
|
return self.processor(x, sampling_rate=16000, return_tensors="pt").input_values.to(self.device) |
|
|
|
class AudioFeatureMapper(ModelMixin): |
|
def __init__(self, input_num=15, output_num=77, model_path=None): |
|
super(AudioFeatureMapper, self).__init__() |
|
self.linear = nn.Linear(input_num, output_num) |
|
if model_path is not None: |
|
self.load_state_dict(torch.load(model_path)) |
|
|
|
def forward(self, x): |
|
|
|
result = self.linear(x.permute(0, 2, 1)) |
|
result = result.permute(0, 2, 1) |
|
|
|
return result |
|
|
|
def test(): |
|
|
|
model_path = "/ossfs/workspace/projects/model_weights/Moore-AnimateAnyone/wav2vec2-base-960h" |
|
model = Wav2Vec(model_path) |
|
print("### model loaded ###") |
|
|
|
audio_path = "/ossfs/workspace/projects/Moore-AnimateAnyone-master/assets/taken_clip.wav" |
|
input_audio, rate = librosa.load(audio_path, sr=16000) |
|
print(f"输入shape: {input_audio.shape}, rate: {rate}") |
|
|
|
|
|
input_v = model.process(input_audio) |
|
|
|
|
|
out = model(input_v) |
|
print(f"输入shape: {input_v.shape}, 输出shape: {out.shape}") |