Spaces:
Running
Running
from typing import Optional, Any, List | |
from functools import lru_cache | |
import numpy | |
import scipy | |
from facefusion.filesystem import is_audio | |
from facefusion.ffmpeg import read_audio_buffer | |
from facefusion.typing import Fps, Audio, AudioFrame, Spectrogram, MelFilterBank | |
from facefusion.voice_extractor import batch_extract_voice | |
def read_static_audio(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]: | |
return read_audio(audio_path, fps) | |
def read_audio(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]: | |
sample_rate = 48000 | |
channel_total = 2 | |
if is_audio(audio_path): | |
audio_buffer = read_audio_buffer(audio_path, sample_rate, channel_total) | |
audio = numpy.frombuffer(audio_buffer, dtype = numpy.int16).reshape(-1, 2) | |
audio = prepare_audio(audio) | |
spectrogram = create_spectrogram(audio) | |
audio_frames = extract_audio_frames(spectrogram, fps) | |
return audio_frames | |
return None | |
def read_static_voice(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]: | |
return read_voice(audio_path, fps) | |
def read_voice(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]: | |
sample_rate = 48000 | |
channel_total = 2 | |
chunk_size = 1024 * 240 | |
step_size = 1024 * 180 | |
if is_audio(audio_path): | |
audio_buffer = read_audio_buffer(audio_path, sample_rate, channel_total) | |
audio = numpy.frombuffer(audio_buffer, dtype = numpy.int16).reshape(-1, 2) | |
audio = batch_extract_voice(audio, chunk_size, step_size) | |
audio = prepare_voice(audio) | |
spectrogram = create_spectrogram(audio) | |
audio_frames = extract_audio_frames(spectrogram, fps) | |
return audio_frames | |
return None | |
def get_audio_frame(audio_path : str, fps : Fps, frame_number : int = 0) -> Optional[AudioFrame]: | |
if is_audio(audio_path): | |
audio_frames = read_static_audio(audio_path, fps) | |
if frame_number in range(len(audio_frames)): | |
return audio_frames[frame_number] | |
return None | |
def get_voice_frame(audio_path : str, fps : Fps, frame_number : int = 0) -> Optional[AudioFrame]: | |
if is_audio(audio_path): | |
voice_frames = read_static_voice(audio_path, fps) | |
if frame_number in range(len(voice_frames)): | |
return voice_frames[frame_number] | |
return None | |
def create_empty_audio_frame() -> AudioFrame: | |
mel_filter_total = 80 | |
step_size = 16 | |
audio_frame = numpy.zeros((mel_filter_total, step_size)).astype(numpy.int16) | |
return audio_frame | |
def prepare_audio(audio : numpy.ndarray[Any, Any]) -> Audio: | |
if audio.ndim > 1: | |
audio = numpy.mean(audio, axis = 1) | |
audio = audio / numpy.max(numpy.abs(audio), axis = 0) | |
audio = scipy.signal.lfilter([ 1.0, -0.97 ], [ 1.0 ], audio) | |
return audio | |
def prepare_voice(audio : numpy.ndarray[Any, Any]) -> Audio: | |
sample_rate = 48000 | |
resample_rate = 16000 | |
audio = scipy.signal.resample(audio, int(len(audio) * resample_rate / sample_rate)) | |
audio = prepare_audio(audio) | |
return audio | |
def convert_hertz_to_mel(hertz : float) -> float: | |
return 2595 * numpy.log10(1 + hertz / 700) | |
def convert_mel_to_hertz(mel : numpy.ndarray[Any, Any]) -> numpy.ndarray[Any, Any]: | |
return 700 * (10 ** (mel / 2595) - 1) | |
def create_mel_filter_bank() -> MelFilterBank: | |
mel_filter_total = 80 | |
mel_bin_total = 800 | |
sample_rate = 16000 | |
min_frequency = 55.0 | |
max_frequency = 7600.0 | |
mel_filter_bank = numpy.zeros((mel_filter_total, mel_bin_total // 2 + 1)) | |
mel_frequency_range = numpy.linspace(convert_hertz_to_mel(min_frequency), convert_hertz_to_mel(max_frequency), mel_filter_total + 2) | |
indices = numpy.floor((mel_bin_total + 1) * convert_mel_to_hertz(mel_frequency_range) / sample_rate).astype(numpy.int16) | |
for index in range(mel_filter_total): | |
start = indices[index] | |
end = indices[index + 1] | |
mel_filter_bank[index, start:end] = scipy.signal.windows.triang(end - start) | |
return mel_filter_bank | |
def create_spectrogram(audio : Audio) -> Spectrogram: | |
mel_bin_total = 800 | |
mel_bin_overlap = 600 | |
mel_filter_bank = create_mel_filter_bank() | |
spectrogram = scipy.signal.stft(audio, nperseg = mel_bin_total, nfft = mel_bin_total, noverlap = mel_bin_overlap)[2] | |
spectrogram = numpy.dot(mel_filter_bank, numpy.abs(spectrogram)) | |
return spectrogram | |
def extract_audio_frames(spectrogram : Spectrogram, fps : Fps) -> List[AudioFrame]: | |
mel_filter_total = 80 | |
step_size = 16 | |
audio_frames = [] | |
indices = numpy.arange(0, spectrogram.shape[1], mel_filter_total / fps).astype(numpy.int16) | |
indices = indices[indices >= step_size] | |
for index in indices: | |
start = max(0, index - step_size) | |
audio_frames.append(spectrogram[:, start:index]) | |
return audio_frames | |