File size: 4,539 Bytes
e7cae83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
from typing import Optional, Any, List
from functools import lru_cache
import numpy
import scipy

from facefusion.filesystem import is_audio
from facefusion.ffmpeg import read_audio_buffer
from facefusion.typing import Fps, Audio, AudioFrame, Spectrogram, MelFilterBank
from facefusion.voice_extractor import batch_extract_voice


@lru_cache(maxsize = 128)
def read_static_audio(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]:
	return read_audio(audio_path, fps)


def read_audio(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]:
	sample_rate = 48000
	channel_total = 2

	if is_audio(audio_path):
		audio_buffer = read_audio_buffer(audio_path, sample_rate, channel_total)
		audio = numpy.frombuffer(audio_buffer, dtype = numpy.int16).reshape(-1, 2)
		audio = prepare_audio(audio)
		spectrogram = create_spectrogram(audio)
		audio_frames = extract_audio_frames(spectrogram, fps)
		return audio_frames
	return None


@lru_cache(maxsize = 128)
def read_static_voice(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]:
	return read_voice(audio_path, fps)


def read_voice(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]:
	sample_rate = 48000
	channel_total = 2
	chunk_size = 1024 * 240
	step_size = 1024 * 180

	if is_audio(audio_path):
		audio_buffer = read_audio_buffer(audio_path, sample_rate, channel_total)
		audio = numpy.frombuffer(audio_buffer, dtype = numpy.int16).reshape(-1, 2)
		audio = batch_extract_voice(audio, chunk_size, step_size)
		audio = prepare_voice(audio)
		spectrogram = create_spectrogram(audio)
		audio_frames = extract_audio_frames(spectrogram, fps)
		return audio_frames
	return None


def get_audio_frame(audio_path : str, fps : Fps, frame_number : int = 0) -> Optional[AudioFrame]:
	if is_audio(audio_path):
		audio_frames = read_static_audio(audio_path, fps)
		if frame_number in range(len(audio_frames)):
			return audio_frames[frame_number]
	return None


def get_voice_frame(audio_path : str, fps : Fps, frame_number : int = 0) -> Optional[AudioFrame]:
	if is_audio(audio_path):
		voice_frames = read_static_voice(audio_path, fps)
		if frame_number in range(len(voice_frames)):
			return voice_frames[frame_number]
	return None


def create_empty_audio_frame() -> AudioFrame:
	mel_filter_total = 80
	step_size = 16
	audio_frame = numpy.zeros((mel_filter_total, step_size)).astype(numpy.int16)
	return audio_frame


def prepare_audio(audio : numpy.ndarray[Any, Any]) -> Audio:
	if audio.ndim > 1:
		audio = numpy.mean(audio, axis = 1)
	audio = audio / numpy.max(numpy.abs(audio), axis = 0)
	audio = scipy.signal.lfilter([ 1.0, -0.97 ], [ 1.0 ], audio)
	return audio


def prepare_voice(audio : numpy.ndarray[Any, Any]) -> Audio:
	sample_rate = 48000
	resample_rate = 16000

	audio = scipy.signal.resample(audio, int(len(audio) * resample_rate / sample_rate))
	audio = prepare_audio(audio)
	return audio


def convert_hertz_to_mel(hertz : float) -> float:
	return 2595 * numpy.log10(1 + hertz / 700)


def convert_mel_to_hertz(mel : numpy.ndarray[Any, Any]) -> numpy.ndarray[Any, Any]:
	return 700 * (10 ** (mel / 2595) - 1)


def create_mel_filter_bank() -> MelFilterBank:
	mel_filter_total = 80
	mel_bin_total = 800
	sample_rate = 16000
	min_frequency = 55.0
	max_frequency = 7600.0
	mel_filter_bank = numpy.zeros((mel_filter_total, mel_bin_total // 2 + 1))
	mel_frequency_range = numpy.linspace(convert_hertz_to_mel(min_frequency), convert_hertz_to_mel(max_frequency), mel_filter_total + 2)
	indices = numpy.floor((mel_bin_total + 1) * convert_mel_to_hertz(mel_frequency_range) / sample_rate).astype(numpy.int16)

	for index in range(mel_filter_total):
		start = indices[index]
		end = indices[index + 1]
		mel_filter_bank[index, start:end] = scipy.signal.windows.triang(end - start)
	return mel_filter_bank


def create_spectrogram(audio : Audio) -> Spectrogram:
	mel_bin_total = 800
	mel_bin_overlap = 600
	mel_filter_bank = create_mel_filter_bank()
	spectrogram = scipy.signal.stft(audio, nperseg = mel_bin_total, nfft = mel_bin_total, noverlap = mel_bin_overlap)[2]
	spectrogram = numpy.dot(mel_filter_bank, numpy.abs(spectrogram))
	return spectrogram


def extract_audio_frames(spectrogram : Spectrogram, fps : Fps) -> List[AudioFrame]:
	mel_filter_total = 80
	step_size = 16
	audio_frames = []
	indices = numpy.arange(0, spectrogram.shape[1], mel_filter_total / fps).astype(numpy.int16)
	indices = indices[indices >= step_size]

	for index in indices:
		start = max(0, index - step_size)
		audio_frames.append(spectrogram[:, start:index])
	return audio_frames