import gradio as gr import pyworld import numpy as np from scipy.io import wavfile from wsola import WSOLA from scipy.signal import firwin, lfilter, resample, filtfilt from numpy.fft import fft, ifft import librosa import soundfile as sf ########################## # 叠频 # ########################## def shift_pitch(signal, fs, f_ratio): peaks = find_peaks(signal, fs) new_signal = psola(signal, peaks, f_ratio) return new_signal def find_peaks(signal, fs, max_hz=950, min_hz=75, analysis_win_ms=40, max_change=1.005, min_change=0.995): N = len(signal) min_period = fs // max_hz max_period = fs // min_hz # compute pitch periodicity sequence = int(analysis_win_ms / 1000 * fs) # analysis sequence length in samples periods = compute_periods_per_sequence(signal, sequence, min_period, max_period) # simple hack to avoid octave error: assume that the pitch should not vary much, restrict range mean_period = np.mean(periods) max_period = int(mean_period * 1.1) min_period = int(mean_period * 0.9) periods = compute_periods_per_sequence(signal, sequence, min_period, max_period) # find the peaks peaks = [np.argmax(signal[:int(periods[0]*1.1)])] while True: prev = peaks[-1] idx = prev // sequence # current autocorrelation analysis window if prev + int(periods[idx] * max_change) >= N: break # find maximum near expected location peaks.append(prev + int(periods[idx] * min_change) + np.argmax(signal[prev + int(periods[idx] * min_change): prev + int(periods[idx] * max_change)])) return np.array(peaks) def compute_periods_per_sequence(signal, sequence, min_period, max_period): offset = 0 # current sample offset periods = [] # period length of each analysis sequence N = len(signal) while offset < N: fourier = fft(signal[offset: offset + sequence]) fourier[0] = 0 # remove DC component autoc = ifft(fourier * np.conj(fourier)).real autoc_peak = min_period + np.argmax(autoc[min_period: max_period]) periods.append(autoc_peak) offset += sequence return periods def psola(signal, peaks, f_ratio): N = len(signal) # Interpolate new_signal = np.zeros(N) # print('len(peaks) * f_ratio->',len(peaks) * f_ratio) new_peaks_ref = np.linspace(0, len(peaks) - 1, int(len(peaks) * f_ratio)) new_peaks = np.zeros(len(new_peaks_ref)).astype(int) for i in range(len(new_peaks)): weight = new_peaks_ref[i] % 1 left = np.floor(new_peaks_ref[i]).astype(int) right = np.ceil(new_peaks_ref[i]).astype(int) new_peaks[i] = int(peaks[left] * (1 - weight) + peaks[right] * weight) # PSOLA for j in range(len(new_peaks)): # find the corresponding old peak index i = np.argmin(np.abs(peaks - new_peaks[j])) # get the distances to adjacent peaks P1 = [new_peaks[j] if j == 0 else new_peaks[j] - new_peaks[j-1], N - 1 - new_peaks[j] if j == len(new_peaks) - 1 else new_peaks[j+1] - new_peaks[j]] # edge case truncation if peaks[i] - P1[0] < 0: P1[0] = peaks[i] if peaks[i] + P1[1] > N - 1: P1[1] = N - 1 - peaks[i] # linear OLA window window = list(np.linspace(0, 1, P1[0] + 1)[1:]) + list(np.linspace(1, 0, P1[1] + 1)[1:]) # center window from original signal at the new peak new_signal[new_peaks[j] - P1[0]: new_peaks[j] + P1[1]] += window * signal[peaks[i] - P1[0]: peaks[i] + P1[1]] return new_signal ########################## # 变频 # ########################## # 低通滤波 def low_cut_filter(x, fs, cutoff=70): nyquist = fs // 2 norm_cutoff = cutoff / nyquist # low cut filter fil = firwin(255, norm_cutoff, pass_zero=False) lcf_x = lfilter(fil, 1, x) return lcf_x # 高频修复 def high_frequency_completion(x, transformed,f0rate,par): x = np.array(x, dtype=np.float64) f0, time_axis = pyworld.harvest(x, par['fs'], f0_floor=par['minf0'], f0_ceil=par['maxf0'], frame_period=par['shiftms']) spc = pyworld.cheaptrick(x, f0, time_axis, par['fs'], fft_size=par['fftl']) ap = pyworld.d4c(x, f0, time_axis, par['fs'], fft_size=par['fftl']) # 利用0基频进行语音还原 uf0 = np.zeros(len(f0)) unvoice_anasyn = pyworld.synthesize(uf0, spc, ap, par['fs'], frame_period=par['shiftms']) # 高通滤波 获取原语音中的高频细节 fil = firwin(255, f0rate, pass_zero=False) HPFed_unvoice_anasyn = filtfilt(fil, 1, unvoice_anasyn) if len(HPFed_unvoice_anasyn) > len(transformed): return transformed + HPFed_unvoice_anasyn[:len(transformed)] else: transformed[:len(HPFed_unvoice_anasyn)] += HPFed_unvoice_anasyn return transformed def transform_f0(x,f0rate,config): if f0rate < 1.0: completion = True else: completion = False fs = config["fs"] x = low_cut_filter(x, fs, cutoff=70) # 利用 wsola 调速 wsola = WSOLA(config["fs"], 1 / f0rate, shiftms=10) wsolaed = wsola.duration_modification(x) # 利用 resample 调频 xlen = len(x) transformed = resample(wsolaed, xlen) # 基频变低 进行高频修正 if completion: transformed = high_frequency_completion(x, transformed, f0rate,config) return transformed with gr.Blocks() as interface: with gr.Row(): wav_path = gr.Audio(source='microphone',type='filepath') with gr.Column(): minf0 = gr.Slider(50, 300, 70, step=10, label="minf0") turn_tune = gr.Slider(0.2, 3, 1.5, step=0.1, label="turn_tune") with gr.Column(): maxf0 = gr.Slider(500, 1100, 700, step=10, label="maxf0") shiftms = gr.Slider(1, 50, 10, step=1, label="shiftms") with gr.Column(): fr = gr.Slider(0.1, 15, 1, step=0.1, label="fr") with gr.Row(): audio_output = gr.Audio(type='filepath') section_btn1 = gr.Button("change") # 图片模型训练 def change(wav_path,turn_tune,minf0,maxf0,shiftms,fr): fs, x = wavfile.read(wav_path) x = np.array(x, dtype=np.float64) outfile = str(wav_path).split('.')[0] + '-output.wav' config = {} config["fs"] = fs config["minf0"] = minf0 config["maxf0"] = maxf0 config["shiftms"] = shiftms config["fftl"] =1024 wav_slow = transform_f0(x,turn_tune,config) wavfile.write(outfile, fs, wav_slow.astype(np.int16)) fr = float(fr) print('fr->',fr) if fr != 1: orig_signal, fs = librosa.load(outfile, sr=None) N = len(orig_signal) f_ratio = fr ** (-2 / 12) new_signal = shift_pitch(orig_signal, fs, f_ratio) sf.write(outfile,new_signal,fs) return outfile section_btn1.click(change, inputs=[wav_path,turn_tune,minf0,maxf0,shiftms,fr], outputs=[audio_output]) interface.launch(show_api=False)