| Description: libf0 salience-based F0 estimation implementation
| Author: Sebastian Rosenzweig, Simon Schwär, Meinard Müller
| License: The MIT license,
| This file is part of libf0.
import numpy as np
from librosa import stft
from scipy import ndimage, linalg
from numba import njit
def salience(x, Fs=22050, N=2048, H=256, F_min=55.0, F_max=1760.0, R=10.0, num_harm=10, freq_smooth_len=11,
alpha=0.9, gamma=0.0, constraint_region=None, tol=5, score_low=0.01, score_high=1.0):
Implementation of a salience-based F0-estimation algorithm using pitch contours, inspired by Melodia.
x : ndarray
Audio signal
Fs : int
Sampling rate
N : int
Window size
H : int
Hop size
F_min : float or int
Minimal frequency
F_max : float or int
Maximal frequency
R : int
Frequency resolution given in cents
num_harm : int
Number of harmonics (Default value = 10)
freq_smooth_len : int
Filter length for vertical smoothing (Default value = 11)
alpha : float
Weighting parameter for harmonics (Default value = 0.9)
gamma : float
Logarithmic compression factor (Default value = 0.0)
constraint_region : None or ndarray
Constraint regions, row-format: (t_start_sec, t_end_sec, f_start_hz, f_end,hz)
(Default value = None)
tol : int
Tolerance parameter for transition matrix (Default value = 5)
score_low : float
Score (low) for transition matrix (Default value = 0.01)
score_high : float
Score (high) for transition matrix (Default value = 1.0)
f0 : ndarray
Estimated F0-trajectory
T_coef: ndarray
Time axis
sal: ndarray
Salience value of estimated F0
See also
[FMP] Notebook: C8/C8S2_SalienceRepresentation.ipynb
# compute salience representation via instantaneous frequency and harmonic summation
Z, F_coef_hertz = compute_salience_rep(x, Fs, N=N, H=H, F_min=F_min, F_max=F_max, R=R,
num_harm=num_harm, freq_smooth_len=freq_smooth_len,
alpha=alpha, gamma=gamma)
# compute trajectory via dynamic programming
T_coef = (np.arange(Z.shape[1]) * H) / Fs
index_CR = compute_trajectory_cr(Z, T_coef, F_coef_hertz, constraint_region,
tol=tol, score_low=score_low, score_high=score_high)
traj = F_coef_hertz[index_CR]
traj[index_CR == -1] = 0
# compute salience value
Z_max = np.max(Z, axis=0)
Z_norm = np.divide(Z, np.ones((Z.shape[0], 1)) * Z_max)
sal = Z_norm[index_CR, np.arange(Z.shape[1])]
sal[traj == 0] = 0
return traj, T_coef, sal
def compute_salience_rep(x, Fs, N, H, F_min, F_max, R, num_harm, freq_smooth_len, alpha, gamma):
Compute salience representation [FMP, Eq. (8.56)]
x : ndarray
Audio signal
Fs : int
Sampling rate
N : int
Window size
H : int
Hop size
F_min : float or int
Minimal frequency
F_max : float or int
Maximal frequency
R : int
Frequency resolution given in cents
num_harm : int
Number of harmonics
freq_smooth_len : int
Filter length for vertical smoothing
alpha : float
Weighting parameter for harmonics
gamma : float
Logarithmic compression factor
Z : ndarray
Salience representation
F_coef_hertz : ndarray
Frequency axis in Hz
See also
[FMP] Notebook: C8/C8S2_SalienceRepresentation.ipynb
X = stft(x, n_fft=N, hop_length=H, win_length=N, pad_mode='constant')
Y_LF_IF_bin, F_coef_hertz = compute_y_lf_if_bin_eff(X, Fs, N, H, F_min, F_max, R)
# smoothing
Y_LF_IF_bin = ndimage.convolve1d(Y_LF_IF_bin, np.hanning(freq_smooth_len), axis=0, mode='constant')
Z = compute_salience_from_logfreq_spec(Y_LF_IF_bin, R, n_harmonics=num_harm, alpha=alpha, beta=1, gamma=gamma)
return Z, F_coef_hertz
def compute_y_lf_if_bin_eff(X, Fs, N, H, F_min, F_max, R):
Binned Log-frequency Spectrogram with variable frequency resolution based on instantaneous frequency,
more efficient implementation than FMP
X : ndarray
Complex spectrogram
Fs : int
Sampling rate in Hz
N : int
Window size
H : int
Hop size
F_min : float or int
Minimal frequency
F_max : float or int
Maximal frequency
R : int
Frequency resolution given in cents
Y_LF_IF_bin : ndarray
Binned log-frequency spectrogram using instantaneous frequency (shape: [freq, time])
F_coef_hertz : ndarray
Frequency axis in Hz
# calculate number of bins on log frequency axis
B = frequency_to_bin_index(F_max, R, F_min) + 1
# center frequencies of the final bins
F_coef_hertz = F_min * np.power(2, (np.arange(0, B) * R / 1200))
# calculate heterodyned phase increment (hpi)
k = np.arange(X.shape[0]).reshape(-1, 1)
omega = 2 * np.pi * k / N # center frequency for each bin in rad
hpi = (np.angle(X[:, 1:]) - np.angle(X[:, 0:-1])) - omega * H
# reduce hpi to -pi:pi range
# this is much faster than using the modulo function below, but gives the same result
# hpi = np.mod(hpi + np.pi, 2 * np.pi) - np.pi
hpi = hpi - 2 * np.pi * (np.around((hpi / (2 * np.pi)) + 1) - 1)
# calculate instantaneous frequencies in Hz
inst_f = (omega + hpi / H) * Fs / (2 * np.pi)
# repeat the first time frame to match dimensions of X
inst_f = np.hstack((np.copy(inst_f[:, 0]).reshape(-1, 1), inst_f))
# mask frequencies that are not relevant
mask = np.logical_and(inst_f >= F_min, inst_f < F_max)
inst_f *= mask
# set 0 to nan, so it does stay at nan in the bin assignment calculation
inst_f[np.where(inst_f == 0)] = np.nan
# find which inst_f values belong to which bin
bin_assignment = frequency_to_bin_index(inst_f, R, F_min)
# we map the discarded values to an extra bin that we remove before returning the binned spectrogram
bin_assignment[np.where(np.isnan(inst_f))] = B
# perform binning on power spectrogram for each time frame separately
Y = np.abs(X) ** 2
Y_LF_IF_bin = np.zeros((B+1, Y.shape[1]))
for t in range(Y.shape[1]):[:, t], bin_assignment[:, t], Y[:, t])
return Y_LF_IF_bin[:B, :], F_coef_hertz
def compute_salience_from_logfreq_spec(lf_spec, R, n_harmonics, alpha, beta, gamma, harmonic_win_len=11):
Compute salience representation using harmonic summation following [1]
lf_spec : ndarray
(F, T) log-spectrogram
R : int
Frequency resolution given in cents
n_harmonics : int
Number of harmonics
alpha : float
Weighting parameter for harmonics
beta : float
Compression parameter for spectrogram magnitudes
gamma : float
Magnitude threshold
harmonic_win_len : int
Length of a frequency weighting window in bins
Z : ndarray
(F, T) salience representation of the input spectrogram
# magnitude thresholding and compression
eps = np.finfo(np.float32).eps
threshold_mask = (20 * np.log10(lf_spec/np.max(lf_spec) + eps)) < gamma
lf_spec = lf_spec**beta * threshold_mask
# compute window
max_diff_bins = harmonic_win_len // 2
window = np.cos(np.linspace(-1, 1, 2*max_diff_bins+1)*np.pi/2)**2 # cosine^2 window
# compute indices of harmonics
harmonics = np.round(np.log2(np.arange(1, n_harmonics + 1)) * 1200 / R).astype(int)
weighting_vec = np.zeros((lf_spec.shape[0] + max_diff_bins))
# compute weights
for idx, h in enumerate(harmonics):
if h+harmonic_win_len > len(weighting_vec):
break # we reached the maximum length available
weighting_vec[h:h+harmonic_win_len] += window * alpha**idx
# correlate lf_spec with the weighting vector on the frequency axis
Z = ndimage.correlate1d(lf_spec, weighting_vec[:],
axis=0, mode='constant', cval=0, origin=-len(weighting_vec)//2 + max_diff_bins)
# magnitude thresholding and compression
threshold_mask = (20 * np.log10(Z / np.max(Z) + eps)) < gamma
Z = Z ** beta * threshold_mask
return Z
def define_transition_matrix(B, tol=0, score_low=0.01, score_high=1.0):
Generate transition matrix for dynamic programming
B : int
Number of bins
tol : int
Tolerance parameter for transition matrix (Default value = 0)
score_low : float
Score (low) for transition matrix (Default value = 0.01)
score_high : float
Score (high) for transition matrix (Default value = 1.0)
T : ndarray
(B, B) Transition matrix
See also
[FMP] Notebook: C8/C8S2_FundFreqTracking.ipynb
col = np.ones((B,)) * score_low
col[0:tol+1] = np.ones((tol+1, )) * score_high
T = linalg.toeplitz(col)
return T
def compute_trajectory_dp(Z, T):
Trajectory tracking using dynamic programming
Z : ndarray
Salience representation
T : ndarray
Transisition matrix
eta_DP : ndarray
Trajectory indices
See also
[FMP] Notebook: C8/C8S2_FundFreqTracking.ipynb
B, N = Z.shape
eps_machine = np.finfo(np.float32).eps
Z_log = np.log(Z + eps_machine)
T_log = np.log(T + eps_machine)
E = np.zeros((B, N))
D = np.zeros((B, N))
D[:, 0] = Z_log[:, 0]
for n in np.arange(1, N):
for b in np.arange(0, B):
D[b, n] = np.max(T_log[b, :] + D[:, n-1]) + Z_log[b, n]
E[b, n-1] = np.argmax(T_log[b, :] + D[:, n-1])
# backtracking
eta_DP = np.zeros(N)
eta_DP[N-1] = int(np.argmax(D[:, N-1]))
for n in np.arange(N-2, -1, -1):
eta_DP[n] = E[int(eta_DP[n+1]), n]
return eta_DP.astype(np.int64)
def compute_trajectory_cr(Z, T_coef, F_coef_hertz, constraint_region=None,
tol=5, score_low=0.01, score_high=1.0):
Trajectory tracking with constraint regions
Notebook: C8/C8S2_FundFreqTracking.ipynb
Z : ndarray
Salience representation
T_coef : ndarray
Time axis
F_coef_hertz : ndarray
Frequency axis in Hz
constraint_region : ndarray or None
Constraint regions, row-format: (t_start_sec, t_end_sec, f_start_hz, f_end_hz)
(Default value = None)
tol : int
Tolerance parameter for transition matrix (Default value = 5)
score_low : float
Score (low) for transition matrix (Default value = 0.01)
score_high : float
Score (high) for transition matrix (Default value = 1.0)
eta : ndarray
Trajectory indices, unvoiced frames are indicated with -1
See also
[FMP] Notebook: C8/C8S2_FundFreqTracking.ipynb
# do tracking within every constraint region
if constraint_region is not None:
# initialize contour, unvoiced frames are indicated with -1
eta = np.full(len(T_coef), -1)
for row_idx in range(constraint_region.shape[0]):
t_start = constraint_region[row_idx, 0] # sec
t_end = constraint_region[row_idx, 1] # sec
f_start = constraint_region[row_idx, 2] # Hz
f_end = constraint_region[row_idx, 3] # Hz
# convert start/end values to indices
t_start_idx = np.argmin(np.abs(T_coef - t_start))
t_end_idx = np.argmin(np.abs(T_coef - t_end))
f_start_idx = np.argmin(np.abs(F_coef_hertz - f_start))
f_end_idx = np.argmin(np.abs(F_coef_hertz - f_end))
# track in salience part
cur_Z = Z[f_start_idx:f_end_idx+1, t_start_idx:t_end_idx+1]
T = define_transition_matrix(cur_Z.shape[0], tol=tol,
score_low=score_low, score_high=score_high)
cur_eta = compute_trajectory_dp(cur_Z, T)
# fill contour
eta[t_start_idx:t_end_idx+1] = f_start_idx + cur_eta
T = define_transition_matrix(Z.shape[0], tol=tol, score_low=score_low, score_high=score_high)
eta = compute_trajectory_dp(Z, T)
return eta
def frequency_to_bin_index(F, R, F_ref):
Binning function with variable frequency resolution
Note: Indexing starts with 0 (opposed to [FMP, Eq. (8.49)])
F : float or ndarray
Frequency in Hz
R : float
Frequency resolution in cents (Default value = 10.0)
F_ref : float
Reference frequency in Hz (Default value = 55.0)
bin_index (int): Index for bin (starting with index 0)
See also
[FMP] Notebook: C8/C8S2_SalienceRepresentation.ipynb
bin_index = np.floor((1200 / R) * np.log2(F / F_ref) + 0.5).astype(np.int64)
return bin_index