Text-to-Speech
English

The generated audio is a bit small. How can I increase the volume in the code?

#74
by mayixb - opened

samples, sample_rate = kokoro.create(
text, voice=voices, speed=0.9, lang="en-us"
)

Hi, I fixed that by processing audio data like this:

import numpy as np

def improve_volume(audio_data):
    audio=normalize_audio(data=audio_data, target_db=0)                          # normalize signal to to prepare for compression
    audio=compress_audio(data=audio, threshold_db=-4, ratio=2, knee_width=5)     # compress signal to get dynamic range. (google: Dynamic Range Compression)
    audio=normalize_audio(data=audio, target_db=-1)                              # normalize again to increase volume after compression
        
    return audio

def normalize_audio(data, target_db=0):
    target_amplitude = 10 ** (target_db / 20)
    # Normalize audio
    peak = np.max(np.abs(data))
    return data * (target_amplitude / peak)
    
def compress_audio(data, threshold_db, ratio, knee_width):
    """Apply dynamic range compression to audio."""
    threshold = 10 ** (threshold_db / 20)
    knee = 10 ** (knee_width / 20)
    compressed = np.zeros_like(data)
        
    for i in range(len(data)):
        amplitude = abs(data[i])
        if amplitude < threshold:
            compressed[i] = data[i]  # No compression for quiet parts
        elif amplitude > threshold + knee:
            compressed[i] = np.sign(data[i]) * (threshold + (amplitude - threshold) / ratio)
        else:
            # Smooth knee transition
            knee_start = threshold
            knee_end = threshold + knee
            blend = (amplitude - knee_start) / (knee_end - knee_start)
            compressed[i] = np.sign(data[i]) * (knee_start + blend * ((amplitude - knee_start) / ratio))
    return compressed

Sign up or log in to comment