File size: 2,563 Bytes
01e655b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import sys
from pydub import AudioSegment
import soundfile as sf
import pyrubberband as pyrb
import numpy as np
from io import BytesIO


def audiosegment_to_librosawav(audiosegment):
    channel_sounds = audiosegment.split_to_mono()
    samples = [s.get_array_of_samples() for s in channel_sounds]

    fp_arr = np.array(samples).T.astype(np.float32)
    fp_arr /= np.iinfo(samples[0].typecode).max
    fp_arr = fp_arr.reshape(-1)

    return fp_arr


def ndarray_to_segment(ndarray, frame_rate):
    buffer = BytesIO()
    sf.write(buffer, ndarray, frame_rate, format="wav")
    buffer.seek(0)
    sound = AudioSegment.from_wav(
        buffer,
    )
    return sound


def time_stretch(input_segment: AudioSegment, time_factor: float) -> AudioSegment:
    """
    factor range -> [0.2,10]
    """
    time_factor = np.clip(time_factor, 0.2, 10)
    sr = input_segment.frame_rate
    y = audiosegment_to_librosawav(input_segment)
    y_stretch = pyrb.time_stretch(y, sr, time_factor)

    sound = ndarray_to_segment(
        y_stretch,
        frame_rate=sr,
    )
    return sound


def pitch_shift(
    input_segment: AudioSegment,
    pitch_shift_factor: float,
) -> AudioSegment:
    """
    factor range -> [-12,12]
    """
    pitch_shift_factor = np.clip(pitch_shift_factor, -12, 12)
    sr = input_segment.frame_rate
    y = audiosegment_to_librosawav(input_segment)
    y_shift = pyrb.pitch_shift(y, sr, pitch_shift_factor)

    sound = ndarray_to_segment(
        y_shift,
        frame_rate=sr,
    )
    return sound


def apply_prosody_to_audio_data(
    audio_data: np.ndarray, rate: float, volume: float, pitch: float, sr: int
) -> np.ndarray:
    if rate != 1:
        audio_data = pyrb.time_stretch(audio_data, sr=sr, rate=rate)

    if volume != 0:
        audio_data = audio_data * volume

    if pitch != 0:
        audio_data = pyrb.pitch_shift(audio_data, sr=sr, n_steps=pitch)

    return audio_data


if __name__ == "__main__":
    input_file = sys.argv[1]

    time_stretch_factors = [0.5, 0.75, 1.5, 1.0]
    pitch_shift_factors = [-12, -5, 0, 5, 12]

    input_sound = AudioSegment.from_mp3(input_file)

    for time_factor in time_stretch_factors:
        output_wav = f"time_stretched_{int(time_factor * 100)}.wav"
        sound = time_stretch(input_sound, time_factor)
        sound.export(output_wav, format="wav")

    for pitch_factor in pitch_shift_factors:
        output_wav = f"pitch_shifted_{int(pitch_factor * 100)}.wav"
        sound = pitch_shift(input_sound, pitch_factor)
        sound.export(output_wav, format="wav")