File size: 4,582 Bytes
ce79f00
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import os
import io
from typing_extensions import Literal

import gradio as gr
import tempfile
import numpy as np

from dotenv import load_dotenv
from elevenlabs.client import ElevenLabs
from elevenlabs import play, stream, save
from elevenlabs import Voice, VoiceSettings
from pydub import AudioSegment
from pydub.playback import play
import imageio_ffmpeg as ffmpeg
import requests
from audiostretchy.stretch import AudioStretch


load_dotenv()

def verify_auth(username, password):
    if username == USER and password == PASSWORD:

        return True
    else:
        return False

ELE_API_KEY = os.getenv("ELE_API_KEY")
USER = os.getenv("USER")
PASSWORD = os.getenv("PASSWORD")

MODEL = "eleven_multilingual_v2"

ele_client = ElevenLabs(api_key=ELE_API_KEY)

VOICE = [
    "μŠΉν˜„",
    "우승"
]

KEY_MAPPING = {
    "μŠΉν˜„": "0RBbbgk6KUJxHmWzPiHz", # μŠΉν˜„+μ œμ‹œμΉ΄(2:1)
    "우승": "ASwOiisDbuaP2R1jUQU6", # 우승+TTS_KKC(1:1)
}


AudioSegment.converter = ffmpeg.get_ffmpeg_exe()


def change_pitch(audio_segment, pitch_shift):
    new_sample_rate = int(audio_segment.frame_rate * (2.0 ** pitch_shift))
    pitched_audio = audio_segment._spawn(audio_segment.raw_data, overrides={'frame_rate': new_sample_rate})    
    return pitched_audio.set_frame_rate(audio_segment.frame_rate)


def predict(
        text: str,
        voice: str,
        output_file_format: Literal["mp3"] = "",
        speed: float = 1.0,
        pitch_shift: float = 0.0,
        stability: float = 0.5,
        similarity: float = 0.7,
        style_exaggeration: float = 0.,
        speaker_boost: bool = True     
):
    
    try:                        
        voice_setup=Voice(
            voice_id=KEY_MAPPING[voice],
            settings=VoiceSettings(stability=stability, similarity_boost=similarity, style=style_exaggeration, use_speaker_boost=speaker_boost)
        )
        
        audio = ele_client.generate(
            text = text,
            voice = voice_setup,
            model=MODEL
        )            
        audio_data = b''.join(audio)

    except Exception as e:
        raise requests.exceptions.RequestException(f"An error occurred while generating speech. Please check your API key and come back try again. {str(e)}")   
        
    print(f"[Text] {text}")
    
    audio_stretch = AudioStretch()
    audio_stretch.open_mp3(io.BytesIO(audio_data))
    audio_stretch.stretch(ratio=1/speed) # speed 0.5 -> 2.0\

    # Export the final audio to a temporary file
    with tempfile.NamedTemporaryFile(suffix=f".{output_file_format}", delete=False) as temp_file:
        audio_stretch.save(path=temp_file.name)
        audio = AudioSegment.from_file(temp_file.name)
        
        # Adjust pitch if needed
        if pitch_shift != 0.0:
            audio = change_pitch(audio, pitch_shift)
        
        audio.export(temp_file.name, format=output_file_format)
        temp_file_path = temp_file.name

    return temp_file_path

          
with gr.Blocks() as demo:
    gr.Markdown("# <center> Letsur Text-To-Speech API with Gradio </center>")
    with gr.Row(variant="panel"):        
        voice = gr.Dropdown(choices=VOICE, label="Voice Options", value="μŠΉν˜„")
        output_file_format = gr.Dropdown(choices=["mp3"], label="Output Options", value="mp3")

    text = gr.Textbox(label="Input text",
                      value="μ•ˆλ…•ν•˜μ„Έμš”.",
                      placeholder="μ•ˆλ…•ν•˜μ„Έμš”.")
    
    # Additional parameters
    with gr.Accordion("Advanced Settings", open=False):
        speed = gr.Slider(label="speed", minimum=0.8, maximum=1.2, step=0.1, value=1.0)
        pitch_shift = gr.Slider(label="pitch_shift", minimum=-0.1, maximum=0.1, step=0.05, value=0.0) # λ²”μœ„: 0~1
        stability = gr.Slider(label="stability", minimum=0., maximum=1., step=0.1, value=1.0) # λ²”μœ„: 0~1
        similarity = gr.Slider(label="similarity", minimum=0., maximum=1., step=0.1, value=1.0) # λ²”μœ„: 0~1
        style_exaggeration = gr.Slider(label="style_exaggeration", minimum=0., maximum=1., step=0.1, value=0.) # λ²”μœ„: 0~1
        speaker_boost = gr.Checkbox(label="speaker_boost", value=True) # True or False            
    
    btn = gr.Button("Text-To-Speech")
    output_audio = gr.Audio(label="Speech Output")

    inputs = [text, voice, output_file_format] + [speed, pitch_shift, stability, similarity, style_exaggeration, speaker_boost]
    
    text.submit(fn=predict, inputs=inputs, outputs=output_audio, api_name="predict")
    btn.click(fn=predict, inputs=inputs, outputs=output_audio, api_name=False)

demo.queue().launch()