import os import io from typing_extensions import Literal import gradio as gr import tempfile import numpy as np from dotenv import load_dotenv from elevenlabs.client import ElevenLabs from elevenlabs import play, stream, save from elevenlabs import Voice, VoiceSettings from pydub import AudioSegment from pydub.playback import play import imageio_ffmpeg as ffmpeg import requests from audiostretchy.stretch import AudioStretch load_dotenv() def verify_auth(username, password): if username == USER and password == PASSWORD: return True else: return False ELE_API_KEY = os.getenv("ELE_API_KEY") USER = os.getenv("USER") PASSWORD = os.getenv("PASSWORD") MODEL = "eleven_multilingual_v2" ele_client = ElevenLabs(api_key=ELE_API_KEY) VOICE = [ "승현", "우승" ] KEY_MAPPING = { "승현": "0RBbbgk6KUJxHmWzPiHz", # 승현+제시카(2:1) "우승": "ASwOiisDbuaP2R1jUQU6", # 우승+TTS_KKC(1:1) } AudioSegment.converter = ffmpeg.get_ffmpeg_exe() def change_pitch(audio_segment, pitch_shift): new_sample_rate = int(audio_segment.frame_rate * (2.0 ** pitch_shift)) pitched_audio = audio_segment._spawn(audio_segment.raw_data, overrides={'frame_rate': new_sample_rate}) return pitched_audio.set_frame_rate(audio_segment.frame_rate) def predict( text: str, voice: str, output_file_format: Literal["mp3"] = "", speed: float = 1.0, pitch_shift: float = 0.0, stability: float = 0.5, similarity: float = 0.7, style_exaggeration: float = 0., speaker_boost: bool = True ): try: voice_setup=Voice( voice_id=KEY_MAPPING[voice], settings=VoiceSettings(stability=stability, similarity_boost=similarity, style=style_exaggeration, use_speaker_boost=speaker_boost) ) audio = ele_client.generate( text = text, voice = voice_setup, model=MODEL ) audio_data = b''.join(audio) except Exception as e: raise requests.exceptions.RequestException(f"An error occurred while generating speech. Please check your API key and come back try again. {str(e)}") print(f"[Text] {text}") audio_stretch = AudioStretch() audio_stretch.open_mp3(io.BytesIO(audio_data)) audio_stretch.stretch(ratio=1/speed) # speed 0.5 -> 2.0\ # Export the final audio to a temporary file with tempfile.NamedTemporaryFile(suffix=f".{output_file_format}", delete=False) as temp_file: audio_stretch.save(path=temp_file.name) audio = AudioSegment.from_file(temp_file.name) # Adjust pitch if needed if pitch_shift != 0.0: audio = change_pitch(audio, pitch_shift) audio.export(temp_file.name, format=output_file_format) temp_file_path = temp_file.name return temp_file_path with gr.Blocks() as demo: gr.Markdown("#
Letsur Text-To-Speech API with Gradio
") with gr.Row(variant="panel"): voice = gr.Dropdown(choices=VOICE, label="Voice Options", value="승현") output_file_format = gr.Dropdown(choices=["mp3"], label="Output Options", value="mp3") text = gr.Textbox(label="Input text", value="안녕하세요.", placeholder="안녕하세요.") # Additional parameters with gr.Accordion("Advanced Settings", open=False): speed = gr.Slider(label="speed", minimum=0.8, maximum=1.2, step=0.1, value=1.0) pitch_shift = gr.Slider(label="pitch_shift", minimum=-0.1, maximum=0.1, step=0.05, value=0.0) # 범위: 0~1 stability = gr.Slider(label="stability", minimum=0., maximum=1., step=0.1, value=1.0) # 범위: 0~1 similarity = gr.Slider(label="similarity", minimum=0., maximum=1., step=0.1, value=1.0) # 범위: 0~1 style_exaggeration = gr.Slider(label="style_exaggeration", minimum=0., maximum=1., step=0.1, value=0.) # 범위: 0~1 speaker_boost = gr.Checkbox(label="speaker_boost", value=True) # True or False btn = gr.Button("Text-To-Speech") output_audio = gr.Audio(label="Speech Output") inputs = [text, voice, output_file_format] + [speed, pitch_shift, stability, similarity, style_exaggeration, speaker_boost] text.submit(fn=predict, inputs=inputs, outputs=output_audio, api_name="predict") btn.click(fn=predict, inputs=inputs, outputs=output_audio, api_name=False) demo.queue().launch()