Spaces:
Runtime error
Runtime error
File size: 4,582 Bytes
ce79f00 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
import os
import io
from typing_extensions import Literal
import gradio as gr
import tempfile
import numpy as np
from dotenv import load_dotenv
from elevenlabs.client import ElevenLabs
from elevenlabs import play, stream, save
from elevenlabs import Voice, VoiceSettings
from pydub import AudioSegment
from pydub.playback import play
import imageio_ffmpeg as ffmpeg
import requests
from audiostretchy.stretch import AudioStretch
load_dotenv()
def verify_auth(username, password):
if username == USER and password == PASSWORD:
return True
else:
return False
ELE_API_KEY = os.getenv("ELE_API_KEY")
USER = os.getenv("USER")
PASSWORD = os.getenv("PASSWORD")
MODEL = "eleven_multilingual_v2"
ele_client = ElevenLabs(api_key=ELE_API_KEY)
VOICE = [
"μΉν",
"μ°μΉ"
]
KEY_MAPPING = {
"μΉν": "0RBbbgk6KUJxHmWzPiHz", # μΉν+μ μμΉ΄(2:1)
"μ°μΉ": "ASwOiisDbuaP2R1jUQU6", # μ°μΉ+TTS_KKC(1:1)
}
AudioSegment.converter = ffmpeg.get_ffmpeg_exe()
def change_pitch(audio_segment, pitch_shift):
new_sample_rate = int(audio_segment.frame_rate * (2.0 ** pitch_shift))
pitched_audio = audio_segment._spawn(audio_segment.raw_data, overrides={'frame_rate': new_sample_rate})
return pitched_audio.set_frame_rate(audio_segment.frame_rate)
def predict(
text: str,
voice: str,
output_file_format: Literal["mp3"] = "",
speed: float = 1.0,
pitch_shift: float = 0.0,
stability: float = 0.5,
similarity: float = 0.7,
style_exaggeration: float = 0.,
speaker_boost: bool = True
):
try:
voice_setup=Voice(
voice_id=KEY_MAPPING[voice],
settings=VoiceSettings(stability=stability, similarity_boost=similarity, style=style_exaggeration, use_speaker_boost=speaker_boost)
)
audio = ele_client.generate(
text = text,
voice = voice_setup,
model=MODEL
)
audio_data = b''.join(audio)
except Exception as e:
raise requests.exceptions.RequestException(f"An error occurred while generating speech. Please check your API key and come back try again. {str(e)}")
print(f"[Text] {text}")
audio_stretch = AudioStretch()
audio_stretch.open_mp3(io.BytesIO(audio_data))
audio_stretch.stretch(ratio=1/speed) # speed 0.5 -> 2.0\
# Export the final audio to a temporary file
with tempfile.NamedTemporaryFile(suffix=f".{output_file_format}", delete=False) as temp_file:
audio_stretch.save(path=temp_file.name)
audio = AudioSegment.from_file(temp_file.name)
# Adjust pitch if needed
if pitch_shift != 0.0:
audio = change_pitch(audio, pitch_shift)
audio.export(temp_file.name, format=output_file_format)
temp_file_path = temp_file.name
return temp_file_path
with gr.Blocks() as demo:
gr.Markdown("# <center> Letsur Text-To-Speech API with Gradio </center>")
with gr.Row(variant="panel"):
voice = gr.Dropdown(choices=VOICE, label="Voice Options", value="μΉν")
output_file_format = gr.Dropdown(choices=["mp3"], label="Output Options", value="mp3")
text = gr.Textbox(label="Input text",
value="μλ
νμΈμ.",
placeholder="μλ
νμΈμ.")
# Additional parameters
with gr.Accordion("Advanced Settings", open=False):
speed = gr.Slider(label="speed", minimum=0.8, maximum=1.2, step=0.1, value=1.0)
pitch_shift = gr.Slider(label="pitch_shift", minimum=-0.1, maximum=0.1, step=0.05, value=0.0) # λ²μ: 0~1
stability = gr.Slider(label="stability", minimum=0., maximum=1., step=0.1, value=1.0) # λ²μ: 0~1
similarity = gr.Slider(label="similarity", minimum=0., maximum=1., step=0.1, value=1.0) # λ²μ: 0~1
style_exaggeration = gr.Slider(label="style_exaggeration", minimum=0., maximum=1., step=0.1, value=0.) # λ²μ: 0~1
speaker_boost = gr.Checkbox(label="speaker_boost", value=True) # True or False
btn = gr.Button("Text-To-Speech")
output_audio = gr.Audio(label="Speech Output")
inputs = [text, voice, output_file_format] + [speed, pitch_shift, stability, similarity, style_exaggeration, speaker_boost]
text.submit(fn=predict, inputs=inputs, outputs=output_audio, api_name="predict")
btn.click(fn=predict, inputs=inputs, outputs=output_audio, api_name=False)
demo.queue().launch() |