File size: 4,691 Bytes
8dec2ab 0d15013 8dec2ab 77c33c9 8dec2ab |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
import gradio as gr
import os
import soundfile as sf
import numpy as np
import torch
import traceback
from inference import StyleTTS2
repo_dir = './'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
config_path = os.path.join(repo_dir, "Models", "config.yaml")
models_path = os.path.join(repo_dir, "Models", "model.pth")
model = StyleTTS2(config_path, models_path).eval().to(device)
voice_path = os.path.join(repo_dir, "reference_audio")
eg_voices = [os.path.join(voice_path,"vn_1.wav"), os.path.join(voice_path,"vn_2.wav")]
eg_texts = [
"Chỉ với khoảng 90 triệu tham số, [en-us]{StyleTTS2-lite} có thể dễ dàng tạo giọng nói với tốc độ cao.",
"[id_1] Với [en-us]{StyleTTS2-lite} bạn có thể sử dụng [en-us]{language tag} để mô hình chắc chắn đọc bằng tiếng Anh, [id_2]cũng như sử dụng [en-us]{speaker tag} để chuyển đổi nhanh giữa các giọng đọc.",
]
# Core inference function
def main(reference_paths, text_prompt, denoise, avg_style, stabilize):
try:
speakers = {}
for i, path in enumerate(reference_paths, 1):
speaker_id = f"id_{i}"
speakers[speaker_id] = {
"path": path,
"lang": "vi",
"speed": 1.0
}
with torch.no_grad():
styles = model.get_styles(speakers, denoise, avg_style)
r = model.generate(text_prompt, styles, stabilize, 18, "[id_1]")
r = r / np.abs(r).max()
sf.write("output.wav", r, samplerate=24000)
return "output.wav", "Audio generated successfully!"
except Exception as e:
error_message = traceback.format_exc()
return None, error_message
def on_file_upload(file_list):
if not file_list:
return None, "No file uploaded yet."
unique_files = {}
for file_path in file_list:
file_name = os.path.basename(file_path)
unique_files[file_name] = file_path #update and remove duplicate
uploaded_infos = []
uploaded_file_names = list(unique_files.keys())
for i in range(len(uploaded_file_names)):
uploaded_infos.append(f"[id_{i+1}]: {uploaded_file_names[i]}")
summary = "\n".join(uploaded_infos)
return list(unique_files.values()), f"Current reference audios:\n{summary}"
def gen_example(reference_paths, text_prompt):
output, status = main(reference_paths, text_prompt, 0.6, True, True)
return output, reference_paths, status
# Gradio UI
with gr.Blocks() as demo:
gr.HTML("<h1 style='text-align: center;'>StyleTTS2‑Lite Demo</h1>")
gr.Markdown(
"Download the local inference package from Hugging Face: "
"[StyleTTS2‑Lite (Vietnamese)]"
"(https://huggingface.co/dangtr0408/StyleTTS2-lite-vi/)."
)
gr.Markdown(
"Annotate any non‑Vietnamese words with the appropriate language tag, e.g., [en-us]{ } for English. For more information, see "
"[eSpeakNG docs]"
"(https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md)"
)
with gr.Row(equal_height=True):
with gr.Column(scale=1):
text_prompt = gr.Textbox(label="Text Prompt", placeholder="Enter your text here...", lines=4)
with gr.Column(scale=1):
avg_style = gr.Checkbox(label="Use Average Styles", value=True)
stabilize = gr.Checkbox(label="Stabilize Speaking Speed", value=True)
denoise = gr.Slider(0.0, 1.0, step=0.1, value=0.6, label="Denoise Strength")
with gr.Row(equal_height=True):
with gr.Column(scale=1):
reference_audios = gr.File(label="Reference Audios", file_types=[".wav", ".mp3"], file_count="multiple", height=150)
gen_button = gr.Button("Generate")
with gr.Column(scale=1):
synthesized_audio = gr.Audio(label="Generate Audio", type="filepath")
status = gr.Textbox(label="Status", interactive=False, lines=3)
reference_audios.change(
on_file_upload,
inputs=[reference_audios],
outputs=[reference_audios, status]
)
gen_button.click(
fn=main,
inputs=[
reference_audios,
text_prompt,
denoise,
avg_style,
stabilize
],
outputs=[synthesized_audio, status]
)
gr.Examples(
examples=[[[eg_voices[0]], eg_texts[0]], [eg_voices, eg_texts[1]]],
inputs=[reference_audios, text_prompt],
outputs=[synthesized_audio, reference_audios, status],
fn=gen_example,
cache_examples=False,
label="Examples",
run_on_click=True
)
demo.launch() |