Spaces:

copyvara
/

Spark-TTS

Runtime error

App Files Files Community

spark-tts commited on Feb 26

Commit

ad5cf60

1 Parent(s): 6f15685

add webui

Browse files

Files changed (3) hide show

.gitignore +1 -1
sparktts/utils/token_parser.py +8 -0
webui.py +192 -0

.gitignore CHANGED Viewed

@@ -7,7 +7,7 @@ results/
 demo/
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/

 demo/
 # C extensions
 *.so
+.gradio/
 # Distribution / packaging
 .Python
 build/

sparktts/utils/token_parser.py CHANGED Viewed

@@ -19,6 +19,14 @@ LEVELS_MAP = {
     "very_high": 4,
 }
 GENDER_MAP = {
     "female": 0,
     "male": 1,

     "very_high": 4,
 }
+LEVELS_MAP_UI = {
+    1: 'very_low',
+    2: 'low',
+    3: 'moderate',
+    4: 'high',
+    5: 'very_high'
+}
 GENDER_MAP = {
     "female": 0,
     "male": 1,

webui.py ADDED Viewed

	@@ -0,0 +1,192 @@

+# Copyright (c) 2025 SparkAudio
+#               2025 Xinsheng Wang ([email protected])
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import torch
+import soundfile as sf
+import logging
+import gradio as gr
+from datetime import datetime
+from cli.SparkTTS import SparkTTS
+from sparktts.utils.token_parser import LEVELS_MAP_UI
+def initialize_model(model_dir="pretrained_models/Spark-TTS-0.5B", device=0):
+    """Load the model once at the beginning."""
+    logging.info(f"Loading model from: {model_dir}")
+    device = torch.device(f"cuda:{device}")
+    model = SparkTTS(model_dir, device)
+    return model
+def run_tts(
+    text,
+    model,
+    prompt_text=None,
+    prompt_speech=None,
+    gender=None,
+    pitch=None,
+    speed=None,
+    save_dir="example/results",
+):
+    """Perform TTS inference and save the generated audio."""
+    logging.info(f"Saving audio to: {save_dir}")
+    if prompt_text is not None:
+        prompt_text = None if len(prompt_text) <= 1 else prompt_text
+    # Ensure the save directory exists
+    os.makedirs(save_dir, exist_ok=True)
+    # Generate unique filename using timestamp
+    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
+    save_path = os.path.join(save_dir, f"{timestamp}.wav")
+    logging.info("Starting inference...")
+    # Perform inference and save the output audio
+    with torch.no_grad():
+        wav = model.inference(
+            text,
+            prompt_speech,
+            prompt_text,
+            gender,
+            pitch,
+            speed,
+        )
+        sf.write(save_path, wav, samplerate=16000)
+    logging.info(f"Audio saved at: {save_path}")
+    return save_path, model  # Return model along with audio path
+def voice_clone(text, model, prompt_text, prompt_wav_upload, prompt_wav_record):
+    """Gradio interface for TTS with prompt speech input."""
+    # Determine prompt speech (from audio file or recording)
+    prompt_speech = prompt_wav_upload if prompt_wav_upload else prompt_wav_record
+    prompt_text = None if len(prompt_text) < 2 else prompt_text
+    audio_output_path, model = run_tts(
+        text, model, prompt_text=prompt_text, prompt_speech=prompt_speech
+    )
+    return audio_output_path, model
+def voice_creation(text, model, gender, pitch, speed):
+    """Gradio interface for TTS with control over voice attributes."""
+    pitch = LEVELS_MAP_UI[int(pitch)]
+    speed = LEVELS_MAP_UI[int(speed)]
+    audio_output_path, model = run_tts(
+        text, model, gender=gender, pitch=pitch, speed=speed
+    )
+    return audio_output_path, model
+def build_ui(model_dir, device=0):
+    with gr.Blocks() as demo:
+        # Initialize model
+        model = initialize_model(model_dir, device=device)
+        # Use HTML for centered title
+        gr.HTML('<h1 style="text-align: center;">Spark-TTS by SparkAudio</h1>')
+        with gr.Tabs():
+            # Voice Clone Tab
+            with gr.TabItem("Voice Clone"):
+                gr.Markdown(
+                    "### Upload reference audio or recording （上传参考音频或者录音）"
+                )
+                with gr.Row():
+                    prompt_wav_upload = gr.Audio(
+                        sources="upload",
+                        type="filepath",
+                        label="Choose the prompt audio file, ensuring the sampling rate is no lower than 16kHz.",
+                    )
+                    prompt_wav_record = gr.Audio(
+                        sources="microphone",
+                        type="filepath",
+                        label="Record the prompt audio file.",
+                    )
+                with gr.Row():
+                    text_input = gr.Textbox(
+                        label="Text", lines=3, placeholder="Enter text here"
+                    )
+                    prompt_text_input = gr.Textbox(
+                        label="Text of prompt speech (Optional; recommended for cloning in the same language.)",
+                        lines=3,
+                        placeholder="Enter text of the prompt speech.",
+                    )
+                audio_output = gr.Audio(
+                    label="Generated Audio", autoplay=True, streaming=True
+                )
+                generate_buttom_clone = gr.Button("Generate")
+                generate_buttom_clone.click(
+                    voice_clone,
+                    inputs=[
+                        text_input,
+                        gr.State(model),
+                        prompt_text_input,
+                        prompt_wav_upload,
+                        prompt_wav_record,
+                    ],
+                    outputs=[audio_output, gr.State(model)],
+                )
+            # Voice Creation Tab
+            with gr.TabItem("Voice Creation"):
+                gr.Markdown(
+                    "### Create your own voice based on the following parameters"
+                )
+                with gr.Row():
+                    with gr.Column():
+                        gender = gr.Radio(
+                            choices=["male", "female"], value="male", label="Gender"
+                        )
+                        pitch = gr.Slider(
+                            minimum=1, maximum=5, step=1, value=3, label="Pitch"
+                        )
+                        speed = gr.Slider(
+                            minimum=1, maximum=5, step=1, value=3, label="Speed"
+                        )
+                    with gr.Column():
+                        text_input_creation = gr.Textbox(
+                            label="Input Text",
+                            lines=3,
+                            placeholder="Enter text here",
+                            value="You can generate a customized voice by adjusting parameters such as pitch and speed.",
+                        )
+                        create_button = gr.Button("Create Voice")
+                audio_output = gr.Audio(
+                    label="Generated Audio", autoplay=True, streaming=True
+                )
+                create_button.click(
+                    voice_creation,
+                    inputs=[text_input_creation, gr.State(model), gender, pitch, speed],
+                    outputs=[audio_output, gr.State(model)],
+                )
+    return demo
+if __name__ == "__main__":
+    demo = build_ui(model_dir="pretrained_models/Spark-TTS-0.5B", device=5)
+    demo.launch()