Spaces:
Sleeping
Sleeping
File size: 8,707 Bytes
3dba9d4 b4f1e5a 3dba9d4 b4f1e5a 3dba9d4 b4f1e5a 3dba9d4 b4f1e5a 3dba9d4 b4f1e5a 3dba9d4 b4f1e5a 3dba9d4 b4f1e5a 3dba9d4 b4f1e5a 3dba9d4 b4f1e5a 3dba9d4 b4f1e5a 3dba9d4 b4f1e5a 3dba9d4 b4f1e5a 3dba9d4 b4f1e5a 3dba9d4 b4f1e5a 3dba9d4 b4f1e5a 3dba9d4 b4f1e5a 3dba9d4 b4f1e5a 3dba9d4 b4f1e5a 3dba9d4 b4f1e5a 3dba9d4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 |
import gradio as gr
import numpy as np
import os
import time
import torch
from scipy.io import wavfile
# Bark imports
from bark import generate_audio, SAMPLE_RATE
from bark.generation import preload_models
# Hugging Face Transformers
from transformers import AutoModelForTextToSpeech, AutoProcessor, AutoTokenizer
from transformers import SpeechT5HifiGan, SpeechT5ForTextToSpeech, SpeechT5Processor
class VoiceSynthesizer:
def __init__(self):
# Create working directory
self.base_dir = os.path.dirname(os.path.abspath(__file__))
self.working_dir = os.path.join(self.base_dir, "working_files")
os.makedirs(self.working_dir, exist_ok=True)
# Initialize models dictionary
self.models = {
"bark": self._initialize_bark,
"speecht5": self._initialize_speecht5
}
# Default model
self.current_model = "bark"
# Initialize Bark models
try:
print("Attempting to load Bark models...")
preload_models()
print("Bark models loaded successfully.")
except Exception as e:
print(f"Bark model loading error: {e}")
def _initialize_bark(self):
"""Bark model initialization (already done in __init__)"""
return None
def _initialize_speecht5(self):
"""Initialize SpeechT5 model from Hugging Face"""
try:
# Load SpeechT5 model and processor
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
# Load speaker embeddings
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[0]["xvector"]).unsqueeze(0)
return {
"model": model,
"processor": processor,
"vocoder": vocoder,
"speaker_embeddings": speaker_embeddings
}
except Exception as e:
print(f"SpeechT5 model loading error: {e}")
return None
def set_model(self, model_name):
"""Set the current model for speech synthesis"""
if model_name not in self.models:
raise ValueError(f"Model {model_name} not supported")
self.current_model = model_name
def generate_speech(self, text, model_name=None, voice_preset=None):
"""Generate speech using selected model"""
if not text or not text.strip():
return None, "Please enter some text to speak"
# Use specified model or current model
current_model = model_name or self.current_model
try:
if current_model == "bark":
return self._generate_bark_speech(text, voice_preset)
elif current_model == "speecht5":
return self._generate_speecht5_speech(text, voice_preset)
else:
raise ValueError(f"Unsupported model: {current_model}")
except Exception as e:
print(f"Speech generation error: {e}")
import traceback
traceback.print_exc()
return None, f"Error generating speech: {str(e)}"
def _generate_bark_speech(self, text, voice_preset=None):
"""Generate speech using Bark"""
# List of Bark voice presets
voice_presets = [
"v2/en_speaker_6", # Female
"v2/en_speaker_3", # Male
"v2/en_speaker_9", # Neutral
]
# Select voice preset
history_prompt = voice_preset if voice_preset else voice_presets[0]
# Generate audio
audio_array = generate_audio(
text,
history_prompt=history_prompt
)
# Save generated audio
filename = f"bark_speech_{int(time.time())}.wav"
filepath = os.path.join(self.working_dir, filename)
wavfile.write(filepath, SAMPLE_RATE, audio_array)
return filepath, None
def _generate_speecht5_speech(self, text, speaker_id=None):
"""Generate speech using SpeechT5"""
# Ensure model is initialized
speecht5_models = self.models["speecht5"]()
if not speecht5_models:
return None, "SpeechT5 model not loaded"
model = speecht5_models["model"]
processor = speecht5_models["processor"]
vocoder = speecht5_models["vocoder"]
speaker_embeddings = speecht5_models["speaker_embeddings"]
# Prepare inputs
inputs = processor(text=text, return_tensors="pt")
# Generate speech
speech = model.generate_speech(
inputs["input_ids"],
speaker_embeddings
)
# Convert to numpy array
audio_array = speech.numpy()
# Save generated audio
filename = f"speecht5_speech_{int(time.time())}.wav"
filepath = os.path.join(self.working_dir, filename)
wavfile.write(filepath, 16000, audio_array)
return filepath, None
def create_interface():
synthesizer = VoiceSynthesizer()
with gr.Blocks() as interface:
gr.Markdown("# ๐๏ธ Advanced Voice Synthesis")
with gr.Row():
with gr.Column():
gr.Markdown("## Speech Generation")
text_input = gr.Textbox(label="Enter Text to Speak")
# Model Selection
model_dropdown = gr.Dropdown(
choices=[
"bark (Suno AI)",
"speecht5 (Microsoft)"
],
label="Select TTS Model",
value="bark (Suno AI)"
)
# Voice Preset Dropdowns
with gr.Row():
bark_preset = gr.Dropdown(
choices=[
"v2/en_speaker_6 (Female)",
"v2/en_speaker_3 (Male)",
"v2/en_speaker_9 (Neutral)"
],
label="Bark Voice Preset",
visible=True
)
speecht5_preset = gr.Dropdown(
choices=[
"Default Speaker"
],
label="SpeechT5 Speaker",
visible=False
)
generate_btn = gr.Button("Generate Speech")
audio_output = gr.Audio(label="Generated Speech")
error_output = gr.Textbox(label="Errors", visible=True)
# Dynamic model and preset visibility
def update_model_visibility(model):
if "bark" in model.lower():
return {
bark_preset: gr.update(visible=True),
speecht5_preset: gr.update(visible=False)
}
else:
return {
bark_preset: gr.update(visible=False),
speecht5_preset: gr.update(visible=True)
}
model_dropdown.change(
fn=update_model_visibility,
inputs=model_dropdown,
outputs=[bark_preset, speecht5_preset]
)
# Speech generation logic
def generate_speech_wrapper(text, model, bark_preset, speecht5_preset):
# Map model name
model_map = {
"bark (Suno AI)": "bark",
"speecht5 (Microsoft)": "speecht5"
}
# Select appropriate preset
preset = bark_preset if "bark" in model else speecht5_preset
return synthesizer.generate_speech(
text,
model_name=model_map[model],
voice_preset=preset
)
generate_btn.click(
fn=generate_speech_wrapper,
inputs=[text_input, model_dropdown, bark_preset, speecht5_preset],
outputs=[audio_output, error_output]
)
return interface
if __name__ == "__main__":
interface = create_interface()
interface.launch(
share=False,
debug=True,
show_error=True,
server_name='0.0.0.0',
server_port=7860
) |