Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -10,7 +10,40 @@ from fastapi.responses import StreamingResponse, Response, HTMLResponse
|
|
10 |
from fastapi.middleware import Middleware
|
11 |
from fastapi.middleware.gzip import GZipMiddleware
|
12 |
|
13 |
-
from
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
app = FastAPI(
|
16 |
title="Kokoro TTS FastAPI",
|
@@ -23,7 +56,7 @@ app = FastAPI(
|
|
23 |
# Global Pipeline Instance
|
24 |
# ------------------------------------------------------------------------------
|
25 |
# Create one pipeline instance for the entire app.
|
26 |
-
|
27 |
|
28 |
|
29 |
# ------------------------------------------------------------------------------
|
@@ -126,57 +159,70 @@ def audio_tensor_to_opus_bytes(audio_tensor: torch.Tensor, sample_rate: int = 24
|
|
126 |
|
127 |
return encoded_data
|
128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
|
130 |
# ------------------------------------------------------------------------------
|
131 |
# Endpoints
|
132 |
# ------------------------------------------------------------------------------
|
133 |
|
134 |
-
@app.get("/tts/streaming", summary="Streaming TTS")
|
135 |
-
def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0, format: str = "opus"):
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
|
|
180 |
|
181 |
|
182 |
@app.get("/tts/full", summary="Full TTS")
|
@@ -185,21 +231,18 @@ def tts_full(text: str, voice: str = "af_heart", speed: float = 1.0, format: str
|
|
185 |
Full TTS endpoint that synthesizes the entire text, concatenates the audio,
|
186 |
and returns a complete WAV or Opus file.
|
187 |
"""
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
# Concatenate all audio segments.
|
202 |
-
full_audio = np.concatenate(audio_segments)
|
203 |
|
204 |
# Write the concatenated audio to an in-memory WAV or Opus file.
|
205 |
sample_rate = 24000
|
|
|
10 |
from fastapi.middleware import Middleware
|
11 |
from fastapi.middleware.gzip import GZipMiddleware
|
12 |
|
13 |
+
from misaki import en
|
14 |
+
|
15 |
+
import os
|
16 |
+
import numpy as np
|
17 |
+
from onnxruntime import InferenceSession
|
18 |
+
from huggingface_hub import snapshot_download
|
19 |
+
|
20 |
+
import json
|
21 |
+
|
22 |
+
# Load the configuration file
|
23 |
+
config_file_path = 'config.json' # Update this with the path to your config file
|
24 |
+
|
25 |
+
with open(config_file_path, 'r') as f:
|
26 |
+
config = json.load(f)
|
27 |
+
|
28 |
+
# Extract the phoneme vocabulary
|
29 |
+
phoneme_vocab = config['vocab']
|
30 |
+
|
31 |
+
# Step 3: Download the model and voice file from Hugging Face Hub
|
32 |
+
model_repo = "onnx-community/Kokoro-82M-v1.0-ONNX"
|
33 |
+
model_name = "onnx/model_q8f16.onnx"
|
34 |
+
voice_file = "voices"
|
35 |
+
local_dir = "."
|
36 |
+
|
37 |
+
# Download the model and voice file
|
38 |
+
snapshot_download(
|
39 |
+
repo_id=model_repo,
|
40 |
+
local_dir=local_dir,
|
41 |
+
allow_patterns=[model_name, voice_file],
|
42 |
+
)
|
43 |
+
|
44 |
+
# Step 4: Load the model
|
45 |
+
model_path = os.path.join(local_dir, model_name)
|
46 |
+
sess = InferenceSession(model_path)
|
47 |
|
48 |
app = FastAPI(
|
49 |
title="Kokoro TTS FastAPI",
|
|
|
56 |
# Global Pipeline Instance
|
57 |
# ------------------------------------------------------------------------------
|
58 |
# Create one pipeline instance for the entire app.
|
59 |
+
|
60 |
|
61 |
|
62 |
# ------------------------------------------------------------------------------
|
|
|
159 |
|
160 |
return encoded_data
|
161 |
|
162 |
+
g2p = en.G2P(trf=False, british=False, fallback=None) # no transformer, American English
|
163 |
+
|
164 |
+
def tokenizer(text):
|
165 |
+
phonemes_string, _ = g2p(text)
|
166 |
+
phonemes = []
|
167 |
+
for i in phonemes_string:
|
168 |
+
phonemes.append(i)
|
169 |
+
tokens = [phoneme_vocab[phoneme] for phoneme in phonemes if phoneme in phoneme_vocab]
|
170 |
+
return tokens
|
171 |
+
|
172 |
+
|
173 |
+
|
174 |
|
175 |
# ------------------------------------------------------------------------------
|
176 |
# Endpoints
|
177 |
# ------------------------------------------------------------------------------
|
178 |
|
179 |
+
# @app.get("/tts/streaming", summary="Streaming TTS")
|
180 |
+
# def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0, format: str = "opus"):
|
181 |
+
# """
|
182 |
+
# Streaming TTS endpoint that returns a continuous audio stream.
|
183 |
+
# Supports WAV (PCM) and Opus formats. Opus offers significantly better compression.
|
184 |
+
|
185 |
+
# The endpoint first yields a WAV header (with a dummy length) for WAV,
|
186 |
+
# then yields encoded audio data for each text chunk as soon as it is generated.
|
187 |
+
# """
|
188 |
+
# # Split the input text using the custom doubling strategy.
|
189 |
+
# chunks = custom_split_text(text)
|
190 |
+
# sample_rate = 24000
|
191 |
+
# num_channels = 1
|
192 |
+
# sample_width = 2 # 16-bit PCM
|
193 |
+
|
194 |
+
|
195 |
+
# def audio_generator():
|
196 |
+
# if format.lower() == "wav":
|
197 |
+
# # Yield the WAV header first.
|
198 |
+
# header = generate_wav_header(sample_rate, num_channels, sample_width)
|
199 |
+
# yield header
|
200 |
+
# # Process and yield each chunk's audio data.
|
201 |
+
# for i, chunk in enumerate(chunks):
|
202 |
+
# print(f"Processing chunk {i}: {chunk}") # Debugging
|
203 |
+
# try:
|
204 |
+
# results = list(pipeline(chunk, voice=voice, speed=speed, split_pattern=None))
|
205 |
+
# for result in results:
|
206 |
+
# if result.audio is not None:
|
207 |
+
# if format.lower() == "wav":
|
208 |
+
# yield audio_tensor_to_pcm_bytes(result.audio)
|
209 |
+
# elif format.lower() == "opus":
|
210 |
+
# yield audio_tensor_to_opus_bytes(result.audio, sample_rate=sample_rate)
|
211 |
+
# else:
|
212 |
+
# raise ValueError(f"Unsupported audio format: {format}")
|
213 |
+
# else:
|
214 |
+
# print(f"Chunk {i}: No audio generated")
|
215 |
+
# except Exception as e:
|
216 |
+
# print(f"Error processing chunk {i}: {e}")
|
217 |
+
# yield b'' # important so that streaming continues. Consider returning an error sound.
|
218 |
+
|
219 |
+
# media_type = "audio/wav" if format.lower() == "wav" else "audio/opus"
|
220 |
+
|
221 |
+
# return StreamingResponse(
|
222 |
+
# audio_generator(),
|
223 |
+
# media_type=media_type,
|
224 |
+
# headers={"Cache-Control": "no-cache"},
|
225 |
+
# )
|
226 |
|
227 |
|
228 |
@app.get("/tts/full", summary="Full TTS")
|
|
|
231 |
Full TTS endpoint that synthesizes the entire text, concatenates the audio,
|
232 |
and returns a complete WAV or Opus file.
|
233 |
"""
|
234 |
+
voice_path = os.path.join(local_dir, f"voices/{voice}.bin")
|
235 |
+
voices = np.fromfile(voice_path, dtype=np.float32).reshape(-1, 1, 256)
|
236 |
+
|
237 |
+
tokens = tokenizer(text)
|
238 |
+
|
239 |
+
final_token = [[0, *tokens]]
|
240 |
+
|
241 |
+
full_audio = sess.run(None, dict(
|
242 |
+
input_ids=tokens,
|
243 |
+
style=ref_s,
|
244 |
+
speed=np.ones(1, dtype=np.float32),
|
245 |
+
))[0]
|
|
|
|
|
|
|
246 |
|
247 |
# Write the concatenated audio to an in-memory WAV or Opus file.
|
248 |
sample_rate = 24000
|