bcci commited on
Commit
f711016
·
verified ·
1 Parent(s): 6dfcc82

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +106 -63
app.py CHANGED
@@ -10,7 +10,40 @@ from fastapi.responses import StreamingResponse, Response, HTMLResponse
10
  from fastapi.middleware import Middleware
11
  from fastapi.middleware.gzip import GZipMiddleware
12
 
13
- from kokoro import KPipeline
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  app = FastAPI(
16
  title="Kokoro TTS FastAPI",
@@ -23,7 +56,7 @@ app = FastAPI(
23
  # Global Pipeline Instance
24
  # ------------------------------------------------------------------------------
25
  # Create one pipeline instance for the entire app.
26
- pipeline = KPipeline(lang_code="a")
27
 
28
 
29
  # ------------------------------------------------------------------------------
@@ -126,57 +159,70 @@ def audio_tensor_to_opus_bytes(audio_tensor: torch.Tensor, sample_rate: int = 24
126
 
127
  return encoded_data
128
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
  # ------------------------------------------------------------------------------
131
  # Endpoints
132
  # ------------------------------------------------------------------------------
133
 
134
- @app.get("/tts/streaming", summary="Streaming TTS")
135
- def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0, format: str = "opus"):
136
- """
137
- Streaming TTS endpoint that returns a continuous audio stream.
138
- Supports WAV (PCM) and Opus formats. Opus offers significantly better compression.
139
-
140
- The endpoint first yields a WAV header (with a dummy length) for WAV,
141
- then yields encoded audio data for each text chunk as soon as it is generated.
142
- """
143
- # Split the input text using the custom doubling strategy.
144
- chunks = custom_split_text(text)
145
- sample_rate = 24000
146
- num_channels = 1
147
- sample_width = 2 # 16-bit PCM
148
-
149
- def audio_generator():
150
- if format.lower() == "wav":
151
- # Yield the WAV header first.
152
- header = generate_wav_header(sample_rate, num_channels, sample_width)
153
- yield header
154
- # Process and yield each chunk's audio data.
155
- for i, chunk in enumerate(chunks):
156
- print(f"Processing chunk {i}: {chunk}") # Debugging
157
- try:
158
- results = list(pipeline(chunk, voice=voice, speed=speed, split_pattern=None))
159
- for result in results:
160
- if result.audio is not None:
161
- if format.lower() == "wav":
162
- yield audio_tensor_to_pcm_bytes(result.audio)
163
- elif format.lower() == "opus":
164
- yield audio_tensor_to_opus_bytes(result.audio, sample_rate=sample_rate)
165
- else:
166
- raise ValueError(f"Unsupported audio format: {format}")
167
- else:
168
- print(f"Chunk {i}: No audio generated")
169
- except Exception as e:
170
- print(f"Error processing chunk {i}: {e}")
171
- yield b'' # important so that streaming continues. Consider returning an error sound.
172
-
173
- media_type = "audio/wav" if format.lower() == "wav" else "audio/opus"
174
-
175
- return StreamingResponse(
176
- audio_generator(),
177
- media_type=media_type,
178
- headers={"Cache-Control": "no-cache"},
179
- )
 
180
 
181
 
182
  @app.get("/tts/full", summary="Full TTS")
@@ -185,21 +231,18 @@ def tts_full(text: str, voice: str = "af_heart", speed: float = 1.0, format: str
185
  Full TTS endpoint that synthesizes the entire text, concatenates the audio,
186
  and returns a complete WAV or Opus file.
187
  """
188
- # Use newline-based splitting via the pipeline's split_pattern.
189
- results = list(pipeline(text, voice=voice, speed=speed, split_pattern=r"\n+"))
190
- audio_segments = []
191
- for result in results:
192
- if result.audio is not None:
193
- audio_np = result.audio.cpu().numpy()
194
- if audio_np.ndim > 1:
195
- audio_np = audio_np.flatten()
196
- audio_segments.append(audio_np)
197
-
198
- if not audio_segments:
199
- raise HTTPException(status_code=500, detail="No audio generated.")
200
-
201
- # Concatenate all audio segments.
202
- full_audio = np.concatenate(audio_segments)
203
 
204
  # Write the concatenated audio to an in-memory WAV or Opus file.
205
  sample_rate = 24000
 
10
  from fastapi.middleware import Middleware
11
  from fastapi.middleware.gzip import GZipMiddleware
12
 
13
+ from misaki import en
14
+
15
+ import os
16
+ import numpy as np
17
+ from onnxruntime import InferenceSession
18
+ from huggingface_hub import snapshot_download
19
+
20
+ import json
21
+
22
+ # Load the configuration file
23
+ config_file_path = 'config.json' # Update this with the path to your config file
24
+
25
+ with open(config_file_path, 'r') as f:
26
+ config = json.load(f)
27
+
28
+ # Extract the phoneme vocabulary
29
+ phoneme_vocab = config['vocab']
30
+
31
+ # Step 3: Download the model and voice file from Hugging Face Hub
32
+ model_repo = "onnx-community/Kokoro-82M-v1.0-ONNX"
33
+ model_name = "onnx/model_q8f16.onnx"
34
+ voice_file = "voices"
35
+ local_dir = "."
36
+
37
+ # Download the model and voice file
38
+ snapshot_download(
39
+ repo_id=model_repo,
40
+ local_dir=local_dir,
41
+ allow_patterns=[model_name, voice_file],
42
+ )
43
+
44
+ # Step 4: Load the model
45
+ model_path = os.path.join(local_dir, model_name)
46
+ sess = InferenceSession(model_path)
47
 
48
  app = FastAPI(
49
  title="Kokoro TTS FastAPI",
 
56
  # Global Pipeline Instance
57
  # ------------------------------------------------------------------------------
58
  # Create one pipeline instance for the entire app.
59
+
60
 
61
 
62
  # ------------------------------------------------------------------------------
 
159
 
160
  return encoded_data
161
 
162
+ g2p = en.G2P(trf=False, british=False, fallback=None) # no transformer, American English
163
+
164
+ def tokenizer(text):
165
+ phonemes_string, _ = g2p(text)
166
+ phonemes = []
167
+ for i in phonemes_string:
168
+ phonemes.append(i)
169
+ tokens = [phoneme_vocab[phoneme] for phoneme in phonemes if phoneme in phoneme_vocab]
170
+ return tokens
171
+
172
+
173
+
174
 
175
  # ------------------------------------------------------------------------------
176
  # Endpoints
177
  # ------------------------------------------------------------------------------
178
 
179
+ # @app.get("/tts/streaming", summary="Streaming TTS")
180
+ # def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0, format: str = "opus"):
181
+ # """
182
+ # Streaming TTS endpoint that returns a continuous audio stream.
183
+ # Supports WAV (PCM) and Opus formats. Opus offers significantly better compression.
184
+
185
+ # The endpoint first yields a WAV header (with a dummy length) for WAV,
186
+ # then yields encoded audio data for each text chunk as soon as it is generated.
187
+ # """
188
+ # # Split the input text using the custom doubling strategy.
189
+ # chunks = custom_split_text(text)
190
+ # sample_rate = 24000
191
+ # num_channels = 1
192
+ # sample_width = 2 # 16-bit PCM
193
+
194
+
195
+ # def audio_generator():
196
+ # if format.lower() == "wav":
197
+ # # Yield the WAV header first.
198
+ # header = generate_wav_header(sample_rate, num_channels, sample_width)
199
+ # yield header
200
+ # # Process and yield each chunk's audio data.
201
+ # for i, chunk in enumerate(chunks):
202
+ # print(f"Processing chunk {i}: {chunk}") # Debugging
203
+ # try:
204
+ # results = list(pipeline(chunk, voice=voice, speed=speed, split_pattern=None))
205
+ # for result in results:
206
+ # if result.audio is not None:
207
+ # if format.lower() == "wav":
208
+ # yield audio_tensor_to_pcm_bytes(result.audio)
209
+ # elif format.lower() == "opus":
210
+ # yield audio_tensor_to_opus_bytes(result.audio, sample_rate=sample_rate)
211
+ # else:
212
+ # raise ValueError(f"Unsupported audio format: {format}")
213
+ # else:
214
+ # print(f"Chunk {i}: No audio generated")
215
+ # except Exception as e:
216
+ # print(f"Error processing chunk {i}: {e}")
217
+ # yield b'' # important so that streaming continues. Consider returning an error sound.
218
+
219
+ # media_type = "audio/wav" if format.lower() == "wav" else "audio/opus"
220
+
221
+ # return StreamingResponse(
222
+ # audio_generator(),
223
+ # media_type=media_type,
224
+ # headers={"Cache-Control": "no-cache"},
225
+ # )
226
 
227
 
228
  @app.get("/tts/full", summary="Full TTS")
 
231
  Full TTS endpoint that synthesizes the entire text, concatenates the audio,
232
  and returns a complete WAV or Opus file.
233
  """
234
+ voice_path = os.path.join(local_dir, f"voices/{voice}.bin")
235
+ voices = np.fromfile(voice_path, dtype=np.float32).reshape(-1, 1, 256)
236
+
237
+ tokens = tokenizer(text)
238
+
239
+ final_token = [[0, *tokens]]
240
+
241
+ full_audio = sess.run(None, dict(
242
+ input_ids=tokens,
243
+ style=ref_s,
244
+ speed=np.ones(1, dtype=np.float32),
245
+ ))[0]
 
 
 
246
 
247
  # Write the concatenated audio to an in-memory WAV or Opus file.
248
  sample_rate = 24000