kokoro-onnx-api-test

Running

App Files Files Community

bcci commited on 6 days ago

Commit

f711016

verified ·

1 Parent(s): 6dfcc82

Update app.py

Browse files

Files changed (1) hide show

app.py +106 -63

app.py CHANGED Viewed

@@ -10,7 +10,40 @@ from fastapi.responses import StreamingResponse, Response, HTMLResponse
 from fastapi.middleware import Middleware
 from fastapi.middleware.gzip import GZipMiddleware
-from kokoro import KPipeline
 app = FastAPI(
     title="Kokoro TTS FastAPI",
@@ -23,7 +56,7 @@ app = FastAPI(
 # Global Pipeline Instance
 # ------------------------------------------------------------------------------
 # Create one pipeline instance for the entire app.
-pipeline = KPipeline(lang_code="a")
 # ------------------------------------------------------------------------------
@@ -126,57 +159,70 @@ def audio_tensor_to_opus_bytes(audio_tensor: torch.Tensor, sample_rate: int = 24
     return encoded_data
 # ------------------------------------------------------------------------------
 # Endpoints
 # ------------------------------------------------------------------------------
-@app.get("/tts/streaming", summary="Streaming TTS")
-def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0, format: str = "opus"):
-    """
-    Streaming TTS endpoint that returns a continuous audio stream.
-    Supports WAV (PCM) and Opus formats.  Opus offers significantly better compression.
-    The endpoint first yields a WAV header (with a dummy length) for WAV,
-    then yields encoded audio data for each text chunk as soon as it is generated.
-    """
-    # Split the input text using the custom doubling strategy.
-    chunks = custom_split_text(text)
-    sample_rate = 24000
-    num_channels = 1
-    sample_width = 2  # 16-bit PCM
-    def audio_generator():
-        if format.lower() == "wav":
-            # Yield the WAV header first.
-            header = generate_wav_header(sample_rate, num_channels, sample_width)
-            yield header
-        # Process and yield each chunk's audio data.
-        for i, chunk in enumerate(chunks):
-            print(f"Processing chunk {i}: {chunk}")  # Debugging
-            try:
-                results = list(pipeline(chunk, voice=voice, speed=speed, split_pattern=None))
-                for result in results:
-                    if result.audio is not None:
-                        if format.lower() == "wav":
-                            yield audio_tensor_to_pcm_bytes(result.audio)
-                        elif format.lower() == "opus":
-                            yield audio_tensor_to_opus_bytes(result.audio, sample_rate=sample_rate)
-                        else:
-                            raise ValueError(f"Unsupported audio format: {format}")
-                    else:
-                        print(f"Chunk {i}: No audio generated")
-            except Exception as e:
-                print(f"Error processing chunk {i}: {e}")
-                yield b'' # important so that streaming continues.  Consider returning an error sound.
-    media_type = "audio/wav" if format.lower() == "wav" else "audio/opus"
-    return StreamingResponse(
-        audio_generator(),
-        media_type=media_type,
-        headers={"Cache-Control": "no-cache"},
-    )
 @app.get("/tts/full", summary="Full TTS")
@@ -185,21 +231,18 @@ def tts_full(text: str, voice: str = "af_heart", speed: float = 1.0, format: str
     Full TTS endpoint that synthesizes the entire text, concatenates the audio,
     and returns a complete WAV or Opus file.
     """
-    # Use newline-based splitting via the pipeline's split_pattern.
-    results = list(pipeline(text, voice=voice, speed=speed, split_pattern=r"\n+"))
-    audio_segments = []
-    for result in results:
-        if result.audio is not None:
-            audio_np = result.audio.cpu().numpy()
-            if audio_np.ndim > 1:
-                audio_np = audio_np.flatten()
-            audio_segments.append(audio_np)
-    if not audio_segments:
-        raise HTTPException(status_code=500, detail="No audio generated.")
-    # Concatenate all audio segments.
-    full_audio = np.concatenate(audio_segments)
     # Write the concatenated audio to an in-memory WAV or Opus file.
     sample_rate = 24000

 from fastapi.middleware import Middleware
 from fastapi.middleware.gzip import GZipMiddleware
+from misaki import en
+import os
+import numpy as np
+from onnxruntime import InferenceSession
+from huggingface_hub import snapshot_download
+import json
+# Load the configuration file
+config_file_path = 'config.json'  # Update this with the path to your config file
+with open(config_file_path, 'r') as f:
+    config = json.load(f)
+# Extract the phoneme vocabulary
+phoneme_vocab = config['vocab']
+# Step 3: Download the model and voice file from Hugging Face Hub
+model_repo = "onnx-community/Kokoro-82M-v1.0-ONNX"
+model_name = "onnx/model_q8f16.onnx"
+voice_file = "voices"
+local_dir = "."
+# Download the model and voice file
+snapshot_download(
+    repo_id=model_repo,
+    local_dir=local_dir,
+    allow_patterns=[model_name, voice_file],
+)
+# Step 4: Load the model
+model_path = os.path.join(local_dir, model_name)
+sess = InferenceSession(model_path)
 app = FastAPI(
     title="Kokoro TTS FastAPI",
 # Global Pipeline Instance
 # ------------------------------------------------------------------------------
 # Create one pipeline instance for the entire app.
 # ------------------------------------------------------------------------------
     return encoded_data
+g2p = en.G2P(trf=False, british=False, fallback=None) # no transformer, American English
+def tokenizer(text):
+    phonemes_string, _ = g2p(text)
+    phonemes = []
+    for i in phonemes_string:
+        phonemes.append(i)
+    tokens = [phoneme_vocab[phoneme] for phoneme in phonemes if phoneme in phoneme_vocab]
+    return tokens
 # ------------------------------------------------------------------------------
 # Endpoints
 # ------------------------------------------------------------------------------
+# @app.get("/tts/streaming", summary="Streaming TTS")
+# def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0, format: str = "opus"):
+#     """
+#     Streaming TTS endpoint that returns a continuous audio stream.
+#     Supports WAV (PCM) and Opus formats.  Opus offers significantly better compression.
+#     The endpoint first yields a WAV header (with a dummy length) for WAV,
+#     then yields encoded audio data for each text chunk as soon as it is generated.
+#     """
+#     # Split the input text using the custom doubling strategy.
+#     chunks = custom_split_text(text)
+#     sample_rate = 24000
+#     num_channels = 1
+#     sample_width = 2  # 16-bit PCM
+#     def audio_generator():
+#         if format.lower() == "wav":
+#             # Yield the WAV header first.
+#             header = generate_wav_header(sample_rate, num_channels, sample_width)
+#             yield header
+#         # Process and yield each chunk's audio data.
+#         for i, chunk in enumerate(chunks):
+#             print(f"Processing chunk {i}: {chunk}")  # Debugging
+#             try:
+#                 results = list(pipeline(chunk, voice=voice, speed=speed, split_pattern=None))
+#                 for result in results:
+#                     if result.audio is not None:
+#                         if format.lower() == "wav":
+#                             yield audio_tensor_to_pcm_bytes(result.audio)
+#                         elif format.lower() == "opus":
+#                             yield audio_tensor_to_opus_bytes(result.audio, sample_rate=sample_rate)
+#                         else:
+#                             raise ValueError(f"Unsupported audio format: {format}")
+#                     else:
+#                         print(f"Chunk {i}: No audio generated")
+#             except Exception as e:
+#                 print(f"Error processing chunk {i}: {e}")
+#                 yield b'' # important so that streaming continues.  Consider returning an error sound.
+#     media_type = "audio/wav" if format.lower() == "wav" else "audio/opus"
+#     return StreamingResponse(
+#         audio_generator(),
+#         media_type=media_type,
+#         headers={"Cache-Control": "no-cache"},
+#     )
 @app.get("/tts/full", summary="Full TTS")
     Full TTS endpoint that synthesizes the entire text, concatenates the audio,
     and returns a complete WAV or Opus file.
     """
+    voice_path = os.path.join(local_dir, f"voices/{voice}.bin")
+    voices = np.fromfile(voice_path, dtype=np.float32).reshape(-1, 1, 256)
+    tokens = tokenizer(text)
+    final_token = [[0, *tokens]]
+    full_audio = sess.run(None, dict(
+        input_ids=tokens,
+        style=ref_s,
+        speed=np.ones(1, dtype=np.float32),
+    ))[0]
     # Write the concatenated audio to an in-memory WAV or Opus file.
     sample_rate = 24000