Spaces:

bcci
/

kokoro-api-test

Runtime error

App Files Files Community

bcci commited on 6 days ago

Commit

0870f8f

verified ·

1 Parent(s): 80ce7b7

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -111

app.py CHANGED Viewed

@@ -2,6 +2,7 @@ import io
 import re
 import wave
 import struct
 import numpy as np
 import torch
@@ -10,7 +11,8 @@ from fastapi.responses import StreamingResponse, Response, HTMLResponse
 from fastapi.middleware import Middleware
 from fastapi.middleware.gzip import GZipMiddleware
-from kokoro import StreamKPipeline, KPipeline # Import StreamKPipeline and KPipeline
 app = FastAPI(
     title="Kokoro TTS FastAPI",
@@ -23,9 +25,14 @@ app = FastAPI(
 # Global Pipeline Instance
 # ------------------------------------------------------------------------------
 # Create one pipeline instance for the entire app.
-stream_pipeline = StreamKPipeline(lang_code="a") # Use StreamKPipeline for streaming
-full_pipeline = KPipeline(lang_code="a") # Keep KPipeline for full TTS
 # ------------------------------------------------------------------------------
 # Helper Functions
@@ -48,40 +55,6 @@ def generate_wav_header(sample_rate: int, num_channels: int, sample_width: int,
     return header + fmt_chunk + data_chunk_header
-def custom_split_text(text: str) -> list:
-    """
-    Custom splitting:
-      - Start with a chunk size of 2 words.
-      - For each chunk, if a period (".") is found in any word (except if it’s the very last word),
-        then split the chunk at that word (include words up to that word).
-      - Otherwise, use the current chunk size.
-      - For subsequent chunks, increase the chunk size by 2.
-      - If there are fewer than the desired number of words for a full chunk, add all remaining words.
-    """
-    words = text.split()
-    chunks = []
-    chunk_size = 2
-    start = 0
-    while start < len(words):
-        candidate_end = start + chunk_size
-        if candidate_end > len(words):
-            candidate_end = len(words)
-        chunk_words = words[start:candidate_end]
-        # Look for a period in any word except the last one.
-        split_index = None
-        for i in range(len(chunk_words) - 1):
-            if '.' in chunk_words[i]:
-                split_index = i
-                break
-        if split_index is not None:
-            candidate_end = start + split_index + 1
-            chunk_words = words[start:candidate_end]
-        chunks.append(" ".join(chunk_words))
-        start = candidate_end
-        chunk_size += 2  # Increase the chunk size by 2 for the next iteration.
-    return chunks
 def audio_tensor_to_pcm_bytes(audio_tensor: torch.Tensor) -> bytes:
     """
     Convert a torch.FloatTensor (with values in [-1, 1]) to raw 16-bit PCM bytes.
@@ -101,12 +74,12 @@ def audio_tensor_to_pcm_bytes(audio_tensor: torch.Tensor) -> bytes:
 # ------------------------------------------------------------------------------
 @app.get("/tts/streaming", summary="Streaming TTS")
-def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0):
     """
-    Streaming TTS endpoint that returns a continuous audio stream in WAV format (PCM).
-    The endpoint yields a WAV header (with a dummy length) only once at the start of the stream,
-    then yields PCM audio data chunks as they are generated in real-time.
     """
     sample_rate = 24000
     num_channels = 1
@@ -117,16 +90,18 @@ def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0):
         header = generate_wav_header(sample_rate, num_channels, sample_width)
         yield header
-        # Stream audio chunks from StreamKPipeline
         try:
-            for stream_result in stream_pipeline(text, voice=voice, speed=speed, split_pattern=r'([.!?…])\s+'): # Split at sentence ends
-                if stream_result.audio_chunk is not None:
-                    pcm_bytes = audio_tensor_to_pcm_bytes(stream_result.audio_chunk)
-                    yield pcm_bytes
-        except Exception as e:
-            print(f"Streaming error: {e}")
-            yield b'' # Keep stream alive on error
     media_type = "audio/wav"
@@ -136,52 +111,13 @@ def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0):
         headers={"Cache-Control": "no-cache"},
     )
-@app.get("/tts/full", summary="Full TTS")
-def tts_full(text: str, voice: str = "af_heart", speed: float = 1.0):
-    """
-    Full TTS endpoint that synthesizes the entire text using KPipeline,
-    concatenates the audio, and returns a complete WAV file.
-    """
-    # Use newline-based splitting via the pipeline's split_pattern.
-    results = list(full_pipeline(text, voice=voice, speed=speed, split_pattern=r"\n+"))
-    audio_segments = []
-    for result in results:
-        if result.audio is not None:
-            audio_np = result.audio.cpu().numpy()
-            if audio_np.ndim > 1:
-                audio_np = audio_np.flatten()
-            audio_segments.append(audio_np)
-    if not audio_segments:
-        raise HTTPException(status_code=500, detail="No audio generated.")
-    # Concatenate all audio segments.
-    full_audio = np.concatenate(audio_segments)
-    # Write the concatenated audio to an in-memory WAV file.
-    sample_rate = 24000
-    num_channels = 1
-    sample_width = 2  # 16-bit PCM -> 2 bytes per sample
-    wav_io = io.BytesIO()
-    with wave.open(wav_io, "wb") as wav_file:
-        wav_file.setnchannels(num_channels)
-        wav_file.setsampwidth(sample_width)
-        wav_file.setframerate(sample_rate)
-        full_audio_int16 = np.int16(full_audio * 32767)
-        wav_file.writeframes(full_audio_int16.tobytes())
-    wav_io.seek(0)
-    return Response(content=wav_io.read(), media_type="audio/wav")
 @app.get("/", response_class=HTMLResponse)
 def index():
     """
     HTML demo page for Kokoro TTS.
-    This page provides a simple UI to enter text, choose a voice and speed,
-    and play synthesized audio from both the streaming and full endpoints.
     """
     return """
     <!DOCTYPE html>
@@ -191,34 +127,15 @@ def index():
     </head>
     <body>
         <h1>Kokoro TTS Demo</h1>
-        <textarea id="text" rows="4" cols="50" placeholder="Enter text here"></textarea><br>
-        <label for="voice">Voice:</label>
-        <input type="text" id="voice" value="af_heart"><br>
-        <label for="speed">Speed:</label>
-        <input type="number" step="0.1" id="speed" value="1.0"><br>
-        <br><br>
         <button onclick="playStreaming()">Play Streaming TTS</button>
-        <button onclick="playFull()">Play Full TTS (Download WAV)</button>
         <br><br>
         <audio id="audio" controls autoplay></audio>
         <script>
             function playStreaming() {
                 const text = document.getElementById('text').value;
-                const voice = document.getElementById('voice').value;
-                const speed = document.getElementById('speed').value;
-                const audio = document.getElementById('audio');
-                // Set the audio element's source to the streaming endpoint.
-                audio.src = `/tts/streaming?text=${encodeURIComponent(text)}&voice=${encodeURIComponent(voice)}&speed=${speed}`;
-                audio.type = 'audio/wav';
-                audio.play();
-            }
-            function playFull() {
-                const text = document.getElementById('text').value;
-                const voice = document.getElementById('voice').value;
-                const speed = document.getElementById('speed').value;
                 const audio = document.getElementById('audio');
-                // Set the audio element's source to the full TTS endpoint.
-                audio.src = `/tts/full?text=${encodeURIComponent(text)}&voice=${encodeURIComponent(voice)}&speed=${speed}`;
                 audio.type = 'audio/wav';
                 audio.play();
             }

 import re
 import wave
 import struct
+import time
 import numpy as np
 import torch
 from fastapi.middleware import Middleware
 from fastapi.middleware.gzip import GZipMiddleware
+from kokoro import KPipeline, StreamKPipeline
+from kokoro.model import KModel
 app = FastAPI(
     title="Kokoro TTS FastAPI",
 # Global Pipeline Instance
 # ------------------------------------------------------------------------------
 # Create one pipeline instance for the entire app.
+model = KModel()  # Or however you initialize/load your model
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model.to(device)
+#pipeline = KPipeline(lang_code="a",model=model)
+voice = "af_heart"
+speed = 1.0
+pipeline = StreamKPipeline(lang_code="a", model=model, voice=voice, device=device, speed=speed)
 # ------------------------------------------------------------------------------
 # Helper Functions
     return header + fmt_chunk + data_chunk_header
 def audio_tensor_to_pcm_bytes(audio_tensor: torch.Tensor) -> bytes:
     """
     Convert a torch.FloatTensor (with values in [-1, 1]) to raw 16-bit PCM bytes.
 # ------------------------------------------------------------------------------
 @app.get("/tts/streaming", summary="Streaming TTS")
+def tts_streaming(text: str):
     """
+    Streaming TTS endpoint that returns a continuous audio stream.
+    The endpoint yields a WAV header (with a dummy length) for WAV,
+    then yields encoded audio data for each phoneme as soon as it is generated.
     """
     sample_rate = 24000
     num_channels = 1
         header = generate_wav_header(sample_rate, num_channels, sample_width)
         yield header
+        # Process and yield each audio chunk.
         try:
+            for result in pipeline(text):  # Use StreamKPipeline
+                if result.audio is not None:
+                    yield audio_tensor_to_pcm_bytes(result.audio)
+                else:
+                    print("No audio generated for phoneme")
+        except Exception as e:
+            print(f"Error processing: {e}")
+            yield b''  # Important so that streaming continues.
     media_type = "audio/wav"
         headers={"Cache-Control": "no-cache"},
     )
+#Remove full tts
 @app.get("/", response_class=HTMLResponse)
 def index():
     """
     HTML demo page for Kokoro TTS.
+    This page provides a simple UI to enter text and play synthesized audio from the streaming endpoint.
     """
     return """
     <!DOCTYPE html>
     </head>
     <body>
         <h1>Kokoro TTS Demo</h1>
+        <textarea id="text" rows="4" cols="50" placeholder="Enter text here"></textarea><br><br>
         <button onclick="playStreaming()">Play Streaming TTS</button>
         <br><br>
         <audio id="audio" controls autoplay></audio>
         <script>
             function playStreaming() {
                 const text = document.getElementById('text').value;
                 const audio = document.getElementById('audio');
+                audio.src = `/tts/streaming?text=${encodeURIComponent(text)}`;
                 audio.type = 'audio/wav';
                 audio.play();
             }