Spaces:

gaganyatri
/

tts-indic-f5

Paused

App Files Files Community

sachin commited on 17 days ago

Commit

457fdad

1 Parent(s): 56c2e15

tet

Browse files

Files changed (2) hide show

requirements.txt +1 -0
tts_api.py +23 -11

requirements.txt CHANGED Viewed

@@ -9,6 +9,7 @@ anyio==4.9.0
 async-timeout==5.0.1
 attrs==25.3.0
 audioread==3.0.1
 bitsandbytes==0.45.5
 boto3==1.37.29
 botocore==1.37.29

 async-timeout==5.0.1
 attrs==25.3.0
 audioread==3.0.1
+flash-attn
 bitsandbytes==0.45.5
 boto3==1.37.29
 botocore==1.37.29

tts_api.py CHANGED Viewed

@@ -12,6 +12,14 @@ from typing import Optional, Dict
 from starlette.responses import StreamingResponse
 from fastapi.responses import RedirectResponse
 # Initialize FastAPI app
 app = FastAPI(title="IndicF5 Text-to-Speech API", description="High-quality TTS for Indian languages with Kannada output")
@@ -20,12 +28,12 @@ repo_id = "ai4bharat/IndicF5"
 model = AutoModel.from_pretrained(repo_id, trust_remote_code=True)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model = model.to(device)
-model.eval()  # Set model to evaluation mode for inference
 if torch.cuda.is_available():
-    torch.cuda.synchronize()  # Ensure CUDA is ready
 print("Device:", device)
-# Precompile model if possible (for PyTorch 2.0+)
 if hasattr(torch, "compile"):
     model = torch.compile(model)
@@ -48,12 +56,7 @@ class SynthesizeRequest(BaseModel):
 class KannadaSynthesizeRequest(BaseModel):
     text: str
-# Response model with timing
-class SynthesisResponse(BaseModel):
-    audio: bytes
-    timing: Dict[str, float]
-# Cache for reference audio to avoid repeated downloads
 audio_cache = {}
 def load_audio_from_url(url: str) -> tuple:
@@ -91,10 +94,19 @@ def synthesize_speech(text: str, ref_audio_name: str, ref_text: str) -> tuple[io
         sf.write(temp_audio.name, audio_data, samplerate=sample_rate, format='WAV')
         temp_audio.flush()
-        # Inference with no_grad for optimization
         start_inference = time.time()
         with torch.no_grad():
-            audio = model(text, ref_audio_path=temp_audio.name, ref_text=ref_text)
         timing["inference"] = time.time() - start_inference
     timing["temp_file"] = time.time() - start_temp

 from starlette.responses import StreamingResponse
 from fastapi.responses import RedirectResponse
+# Check if flash-attn is available
+try:
+    from flash_attn import flash_attention
+    FLASH_ATTENTION_AVAILABLE = True
+except ImportError:
+    FLASH_ATTENTION_AVAILABLE = False
+    print("Flash Attention not available. Install with 'pip install flash-attn' for better performance.")
 # Initialize FastAPI app
 app = FastAPI(title="IndicF5 Text-to-Speech API", description="High-quality TTS for Indian languages with Kannada output")
 model = AutoModel.from_pretrained(repo_id, trust_remote_code=True)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model = model.to(device)
+model.eval()  # Set model to evaluation mode
 if torch.cuda.is_available():
+    torch.cuda.synchronize()
 print("Device:", device)
+# Precompile model if possible (PyTorch 2.0+)
 if hasattr(torch, "compile"):
     model = torch.compile(model)
 class KannadaSynthesizeRequest(BaseModel):
     text: str
+# Cache for reference audio
 audio_cache = {}
 def load_audio_from_url(url: str) -> tuple:
         sf.write(temp_audio.name, audio_data, samplerate=sample_rate, format='WAV')
         temp_audio.flush()
+        # Inference with Flash Attention
         start_inference = time.time()
         with torch.no_grad():
+            if FLASH_ATTENTION_AVAILABLE and torch.cuda.is_available():
+                # Assuming model has an attention mechanism we can override
+                # This is a placeholder; actual implementation depends on model internals
+                try:
+                    audio = model(text, ref_audio_path=temp_audio.name, ref_text=ref_text, attention_impl="flash")
+                except AttributeError:
+                    print("Warning: Model does not support custom attention_impl. Using default.")
+                    audio = model(text, ref_audio_path=temp_audio.name, ref_text=ref_text)
+            else:
+                audio = model(text, ref_audio_path=temp_audio.name, ref_text=ref_text)
         timing["inference"] = time.time() - start_inference
     timing["temp_file"] = time.time() - start_temp