sachin commited on
Commit
56c2e15
·
1 Parent(s): e3b5e47

add- compile

Browse files
Files changed (1) hide show
  1. tts_api.py +69 -88
tts_api.py CHANGED
@@ -1,4 +1,5 @@
1
  import io
 
2
  import torch
3
  import requests
4
  import tempfile
@@ -7,45 +8,29 @@ import soundfile as sf
7
  from fastapi import FastAPI, HTTPException
8
  from transformers import AutoModel
9
  from pydantic import BaseModel
10
- from typing import Optional
11
  from starlette.responses import StreamingResponse
 
12
 
13
  # Initialize FastAPI app
14
  app = FastAPI(title="IndicF5 Text-to-Speech API", description="High-quality TTS for Indian languages with Kannada output")
15
 
16
- # Load TTS model
17
  repo_id = "ai4bharat/IndicF5"
18
  model = AutoModel.from_pretrained(repo_id, trust_remote_code=True)
19
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
20
- print("Device:", device)
21
  model = model.to(device)
 
 
 
 
22
 
23
- # Example Data (Multiple Examples with URLs)
 
 
 
 
24
  EXAMPLES = [
25
- {
26
- "audio_name": "PAN_F (Happy)",
27
- "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/PAN_F_HAPPY_00002.wav",
28
- "ref_text": "ਇੱਕ ਗ੍ਰਾਹਕ ਨੇ ਸਾਡੀ ਬੇਮਿਸਾਲ ਸੇਵਾ ਬਾਰੇ ਦਿਲੋਂਗਵਾਹੀ ਦਿੱਤੀ ਜਿਸ ਨਾਲ ਸਾਨੂੰ ਅਨੰದ ಮਹಿಸೂਸ ਹੋਇਆ।",
29
- "synth_text": "ನಾನು ಯಾವುದೇ ಚಿಂತೆ ಇಲ್ಲದೆ ನನ್ನ ಸ್ನೇಹಿತರನ್ನು ನನ್ನ ಆಟೋಮೊಬೈಲ್ ತಜ್ಞರ ಬಳಿಗೆ ಕಳುಹಿಸುತ್ತೇನೆ ಏಕೆಂದರೆ ಅವರು ಖಂಡಿತವಾಗಿಯೂ ಅವರ ಎಲ್ಲಾ ಅಗತ್ಯಗಳನ್ನು ಪೂರೈಸುತ್ತಾರೆ ಎಂದು ನನಗೆ ಗೊತ್ತು."
30
- },
31
- {
32
- "audio_name": "TAM_F (Happy)",
33
- "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/TAM_F_HAPPY_00001.wav",
34
- "ref_text": "நான் நெனச்ச மாதிரியே அமேசான்ல பெரிய தள்ளுபடி வந்திருக்கு. கம்மி காசுக்கே அந்தப் புது சேம்சங் மாடல வாங்கிடலாம்.",
35
- "synth_text": "ಊಟದ ನಂತರ ಮೊಸರು ಅನ್ನ ತಿಂದರೆ ಒಂದು ಉತ್ಸಾಹವಾಗುತ್ತದೆ!"
36
- },
37
- {
38
- "audio_name": "MAR_F (WIKI)",
39
- "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/MAR_F_WIKI_00001.wav",
40
- "ref_text": "दिगंतराव्दारे अंतराळ कक्षेतला कचरा चिन्हित करण्यासाठी प्रयत्न केले जात आहे.",
41
- "synth_text": "ಪ್ರಾರಂಭಿಕ ಬೀಜ ಚಿಗುರೊಡೆಯುವಿಕೆ. ನಾನು ಸೋಲಾಪುರ ಜಿಲ್ಲೆಯ ಮಾಲಶಿರಸ್ ತಾಲೂಕಿನ ರೈತ ಗಣಪತ್ ಪಾಟೀಲ್ ಮಾತನಾಡುತ್ತಿದ್ದೇನೆ. ನನ್ನ ಕಬ್ಬಿನ ಬೆಳೆಯಲ್ಲಿ ಪ್ರಾರಂಭಿಕ ಬೀಜ ಚಿಗುರೊಡೆಯುವ ಕೀಟ ಕಂಡುಬರುತ್ತಿದೆ. ಕ್ಲೋರಂಟ್ರಾನಿಲಿಪ್ರೋಲ್ (ಕೊರಾಜೆನ್) ಬಳಸುವುದು ಸೂಕ್ತವೇ? ಅದರ ಪ್ರಮಾಣ ಎಷ್ಟಿರಬೇಕು?"
42
- },
43
- {
44
- "audio_name": "MAR_M (WIKI)",
45
- "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/MAR_M_WIKI_00001.wav",
46
- "ref_text": "या प्रथाला एकोणीसशे पंचातर ईसवी पासून भारतीय दंड संहिताची धारा चारशे अठ्ठावीस आणि चारशे एकोणतीसच्या अंतर्गत निषেধ केला.",
47
- "synth_text": "ಜೀವಾಣು ಕೊಳೆತ. ನಾನು ಅಹಮದ್‌ನಗರ ಜಿಲ್ಲೆಯ ರಾಹುರಿ ಗ್ರಾಮದಿಂದ ಬಾಳಾಸಾಹೇಬ್ ಜಾಧವ್ ಮಾತನಾಡುತ್ತಿದ್ದೇನೆ. ನನ್ನ ದಾಳಿಂಬೆ ತೋಟದಲ್ಲಿ ಜೀವಾಣು ಕೊಳೆತ ಹೆಚ್ಚಾಗಿ ಕಾಣಿಸುತ್ತಿದೆ. ಸ್ಟ್ರೆಪ್ಟೋಸೈಕ್ಲಿನ್ ಮತ್ತು ಕಾಪರ್ ಆಕ್ಸಿಕ್ಲೋರೈಡ್ ಸಿಂಪಡಣೆಗೆ ಸೂಕ್ತ ಪ್ರಮಾಣ ಎಷ್ಟು?"
48
- },
49
  {
50
  "audio_name": "KAN_F (Happy)",
51
  "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/KAN_F_HAPPY_00001.wav",
@@ -54,112 +39,108 @@ EXAMPLES = [
54
  },
55
  ]
56
 
57
- # Pydantic models for request bodies
58
  class SynthesizeRequest(BaseModel):
59
- text: str # Text to synthesize (expected in Kannada)
60
- ref_audio_name: str # Dropdown of audio names from EXAMPLES
61
- ref_text: Optional[str] = None # Optional, defaults to example ref_text if not provided
62
 
63
  class KannadaSynthesizeRequest(BaseModel):
64
- text: str # Text to synthesize (must be in Kannada)
 
 
 
 
 
 
 
 
65
 
66
- # Function to load audio from URL
67
- def load_audio_from_url(url: str):
68
- response = requests.get(url)
 
 
 
69
  if response.status_code == 200:
70
  audio_data, sample_rate = sf.read(io.BytesIO(response.content))
 
71
  return sample_rate, audio_data
72
  raise HTTPException(status_code=500, detail="Failed to load reference audio from URL.")
73
 
74
- # Function to synthesize speech
75
- def synthesize_speech(text: str, ref_audio_name: str, ref_text: str):
76
- # Find the matching example
77
- ref_audio_url = None
78
- for example in EXAMPLES:
79
- if example["audio_name"] == ref_audio_name:
80
- ref_audio_url = example["audio_url"]
81
- if not ref_text:
82
- ref_text = example["ref_text"]
83
- break
84
-
85
  if not ref_audio_url:
86
  raise HTTPException(status_code=400, detail="Invalid reference audio name.")
87
-
88
- if not text.strip():
89
- raise HTTPException(status_code=400, detail="Text to synthesize cannot be empty.")
90
 
91
- if not ref_text or not ref_text.strip():
92
- raise HTTPException(status_code=400, detail="Reference text cannot be empty.")
93
 
94
- # Load reference audio from URL
 
95
  sample_rate, audio_data = load_audio_from_url(ref_audio_url)
 
96
 
97
- # Save reference audio to a temporary file
 
98
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
99
  sf.write(temp_audio.name, audio_data, samplerate=sample_rate, format='WAV')
100
  temp_audio.flush()
101
 
102
- # Generate speech
103
- audio = model(text, ref_audio_path=temp_audio.name, ref_text=ref_text)
 
 
 
 
 
104
 
105
- # Normalize output
 
106
  if audio.dtype == np.int16:
107
  audio = audio.astype(np.float32) / 32768.0
 
108
 
109
- # Save generated audio to a BytesIO buffer
 
110
  buffer = io.BytesIO()
111
  sf.write(buffer, audio, 24000, format='WAV')
112
  buffer.seek(0)
 
 
 
 
113
 
114
- return buffer
115
- '''
116
- # Original endpoint
117
- @app.post("/synthesize", response_class=StreamingResponse)
118
- async def synthesize(request: SynthesizeRequest):
119
- audio_buffer = synthesize_speech(request.text, request.ref_audio_name, request.ref_text)
120
- return StreamingResponse(
121
- audio_buffer,
122
- media_type="audio/wav",
123
- headers={"Content-Disposition": "attachment; filename=synthesized_speech.wav"}
124
- )
125
- '''
126
- # New endpoint for Kannada-only synthesis
127
  @app.post("/audio/speech", response_class=StreamingResponse)
128
  async def synthesize_kannada(request: KannadaSynthesizeRequest):
129
- # Use the Kannada example as fixed reference
130
  kannada_example = next(ex for ex in EXAMPLES if ex["audio_name"] == "KAN_F (Happy)")
131
 
132
  if not request.text.strip():
133
  raise HTTPException(status_code=400, detail="Text to synthesize cannot be empty.")
134
 
135
- # Use the fixed Kannada reference audio and text
136
- audio_buffer = synthesize_speech(
137
  text=request.text,
138
  ref_audio_name="KAN_F (Happy)",
139
  ref_text=kannada_example["ref_text"]
140
  )
141
 
 
 
142
  return StreamingResponse(
143
  audio_buffer,
144
  media_type="audio/wav",
145
  headers={"Content-Disposition": "attachment; filename=synthesized_kannada_speech.wav"}
146
  )
147
 
148
- # Root endpoint with basic info
149
  @app.get("/")
150
- async def root():
151
- return {
152
- "message": "Welcome to IndicF5 Text-to-Speech API",
153
- "description": "High-quality TTS for Indian languages with output in Kannada. Provide Kannada text for synthesis.",
154
- "endpoints": {
155
- "/synthesize": "General synthesis with customizable reference audio",
156
- "/synthesize_kannada": "Kannada-specific synthesis using KAN_F (Happy) as reference"
157
- },
158
- "available_ref_audio_names": [ex["audio_name"] for ex in EXAMPLES],
159
- "example_synth_texts_in_kannada": {ex["audio_name"]: ex["synth_text"] for ex in EXAMPLES}
160
- }
161
-
162
- # Run the app
163
  if __name__ == "__main__":
164
  import uvicorn
165
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
1
  import io
2
+ import time
3
  import torch
4
  import requests
5
  import tempfile
 
8
  from fastapi import FastAPI, HTTPException
9
  from transformers import AutoModel
10
  from pydantic import BaseModel
11
+ from typing import Optional, Dict
12
  from starlette.responses import StreamingResponse
13
+ from fastapi.responses import RedirectResponse
14
 
15
  # Initialize FastAPI app
16
  app = FastAPI(title="IndicF5 Text-to-Speech API", description="High-quality TTS for Indian languages with Kannada output")
17
 
18
+ # Load TTS model globally with optimizations
19
  repo_id = "ai4bharat/IndicF5"
20
  model = AutoModel.from_pretrained(repo_id, trust_remote_code=True)
21
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
22
  model = model.to(device)
23
+ model.eval() # Set model to evaluation mode for inference
24
+ if torch.cuda.is_available():
25
+ torch.cuda.synchronize() # Ensure CUDA is ready
26
+ print("Device:", device)
27
 
28
+ # Precompile model if possible (for PyTorch 2.0+)
29
+ if hasattr(torch, "compile"):
30
+ model = torch.compile(model)
31
+
32
+ # Example Data
33
  EXAMPLES = [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  {
35
  "audio_name": "KAN_F (Happy)",
36
  "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/KAN_F_HAPPY_00001.wav",
 
39
  },
40
  ]
41
 
42
+ # Pydantic models
43
  class SynthesizeRequest(BaseModel):
44
+ text: str
45
+ ref_audio_name: str
46
+ ref_text: Optional[str] = None
47
 
48
  class KannadaSynthesizeRequest(BaseModel):
49
+ text: str
50
+
51
+ # Response model with timing
52
+ class SynthesisResponse(BaseModel):
53
+ audio: bytes
54
+ timing: Dict[str, float]
55
+
56
+ # Cache for reference audio to avoid repeated downloads
57
+ audio_cache = {}
58
 
59
+ def load_audio_from_url(url: str) -> tuple:
60
+ start_time = time.time()
61
+ if url in audio_cache:
62
+ return audio_cache[url]
63
+
64
+ response = requests.get(url, timeout=10)
65
  if response.status_code == 200:
66
  audio_data, sample_rate = sf.read(io.BytesIO(response.content))
67
+ audio_cache[url] = (sample_rate, audio_data)
68
  return sample_rate, audio_data
69
  raise HTTPException(status_code=500, detail="Failed to load reference audio from URL.")
70
 
71
+ def synthesize_speech(text: str, ref_audio_name: str, ref_text: str) -> tuple[io.BytesIO, Dict[str, float]]:
72
+ timing = {}
73
+ start_total = time.time()
74
+
75
+ # Find matching example
76
+ ref_audio_url = next((ex["audio_url"] for ex in EXAMPLES if ex["audio_name"] == ref_audio_name), None)
 
 
 
 
 
77
  if not ref_audio_url:
78
  raise HTTPException(status_code=400, detail="Invalid reference audio name.")
 
 
 
79
 
80
+ if not text.strip() or (not ref_text or not ref_text.strip()):
81
+ raise HTTPException(status_code=400, detail="Text fields cannot be empty.")
82
 
83
+ # Load reference audio
84
+ start_audio_load = time.time()
85
  sample_rate, audio_data = load_audio_from_url(ref_audio_url)
86
+ timing["audio_load"] = time.time() - start_audio_load
87
 
88
+ # Save reference audio to temp file
89
+ start_temp = time.time()
90
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
91
  sf.write(temp_audio.name, audio_data, samplerate=sample_rate, format='WAV')
92
  temp_audio.flush()
93
 
94
+ # Inference with no_grad for optimization
95
+ start_inference = time.time()
96
+ with torch.no_grad():
97
+ audio = model(text, ref_audio_path=temp_audio.name, ref_text=ref_text)
98
+ timing["inference"] = time.time() - start_inference
99
+
100
+ timing["temp_file"] = time.time() - start_temp
101
 
102
+ # Normalize audio
103
+ start_normalize = time.time()
104
  if audio.dtype == np.int16:
105
  audio = audio.astype(np.float32) / 32768.0
106
+ timing["normalize"] = time.time() - start_normalize
107
 
108
+ # Save to buffer
109
+ start_buffer = time.time()
110
  buffer = io.BytesIO()
111
  sf.write(buffer, audio, 24000, format='WAV')
112
  buffer.seek(0)
113
+ timing["buffer"] = time.time() - start_buffer
114
+
115
+ timing["total"] = time.time() - start_total
116
+ return buffer, timing
117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  @app.post("/audio/speech", response_class=StreamingResponse)
119
  async def synthesize_kannada(request: KannadaSynthesizeRequest):
120
+ start_time = time.time()
121
  kannada_example = next(ex for ex in EXAMPLES if ex["audio_name"] == "KAN_F (Happy)")
122
 
123
  if not request.text.strip():
124
  raise HTTPException(status_code=400, detail="Text to synthesize cannot be empty.")
125
 
126
+ audio_buffer, timing = synthesize_speech(
 
127
  text=request.text,
128
  ref_audio_name="KAN_F (Happy)",
129
  ref_text=kannada_example["ref_text"]
130
  )
131
 
132
+ print(f"Synthesis completed in {timing['total']:.2f} seconds: {timing}")
133
+
134
  return StreamingResponse(
135
  audio_buffer,
136
  media_type="audio/wav",
137
  headers={"Content-Disposition": "attachment; filename=synthesized_kannada_speech.wav"}
138
  )
139
 
 
140
  @app.get("/")
141
+ async def home():
142
+ return RedirectResponse(url="/docs")
143
+
 
 
 
 
 
 
 
 
 
 
144
  if __name__ == "__main__":
145
  import uvicorn
146
  uvicorn.run(app, host="0.0.0.0", port=7860)