hivecorp commited on
Commit
ecaad35
·
verified ·
1 Parent(s): 3851ab4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +327 -214
app.py CHANGED
@@ -5,16 +5,12 @@ import os
5
  import asyncio
6
  import uuid
7
  import re
 
 
8
  from concurrent.futures import ThreadPoolExecutor
9
- from typing import List, Tuple, Optional
10
  import math
11
  from dataclasses import dataclass
12
- import hashlib
13
- import json
14
- from pathlib import Path
15
- from tqdm.asyncio import tqdm
16
- from html import unescape
17
- import html
18
 
19
  class TimingManager:
20
  def __init__(self):
@@ -47,20 +43,6 @@ class Segment:
47
  audio: Optional[AudioSegment] = None
48
  lines: List[str] = None # Add lines field for display purposes only
49
 
50
- class TextCleaner:
51
- @staticmethod
52
- def clean_text(text: str) -> str:
53
- """Clean text from HTML and normalize for TTS"""
54
- # Remove HTML tags
55
- text = re.sub(r'<[^>]+>', '', text)
56
- # Convert HTML entities
57
- text = unescape(text)
58
- # Normalize whitespace
59
- text = ' '.join(text.split())
60
- # Fix common punctuation issues
61
- text = re.sub(r'\s+([.,!?;:])', r'\1', text)
62
- return text
63
-
64
  class TextProcessor:
65
  def __init__(self, words_per_line: int, lines_per_segment: int):
66
  self.words_per_line = words_per_line
@@ -124,9 +106,6 @@ class TextProcessor:
124
  return breaks
125
 
126
  def split_into_segments(self, text: str) -> List[Segment]:
127
- # Clean text before processing
128
- text = TextCleaner.clean_text(text)
129
-
130
  # Normalize text and add proper spacing around punctuation
131
  text = re.sub(r'\s+', ' ', text.strip())
132
  text = re.sub(r'([.!?,;:])\s*', r'\1 ', text)
@@ -205,188 +184,264 @@ class TextProcessor:
205
 
206
  return lines
207
 
208
- class AudioCache:
209
- def __init__(self, cache_dir="./cache"):
210
- self.cache_dir = Path(cache_dir)
211
- self.cache_dir.mkdir(exist_ok=True)
212
-
213
- def get_cache_key(self, text: str, voice: str, rate: str, pitch: str) -> str:
214
- data = f"{text}{voice}{rate}{pitch}".encode()
215
- return hashlib.md5(data).hexdigest()
216
-
217
- def get_cached_audio(self, cache_key: str) -> Optional[AudioSegment]:
218
- cache_file = self.cache_dir / f"{cache_key}.wav"
219
- if cache_file.exists():
220
- return AudioSegment.from_file(str(cache_file))
221
- return None
222
-
223
- def cache_audio(self, cache_key: str, audio: AudioSegment):
224
- cache_file = self.cache_dir / f"{cache_key}.wav"
225
- audio.export(str(cache_file), format="wav")
226
-
227
- class SSMLBuilder:
228
- def __init__(self):
229
- self.content = []
230
-
231
- def add_text(self, text: str):
232
- # Escape special characters for SSML
233
- text = html.escape(text, quote=True)
234
- self.content.append(text)
235
- return self
236
-
237
- def add_break(self, strength: str = "medium"):
238
- self.content.append(f'<break strength="{strength}"/>')
239
- return self
240
-
241
- def add_prosody(self, text: str, rate: str = "medium", pitch: str = "medium"):
242
- self.content.append(
243
- f'<prosody rate="{rate}" pitch="{pitch}">{text}</prosody>'
244
- )
245
- return self
246
-
247
- def add_sentence(self, text: str):
248
- self.content.append(f'<s>{text}</s>')
249
- return self
250
-
251
- def __str__(self):
252
- return (
253
- '<?xml version="1.0"?>'
254
- '<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis">'
255
- f'{"".join(self.content)}'
256
- '</speak>'
257
- )
258
 
259
- class SpeechEnhancer:
260
- @staticmethod
261
- def add_speech_marks(text: str) -> str:
262
- """Add SSML marks for better speech control"""
263
- ssml = SSMLBuilder()
264
-
265
- # Split text and add appropriate SSML tags
266
- sentences = text.split('. ')
267
- for i, sentence in enumerate(sentences):
268
- sentence = sentence.strip()
269
- if not sentence:
270
- continue
271
-
272
- ssml.add_sentence(sentence)
273
-
274
- # Add appropriate breaks between sentences
275
- if i < len(sentences) - 1:
276
- ssml.add_break("strong")
277
-
278
- # Add breaks at commas
279
- if ',' in sentence:
280
- parts = sentence.split(',')
281
- for part in parts[:-1]:
282
- ssml.add_break("medium")
283
-
284
- return str(ssml)
285
-
286
- @staticmethod
287
- def enhance_timing(segment: Segment) -> Segment:
288
- """Add natural pauses based on punctuation"""
289
- if segment.audio:
290
- for punct, pause_ms in {'.': 400, '!': 400, '?': 400, ',': 200, ';': 300}.items():
291
- if punct in segment.text:
292
- silence = AudioSegment.silent(duration=pause_ms)
293
- segment.audio = segment.audio.append(silence, crossfade=50)
294
- return segment
295
-
296
- async def process_segment_with_timing(segment: Segment, voice: str, rate: str, pitch: str, cache: AudioCache) -> Segment:
297
- """Process segment with enhanced speech features"""
298
  try:
299
- # Clean text before processing
300
- clean_text = TextCleaner.clean_text(segment.text)
301
- cache_key = cache.get_cache_key(clean_text, voice, rate, pitch)
302
- cached_audio = cache.get_cached_audio(cache_key)
303
-
304
- if cached_audio:
305
- segment.audio = cached_audio
306
- segment.duration = len(cached_audio)
307
- return segment
308
 
309
- # Create SSML with cleaned text
310
- enhanced_text = SpeechEnhancer.add_speech_marks(clean_text)
311
- tts = edge_tts.Communicate(enhanced_text, voice, rate=rate, pitch=pitch)
 
312
 
313
- audio_file = f"temp_segment_{segment.id}_{uuid.uuid4()}.wav"
314
- await tts.save(audio_file)
315
 
316
- segment.audio = AudioSegment.from_file(audio_file)
317
- segment = SpeechEnhancer.enhance_timing(segment)
318
- segment.duration = len(segment.audio)
 
 
 
 
 
319
 
320
- cache.cache_audio(cache_key, segment.audio)
321
  return segment
322
  except Exception as e:
323
- print(f"Error processing segment {segment.id}: {str(e)}")
 
324
  raise
325
  finally:
326
  if os.path.exists(audio_file):
327
- os.remove(audio_file)
 
 
 
328
 
329
- async def generate_accurate_srt(text: str, voice: str, rate: str, pitch: str, words_per_line: int, lines_per_segment: int, enable_ssml: bool, use_cache: bool, pause_after_period: int, pause_after_comma: int) -> Tuple[str, str]:
330
- # Clean input text first
331
- text = TextCleaner.clean_text(text)
 
 
 
 
 
 
 
 
332
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
  processor = TextProcessor(words_per_line, lines_per_segment)
334
  segments = processor.split_into_segments(text)
335
 
336
- # Process segments sequentially for better timing control
337
  processed_segments = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
338
  current_time = 0
339
  final_audio = AudioSegment.empty()
340
  srt_content = ""
341
- cache = AudioCache() if use_cache else None
342
 
343
- for segment in tqdm(segments, desc="Processing segments"):
344
- # Process segment
345
- processed_segment = await process_segment_with_timing(segment, voice, rate, pitch, cache)
346
-
347
  # Calculate precise timing
348
- processed_segment.start_time = current_time
349
- processed_segment.end_time = current_time + processed_segment.duration
350
 
351
  # Add to SRT with precise timing
352
  srt_content += (
353
- f"{processed_segment.id}\n"
354
- f"{format_time_ms(processed_segment.start_time)} --> {format_time_ms(processed_segment.end_time)}\n"
355
- f"{processed_segment.text}\n\n"
356
  )
357
 
358
  # Add to final audio with precise positioning
359
- final_audio = final_audio.append(processed_segment.audio, crossfade=0)
360
 
361
  # Update timing with precise gap
362
- current_time = processed_segment.end_time
363
- processed_segments.append(processed_segment)
364
 
365
  # Export with high precision
366
- unique_id = uuid.uuid4()
367
- audio_path = f"final_audio_{unique_id}.mp3"
368
- srt_path = f"final_subtitles_{unique_id}.srt"
369
 
370
- # Export with high quality settings for precise timing
371
- final_audio.export(
372
- audio_path,
373
- format="mp3",
374
- bitrate="320k",
375
- parameters=["-ar", "48000", "-ac", "2"]
376
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
377
 
378
- with open(srt_path, "w", encoding='utf-8') as f:
379
- f.write(srt_content)
380
 
381
  return srt_path, audio_path
382
 
383
- async def process_text(text, pitch, rate, voice, words_per_line, lines_per_segment, enable_ssml, use_cache, pause_after_period, pause_after_comma):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384
  try:
385
- # Format pitch and rate strings
386
- pitch_str = f"{pitch:+d}Hz" if pitch != 0 else "+0Hz"
387
- rate_str = f"{rate:+d}%" if rate != 0 else "+0%"
388
 
389
- status_html = "<p>Processing started...</p>"
 
390
 
391
  srt_path, audio_path = await generate_accurate_srt(
392
  text,
@@ -395,32 +450,20 @@ async def process_text(text, pitch, rate, voice, words_per_line, lines_per_segme
395
  pitch_str,
396
  words_per_line,
397
  lines_per_segment,
398
- enable_ssml,
399
- use_cache,
400
- pause_after_period,
401
- pause_after_comma
402
  )
403
 
404
- status_html = """
405
- <div style='color: green; padding: 10px;'>
406
- <p>✓ Processing completed successfully!</p>
407
- <p>- SRT file generated</p>
408
- <p>- Audio file generated</p>
409
- </div>
410
- """
411
-
412
- return srt_path, audio_path, audio_path, status_html
413
-
414
  except Exception as e:
415
- error_html = f"""
416
- <div style='color: red; padding: 10px;'>
417
- <p>❌ Error during processing:</p>
418
- <p>{str(e)}</p>
419
- </div>
420
- """
421
- return None, None, None, error_html
422
 
423
- # Voice options dictionary (same as before)
424
  voice_options = {
425
  "Andrew Male": "en-US-AndrewNeural",
426
  "Jenny Female": "en-US-JennyNeural",
@@ -460,32 +503,102 @@ voice_options = {
460
  "Imani": "en-TZ-ImaniNeural",
461
  "Leah": "en-ZA-LeahNeural",
462
  "Luke": "en-ZA-LukeNeural"
463
- # Add other voices here...
464
  }
465
 
 
 
 
 
466
  # Create Gradio interface
467
- app = gr.Interface(
468
- fn=process_text,
469
- inputs=[
470
- gr.Textbox(label="Enter Text", lines=10),
471
- gr.Slider(label="Pitch Adjustment (Hz)", minimum=-10, maximum=10, value=0, step=1),
472
- gr.Slider(label="Rate Adjustment (%)", minimum=-25, maximum=25, value=0, step=1),
473
- gr.Dropdown(label="Select Voice", choices=list(voice_options.keys()), value="Jenny Female"),
474
- gr.Slider(label="Words per Line", minimum=3, maximum=12, value=6, step=1),
475
- gr.Slider(label="Lines per Segment", minimum=1, maximum=4, value=2, step=1),
476
- gr.Checkbox(label="Enable SSML Enhancement", value=True),
477
- gr.Checkbox(label="Use Audio Cache", value=True),
478
- gr.Slider(label="Pause After Period (ms)", minimum=200, maximum=800, value=400, step=50),
479
- gr.Slider(label="Pause After Comma (ms)", minimum=100, maximum=400, value=200, step=50)
480
- ],
481
- outputs=[
482
- gr.File(label="Download SRT"),
483
- gr.File(label="Download Audio"),
484
- gr.Audio(label="Preview Audio"),
485
- gr.HTML(label="Processing Status")
486
- ],
487
- title="Advanced TTS with Configurable SRT Generation",
488
- description="Generate perfectly synchronized audio and subtitles with natural speech patterns."
489
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
490
 
491
- app.launch()
 
 
5
  import asyncio
6
  import uuid
7
  import re
8
+ import time
9
+ import tempfile
10
  from concurrent.futures import ThreadPoolExecutor
11
+ from typing import List, Tuple, Optional, Dict, Any
12
  import math
13
  from dataclasses import dataclass
 
 
 
 
 
 
14
 
15
  class TimingManager:
16
  def __init__(self):
 
43
  audio: Optional[AudioSegment] = None
44
  lines: List[str] = None # Add lines field for display purposes only
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  class TextProcessor:
47
  def __init__(self, words_per_line: int, lines_per_segment: int):
48
  self.words_per_line = words_per_line
 
106
  return breaks
107
 
108
  def split_into_segments(self, text: str) -> List[Segment]:
 
 
 
109
  # Normalize text and add proper spacing around punctuation
110
  text = re.sub(r'\s+', ' ', text.strip())
111
  text = re.sub(r'([.!?,;:])\s*', r'\1 ', text)
 
184
 
185
  return lines
186
 
187
+ # IMPROVEMENT 1: Enhanced Error Handling
188
+ class TTSError(Exception):
189
+ """Custom exception for TTS processing errors"""
190
+ pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
+ async def process_segment_with_timing(segment: Segment, voice: str, rate: str, pitch: str) -> Segment:
193
+ """Process a complete segment as a single TTS unit with improved error handling"""
194
+ audio_file = os.path.join(tempfile.gettempdir(), f"temp_segment_{segment.id}_{uuid.uuid4()}.wav")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  try:
196
+ # Process the entire segment text as one unit, replacing newlines with spaces
197
+ segment_text = ' '.join(segment.text.split('\n'))
198
+ tts = edge_tts.Communicate(segment_text, voice, rate=rate, pitch=pitch)
 
 
 
 
 
 
199
 
200
+ try:
201
+ await tts.save(audio_file)
202
+ except Exception as e:
203
+ raise TTSError(f"Failed to generate audio for segment {segment.id}: {str(e)}")
204
 
205
+ if not os.path.exists(audio_file) or os.path.getsize(audio_file) == 0:
206
+ raise TTSError(f"Generated audio file is empty or missing for segment {segment.id}")
207
 
208
+ try:
209
+ segment.audio = AudioSegment.from_file(audio_file)
210
+ # Reduced silence to 30ms for more natural flow
211
+ silence = AudioSegment.silent(duration=30)
212
+ segment.audio = silence + segment.audio + silence
213
+ segment.duration = len(segment.audio)
214
+ except Exception as e:
215
+ raise TTSError(f"Failed to process audio file for segment {segment.id}: {str(e)}")
216
 
 
217
  return segment
218
  except Exception as e:
219
+ if not isinstance(e, TTSError):
220
+ raise TTSError(f"Unexpected error processing segment {segment.id}: {str(e)}")
221
  raise
222
  finally:
223
  if os.path.exists(audio_file):
224
+ try:
225
+ os.remove(audio_file)
226
+ except Exception:
227
+ pass # Ignore deletion errors
228
 
229
+ # IMPROVEMENT 2: Better File Management with cleanup
230
+ class FileManager:
231
+ """Manages temporary and output files with cleanup capabilities"""
232
+ def __init__(self):
233
+ self.temp_dir = tempfile.mkdtemp(prefix="tts_app_")
234
+ self.output_files = []
235
+ self.max_files_to_keep = 5 # Keep only the 5 most recent output pairs
236
+
237
+ def get_temp_path(self, prefix):
238
+ """Get a path for a temporary file"""
239
+ return os.path.join(self.temp_dir, f"{prefix}_{uuid.uuid4()}")
240
 
241
+ def create_output_paths(self):
242
+ """Create paths for output files"""
243
+ unique_id = str(uuid.uuid4())
244
+ audio_path = os.path.join(self.temp_dir, f"final_audio_{unique_id}.mp3")
245
+ srt_path = os.path.join(self.temp_dir, f"final_subtitles_{unique_id}.srt")
246
+
247
+ self.output_files.append((srt_path, audio_path))
248
+ self.cleanup_old_files()
249
+
250
+ return srt_path, audio_path
251
+
252
+ def cleanup_old_files(self):
253
+ """Clean up old output files, keeping only the most recent ones"""
254
+ if len(self.output_files) > self.max_files_to_keep:
255
+ old_files = self.output_files[:-self.max_files_to_keep]
256
+ for srt_path, audio_path in old_files:
257
+ try:
258
+ if os.path.exists(srt_path):
259
+ os.remove(srt_path)
260
+ if os.path.exists(audio_path):
261
+ os.remove(audio_path)
262
+ except Exception:
263
+ pass # Ignore deletion errors
264
+
265
+ # Update the list to only include files we're keeping
266
+ self.output_files = self.output_files[-self.max_files_to_keep:]
267
+
268
+ def cleanup_all(self):
269
+ """Clean up all managed files"""
270
+ for srt_path, audio_path in self.output_files:
271
+ try:
272
+ if os.path.exists(srt_path):
273
+ os.remove(srt_path)
274
+ if os.path.exists(audio_path):
275
+ os.remove(audio_path)
276
+ except Exception:
277
+ pass # Ignore deletion errors
278
+
279
+ try:
280
+ os.rmdir(self.temp_dir)
281
+ except Exception:
282
+ pass # Ignore if directory isn't empty or can't be removed
283
+
284
+ # Create global file manager
285
+ file_manager = FileManager()
286
+
287
+ # IMPROVEMENT 3: Parallel Processing for Segments
288
+ async def generate_accurate_srt(
289
+ text: str,
290
+ voice: str,
291
+ rate: str,
292
+ pitch: str,
293
+ words_per_line: int,
294
+ lines_per_segment: int,
295
+ progress_callback=None,
296
+ parallel: bool = True,
297
+ max_workers: int = 4
298
+ ) -> Tuple[str, str]:
299
+ """Generate accurate SRT with parallel processing option"""
300
  processor = TextProcessor(words_per_line, lines_per_segment)
301
  segments = processor.split_into_segments(text)
302
 
303
+ total_segments = len(segments)
304
  processed_segments = []
305
+
306
+ # Update progress to show segmentation is complete
307
+ if progress_callback:
308
+ progress_callback(0.1, "Text segmentation complete")
309
+
310
+ if parallel and total_segments > 1:
311
+ # Process segments in parallel
312
+ processed_count = 0
313
+ segment_tasks = []
314
+
315
+ # Create a semaphore to limit concurrent tasks
316
+ semaphore = asyncio.Semaphore(max_workers)
317
+
318
+ async def process_with_semaphore(segment):
319
+ async with semaphore:
320
+ nonlocal processed_count
321
+ try:
322
+ result = await process_segment_with_timing(segment, voice, rate, pitch)
323
+ processed_count += 1
324
+ if progress_callback:
325
+ progress = 0.1 + (0.8 * processed_count / total_segments)
326
+ progress_callback(progress, f"Processed {processed_count}/{total_segments} segments")
327
+ return result
328
+ except Exception as e:
329
+ # Handle errors in individual segments
330
+ processed_count += 1
331
+ if progress_callback:
332
+ progress = 0.1 + (0.8 * processed_count / total_segments)
333
+ progress_callback(progress, f"Error in segment {segment.id}: {str(e)}")
334
+ raise
335
+
336
+ # Create tasks for all segments
337
+ for segment in segments:
338
+ segment_tasks.append(process_with_semaphore(segment))
339
+
340
+ # Run all tasks and collect results
341
+ try:
342
+ processed_segments = await asyncio.gather(*segment_tasks)
343
+ except Exception as e:
344
+ if progress_callback:
345
+ progress_callback(0.9, f"Error during parallel processing: {str(e)}")
346
+ raise TTSError(f"Failed during parallel processing: {str(e)}")
347
+ else:
348
+ # Process segments sequentially (original method)
349
+ for i, segment in enumerate(segments):
350
+ try:
351
+ processed_segment = await process_segment_with_timing(segment, voice, rate, pitch)
352
+ processed_segments.append(processed_segment)
353
+
354
+ if progress_callback:
355
+ progress = 0.1 + (0.8 * (i + 1) / total_segments)
356
+ progress_callback(progress, f"Processed {i + 1}/{total_segments} segments")
357
+ except Exception as e:
358
+ if progress_callback:
359
+ progress_callback(0.9, f"Error processing segment {segment.id}: {str(e)}")
360
+ raise TTSError(f"Failed to process segment {segment.id}: {str(e)}")
361
+
362
+ # Sort segments by ID to ensure correct order
363
+ processed_segments.sort(key=lambda s: s.id)
364
+
365
+ if progress_callback:
366
+ progress_callback(0.9, "Finalizing audio and subtitles")
367
+
368
+ # Now combine the segments in the correct order
369
  current_time = 0
370
  final_audio = AudioSegment.empty()
371
  srt_content = ""
 
372
 
373
+ for segment in processed_segments:
 
 
 
374
  # Calculate precise timing
375
+ segment.start_time = current_time
376
+ segment.end_time = current_time + segment.duration
377
 
378
  # Add to SRT with precise timing
379
  srt_content += (
380
+ f"{segment.id}\n"
381
+ f"{format_time_ms(segment.start_time)} --> {format_time_ms(segment.end_time)}\n"
382
+ f"{segment.text}\n\n"
383
  )
384
 
385
  # Add to final audio with precise positioning
386
+ final_audio = final_audio.append(segment.audio, crossfade=0)
387
 
388
  # Update timing with precise gap
389
+ current_time = segment.end_time
 
390
 
391
  # Export with high precision
392
+ srt_path, audio_path = file_manager.create_output_paths()
 
 
393
 
394
+ try:
395
+ # Export with optimized quality settings and compression
396
+ export_params = {
397
+ 'format': 'mp3',
398
+ 'bitrate': '192k', # Reduced from 320k but still high quality
399
+ 'parameters': [
400
+ '-ar', '44100', # Standard sample rate
401
+ '-ac', '2', # Stereo
402
+ '-compression_level', '0', # Best compression
403
+ '-qscale:a', '2' # High quality VBR encoding
404
+ ]
405
+ }
406
+ final_audio.export(audio_path, **export_params)
407
+
408
+ with open(srt_path, "w", encoding='utf-8') as f:
409
+ f.write(srt_content)
410
+ except Exception as e:
411
+ if progress_callback:
412
+ progress_callback(1.0, f"Error exporting final files: {str(e)}")
413
+ raise TTSError(f"Failed to export final files: {str(e)}")
414
 
415
+ if progress_callback:
416
+ progress_callback(1.0, "Complete!")
417
 
418
  return srt_path, audio_path
419
 
420
+ # IMPROVEMENT 4: Progress Reporting with proper error handling for older Gradio versions
421
+ async def process_text_with_progress(
422
+ text,
423
+ pitch,
424
+ rate,
425
+ voice,
426
+ words_per_line,
427
+ lines_per_segment,
428
+ parallel_processing,
429
+ progress=gr.Progress()
430
+ ):
431
+ # Input validation
432
+ if not text or text.strip() == "":
433
+ return None, None, None, True, "Please enter some text to convert to speech."
434
+
435
+ # Format pitch and rate strings
436
+ pitch_str = f"{pitch:+d}Hz" if pitch != 0 else "+0Hz"
437
+ rate_str = f"{rate:+d}%" if rate != 0 else "+0%"
438
+
439
  try:
440
+ # Start progress tracking
441
+ progress(0, "Preparing text...")
 
442
 
443
+ def update_progress(value, status):
444
+ progress(value, status)
445
 
446
  srt_path, audio_path = await generate_accurate_srt(
447
  text,
 
450
  pitch_str,
451
  words_per_line,
452
  lines_per_segment,
453
+ progress_callback=update_progress,
454
+ parallel=parallel_processing
 
 
455
  )
456
 
457
+ # If successful, return results and hide error
458
+ return srt_path, audio_path, audio_path, False, ""
459
+ except TTSError as e:
460
+ # Return specific TTS error
461
+ return None, None, None, True, f"TTS Error: {str(e)}"
 
 
 
 
 
462
  except Exception as e:
463
+ # Return any other error
464
+ return None, None, None, True, f"Unexpected error: {str(e)}"
 
 
 
 
 
465
 
466
+ # Voice options dictionary
467
  voice_options = {
468
  "Andrew Male": "en-US-AndrewNeural",
469
  "Jenny Female": "en-US-JennyNeural",
 
503
  "Imani": "en-TZ-ImaniNeural",
504
  "Leah": "en-ZA-LeahNeural",
505
  "Luke": "en-ZA-LukeNeural"
506
+ # Add other voices as needed
507
  }
508
 
509
+ # Register cleanup on exit
510
+ import atexit
511
+ atexit.register(file_manager.cleanup_all)
512
+
513
  # Create Gradio interface
514
+ with gr.Blocks(title="Advanced TTS with Configurable SRT Generation") as app:
515
+ gr.Markdown("# Advanced TTS with Configurable SRT Generation")
516
+ gr.Markdown("Generate perfectly synchronized audio and subtitles with natural speech patterns.")
517
+
518
+ with gr.Row():
519
+ with gr.Column(scale=3):
520
+ text_input = gr.Textbox(label="Enter Text", lines=10, placeholder="Enter your text here...")
521
+
522
+ with gr.Column(scale=2):
523
+ voice_dropdown = gr.Dropdown(
524
+ label="Select Voice",
525
+ choices=list(voice_options.keys()),
526
+ value="Jenny Female"
527
+ )
528
+ pitch_slider = gr.Slider(
529
+ label="Pitch Adjustment (Hz)",
530
+ minimum=-10,
531
+ maximum=10,
532
+ value=0,
533
+ step=1
534
+ )
535
+ rate_slider = gr.Slider(
536
+ label="Rate Adjustment (%)",
537
+ minimum=-25,
538
+ maximum=25,
539
+ value=0,
540
+ step=1
541
+ )
542
+
543
+ with gr.Row():
544
+ with gr.Column():
545
+ words_per_line = gr.Slider(
546
+ label="Words per Line",
547
+ minimum=3,
548
+ maximum=12,
549
+ value=6,
550
+ step=1,
551
+ info="Controls how many words appear on each line of the subtitle"
552
+ )
553
+ with gr.Column():
554
+ lines_per_segment = gr.Slider(
555
+ label="Lines per Segment",
556
+ minimum=1,
557
+ maximum=4,
558
+ value=2,
559
+ step=1,
560
+ info="Controls how many lines appear in each subtitle segment"
561
+ )
562
+ with gr.Column():
563
+ parallel_processing = gr.Checkbox(
564
+ label="Enable Parallel Processing",
565
+ value=True,
566
+ info="Process multiple segments simultaneously for faster conversion (recommended for longer texts)"
567
+ )
568
+
569
+ submit_btn = gr.Button("Generate Audio & Subtitles")
570
+
571
+ # Add error message component
572
+ error_output = gr.Textbox(label="Status", visible=False)
573
+
574
+ with gr.Row():
575
+ with gr.Column():
576
+ audio_output = gr.Audio(label="Preview Audio")
577
+ with gr.Column():
578
+ srt_file = gr.File(label="Download SRT")
579
+ audio_file = gr.File(label="Download Audio")
580
+
581
+ # Handle button click with manual error handling instead of .catch()
582
+ submit_btn.click(
583
+ fn=process_text_with_progress,
584
+ inputs=[
585
+ text_input,
586
+ pitch_slider,
587
+ rate_slider,
588
+ voice_dropdown,
589
+ words_per_line,
590
+ lines_per_segment,
591
+ parallel_processing
592
+ ],
593
+ outputs=[
594
+ srt_file,
595
+ audio_file,
596
+ audio_output,
597
+ error_output,
598
+ error_output
599
+ ],
600
+ api_name="generate"
601
+ )
602
 
603
+ if __name__ == "__main__":
604
+ app.launch()