insta-maker-2

Sleeping

App Files Files Community

hivecorp commited on Mar 19

Commit

266d5cd

verified ·

1 Parent(s): 4902504

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -25

app.py CHANGED Viewed

@@ -39,6 +39,7 @@ class Segment:
     end_time: int = 0
     duration: int = 0
     audio: Optional[AudioSegment] = None
 class TextProcessor:
     def __init__(self, words_per_line: int, lines_per_segment: int):
@@ -182,13 +183,18 @@ class TextProcessor:
         return lines
 async def process_segment_with_timing(segment: Segment, voice: str, rate: str, pitch: str) -> Segment:
-    """Process a single segment and calculate its timing"""
     audio_file = f"temp_segment_{segment.id}_{uuid.uuid4()}.wav"
     try:
-        tts = edge_tts.Communicate(segment.text, voice, rate=rate, pitch=pitch)
         await tts.save(audio_file)
         segment.audio = AudioSegment.from_file(audio_file)
         segment.duration = len(segment.audio)
         return segment
@@ -197,46 +203,50 @@ async def process_segment_with_timing(segment: Segment, voice: str, rate: str, p
             os.remove(audio_file)
 async def generate_accurate_srt(text: str, voice: str, rate: str, pitch: str, words_per_line: int, lines_per_segment: int) -> Tuple[str, str]:
-    # Initialize text processor and split text
     processor = TextProcessor(words_per_line, lines_per_segment)
     segments = processor.split_into_segments(text)
-    # Process all segments in parallel
-    tasks = [
-        process_segment_with_timing(segment, voice, rate, pitch)
-        for segment in segments
-    ]
-    processed_segments = await asyncio.gather(*tasks)
-    # Calculate timing for each segment
     current_time = 0
     final_audio = AudioSegment.empty()
     srt_content = ""
-    for segment in processed_segments:
-        # Set segment timing
-        segment.start_time = current_time
-        segment.end_time = current_time + segment.duration
-        # Add to SRT content
         srt_content += (
-            f"{segment.id}\n"
-            f"{format_time_ms(segment.start_time)} --> {format_time_ms(segment.end_time)}\n"
-            f"{segment.text}\n\n"
         )
-        # Add to final audio
-        final_audio += segment.audio
-        # Update timing
-        current_time = segment.end_time + 100  # 100ms gap between segments
-    # Export files
     unique_id = uuid.uuid4()
     audio_path = f"final_audio_{unique_id}.mp3"
     srt_path = f"final_subtitles_{unique_id}.srt"
-    final_audio.export(audio_path, format="mp3", bitrate="320k")
     with open(srt_path, "w", encoding='utf-8') as f:
         f.write(srt_content)

     end_time: int = 0
     duration: int = 0
     audio: Optional[AudioSegment] = None
+    lines: List[str] = None  # Add lines field for display purposes only
 class TextProcessor:
     def __init__(self, words_per_line: int, lines_per_segment: int):
         return lines
 async def process_segment_with_timing(segment: Segment, voice: str, rate: str, pitch: str) -> Segment:
+    """Process a complete segment as a single TTS unit"""
     audio_file = f"temp_segment_{segment.id}_{uuid.uuid4()}.wav"
     try:
+        # Process the entire segment text as one unit, replacing newlines with spaces
+        segment_text = ' '.join(segment.text.split('\n'))
+        tts = edge_tts.Communicate(segment_text, voice, rate=rate, pitch=pitch)
         await tts.save(audio_file)
         segment.audio = AudioSegment.from_file(audio_file)
+        # Add small silence at start and end for natural spacing
+        silence = AudioSegment.silent(duration=50)
+        segment.audio = silence + segment.audio + silence
         segment.duration = len(segment.audio)
         return segment
             os.remove(audio_file)
 async def generate_accurate_srt(text: str, voice: str, rate: str, pitch: str, words_per_line: int, lines_per_segment: int) -> Tuple[str, str]:
     processor = TextProcessor(words_per_line, lines_per_segment)
     segments = processor.split_into_segments(text)
+    # Process segments sequentially for better timing control
+    processed_segments = []
     current_time = 0
     final_audio = AudioSegment.empty()
     srt_content = ""
+    for segment in segments:
+        # Process segment
+        processed_segment = await process_segment_with_timing(segment, voice, rate, pitch)
+        # Calculate precise timing
+        processed_segment.start_time = current_time
+        processed_segment.end_time = current_time + processed_segment.duration
+        # Add to SRT with precise timing
         srt_content += (
+            f"{processed_segment.id}\n"
+            f"{format_time_ms(processed_segment.start_time)} --> {format_time_ms(processed_segment.end_time)}\n"
+            f"{processed_segment.text}\n\n"
         )
+        # Add to final audio with precise positioning
+        final_audio = final_audio.append(processed_segment.audio, crossfade=0)
+        # Update timing with precise gap
+        current_time = processed_segment.end_time
+        processed_segments.append(processed_segment)
+    # Export with high precision
     unique_id = uuid.uuid4()
     audio_path = f"final_audio_{unique_id}.mp3"
     srt_path = f"final_subtitles_{unique_id}.srt"
+    # Export with high quality settings for precise timing
+    final_audio.export(
+        audio_path,
+        format="mp3",
+        bitrate="320k",
+        parameters=["-ar", "48000", "-ac", "2"]
+    )
     with open(srt_path, "w", encoding='utf-8') as f:
         f.write(srt_content)