Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -39,6 +39,7 @@ class Segment:
|
|
39 |
end_time: int = 0
|
40 |
duration: int = 0
|
41 |
audio: Optional[AudioSegment] = None
|
|
|
42 |
|
43 |
class TextProcessor:
|
44 |
def __init__(self, words_per_line: int, lines_per_segment: int):
|
@@ -182,13 +183,18 @@ class TextProcessor:
|
|
182 |
return lines
|
183 |
|
184 |
async def process_segment_with_timing(segment: Segment, voice: str, rate: str, pitch: str) -> Segment:
|
185 |
-
"""Process a
|
186 |
audio_file = f"temp_segment_{segment.id}_{uuid.uuid4()}.wav"
|
187 |
try:
|
188 |
-
|
|
|
|
|
189 |
await tts.save(audio_file)
|
190 |
|
191 |
segment.audio = AudioSegment.from_file(audio_file)
|
|
|
|
|
|
|
192 |
segment.duration = len(segment.audio)
|
193 |
|
194 |
return segment
|
@@ -197,46 +203,50 @@ async def process_segment_with_timing(segment: Segment, voice: str, rate: str, p
|
|
197 |
os.remove(audio_file)
|
198 |
|
199 |
async def generate_accurate_srt(text: str, voice: str, rate: str, pitch: str, words_per_line: int, lines_per_segment: int) -> Tuple[str, str]:
|
200 |
-
# Initialize text processor and split text
|
201 |
processor = TextProcessor(words_per_line, lines_per_segment)
|
202 |
segments = processor.split_into_segments(text)
|
203 |
|
204 |
-
# Process
|
205 |
-
|
206 |
-
process_segment_with_timing(segment, voice, rate, pitch)
|
207 |
-
for segment in segments
|
208 |
-
]
|
209 |
-
processed_segments = await asyncio.gather(*tasks)
|
210 |
-
|
211 |
-
# Calculate timing for each segment
|
212 |
current_time = 0
|
213 |
final_audio = AudioSegment.empty()
|
214 |
srt_content = ""
|
215 |
|
216 |
-
for segment in
|
217 |
-
#
|
218 |
-
|
219 |
-
|
|
|
|
|
|
|
220 |
|
221 |
-
# Add to SRT
|
222 |
srt_content += (
|
223 |
-
f"{
|
224 |
-
f"{format_time_ms(
|
225 |
-
f"{
|
226 |
)
|
227 |
|
228 |
-
# Add to final audio
|
229 |
-
final_audio
|
230 |
|
231 |
-
# Update timing
|
232 |
-
current_time =
|
|
|
233 |
|
234 |
-
# Export
|
235 |
unique_id = uuid.uuid4()
|
236 |
audio_path = f"final_audio_{unique_id}.mp3"
|
237 |
srt_path = f"final_subtitles_{unique_id}.srt"
|
238 |
|
239 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
240 |
with open(srt_path, "w", encoding='utf-8') as f:
|
241 |
f.write(srt_content)
|
242 |
|
|
|
39 |
end_time: int = 0
|
40 |
duration: int = 0
|
41 |
audio: Optional[AudioSegment] = None
|
42 |
+
lines: List[str] = None # Add lines field for display purposes only
|
43 |
|
44 |
class TextProcessor:
|
45 |
def __init__(self, words_per_line: int, lines_per_segment: int):
|
|
|
183 |
return lines
|
184 |
|
185 |
async def process_segment_with_timing(segment: Segment, voice: str, rate: str, pitch: str) -> Segment:
|
186 |
+
"""Process a complete segment as a single TTS unit"""
|
187 |
audio_file = f"temp_segment_{segment.id}_{uuid.uuid4()}.wav"
|
188 |
try:
|
189 |
+
# Process the entire segment text as one unit, replacing newlines with spaces
|
190 |
+
segment_text = ' '.join(segment.text.split('\n'))
|
191 |
+
tts = edge_tts.Communicate(segment_text, voice, rate=rate, pitch=pitch)
|
192 |
await tts.save(audio_file)
|
193 |
|
194 |
segment.audio = AudioSegment.from_file(audio_file)
|
195 |
+
# Add small silence at start and end for natural spacing
|
196 |
+
silence = AudioSegment.silent(duration=50)
|
197 |
+
segment.audio = silence + segment.audio + silence
|
198 |
segment.duration = len(segment.audio)
|
199 |
|
200 |
return segment
|
|
|
203 |
os.remove(audio_file)
|
204 |
|
205 |
async def generate_accurate_srt(text: str, voice: str, rate: str, pitch: str, words_per_line: int, lines_per_segment: int) -> Tuple[str, str]:
|
|
|
206 |
processor = TextProcessor(words_per_line, lines_per_segment)
|
207 |
segments = processor.split_into_segments(text)
|
208 |
|
209 |
+
# Process segments sequentially for better timing control
|
210 |
+
processed_segments = []
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
current_time = 0
|
212 |
final_audio = AudioSegment.empty()
|
213 |
srt_content = ""
|
214 |
|
215 |
+
for segment in segments:
|
216 |
+
# Process segment
|
217 |
+
processed_segment = await process_segment_with_timing(segment, voice, rate, pitch)
|
218 |
+
|
219 |
+
# Calculate precise timing
|
220 |
+
processed_segment.start_time = current_time
|
221 |
+
processed_segment.end_time = current_time + processed_segment.duration
|
222 |
|
223 |
+
# Add to SRT with precise timing
|
224 |
srt_content += (
|
225 |
+
f"{processed_segment.id}\n"
|
226 |
+
f"{format_time_ms(processed_segment.start_time)} --> {format_time_ms(processed_segment.end_time)}\n"
|
227 |
+
f"{processed_segment.text}\n\n"
|
228 |
)
|
229 |
|
230 |
+
# Add to final audio with precise positioning
|
231 |
+
final_audio = final_audio.append(processed_segment.audio, crossfade=0)
|
232 |
|
233 |
+
# Update timing with precise gap
|
234 |
+
current_time = processed_segment.end_time
|
235 |
+
processed_segments.append(processed_segment)
|
236 |
|
237 |
+
# Export with high precision
|
238 |
unique_id = uuid.uuid4()
|
239 |
audio_path = f"final_audio_{unique_id}.mp3"
|
240 |
srt_path = f"final_subtitles_{unique_id}.srt"
|
241 |
|
242 |
+
# Export with high quality settings for precise timing
|
243 |
+
final_audio.export(
|
244 |
+
audio_path,
|
245 |
+
format="mp3",
|
246 |
+
bitrate="320k",
|
247 |
+
parameters=["-ar", "48000", "-ac", "2"]
|
248 |
+
)
|
249 |
+
|
250 |
with open(srt_path, "w", encoding='utf-8') as f:
|
251 |
f.write(srt_content)
|
252 |
|