import edge_tts import srt import os import wave import gradio as gr # Function to calculate audio duration for a given audio file def get_audio_length(audio_path): with wave.open(audio_path, 'rb') as audio: frames = audio.getnframes() rate = audio.getframerate() return frames / float(rate) # Function to generate SRT entries for a batch of text with accurate timing def generate_accurate_srt(text, start_time, batch_index): srt_entries = [] current_time = start_time for line in text.splitlines(): # Estimate duration of each line based on audio segment generated duration = len(line.split()) * 0.3 # Assuming approx. 0.3 seconds per word end_time = current_time + duration srt_entries.append( srt.Subtitle( index=batch_index, start=srt.timedelta(seconds=current_time), end=srt.timedelta(seconds=end_time), content=line ) ) current_time = end_time batch_index += 1 return srt_entries, current_time # Process each batch of text, generate audio, and accumulate SRT entries def batch_process_srt_and_audio(script_text, batch_size=500): total_srt_entries = [] cumulative_time = 0.0 batch_index = 1 for i in range(0, len(script_text), batch_size): batch_text = script_text[i:i+batch_size] # Generate audio for the batch audio_file = f"audio_batch_{i}.wav" communicate = edge_tts.Communicate(text=batch_text, voice="en-US-AndrewNeural", rate="-25%") communicate.save(audio_file) # Get the duration of the generated audio batch batch_duration = get_audio_length(audio_file) # Generate SRT entries for this batch and update cumulative time srt_entries, cumulative_time = generate_accurate_srt(batch_text, cumulative_time, batch_index) total_srt_entries.extend(srt_entries) batch_index += len(srt_entries) # Write the SRT file srt_file = "output.srt" with open(srt_file, 'w') as file: file.write(srt.compose(total_srt_entries)) return srt_file # Final validation to ensure no SRT entry extends beyond total audio duration def validate_srt_against_audio(srt_file_path, audio_file_path): audio_duration = get_audio_length(audio_file_path) with open(srt_file_path, 'r') as file: subtitles = list(srt.parse(file.read())) for subtitle in subtitles: if subtitle.end.total_seconds() > audio_duration: subtitle.end = srt.timedelta(seconds=audio_duration) break # Write the validated SRT back to the file with open(srt_file_path, 'w') as file: file.write(srt.compose(subtitles)) return srt_file_path # Gradio Interface def process_text_to_srt(script_text): # Process the script in batches and create SRT srt_file = batch_process_srt_and_audio(script_text) # Validate the final SRT file with the complete audio file final_audio_file = "combined_audio.wav" # Assumes you have a combined final audio file validate_srt_against_audio(srt_file, final_audio_file) return srt_file, final_audio_file # Gradio app setup def main(): gr.Interface( fn=process_text_to_srt, inputs="textbox", outputs=["file", "audio"], live=True, title="Text-to-SRT with Accurate Timing", description="Enter text to convert it into audio with synchronized SRT subtitles. The SRT timings are validated against the total audio duration." ).launch() # Run the app if __name__ == "__main__": main()