Spaces:
Sleeping
Sleeping
File size: 5,103 Bytes
077e0e7 f4b5c65 ea230c6 f4b5c65 22a64e1 3927c7f 6926ae7 3927c7f 077e0e7 3927c7f 27bfe3b 22a64e1 3927c7f 27bfe3b 3927c7f 6926ae7 27bfe3b 6926ae7 27bfe3b 3927c7f 6926ae7 3927c7f 27bfe3b 3927c7f 6926ae7 3927c7f f4b5c65 3927c7f 6926ae7 27bfe3b 3927c7f f4b5c65 3927c7f d9e730a 27bfe3b d9e730a 6926ae7 22a64e1 6926ae7 22a64e1 6926ae7 f4b5c65 22a64e1 f4b5c65 3927c7f f4b5c65 3927c7f 077e0e7 8428946 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
import gradio as gr
from pydub import AudioSegment
import edge_tts
import os
import asyncio
import uuid
# Function to get the length of an audio file in seconds
def get_audio_length(audio_file):
audio = AudioSegment.from_file(audio_file)
return audio.duration_seconds
# Function to format time for SRT
def format_time(seconds):
millis = int((seconds % 1) * 1000)
seconds = int(seconds)
hrs = seconds // 3600
mins = (seconds % 3600) // 60
secs = seconds % 60
return f"{hrs:02}:{mins:02}:{secs:02},{millis:03}"
# Function to generate SRT with accurate timing per batch and cross-check timing
async def generate_accurate_srt(batch_text, batch_num, start_offset):
audio_file = f"batch_{batch_num}_audio.wav"
# Generate the audio using edge-tts
tts = edge_tts.Communicate(batch_text, "en-US-AndrewNeural", rate="-25%")
await tts.save(audio_file)
# Get the actual length of the audio file
actual_length = get_audio_length(audio_file)
# Initialize SRT content
srt_content = ""
words = batch_text.split()
num_segments = max(1, len(words) // 15) # Group words into segments of ~15 words each
segment_duration = actual_length / num_segments # Duration for each segment
start_time = start_offset
min_display_duration = 1.5 # Set a minimum display time of 1.5 seconds per subtitle
# Build SRT content with accurate timing, ensuring no segment exceeds audio length
for i in range(0, len(words), 15):
segment_words = words[i:i+15]
end_time = start_time + max(segment_duration, min_display_duration)
# If end_time exceeds actual audio length of the batch, adjust it
if end_time > start_offset + actual_length:
end_time = start_offset + actual_length
srt_content += f"{i // 15 + 1 + (batch_num * 100)}\n"
srt_content += f"{format_time(start_time)} --> {format_time(end_time)}\n"
srt_content += " ".join(segment_words) + "\n\n"
# Update start time for next segment
start_time = end_time
return srt_content, audio_file, start_time # Return updated start time for cumulative tracking
# Batch processing function with cumulative timing, progress indicator, and final SRT validation
async def batch_process_srt_and_audio(script_text, progress=gr.Progress()):
batches = [script_text[i:i+500] for i in range(0, len(script_text), 500)]
all_srt_content = ""
combined_audio = AudioSegment.empty()
start_offset = 0.0 # Track cumulative time offset for SRT timing
# Process each batch sequentially to ensure proper timing and cumulative offset tracking
for batch_num, batch_text in enumerate(batches):
srt_content, audio_file, end_offset = await generate_accurate_srt(batch_text, batch_num, start_offset)
all_srt_content += srt_content
# Append the audio of each batch to the combined audio
batch_audio = AudioSegment.from_file(audio_file)
combined_audio += batch_audio
start_offset = end_offset # Update the start offset for the next batch
# Clean up the individual batch audio file
os.remove(audio_file)
# Update progress
progress((batch_num + 1) / len(batches))
# Final cross-check: Adjust any subtitle that exceeds the total audio length
total_audio_length = combined_audio.duration_seconds
validated_srt_content = ""
for line in all_srt_content.strip().splitlines():
if '-->' in line:
start_str, end_str = line.split(' --> ')
start_time = sum(x * float(t) for x, t in zip([3600, 60, 1, 0.001], start_str.replace(',', ':').split(':')))
end_time = sum(x * float(t) for x, t in zip([3600, 60, 1, 0.001], end_str.replace(',', ':').split(':')))
if end_time > total_audio_length:
end_time = total_audio_length
line = f"{format_time(start_time)} --> {format_time(end_time)}"
validated_srt_content += line + "\n"
# Generate unique names for the final files
unique_id = uuid.uuid4()
final_audio_path = f"final_audio_{unique_id}.wav"
final_srt_path = f"final_subtitles_{unique_id}.srt"
# Export combined audio and validated SRT with unique names
combined_audio.export(final_audio_path, format="wav")
with open(final_srt_path, "w") as srt_file:
srt_file.write(validated_srt_content)
return final_srt_path, final_audio_path
# Gradio interface function
async def process_script(script_text):
srt_path, audio_path = await batch_process_srt_and_audio(script_text)
return srt_path, audio_path, audio_path
# Gradio interface setup
app = gr.Interface(
fn=process_script,
inputs=gr.Textbox(label="Enter Script Text", lines=10),
outputs=[
gr.File(label="Download SRT File"),
gr.File(label="Download Audio File"),
gr.Audio(label="Play Audio")
],
description="Upload your script text, and the app will generate audio with en-US-AndrewNeural voice (Rate: -25%) and an accurate SRT file for download."
)
app.launch()
|