File size: 5,103 Bytes
077e0e7
f4b5c65
ea230c6
 
f4b5c65
22a64e1
3927c7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6926ae7
3927c7f
 
077e0e7
3927c7f
 
 
 
 
 
 
 
 
 
27bfe3b
22a64e1
3927c7f
27bfe3b
3927c7f
6926ae7
 
27bfe3b
6926ae7
 
 
 
 
 
27bfe3b
3927c7f
 
6926ae7
 
3927c7f
 
27bfe3b
3927c7f
6926ae7
3927c7f
 
 
f4b5c65
3927c7f
 
6926ae7
27bfe3b
 
3927c7f
 
 
 
f4b5c65
3927c7f
 
 
 
 
d9e730a
27bfe3b
d9e730a
6926ae7
 
 
 
 
 
 
 
 
 
 
 
 
22a64e1
 
 
 
 
6926ae7
22a64e1
 
6926ae7
f4b5c65
22a64e1
f4b5c65
3927c7f
 
 
 
 
 
 
 
 
 
f4b5c65
 
3927c7f
 
 
 
077e0e7
8428946
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import gradio as gr
from pydub import AudioSegment
import edge_tts
import os
import asyncio
import uuid

# Function to get the length of an audio file in seconds
def get_audio_length(audio_file):
    audio = AudioSegment.from_file(audio_file)
    return audio.duration_seconds

# Function to format time for SRT
def format_time(seconds):
    millis = int((seconds % 1) * 1000)
    seconds = int(seconds)
    hrs = seconds // 3600
    mins = (seconds % 3600) // 60
    secs = seconds % 60
    return f"{hrs:02}:{mins:02}:{secs:02},{millis:03}"

# Function to generate SRT with accurate timing per batch and cross-check timing
async def generate_accurate_srt(batch_text, batch_num, start_offset):
    audio_file = f"batch_{batch_num}_audio.wav"
    
    # Generate the audio using edge-tts
    tts = edge_tts.Communicate(batch_text, "en-US-AndrewNeural", rate="-25%")
    await tts.save(audio_file)

    # Get the actual length of the audio file
    actual_length = get_audio_length(audio_file)

    # Initialize SRT content
    srt_content = ""
    words = batch_text.split()
    num_segments = max(1, len(words) // 15)  # Group words into segments of ~15 words each
    segment_duration = actual_length / num_segments  # Duration for each segment
    start_time = start_offset
    min_display_duration = 1.5  # Set a minimum display time of 1.5 seconds per subtitle

    # Build SRT content with accurate timing, ensuring no segment exceeds audio length
    for i in range(0, len(words), 15):
        segment_words = words[i:i+15]
        end_time = start_time + max(segment_duration, min_display_duration)
        
        # If end_time exceeds actual audio length of the batch, adjust it
        if end_time > start_offset + actual_length:
            end_time = start_offset + actual_length

        srt_content += f"{i // 15 + 1 + (batch_num * 100)}\n"
        srt_content += f"{format_time(start_time)} --> {format_time(end_time)}\n"
        srt_content += " ".join(segment_words) + "\n\n"
        
        # Update start time for next segment
        start_time = end_time

    return srt_content, audio_file, start_time  # Return updated start time for cumulative tracking

# Batch processing function with cumulative timing, progress indicator, and final SRT validation
async def batch_process_srt_and_audio(script_text, progress=gr.Progress()):
    batches = [script_text[i:i+500] for i in range(0, len(script_text), 500)]
    all_srt_content = ""
    combined_audio = AudioSegment.empty()
    start_offset = 0.0  # Track cumulative time offset for SRT timing

    # Process each batch sequentially to ensure proper timing and cumulative offset tracking
    for batch_num, batch_text in enumerate(batches):
        srt_content, audio_file, end_offset = await generate_accurate_srt(batch_text, batch_num, start_offset)
        all_srt_content += srt_content

        # Append the audio of each batch to the combined audio
        batch_audio = AudioSegment.from_file(audio_file)
        combined_audio += batch_audio
        start_offset = end_offset  # Update the start offset for the next batch

        # Clean up the individual batch audio file
        os.remove(audio_file)

        # Update progress
        progress((batch_num + 1) / len(batches))

    # Final cross-check: Adjust any subtitle that exceeds the total audio length
    total_audio_length = combined_audio.duration_seconds
    validated_srt_content = ""
    for line in all_srt_content.strip().splitlines():
        if '-->' in line:
            start_str, end_str = line.split(' --> ')
            start_time = sum(x * float(t) for x, t in zip([3600, 60, 1, 0.001], start_str.replace(',', ':').split(':')))
            end_time = sum(x * float(t) for x, t in zip([3600, 60, 1, 0.001], end_str.replace(',', ':').split(':')))
            if end_time > total_audio_length:
                end_time = total_audio_length
            line = f"{format_time(start_time)} --> {format_time(end_time)}"
        validated_srt_content += line + "\n"

    # Generate unique names for the final files
    unique_id = uuid.uuid4()
    final_audio_path = f"final_audio_{unique_id}.wav"
    final_srt_path = f"final_subtitles_{unique_id}.srt"

    # Export combined audio and validated SRT with unique names
    combined_audio.export(final_audio_path, format="wav")
    with open(final_srt_path, "w") as srt_file:
        srt_file.write(validated_srt_content)

    return final_srt_path, final_audio_path

# Gradio interface function
async def process_script(script_text):
    srt_path, audio_path = await batch_process_srt_and_audio(script_text)
    return srt_path, audio_path, audio_path

# Gradio interface setup
app = gr.Interface(
    fn=process_script,
    inputs=gr.Textbox(label="Enter Script Text", lines=10),
    outputs=[
        gr.File(label="Download SRT File"),
        gr.File(label="Download Audio File"),
        gr.Audio(label="Play Audio")
    ],
    description="Upload your script text, and the app will generate audio with en-US-AndrewNeural voice (Rate: -25%) and an accurate SRT file for download."
)

app.launch()