File size: 3,661 Bytes
077e0e7
f4b5c65
ea230c6
 
f4b5c65
3927c7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
077e0e7
3927c7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4b5c65
3927c7f
 
 
 
 
 
 
c95cd5b
3927c7f
 
 
 
 
 
 
f4b5c65
3927c7f
 
 
 
 
f4b5c65
 
 
3927c7f
f4b5c65
 
 
3927c7f
 
 
 
 
 
 
 
 
 
f4b5c65
 
3927c7f
 
 
 
077e0e7
8428946
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import gradio as gr
from pydub import AudioSegment
import edge_tts
import os
import asyncio

# Function to get the length of an audio file in seconds
def get_audio_length(audio_file):
    audio = AudioSegment.from_file(audio_file)
    return audio.duration_seconds

# Function to format time for SRT
def format_time(seconds):
    millis = int((seconds % 1) * 1000)
    seconds = int(seconds)
    hrs = seconds // 3600
    mins = (seconds % 3600) // 60
    secs = seconds % 60
    return f"{hrs:02}:{mins:02}:{secs:02},{millis:03}"

# Function to generate SRT with accurate timing per batch
async def generate_accurate_srt(batch_text, batch_num, start_offset):
    audio_file = f"batch_{batch_num}_audio.wav"
    
    # Generate the audio using edge-tts
    tts = edge_tts.Communicate(batch_text, "en-US-AndrewNeural", rate="-25%")
    await tts.save(audio_file)

    # Get the actual length of the audio file
    actual_length = get_audio_length(audio_file)

    # Initialize SRT content
    srt_content = ""
    words = batch_text.split()
    segment_duration = actual_length / len(words) * 10  # Adjusted for ~10 words per SRT segment
    start_time = start_offset

    # Build SRT content with accurate timing
    for i in range(0, len(words), 10):
        segment_words = words[i:i+10]
        end_time = start_time + segment_duration
        srt_content += f"{i // 10 + 1 + (batch_num * 100)}\n"
        srt_content += f"{format_time(start_time)} --> {format_time(end_time)}\n"
        srt_content += " ".join(segment_words) + "\n\n"
        start_time = end_time

    return srt_content, audio_file, start_time

# Batch processing function with concurrent processing and progress indicator
async def batch_process_srt_and_audio(script_text, progress=gr.Progress()):
    batches = [script_text[i:i+500] for i in range(0, len(script_text), 500)]
    all_srt_content = ""
    combined_audio = AudioSegment.empty()
    start_offset = 0.0  # Track cumulative time offset for SRT timing

    # Prepare tasks for concurrent batch processing
    tasks = [
        generate_accurate_srt(batch_text, batch_num, start_offset)
        for batch_num, batch_text in enumerate(batches)
    ]
    
    # Execute tasks concurrently with progress tracking
    for result in progress.track(asyncio.as_completed(tasks), total=len(tasks), description="Processing batches..."):
        srt_content, audio_file, end_offset = await result
        all_srt_content += srt_content

        # Append the audio of each batch to the combined audio
        batch_audio = AudioSegment.from_file(audio_file)
        combined_audio += batch_audio
        start_offset = end_offset  # Update the start offset for the next batch

        # Clean up the individual batch audio file
        os.remove(audio_file)

    # Export combined audio and SRT
    combined_audio.export("final_audio.wav", format="wav")
    with open("final_subtitles.srt", "w") as srt_file:
        srt_file.write(all_srt_content)

    return "final_subtitles.srt", "final_audio.wav"

# Gradio interface function
async def process_script(script_text):
    srt_path, audio_path = await batch_process_srt_and_audio(script_text)
    return srt_path, audio_path, audio_path

# Gradio interface setup
app = gr.Interface(
    fn=process_script,
    inputs=gr.Textbox(label="Enter Script Text", lines=10),
    outputs=[
        gr.File(label="Download SRT File"),
        gr.File(label="Download Audio File"),
        gr.Audio(label="Play Audio")
    ],
    description="Upload your script text, and the app will generate audio with en-US-AndrewNeural voice (Rate: -25%) and an accurate SRT file for download."
)

app.launch()