import gradio as gr from pydub import AudioSegment import edge_tts import os import asyncio import uuid import re # Function to get the length of an audio file in seconds def get_audio_length(audio_file): audio = AudioSegment.from_file(audio_file) return audio.duration_seconds # Function to format time for SRT def format_time(seconds): millis = int((seconds % 1) * 1000) seconds = int(seconds) hrs = seconds // 3600 mins = (seconds % 3600) // 60 secs = seconds % 60 return f"{hrs:02}:{mins:02}:{secs:02},{millis:03}" # Function to split text into segments by punctuation or limit to 7-8 words def split_text_into_segments(text): segments = [] raw_segments = re.split(r'([.!?])', text) for i in range(0, len(raw_segments) - 1, 2): sentence = raw_segments[i].strip() + raw_segments[i + 1] words = sentence.split() if len(words) > 8: for j in range(0, len(words), 8): segments.append(" ".join(words[j:j + 8])) else: segments.append(sentence.strip()) if len(raw_segments) % 2 == 1: remaining_text = raw_segments[-1].strip() words = remaining_text.split() for j in range(0, len(words), 8): segments.append(" ".join(words[j:j + 8])) return segments # Function to generate SRT with accurate timing per batch async def generate_accurate_srt(batch_text, batch_num, start_offset, pitch, rate, voice): audio_file = f"batch_{batch_num}_audio.wav" # Generate the audio using edge-tts tts = edge_tts.Communicate(batch_text, voice, rate=rate, pitch=pitch) await tts.save(audio_file) # Get the actual length of the audio file actual_length = get_audio_length(audio_file) # Split the text into segments based on punctuation and word count segments = split_text_into_segments(batch_text) segment_duration = actual_length / len(segments) # Duration per segment start_time = start_offset # Initialize SRT content srt_content = "" for index, segment in enumerate(segments): end_time = start_time + segment_duration if end_time > start_offset + actual_length: end_time = start_offset + actual_length srt_content += f"{index + 1 + (batch_num * 100)}\n" srt_content += f"{format_time(start_time)} --> {format_time(end_time)}\n" srt_content += segment + "\n\n" start_time = end_time return srt_content, audio_file, start_time # Batch processing function async def batch_process_srt_and_audio(script_text, pitch, rate, voice, progress=gr.Progress()): batches = [script_text[i:i + 500] for i in range(0, len(script_text), 500)] all_srt_content = "" combined_audio = AudioSegment.empty() start_offset = 0.0 for batch_num, batch_text in enumerate(batches): srt_content, audio_file, end_offset = await generate_accurate_srt(batch_text, batch_num, start_offset, pitch, rate, voice) all_srt_content += srt_content batch_audio = AudioSegment.from_file(audio_file) combined_audio += batch_audio start_offset = end_offset os.remove(audio_file) progress((batch_num + 1) / len(batches)) total_audio_length = combined_audio.duration_seconds validated_srt_content = "" for line in all_srt_content.strip().splitlines(): if '-->' in line: start_str, end_str = line.split(' --> ') start_time = sum(x * float(t) for x, t in zip([3600, 60, 1, 0.001], start_str.replace(',', ':').split(':'))) end_time = sum(x * float(t) for x, t in zip([3600, 60, 1, 0.001], end_str.replace(',', ':').split(':'))) if end_time > total_audio_length: end_time = total_audio_length line = f"{format_time(start_time)} --> {format_time(end_time)}" validated_srt_content += line + "\n" unique_id = uuid.uuid4() final_audio_path = f"final_audio_{unique_id}.mp3" final_srt_path = f"final_subtitles_{unique_id}.srt" combined_audio.export(final_audio_path, format="mp3", bitrate="320k") with open(final_srt_path, "w") as srt_file: srt_file.write(validated_srt_content) return final_srt_path, final_audio_path # Gradio interface function async def process_script(script_text, pitch, rate, voice): # Format pitch correctly for edge-tts pitch_str = f"{pitch}Hz" if pitch != 0 else "-1Hz" formatted_rate = f"{'+' if rate > 1 else ''}{int(rate)}%" srt_path, audio_path = await batch_process_srt_and_audio(script_text, pitch_str, formatted_rate, voice_options[voice]) return srt_path, audio_path, audio_path # Gradio interface setup voice_options = { "Andrew Male": "en-US-AndrewNeural", "Jenny Female": "en-US-JennyNeural", "Guy Male": "en-US-GuyNeural", "Ana Female": "en-US-AnaNeural", "Aria Female": "en-US-AriaNeural", "Brian Male": "en-US-BrianNeural", "Christopher Male": "en-US-ChristopherNeural", "Eric Male": "en-US-EricNeural", "Michelle Male": "en-US-MichelleNeural", "Roger Male": "en-US-RogerNeural", "Natasha Female": "en-AU-NatashaNeural", "William Male": "en-AU-WilliamNeural", "Clara Female": "en-CA-ClaraNeural", "Liam Female ": "en-CA-LiamNeural", "Libby Female": "en-GB-LibbyNeural", "Maisie": "en-GB-MaisieNeural", "Ryan": "en-GB-RyanNeural", "Sonia": "en-GB-SoniaNeural", "Thomas": "en-GB-ThomasNeural", "Sam": "en-HK-SamNeural", "Yan": "en-HK-YanNeural", "Connor": "en-IE-ConnorNeural", "Emily": "en-IE-EmilyNeural", "Neerja": "en-IN-NeerjaNeural", "Prabhat": "en-IN-PrabhatNeural", "Asilia": "en-KE-AsiliaNeural", "Chilemba": "en-KE-ChilembaNeural", "Abeo": "en-NG-AbeoNeural", "Ezinne": "en-NG-EzinneNeural", "Mitchell": "en-NZ-MitchellNeural", "James": "en-PH-JamesNeural", "Rosa": "en-PH-RosaNeural", "Luna": "en-SG-LunaNeural", "Wayne": "en-SG-WayneNeural", "Elimu": "en-TZ-ElimuNeural", "Imani": "en-TZ-ImaniNeural", "Leah": "en-ZA-LeahNeural", "Luke": "en-ZA-LukeNeural" } # All voice options app = gr.Interface( fn=process_script, inputs=[ gr.Textbox(label="Enter Script Text", lines=10), gr.Slider(label="Pitch Adjustment (Hz)", minimum=-20, maximum=20, value=0, step=1), gr.Slider(label="Rate Adjustment (%)", minimum=-50, maximum=50, value=-1, step=1), gr.Dropdown(label="Select Voice", choices=list(voice_options.keys()), value="Andrew Male"), ], outputs=[ gr.File(label="Download SRT File"), gr.File(label="Download Audio File"), gr.Audio(label="Audio Playback") ], title="HIVEcorp Text-to-Speech with SRT Generation", description="Convert your script into audio and generate subtitles.", theme="compact", ) app.launch()