Spaces:
Sleeping
Sleeping
File size: 5,644 Bytes
f7e1683 0ce84cc f4b5c65 0ce84cc f5e4024 0ce84cc f5e4024 0ce84cc ec46cb3 f5e4024 95d954d 0ce84cc f5e4024 95d954d 4b97382 95d954d 4b97382 95d954d 4b97382 95d954d 4b97382 0ce84cc f5e4024 0ce84cc f5e4024 0ce84cc f5e4024 0ce84cc f5e4024 0ce84cc f5e4024 0ce84cc f5e4024 0ce84cc f4b5c65 bccb8c6 0ce84cc 7697af6 0ce84cc f5e4024 0ce84cc 4b97382 0ce84cc c14c0c8 0ce84cc f1e232e 0ce84cc 3927c7f f5e4024 0ce84cc 3927c7f 077e0e7 0ce84cc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
import gradio as gr
from pydub import AudioSegment
import edge_tts
import os
import asyncio
import uuid
import re
# Function to get the length of an audio file in milliseconds
def get_audio_length(audio_file):
audio = AudioSegment.from_file(audio_file)
return len(audio) / 1000 # Return in seconds for compatibility
# Function to format time for SRT in milliseconds
def format_time_ms(milliseconds):
seconds, ms = divmod(int(milliseconds), 1000)
mins, secs = divmod(seconds, 60)
hrs, mins = divmod(mins, 60)
return f"{hrs:02}:{mins:02}:{secs:02},{ms:03}"
# Function to split text into segments based on punctuation, ensuring no word is split
def split_text_into_segments(text):
segments = []
raw_segments = re.split(r'([.!?,])', text)
for i in range(0, len(raw_segments) - 1, 2):
sentence = raw_segments[i].strip() + raw_segments[i + 1]
words = sentence.split()
if len(words) <= 8:
segments.append(sentence.strip())
else:
chunk = ""
for word in words:
if len(chunk.split()) < 8:
chunk += " " + word
else:
segments.append(chunk.strip())
chunk = word
if chunk:
segments.append(chunk.strip())
if len(raw_segments) % 2 == 1:
remaining_text = raw_segments[-1].strip()
if remaining_text:
segments.append(remaining_text)
return segments
# Function to generate SRT with millisecond accuracy per batch
async def generate_accurate_srt(batch_text, batch_num, start_offset, pitch, rate, voice):
audio_file = f"batch_{batch_num}_audio.wav"
tts = edge_tts.Communicate(batch_text, voice, rate=rate, pitch=pitch)
await tts.save(audio_file)
actual_length = get_audio_length(audio_file) * 1000 # Convert to milliseconds
segments = split_text_into_segments(batch_text)
segment_duration = actual_length / len(segments)
start_time = start_offset
srt_content = ""
for index, segment in enumerate(segments):
end_time = start_time + segment_duration
if end_time > start_offset + actual_length:
end_time = start_offset + actual_length
srt_content += f"{index + 1 + (batch_num * 100)}\n"
srt_content += f"{format_time_ms(start_time)} --> {format_time_ms(end_time)}\n"
srt_content += segment + "\n\n"
start_time = end_time
return srt_content, audio_file, start_time
# Batch processing function with millisecond accuracy
async def batch_process_srt_and_audio(script_text, pitch, rate, voice, progress=gr.Progress()):
batches = [script_text[i:i + 500] for i in range(0, len(script_text), 500)]
all_srt_content = ""
combined_audio = AudioSegment.empty()
start_offset = 0.0
for batch_num, batch_text in enumerate(batches):
srt_content, audio_file, end_offset = await generate_accurate_srt(batch_text, batch_num, start_offset, pitch, rate, voice)
all_srt_content += srt_content
batch_audio = AudioSegment.from_file(audio_file)
combined_audio += batch_audio
start_offset = end_offset
os.remove(audio_file)
progress((batch_num + 1) / len(batches))
total_audio_length = combined_audio.duration_seconds
validated_srt_content = ""
for line in all_srt_content.strip().splitlines():
if '-->' in line:
start_str, end_str = line.split(' --> ')
start_time = sum(x * float(t) for x, t in zip([3600, 60, 1, 0.001], start_str.replace(',', ':').split(':')))
end_time = sum(x * float(t) for x, t in zip([3600, 60, 1, 0.001], end_str.replace(',', ':').split(':')))
if end_time > total_audio_length:
end_time = total_audio_length
line = f"{format_time_ms(start_time * 1000)} --> {format_time_ms(end_time * 1000)}"
validated_srt_content += line + "\n"
unique_id = uuid.uuid4()
final_audio_path = f"final_audio_{unique_id}.mp3"
final_srt_path = f"final_subtitles_{unique_id}.srt"
combined_audio.export(final_audio_path, format="mp3", bitrate="320k")
with open(final_srt_path, "w") as srt_file:
srt_file.write(validated_srt_content)
return final_srt_path, final_audio_path
# Gradio interface function
async def process_script(script_text, pitch, rate, voice):
pitch_str = f"{pitch}Hz" if pitch != 0 else "-1Hz"
formatted_rate = f"{'+' if rate > 1 else ''}{int(rate)}%"
srt_path, audio_path = await batch_process_srt_and_audio(script_text, pitch_str, formatted_rate, voice_options[voice])
return srt_path, audio_path, audio_path
# Gradio interface setup
voice_options = {
"Andrew Male": "en-US-AndrewNeural",
"Jenny Female": "en-US-JennyNeural",
# Add other voices here...
}
app = gr.Interface(
fn=process_script,
inputs=[
gr.Textbox(label="Enter Script Text", lines=10),
gr.Slider(label="Pitch Adjustment (Hz)", minimum=-20, maximum=20, value=0, step=1),
gr.Slider(label="Rate Adjustment (%)", minimum=-50, maximum=50, value=-1, step=1),
gr.Dropdown(label="Select Voice", choices=list(voice_options.keys()), value="Andrew Male"),
],
outputs=[
gr.File(label="Download SRT File"),
gr.File(label="Download Audio File"),
gr.Audio(label="Audio Playback")
],
title="HIVEcorp Text-to-Speech with Millisecond SRT Generation",
description="Convert your script into audio and generate millisecond-accurate subtitles.",
theme="compact",
)
app.launch()
|