insta-maker-2 / app.py
hivecorp's picture
Update app.py
c2e3966 verified
raw
history blame
8.12 kB
import gradio as gr
from pydub import AudioSegment
import edge_tts
import os
import asyncio
import uuid
import re
from concurrent.futures import ThreadPoolExecutor
from typing import List, Tuple, Optional
import math
from dataclasses import dataclass
class TimingManager:
def __init__(self):
self.current_time = 0
self.segment_gap = 100 # ms gap between segments
def get_timing(self, duration):
start_time = self.current_time
end_time = start_time + duration
self.current_time = end_time + self.segment_gap
return start_time, end_time
def get_audio_length(audio_file):
audio = AudioSegment.from_file(audio_file)
return len(audio) / 1000
def format_time_ms(milliseconds):
seconds, ms = divmod(int(milliseconds), 1000)
mins, secs = divmod(seconds, 60)
hrs, mins = divmod(mins, 60)
return f"{hrs:02}:{mins:02}:{secs:02},{ms:03}"
@dataclass
class Segment:
id: int
text: str
start_time: int = 0
end_time: int = 0
duration: int = 0
audio: Optional[AudioSegment] = None
class TextProcessor:
def __init__(self, words_per_line: int, lines_per_segment: int):
self.words_per_line = words_per_line
self.lines_per_segment = lines_per_segment
self.break_patterns = {
'strong': r'[.!?]+',
'medium': r'[,;:]',
'weak': r'[\s]+'
}
def split_into_segments(self, text: str) -> List[Segment]:
# Clean and normalize text
text = re.sub(r'\s+', ' ', text.strip())
text = re.sub(r'([.!?,;:])\s*', r'\1 ', text)
# Split into natural segments
segments = []
current_lines = []
current_words = []
words = text.split()
segment_id = 1
for i, word in enumerate(words):
current_words.append(word)
# Check for natural breaks or line length
is_break = (
any(word.endswith(p) for p in '.!?') or # Strong break
(len(current_words) >= self.words_per_line and # Line length
(any(word.endswith(p) for p in ',;:') or # Medium break
i == len(words) - 1)) # End of text
)
if is_break or len(current_words) >= self.words_per_line:
current_lines.append(' '.join(current_words))
current_words = []
if len(current_lines) >= self.lines_per_segment or i == len(words) - 1:
segment_text = '\n'.join(current_lines)
segments.append(Segment(id=segment_id, text=segment_text))
segment_id += 1
current_lines = []
# Handle remaining content
if current_words:
current_lines.append(' '.join(current_words))
if current_lines:
segment_text = '\n'.join(current_lines)
segments.append(Segment(id=segment_id, text=segment_text))
return segments
async def process_segment_with_timing(segment: Segment, voice: str, rate: str, pitch: str) -> Segment:
"""Process a single segment and calculate its timing"""
audio_file = f"temp_segment_{segment.id}_{uuid.uuid4()}.wav"
try:
tts = edge_tts.Communicate(segment.text, voice, rate=rate, pitch=pitch)
await tts.save(audio_file)
segment.audio = AudioSegment.from_file(audio_file)
segment.duration = len(segment.audio)
return segment
finally:
if os.path.exists(audio_file):
os.remove(audio_file)
async def generate_accurate_srt(text: str, voice: str, rate: str, pitch: str, words_per_line: int, lines_per_segment: int) -> Tuple[str, str]:
# Initialize text processor and split text
processor = TextProcessor(words_per_line, lines_per_segment)
segments = processor.split_into_segments(text)
# Process all segments in parallel
tasks = [
process_segment_with_timing(segment, voice, rate, pitch)
for segment in segments
]
processed_segments = await asyncio.gather(*tasks)
# Calculate timing for each segment
current_time = 0
final_audio = AudioSegment.empty()
srt_content = ""
for segment in processed_segments:
# Set segment timing
segment.start_time = current_time
segment.end_time = current_time + segment.duration
# Add to SRT content
srt_content += (
f"{segment.id}\n"
f"{format_time_ms(segment.start_time)} --> {format_time_ms(segment.end_time)}\n"
f"{segment.text}\n\n"
)
# Add to final audio
final_audio += segment.audio
# Update timing
current_time = segment.end_time + 100 # 100ms gap between segments
# Export files
unique_id = uuid.uuid4()
audio_path = f"final_audio_{unique_id}.mp3"
srt_path = f"final_subtitles_{unique_id}.srt"
final_audio.export(audio_path, format="mp3", bitrate="320k")
with open(srt_path, "w", encoding='utf-8') as f:
f.write(srt_content)
return srt_path, audio_path
async def process_text(text, pitch, rate, voice, words_per_line, lines_per_segment):
# Format pitch and rate strings
pitch_str = f"{pitch:+d}Hz" if pitch != 0 else "+0Hz"
rate_str = f"{rate:+d}%" if rate != 0 else "+0%"
srt_path, audio_path = await generate_accurate_srt(
text,
voice_options[voice],
rate_str,
pitch_str,
words_per_line,
lines_per_segment
)
return srt_path, audio_path, audio_path
# Voice options dictionary (same as before)
voice_options = {
"Andrew Male": "en-US-AndrewNeural",
"Jenny Female": "en-US-JennyNeural",
"Guy Male": "en-US-GuyNeural",
"Ana Female": "en-US-AnaNeural",
"Aria Female": "en-US-AriaNeural",
"Brian Male": "en-US-BrianNeural",
"Christopher Male": "en-US-ChristopherNeural",
"Eric Male": "en-US-EricNeural",
"Michelle Male": "en-US-MichelleNeural",
"Roger Male": "en-US-RogerNeural",
"Natasha Female": "en-AU-NatashaNeural",
"William Male": "en-AU-WilliamNeural",
"Clara Female": "en-CA-ClaraNeural",
"Liam Female ": "en-CA-LiamNeural",
"Libby Female": "en-GB-LibbyNeural",
"Maisie": "en-GB-MaisieNeural",
"Ryan": "en-GB-RyanNeural",
"Sonia": "en-GB-SoniaNeural",
"Thomas": "en-GB-ThomasNeural",
"Sam": "en-HK-SamNeural",
"Yan": "en-HK-YanNeural",
"Connor": "en-IE-ConnorNeural",
"Emily": "en-IE-EmilyNeural",
"Neerja": "en-IN-NeerjaNeural",
"Prabhat": "en-IN-PrabhatNeural",
"Asilia": "en-KE-AsiliaNeural",
"Chilemba": "en-KE-ChilembaNeural",
"Abeo": "en-NG-AbeoNeural",
"Ezinne": "en-NG-EzinneNeural",
"Mitchell": "en-NZ-MitchellNeural",
"James": "en-PH-JamesNeural",
"Rosa": "en-PH-RosaNeural",
"Luna": "en-SG-LunaNeural",
"Wayne": "en-SG-WayneNeural",
"Elimu": "en-TZ-ElimuNeural",
"Imani": "en-TZ-ImaniNeural",
"Leah": "en-ZA-LeahNeural",
"Luke": "en-ZA-LukeNeural"
# Add other voices here...
}
# Create Gradio interface
app = gr.Interface(
fn=process_text,
inputs=[
gr.Textbox(label="Enter Text", lines=10),
gr.Slider(label="Pitch Adjustment (Hz)", minimum=-10, maximum=10, value=0, step=1),
gr.Slider(label="Rate Adjustment (%)", minimum=-25, maximum=25, value=0, step=1),
gr.Dropdown(label="Select Voice", choices=list(voice_options.keys()), value="Jenny Female"),
gr.Slider(label="Words per Line", minimum=3, maximum=12, value=6, step=1),
gr.Slider(label="Lines per Segment", minimum=1, maximum=4, value=2, step=1)
],
outputs=[
gr.File(label="Download SRT"),
gr.File(label="Download Audio"),
gr.Audio(label="Preview Audio")
],
title="Advanced TTS with Configurable SRT Generation",
description="Generate perfectly synchronized audio and subtitles with natural speech patterns."
)
app.launch()