hivecorp commited on
Commit
b2e635f
·
verified ·
1 Parent(s): 8b3735e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +99 -105
app.py CHANGED
@@ -6,124 +6,116 @@ import asyncio
6
  import uuid
7
  import re
8
 
9
- # Function to get the length of an audio file in milliseconds
10
  def get_audio_length(audio_file):
11
  audio = AudioSegment.from_file(audio_file)
12
- return len(audio) / 1000 # Return in seconds for compatibility
13
 
14
- # Function to format time for SRT in milliseconds
15
  def format_time_ms(milliseconds):
16
  seconds, ms = divmod(int(milliseconds), 1000)
17
  mins, secs = divmod(seconds, 60)
18
  hrs, mins = divmod(mins, 60)
19
  return f"{hrs:02}:{mins:02}:{secs:02},{ms:03}"
20
 
21
- # Function to split text into segments based on punctuation, ensuring no word is split
22
- def split_text_into_segments(text):
 
 
 
 
 
23
  segments = []
24
- raw_segments = re.split(r'([.!?,])', text)
 
25
 
26
- for i in range(0, len(raw_segments) - 1, 2):
27
- sentence = raw_segments[i].strip() + raw_segments[i + 1]
28
- words = sentence.split()
29
 
30
- if len(words) <= 8:
31
- segments.append(sentence.strip())
32
- else:
33
- chunk = ""
34
- for word in words:
35
- if len(chunk.split()) < 8:
36
- chunk += " " + word
37
- else:
38
- segments.append(chunk.strip())
39
- chunk = word
40
- if chunk:
41
- segments.append(chunk.strip())
42
-
43
- if len(raw_segments) % 2 == 1:
44
- remaining_text = raw_segments[-1].strip()
45
- if remaining_text:
46
- segments.append(remaining_text)
47
-
 
 
 
 
 
 
 
 
 
48
  return segments
49
 
50
- # Function to generate SRT with millisecond accuracy per batch
51
- async def generate_accurate_srt(batch_text, batch_num, start_offset, pitch, rate, voice):
52
- audio_file = f"batch_{batch_num}_audio.wav"
53
 
54
- tts = edge_tts.Communicate(batch_text, voice, rate=rate, pitch=pitch)
55
- await tts.save(audio_file)
56
-
57
- actual_length = get_audio_length(audio_file) * 1000 # Convert to milliseconds
58
-
59
- segments = split_text_into_segments(batch_text)
60
- segment_duration = actual_length / len(segments)
61
- start_time = start_offset
62
-
63
  srt_content = ""
64
- for index, segment in enumerate(segments):
65
- end_time = start_time + segment_duration
 
 
 
 
 
 
66
 
67
- if end_time > start_offset + actual_length:
68
- end_time = start_offset + actual_length
69
-
70
- srt_content += f"{index + 1 + (batch_num * 100)}\n"
71
- srt_content += f"{format_time_ms(start_time)} --> {format_time_ms(end_time)}\n"
 
 
72
  srt_content += segment + "\n\n"
73
 
74
- start_time = end_time
75
-
76
- return srt_content, audio_file, start_time
77
-
78
- # Batch processing function with millisecond accuracy
79
- async def batch_process_srt_and_audio(script_text, pitch, rate, voice, progress=gr.Progress()):
80
- batches = [script_text[i:i + 500] for i in range(0, len(script_text), 500)]
81
- all_srt_content = ""
82
- combined_audio = AudioSegment.empty()
83
- start_offset = 0.0
84
-
85
- for batch_num, batch_text in enumerate(batches):
86
- srt_content, audio_file, end_offset = await generate_accurate_srt(batch_text, batch_num, start_offset, pitch, rate, voice)
87
- all_srt_content += srt_content
88
-
89
- batch_audio = AudioSegment.from_file(audio_file)
90
- combined_audio += batch_audio
91
- start_offset = end_offset
92
-
93
  os.remove(audio_file)
94
- progress((batch_num + 1) / len(batches))
95
-
96
- total_audio_length = combined_audio.duration_seconds
97
- validated_srt_content = ""
98
- for line in all_srt_content.strip().splitlines():
99
- if '-->' in line:
100
- start_str, end_str = line.split(' --> ')
101
- start_time = sum(x * float(t) for x, t in zip([3600, 60, 1, 0.001], start_str.replace(',', ':').split(':')))
102
- end_time = sum(x * float(t) for x, t in zip([3600, 60, 1, 0.001], end_str.replace(',', ':').split(':')))
103
- if end_time > total_audio_length:
104
- end_time = total_audio_length
105
- line = f"{format_time_ms(start_time * 1000)} --> {format_time_ms(end_time * 1000)}"
106
- validated_srt_content += line + "\n"
107
-
108
  unique_id = uuid.uuid4()
109
- final_audio_path = f"final_audio_{unique_id}.mp3"
110
- final_srt_path = f"final_subtitles_{unique_id}.srt"
111
-
112
- combined_audio.export(final_audio_path, format="mp3", bitrate="320k")
113
-
114
- with open(final_srt_path, "w") as srt_file:
115
- srt_file.write(validated_srt_content)
116
-
117
- return final_srt_path, final_audio_path
118
 
119
- # Gradio interface function
120
- async def process_script(script_text, pitch, rate, voice):
121
- pitch_str = f"{pitch}Hz" if pitch != 0 else "-1Hz"
122
- formatted_rate = f"{'+' if rate > 1 else ''}{int(rate)}%"
123
- srt_path, audio_path = await batch_process_srt_and_audio(script_text, pitch_str, formatted_rate, voice_options[voice])
 
 
 
 
 
 
 
 
124
  return srt_path, audio_path, audio_path
125
 
126
- # Gradio interface setup
127
  voice_options = {
128
  "Andrew Male": "en-US-AndrewNeural",
129
  "Jenny Female": "en-US-JennyNeural",
@@ -166,22 +158,24 @@ voice_options = {
166
  # Add other voices here...
167
  }
168
 
 
169
  app = gr.Interface(
170
- fn=process_script,
171
  inputs=[
172
- gr.Textbox(label="Enter Script Text", lines=10),
173
  gr.Slider(label="Pitch Adjustment (Hz)", minimum=-20, maximum=20, value=0, step=1),
174
- gr.Slider(label="Rate Adjustment (%)", minimum=-50, maximum=50, value=-1, step=1),
175
- gr.Dropdown(label="Select Voice", choices=list(voice_options.keys()), value="Andrew Male"),
 
 
176
  ],
177
  outputs=[
178
- gr.File(label="Download SRT File"),
179
- gr.File(label="Download Audio File"),
180
- gr.Audio(label="Audio Playback")
181
  ],
182
- title="HIVEcorp Text-to-Speech with Millisecond SRT Generation",
183
- description="Convert your script into Audio and generate Subtitles.",
184
- theme="compact",
185
  )
186
 
187
- app.launch()
 
6
  import uuid
7
  import re
8
 
 
9
  def get_audio_length(audio_file):
10
  audio = AudioSegment.from_file(audio_file)
11
+ return len(audio) / 1000
12
 
 
13
  def format_time_ms(milliseconds):
14
  seconds, ms = divmod(int(milliseconds), 1000)
15
  mins, secs = divmod(seconds, 60)
16
  hrs, mins = divmod(mins, 60)
17
  return f"{hrs:02}:{mins:02}:{secs:02},{ms:03}"
18
 
19
+ def smart_text_split(text, words_per_line, lines_per_segment):
20
+ # First split by major punctuation (periods, exclamation marks, question marks)
21
+ sentences = re.split(r'([.!?]+)', text)
22
+
23
+ # Recombine sentences with their punctuation
24
+ sentences = [''.join(i) for i in zip(sentences[::2], sentences[1::2] + [''])]
25
+
26
  segments = []
27
+ current_segment = []
28
+ current_line = []
29
 
30
+ for sentence in sentences:
31
+ # Split sentence into words
32
+ words = sentence.strip().split()
33
 
34
+ for word in words:
35
+ current_line.append(word)
36
+
37
+ # Check if current line has reached words_per_line
38
+ if len(current_line) >= words_per_line:
39
+ current_segment.append(' '.join(current_line))
40
+ current_line = []
41
+
42
+ # Check if current segment has reached lines_per_segment
43
+ if len(current_segment) >= lines_per_segment:
44
+ segments.append('\n'.join(current_segment))
45
+ current_segment = []
46
+
47
+ # If there are words in current_line, add them as a line
48
+ if current_line:
49
+ current_segment.append(' '.join(current_line))
50
+ current_line = []
51
+
52
+ # Check if we should start a new segment at sentence boundary
53
+ if len(current_segment) >= lines_per_segment:
54
+ segments.append('\n'.join(current_segment))
55
+ current_segment = []
56
+
57
+ # Add any remaining lines
58
+ if current_segment:
59
+ segments.append('\n'.join(current_segment))
60
+
61
  return segments
62
 
63
+ async def generate_accurate_srt(text, voice, rate, pitch, words_per_line, lines_per_segment):
64
+ segments = smart_text_split(text, words_per_line, lines_per_segment)
 
65
 
 
 
 
 
 
 
 
 
 
66
  srt_content = ""
67
+ combined_audio = AudioSegment.empty()
68
+ current_time = 0
69
+
70
+ for idx, segment in enumerate(segments, 1):
71
+ # Generate audio for this segment
72
+ audio_file = f"temp_segment_{idx}.wav"
73
+ tts = edge_tts.Communicate(segment, voice, rate=rate, pitch=pitch)
74
+ await tts.save(audio_file)
75
 
76
+ # Get segment duration
77
+ segment_audio = AudioSegment.from_file(audio_file)
78
+ segment_duration = len(segment_audio)
79
+
80
+ # Add to SRT content with precise timing
81
+ srt_content += f"{idx}\n"
82
+ srt_content += f"{format_time_ms(current_time)} --> {format_time_ms(current_time + segment_duration)}\n"
83
  srt_content += segment + "\n\n"
84
 
85
+ # Update timing and combine audio
86
+ current_time += segment_duration
87
+ combined_audio += segment_audio
88
+
89
+ # Cleanup
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  os.remove(audio_file)
91
+
92
+ # Export final files
 
 
 
 
 
 
 
 
 
 
 
 
93
  unique_id = uuid.uuid4()
94
+ audio_path = f"final_audio_{unique_id}.mp3"
95
+ srt_path = f"final_subtitles_{unique_id}.srt"
96
+
97
+ combined_audio.export(audio_path, format="mp3", bitrate="320k")
98
+ with open(srt_path, "w", encoding='utf-8') as f:
99
+ f.write(srt_content)
100
+
101
+ return srt_path, audio_path
 
102
 
103
+ async def process_text(text, pitch, rate, voice, words_per_line, lines_per_segment):
104
+ pitch_str = f"{pitch}Hz" if pitch != 0 else "0Hz"
105
+ rate_str = f"{'+' if rate > 0 else ''}{rate}%"
106
+
107
+ srt_path, audio_path = await generate_accurate_srt(
108
+ text,
109
+ voice_options[voice],
110
+ rate_str,
111
+ pitch_str,
112
+ words_per_line,
113
+ lines_per_segment
114
+ )
115
+
116
  return srt_path, audio_path, audio_path
117
 
118
+ # Voice options dictionary (same as before)
119
  voice_options = {
120
  "Andrew Male": "en-US-AndrewNeural",
121
  "Jenny Female": "en-US-JennyNeural",
 
158
  # Add other voices here...
159
  }
160
 
161
+ # Create Gradio interface
162
  app = gr.Interface(
163
+ fn=process_text,
164
  inputs=[
165
+ gr.Textbox(label="Enter Text", lines=10),
166
  gr.Slider(label="Pitch Adjustment (Hz)", minimum=-20, maximum=20, value=0, step=1),
167
+ gr.Slider(label="Rate Adjustment (%)", minimum=-50, maximum=50, value=0, step=1),
168
+ gr.Dropdown(label="Select Voice", choices=list(voice_options.keys()), value="Jenny Female"),
169
+ gr.Slider(label="Words per Line", minimum=1, maximum=15, value=8, step=1),
170
+ gr.Slider(label="Lines per Segment", minimum=1, maximum=5, value=2, step=1)
171
  ],
172
  outputs=[
173
+ gr.File(label="Download SRT"),
174
+ gr.File(label="Download Audio"),
175
+ gr.Audio(label="Preview Audio")
176
  ],
177
+ title="Advanced TTS with Configurable SRT Generation",
178
+ description="Generate perfectly synchronized audio and subtitles with custom segmentation control."
 
179
  )
180
 
181
+ app.launch()