hivecorp commited on
Commit
f7e1683
·
verified ·
1 Parent(s): 310bb28

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +91 -180
app.py CHANGED
@@ -1,189 +1,100 @@
1
- import gradio as gr
2
- from pydub import AudioSegment
3
  import edge_tts
4
- import os
5
  import asyncio
6
- import uuid
7
- import re
8
-
9
- # Function to get the length of an audio file in seconds
10
- def get_audio_length(audio_file):
11
- audio = AudioSegment.from_file(audio_file)
12
- return audio.duration_seconds
13
-
14
- # Function to format time for SRT
15
- def format_time(seconds):
16
- millis = int((seconds % 1) * 1000)
17
- seconds = int(seconds)
18
- hrs = seconds // 3600
19
- mins = (seconds % 3600) // 60
20
- secs = seconds % 60
21
- return f"{hrs:02}:{mins:02}:{secs:02},{millis:03}"
22
-
23
- # Updated function to split text into segments by punctuation or limit to 7-8 words without splitting words
24
- def split_text_into_segments(text):
25
- segments = []
26
- raw_segments = re.split(r'([.!?])', text) # Split by punctuation with the delimiter preserved
27
- combined_segments = []
28
-
29
- # Combine text with punctuation back into full sentences
30
- for i in range(0, len(raw_segments) - 1, 2):
31
- sentence = raw_segments[i].strip() + raw_segments[i + 1]
32
- combined_segments.append(sentence.strip())
33
-
34
- # Further split sentences into 7-8 word segments without splitting words
35
- for sentence in combined_segments:
36
- words = sentence.split()
37
- while words:
38
- segment = " ".join(words[:8]) # Take up to 8 words
39
- segments.append(segment)
40
- words = words[8:] # Move to the next batch of words
41
-
42
- return segments
43
-
44
- # Function to generate SRT with accurate timing per batch
45
- async def generate_accurate_srt(batch_text, batch_num, start_offset, pitch, rate, voice):
46
- audio_file = f"batch_{batch_num}_audio.wav"
47
-
48
- # Generate the audio using edge-tts
49
- tts = edge_tts.Communicate(batch_text, voice, rate=rate, pitch=pitch)
50
- await tts.save(audio_file)
51
-
52
- # Get the actual length of the audio file
53
- actual_length = get_audio_length(audio_file)
54
-
55
- # Split the text into segments based on punctuation and word count
56
- segments = split_text_into_segments(batch_text)
57
- segment_duration = actual_length / len(segments) # Duration per segment
58
- start_time = start_offset
59
-
60
- # Initialize SRT content
61
- srt_content = ""
62
- for index, segment in enumerate(segments):
63
- end_time = start_time + segment_duration
64
-
65
- # Ensure the end time does not exceed the total audio length
66
- if end_time > start_offset + actual_length:
67
- end_time = start_offset + actual_length
68
-
69
- srt_content += f"{index + 1 + (batch_num * 100)}\n"
70
- srt_content += f"{format_time(start_time)} --> {format_time(end_time)}\n"
71
- srt_content += segment + "\n\n"
72
-
73
- start_time = end_time
74
-
75
- return srt_content, audio_file, start_time
76
-
77
- # Batch processing function
78
- async def batch_process_srt_and_audio(script_text, pitch, rate, voice, progress=gr.Progress()):
79
- batches = [script_text[i:i + 500] for i in range(0, len(script_text), 500)]
80
- all_srt_content = ""
81
- combined_audio = AudioSegment.empty()
82
- start_offset = 0.0
83
-
84
- for batch_num, batch_text in enumerate(batches):
85
- srt_content, audio_file, end_offset = await generate_accurate_srt(batch_text, batch_num, start_offset, pitch, rate, voice)
86
- all_srt_content += srt_content
87
-
88
- batch_audio = AudioSegment.from_file(audio_file)
89
- combined_audio += batch_audio
90
- start_offset = end_offset
91
-
92
- os.remove(audio_file)
93
- progress((batch_num + 1) / len(batches))
94
-
95
- # Adjust the total length of the audio for the final cut-off
96
- total_audio_length = combined_audio.duration_seconds
97
- validated_srt_content = ""
98
- for line in all_srt_content.strip().splitlines():
99
- if '-->' in line:
100
- start_str, end_str = line.split(' --> ')
101
- start_time = sum(x * float(t) for x, t in zip([3600, 60, 1, 0.001], start_str.replace(',', ':').split(':')))
102
- end_time = sum(x * float(t) for x, t in zip([3600, 60, 1, 0.001], end_str.replace(',', ':').split(':')))
103
-
104
- # Correct end time to ensure it does not exceed the total audio length
105
- if end_time > total_audio_length:
106
- end_time = total_audio_length
107
-
108
- line = f"{format_time(start_time)} --> {format_time(end_time)}"
109
- validated_srt_content += line + "\n"
110
-
111
- unique_id = uuid.uuid4()
112
- final_audio_path = f"final_audio_{unique_id}.mp3"
113
- final_srt_path = f"final_subtitles_{unique_id}.srt"
114
-
115
- combined_audio.export(final_audio_path, format="mp3", bitrate="320k")
116
-
117
- with open(final_srt_path, "w") as srt_file:
118
- srt_file.write(validated_srt_content)
119
-
120
- return final_srt_path, final_audio_path
121
 
122
  # Gradio interface function
123
- async def process_script(script_text, pitch, rate, voice):
124
- # Format pitch correctly for edge-tts
125
- pitch_str = f"{pitch}Hz" if pitch != 0 else "-1Hz"
126
- formatted_rate = f"{'+' if rate > 1 else ''}{int(rate)}%"
127
- srt_path, audio_path = await batch_process_srt_and_audio(script_text, pitch_str, formatted_rate, voice_options[voice])
128
- return srt_path, audio_path, audio_path
129
-
130
- # Gradio interface setup
131
- voice_options = {
132
- "Andrew Male": "en-US-AndrewNeural",
133
- "Jenny Female": "en-US-JennyNeural",
134
- "Guy Male": "en-US-GuyNeural",
135
- "Ana Female": "en-US-AnaNeural",
136
- "Aria Female": "en-US-AriaNeural",
137
- "Brian Male": "en-US-BrianNeural",
138
- "Christopher Male": "en-US-ChristopherNeural",
139
- "Eric Male": "en-US-EricNeural",
140
- "Michelle Male": "en-US-MichelleNeural",
141
- "Roger Male": "en-US-RogerNeural",
142
- "Natasha Female": "en-AU-NatashaNeural",
143
- "William Male": "en-AU-WilliamNeural",
144
- "Clara Female": "en-CA-ClaraNeural",
145
- "Liam Female ": "en-CA-LiamNeural",
146
- "Libby Female": "en-GB-LibbyNeural",
147
- "Maisie": "en-GB-MaisieNeural",
148
- "Ryan": "en-GB-RyanNeural",
149
- "Sonia": "en-GB-SoniaNeural",
150
- "Thomas": "en-GB-ThomasNeural",
151
- "Sam": "en-HK-SamNeural",
152
- "Yan": "en-HK-YanNeural",
153
- "Connor": "en-IE-ConnorNeural",
154
- "Emily": "en-IE-EmilyNeural",
155
- "Neerja": "en-IN-NeerjaNeural",
156
- "Prabhat": "en-IN-PrabhatNeural",
157
- "Asilia": "en-KE-AsiliaNeural",
158
- "Chilemba": "en-KE-ChilembaNeural",
159
- "Abeo": "en-NG-AbeoNeural",
160
- "Ezinne": "en-NG-EzinneNeural",
161
- "Mitchell": "en-NZ-MitchellNeural",
162
- "James": "en-PH-JamesNeural",
163
- "Rosa": "en-PH-RosaNeural",
164
- "Luna": "en-SG-LunaNeural",
165
- "Wayne": "en-SG-WayneNeural",
166
- "Elimu": "en-TZ-ElimuNeural",
167
- "Imani": "en-TZ-ImaniNeural",
168
- "Leah": "en-ZA-LeahNeural",
169
- "Luke": "en-ZA-LukeNeural"
170
- } # All voice options
171
 
172
- app = gr.Interface(
173
- fn=process_script,
 
174
  inputs=[
175
- gr.Textbox(label="Enter Script Text", lines=10),
176
- gr.Slider(label="Pitch Adjustment (Hz)", minimum=-20, maximum=20, value=0, step=1),
177
- gr.Slider(label="Rate Adjustment (%)", minimum=-50, maximum=50, value=-1, step=1),
178
- gr.Dropdown(label="Select Voice", choices=list(voice_options.keys()), value="Andrew Male"),
179
- ],
180
- outputs=[
181
- gr.File(label="Download SRT File"),
182
- gr.File(label="Download Audio File"),
183
- gr.Audio(label="Audio Playback")
184
  ],
185
- title="HIVEcorp Text-to-Speech with SRT Generation",
186
- description="Convert your script into speech and generate synchronized subtitles (SRT)."
187
  )
188
 
189
- app.launch(share=True)
 
 
1
+ import tempfile
 
2
  import edge_tts
3
+ import gradio as gr
4
  import asyncio
5
+ from concurrent.futures import ThreadPoolExecutor
6
+
7
+ # Language and voice selection dictionary
8
+ language_dict = {
9
+ "Hindi": {
10
+ "Madhur": "hi-IN-MadhurNeural",
11
+ "Swara": "hi-IN-SwaraNeural"
12
+ },
13
+ "English": {
14
+ "Jenny": "en-US-JennyNeural",
15
+ "Guy": "en-US-GuyNeural",
16
+ "Ana": "en-US-AnaNeural",
17
+ "Aria": "en-US-AriaNeural",
18
+ "Brian": "en-US-BrianNeural",
19
+ "Christopher": "en-US-ChristopherNeural",
20
+ "Eric": "en-US-EricNeural",
21
+ "Michelle": "en-US-MichelleNeural",
22
+ "Roger": "en-US-RogerNeural",
23
+ "Natasha": "en-AU-NatashaNeural",
24
+ "William": "en-AU-WilliamNeural",
25
+ "Clara": "en-CA-ClaraNeural",
26
+ "Liam": "en-CA-LiamNeural",
27
+ "Libby": "en-GB-LibbyNeural",
28
+ "Maisie": "en-GB-MaisieNeural",
29
+ "Ryan": "en-GB-RyanNeural",
30
+ "Sonia": "en-GB-SoniaNeural",
31
+ "Thomas": "en-GB-ThomasNeural",
32
+ "Sam": "en-HK-SamNeural",
33
+ "Yan": "en-HK-YanNeural",
34
+ "Connor": "en-IE-ConnorNeural",
35
+ "Emily": "en-IE-EmilyNeural",
36
+ "Neerja": "en-IN-NeerjaNeural",
37
+ "Prabhat": "en-IN-PrabhatNeural",
38
+ "Asilia": "en-KE-AsiliaNeural",
39
+ "Chilemba": "en-KE-ChilembaNeural",
40
+ "Abeo": "en-NG-AbeoNeural",
41
+ "Ezinne": "en-NG-EzinneNeural",
42
+ "Mitchell": "en-NZ-MitchellNeural",
43
+ "James": "en-PH-JamesNeural",
44
+ "Rosa": "en-PH-RosaNeural",
45
+ "Luna": "en-SG-LunaNeural",
46
+ "Wayne": "en-SG-WayneNeural",
47
+ "Elimu": "en-TZ-ElimuNeural",
48
+ "Imani": "en-TZ-ImaniNeural",
49
+ "Leah": "en-ZA-LeahNeural",
50
+ "Luke": "en-ZA-LukeNeural"
51
+ },
52
+ # Add other languages...
53
+ }
54
+
55
+ # Function to chunk text into parts of max 5000 characters
56
+ def chunk_text(text, max_length=5000):
57
+ return [text[i:i + max_length] for i in range(0, len(text), max_length)]
58
+
59
+ # Function to generate speech for each chunk using edge_tts
60
+ async def generate_speech(text_chunk, language, voice):
61
+ communicate = edge_tts.Communicate(text_chunk, voice=language_dict[language][voice])
62
+ audio_data = await communicate.save() # This is an awaitable method
63
+ return audio_data
64
+
65
+ # Function to process text and generate speech
66
+ async def process_text_to_speech(text, language, voice):
67
+ chunks = chunk_text(text)
68
+ results = []
69
+ # Generate speech for each chunk asynchronously
70
+ for chunk in chunks:
71
+ audio_data = await generate_speech(chunk, language, voice)
72
+ results.append(audio_data)
73
+
74
+ # Combine all audio parts into a single file
75
+ with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as output_file:
76
+ output_filename = output_file.name
77
+ with open(output_filename, "wb") as f:
78
+ for result in results:
79
+ f.write(result) # Write the audio data to file
80
+ return output_filename
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
  # Gradio interface function
83
+ async def gradio_interface(text, language, voice):
84
+ audio_filename = await process_text_to_speech(text, language, voice)
85
+ return audio_filename
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
+ # Gradio UI setup
88
+ iface = gr.Interface(
89
+ fn=gradio_interface,
90
  inputs=[
91
+ gr.Textbox(label="Enter Text"),
92
+ gr.Dropdown(choices=list(language_dict.keys()), label="Select Language"),
93
+ gr.Dropdown(choices=["Madhur", "Swara", "Jenny", "Guy", "Ana", "Aria", "Brian"], label="Select Voice")
 
 
 
 
 
 
94
  ],
95
+ outputs=gr.File(label="Download Audio File"),
96
+ live=True # To enable real-time input processing
97
  )
98
 
99
+ # Launch the Gradio interface
100
+ iface.launch()