Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -17,6 +17,8 @@ from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbed
|
|
17 |
from pyannote.audio import Audio
|
18 |
from pyannote.core import Segment
|
19 |
|
|
|
|
|
20 |
import wave
|
21 |
import contextlib
|
22 |
|
@@ -137,7 +139,7 @@ print("DEVICE IS: ")
|
|
137 |
print(device)
|
138 |
|
139 |
|
140 |
-
def
|
141 |
return datetime.timedelta(seconds=round(secs))
|
142 |
|
143 |
def get_youtube(video_url):
|
@@ -161,6 +163,7 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
|
|
161 |
"""
|
162 |
|
163 |
model = whisper.load_model(whisper_model)
|
|
|
164 |
if(video_file_path == None):
|
165 |
raise ValueError("Error no video input")
|
166 |
print(video_file_path)
|
@@ -222,17 +225,29 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
|
|
222 |
text = ''
|
223 |
for (i, segment) in enumerate(segments):
|
224 |
if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
|
225 |
-
objects['Start'].append(str(
|
226 |
objects['Speaker'].append(segment["speaker"])
|
227 |
if i != 0:
|
228 |
-
objects['End'].append(str(
|
229 |
objects['Text'].append(text)
|
230 |
text = ''
|
231 |
text += segment["text"] + ' '
|
232 |
-
objects['End'].append(str(
|
233 |
objects['Text'].append(text)
|
234 |
|
235 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
|
237 |
except Exception as e:
|
238 |
raise RuntimeError("Error Running inference with local model", e)
|
@@ -266,13 +281,13 @@ with demo:
|
|
266 |
memory = psutil.virtual_memory()
|
267 |
|
268 |
with gr.Row():
|
269 |
-
gr.Markdown(
|
270 |
### This space allows you to:
|
271 |
##### 1. Download youtube video with a given URL
|
272 |
##### 2. Watch it in the first video component
|
273 |
##### 3. Run automatic speech recognition and diarization (speaker identification)
|
274 |
-
*Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB*
|
275 |
''')
|
|
|
276 |
|
277 |
with gr.Row():
|
278 |
gr.Markdown('''
|
@@ -307,7 +322,7 @@ with demo:
|
|
307 |
selected_whisper_model.render()
|
308 |
number_speakers.render()
|
309 |
transcribe_btn = gr.Button("Transcribe audio and diarization")
|
310 |
-
transcribe_btn.click(speech_to_text, [video_in, selected_source_lang, selected_whisper_model, number_speakers], transcription_df)
|
311 |
|
312 |
|
313 |
with gr.Row():
|
@@ -319,4 +334,4 @@ with demo:
|
|
319 |
with gr.Column():
|
320 |
transcription_df.render()
|
321 |
|
322 |
-
demo.launch(debug=True)
|
|
|
17 |
from pyannote.audio import Audio
|
18 |
from pyannote.core import Segment
|
19 |
|
20 |
+
from gpuinfo import GPUInfo
|
21 |
+
|
22 |
import wave
|
23 |
import contextlib
|
24 |
|
|
|
139 |
print(device)
|
140 |
|
141 |
|
142 |
+
def convert_time(secs):
|
143 |
return datetime.timedelta(seconds=round(secs))
|
144 |
|
145 |
def get_youtube(video_url):
|
|
|
163 |
"""
|
164 |
|
165 |
model = whisper.load_model(whisper_model)
|
166 |
+
time_start = time.time()
|
167 |
if(video_file_path == None):
|
168 |
raise ValueError("Error no video input")
|
169 |
print(video_file_path)
|
|
|
225 |
text = ''
|
226 |
for (i, segment) in enumerate(segments):
|
227 |
if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
|
228 |
+
objects['Start'].append(str(convert_time(segment["start"])))
|
229 |
objects['Speaker'].append(segment["speaker"])
|
230 |
if i != 0:
|
231 |
+
objects['End'].append(str(convert_time(segments[i - 1]["end"])))
|
232 |
objects['Text'].append(text)
|
233 |
text = ''
|
234 |
text += segment["text"] + ' '
|
235 |
+
objects['End'].append(str(convert_time(segments[i - 1]["end"])))
|
236 |
objects['Text'].append(text)
|
237 |
|
238 |
+
time_end = time.time()
|
239 |
+
time_diff = time_end - time_start
|
240 |
+
memory = psutil.virtual_memory()
|
241 |
+
gpu_utilization, gpu_memory = GPUInfo.gpu_usage()
|
242 |
+
gpu_utilization = gpu_utilization[0] if len(gpu_utilization) > 0 else 0
|
243 |
+
gpu_memory = gpu_memory[0] if len(gpu_memory) > 0 else 0
|
244 |
+
system_info = f"""
|
245 |
+
*Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB.*
|
246 |
+
*Processing time: {time_diff:.5} seconds.*
|
247 |
+
*GPU Utilization: {gpu_utilization}%, GPU Memory: {gpu_memory}MiB.*
|
248 |
+
"""
|
249 |
+
|
250 |
+
return pd.DataFrame(objects), system_info
|
251 |
|
252 |
except Exception as e:
|
253 |
raise RuntimeError("Error Running inference with local model", e)
|
|
|
281 |
memory = psutil.virtual_memory()
|
282 |
|
283 |
with gr.Row():
|
284 |
+
gr.Markdown('''
|
285 |
### This space allows you to:
|
286 |
##### 1. Download youtube video with a given URL
|
287 |
##### 2. Watch it in the first video component
|
288 |
##### 3. Run automatic speech recognition and diarization (speaker identification)
|
|
|
289 |
''')
|
290 |
+
system_info = gr.Markdown(f"*Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB*")
|
291 |
|
292 |
with gr.Row():
|
293 |
gr.Markdown('''
|
|
|
322 |
selected_whisper_model.render()
|
323 |
number_speakers.render()
|
324 |
transcribe_btn = gr.Button("Transcribe audio and diarization")
|
325 |
+
transcribe_btn.click(speech_to_text, [video_in, selected_source_lang, selected_whisper_model, number_speakers], [transcription_df, system_info])
|
326 |
|
327 |
|
328 |
with gr.Row():
|
|
|
334 |
with gr.Column():
|
335 |
transcription_df.render()
|
336 |
|
337 |
+
demo.launch(debug=True, share=True)
|