Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -192,7 +192,7 @@ def get_youtube(video_url):
|
|
192 |
print(abs_video_path)
|
193 |
return abs_video_path
|
194 |
|
195 |
-
def speech_to_text(video_file_path, selected_source_lang, whisper_model,
|
196 |
"""
|
197 |
# Transcribe youtube link using OpenAI Whisper
|
198 |
1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
|
@@ -250,22 +250,19 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, min_num
|
|
250 |
embeddings = np.nan_to_num(embeddings)
|
251 |
print(f'Embedding shape: {embeddings.shape}')
|
252 |
|
|
|
253 |
# Find the best number of speakers
|
254 |
-
|
255 |
-
|
256 |
-
|
|
|
|
|
|
|
|
|
|
|
257 |
else:
|
258 |
-
|
259 |
-
|
260 |
-
score_num_speakers = {}
|
261 |
-
|
262 |
-
for num_speakers in range(min_speakers, max_speakers+1):
|
263 |
-
clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
|
264 |
-
score = silhouette_score(embeddings, clustering.labels_, metric='euclidean')
|
265 |
-
score_num_speakers[num_speakers] = score
|
266 |
-
best_num_speaker = max(score_num_speakers, key=lambda x:score_num_speakers[x])
|
267 |
-
print(f"The best number of speakers: {best_num_speaker} with {score_num_speakers[best_num_speaker]} score")
|
268 |
-
|
269 |
# Assign speaker label
|
270 |
clustering = AgglomerativeClustering(best_num_speaker).fit(embeddings)
|
271 |
labels = clustering.labels_
|
@@ -320,8 +317,7 @@ df_init = pd.DataFrame(columns=['Start', 'End', 'Speaker', 'Text'])
|
|
320 |
memory = psutil.virtual_memory()
|
321 |
selected_source_lang = gr.Dropdown(choices=source_language_list, type="value", value="en", label="Spoken language in video", interactive=True)
|
322 |
selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value="base", label="Selected Whisper model", interactive=True)
|
323 |
-
|
324 |
-
input_max_number_speakers = gr.Number(precision=0, value=2, label="Select maximum number of speakers", interactive=True)
|
325 |
system_info = gr.Markdown(f"*Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB*")
|
326 |
download_transcript = gr.File(label="Download transcript")
|
327 |
transcription_df = gr.DataFrame(value=df_init,label="Transcription dataframe", row_count=(0, "dynamic"), max_rows = 10, wrap=True, overflow_row_behaviour='paginate')
|
@@ -378,11 +374,10 @@ with demo:
|
|
378 |
''')
|
379 |
selected_source_lang.render()
|
380 |
selected_whisper_model.render()
|
381 |
-
|
382 |
-
input_max_number_speakers.render()
|
383 |
transcribe_btn = gr.Button("Transcribe audio and diarization")
|
384 |
transcribe_btn.click(speech_to_text,
|
385 |
-
[video_in, selected_source_lang, selected_whisper_model,
|
386 |
[transcription_df, system_info, download_transcript]
|
387 |
)
|
388 |
|
|
|
192 |
print(abs_video_path)
|
193 |
return abs_video_path
|
194 |
|
195 |
+
def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_speakers):
|
196 |
"""
|
197 |
# Transcribe youtube link using OpenAI Whisper
|
198 |
1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
|
|
|
250 |
embeddings = np.nan_to_num(embeddings)
|
251 |
print(f'Embedding shape: {embeddings.shape}')
|
252 |
|
253 |
+
if num_speakers == 0:
|
254 |
# Find the best number of speakers
|
255 |
+
score_num_speakers = {}
|
256 |
+
|
257 |
+
for num_speakers in range(2, 10+1):
|
258 |
+
clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
|
259 |
+
score = silhouette_score(embeddings, clustering.labels_, metric='euclidean')
|
260 |
+
score_num_speakers[num_speakers] = score
|
261 |
+
best_num_speaker = max(score_num_speakers, key=lambda x:score_num_speakers[x])
|
262 |
+
print(f"The best number of speakers: {best_num_speaker} with {score_num_speakers[best_num_speaker]} score")
|
263 |
else:
|
264 |
+
best_num_speaker = num_speakers
|
265 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
266 |
# Assign speaker label
|
267 |
clustering = AgglomerativeClustering(best_num_speaker).fit(embeddings)
|
268 |
labels = clustering.labels_
|
|
|
317 |
memory = psutil.virtual_memory()
|
318 |
selected_source_lang = gr.Dropdown(choices=source_language_list, type="value", value="en", label="Spoken language in video", interactive=True)
|
319 |
selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value="base", label="Selected Whisper model", interactive=True)
|
320 |
+
number_speakers = gr.Number(precision=0, value=0, label="Input number of speakers for better results. If value=0, model will automatic find the best number of speakers", interactive=True)
|
|
|
321 |
system_info = gr.Markdown(f"*Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB*")
|
322 |
download_transcript = gr.File(label="Download transcript")
|
323 |
transcription_df = gr.DataFrame(value=df_init,label="Transcription dataframe", row_count=(0, "dynamic"), max_rows = 10, wrap=True, overflow_row_behaviour='paginate')
|
|
|
374 |
''')
|
375 |
selected_source_lang.render()
|
376 |
selected_whisper_model.render()
|
377 |
+
number_speakers.render()
|
|
|
378 |
transcribe_btn = gr.Button("Transcribe audio and diarization")
|
379 |
transcribe_btn.click(speech_to_text,
|
380 |
+
[video_in, selected_source_lang, selected_whisper_model, number_speakers],
|
381 |
[transcription_df, system_info, download_transcript]
|
382 |
)
|
383 |
|