vumichien commited on
Commit
494edc1
·
1 Parent(s): 301359c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -20
app.py CHANGED
@@ -192,7 +192,7 @@ def get_youtube(video_url):
192
  print(abs_video_path)
193
  return abs_video_path
194
 
195
- def speech_to_text(video_file_path, selected_source_lang, whisper_model, min_num_speakers, max_number_speakers):
196
  """
197
  # Transcribe youtube link using OpenAI Whisper
198
  1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
@@ -250,22 +250,19 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, min_num
250
  embeddings = np.nan_to_num(embeddings)
251
  print(f'Embedding shape: {embeddings.shape}')
252
 
 
253
  # Find the best number of speakers
254
- if min_num_speakers > max_number_speakers:
255
- min_speakers = max_number_speakers
256
- max_speakers = min_num_speakers
 
 
 
 
 
257
  else:
258
- min_speakers = min_num_speakers
259
- max_speakers = max_number_speakers
260
- score_num_speakers = {}
261
-
262
- for num_speakers in range(min_speakers, max_speakers+1):
263
- clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
264
- score = silhouette_score(embeddings, clustering.labels_, metric='euclidean')
265
- score_num_speakers[num_speakers] = score
266
- best_num_speaker = max(score_num_speakers, key=lambda x:score_num_speakers[x])
267
- print(f"The best number of speakers: {best_num_speaker} with {score_num_speakers[best_num_speaker]} score")
268
-
269
  # Assign speaker label
270
  clustering = AgglomerativeClustering(best_num_speaker).fit(embeddings)
271
  labels = clustering.labels_
@@ -320,8 +317,7 @@ df_init = pd.DataFrame(columns=['Start', 'End', 'Speaker', 'Text'])
320
  memory = psutil.virtual_memory()
321
  selected_source_lang = gr.Dropdown(choices=source_language_list, type="value", value="en", label="Spoken language in video", interactive=True)
322
  selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value="base", label="Selected Whisper model", interactive=True)
323
- input_min_number_speakers = gr.Number(precision=0, value=2, label="Select minimum number of speakers", interactive=True)
324
- input_max_number_speakers = gr.Number(precision=0, value=2, label="Select maximum number of speakers", interactive=True)
325
  system_info = gr.Markdown(f"*Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB*")
326
  download_transcript = gr.File(label="Download transcript")
327
  transcription_df = gr.DataFrame(value=df_init,label="Transcription dataframe", row_count=(0, "dynamic"), max_rows = 10, wrap=True, overflow_row_behaviour='paginate')
@@ -378,11 +374,10 @@ with demo:
378
  ''')
379
  selected_source_lang.render()
380
  selected_whisper_model.render()
381
- input_min_number_speakers.render()
382
- input_max_number_speakers.render()
383
  transcribe_btn = gr.Button("Transcribe audio and diarization")
384
  transcribe_btn.click(speech_to_text,
385
- [video_in, selected_source_lang, selected_whisper_model, input_min_number_speakers, input_max_number_speakers],
386
  [transcription_df, system_info, download_transcript]
387
  )
388
 
 
192
  print(abs_video_path)
193
  return abs_video_path
194
 
195
+ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_speakers):
196
  """
197
  # Transcribe youtube link using OpenAI Whisper
198
  1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
 
250
  embeddings = np.nan_to_num(embeddings)
251
  print(f'Embedding shape: {embeddings.shape}')
252
 
253
+ if num_speakers == 0:
254
  # Find the best number of speakers
255
+ score_num_speakers = {}
256
+
257
+ for num_speakers in range(2, 10+1):
258
+ clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
259
+ score = silhouette_score(embeddings, clustering.labels_, metric='euclidean')
260
+ score_num_speakers[num_speakers] = score
261
+ best_num_speaker = max(score_num_speakers, key=lambda x:score_num_speakers[x])
262
+ print(f"The best number of speakers: {best_num_speaker} with {score_num_speakers[best_num_speaker]} score")
263
  else:
264
+ best_num_speaker = num_speakers
265
+
 
 
 
 
 
 
 
 
 
266
  # Assign speaker label
267
  clustering = AgglomerativeClustering(best_num_speaker).fit(embeddings)
268
  labels = clustering.labels_
 
317
  memory = psutil.virtual_memory()
318
  selected_source_lang = gr.Dropdown(choices=source_language_list, type="value", value="en", label="Spoken language in video", interactive=True)
319
  selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value="base", label="Selected Whisper model", interactive=True)
320
+ number_speakers = gr.Number(precision=0, value=0, label="Input number of speakers for better results. If value=0, model will automatic find the best number of speakers", interactive=True)
 
321
  system_info = gr.Markdown(f"*Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB*")
322
  download_transcript = gr.File(label="Download transcript")
323
  transcription_df = gr.DataFrame(value=df_init,label="Transcription dataframe", row_count=(0, "dynamic"), max_rows = 10, wrap=True, overflow_row_behaviour='paginate')
 
374
  ''')
375
  selected_source_lang.render()
376
  selected_whisper_model.render()
377
+ number_speakers.render()
 
378
  transcribe_btn = gr.Button("Transcribe audio and diarization")
379
  transcribe_btn.click(speech_to_text,
380
+ [video_in, selected_source_lang, selected_whisper_model, number_speakers],
381
  [transcription_df, system_info, download_transcript]
382
  )
383