Spaces:

Cahlil
/

Speech-Recognition-with-Speaker-Segmentation

Build error

App Files Files Community

Cahlil commited on Mar 28, 2022

Commit

8983ff3

1 Parent(s): d78cd77

input handling edit

Browse files

Files changed (1) hide show

app.py +12 -6

app.py CHANGED Viewed

@@ -8,10 +8,12 @@ asr = pipeline(
     feature_extractor="facebook/wav2vec2-large-960h-lv60-self",
 )
-speaker_diarization = Pipeline.from_pretrained("pyannote/speaker-diarization")
-def diarization(audio):
-    speaker_output = speaker_diarization(audio)
     text_output = asr(audio,return_timestamps="word")
     full_text = text_output['text'].lower()
@@ -35,9 +37,13 @@ def diarization(audio):
 title = "Speech Recognition with Speaker Diarization"
 description = "Speaker Diarization is the act of attributing parts of the audio recording to different speakers. This space aims to distinguish the speakers and apply speech-to-text from a given input audio file. Pre-trained models from Pyannote[1] for the Speaker Diarization and [2]."
 article = "<p style='text-align: center'><a href='https://github.com/pyannote/pyannote-audio' target='_blank'>[1] Pyannote - Speaker Diarization model</a></p>"
-inputs = gr.inputs.Audio(source="upload", type="filepath", label="Upload your audio file here:")
-outputs = [gr.outputs.Textbox(type="auto", label="Diarized Output"),gr.outputs.Textbox(type="auto",label="Full ASR Text for comparison")]
-examples = [["test_audio1.wav"]]
 app = gr.Interface(fn=diarization,
                 inputs=inputs,

     feature_extractor="facebook/wav2vec2-large-960h-lv60-self",
 )
+pipeline1 = Pipeline.from_pretrained("pyannote/speaker-segmentation")
+def diarization(file_input,microphone_input,selection):
+    audio = file_input if str(selection) == "Upload" else Path(microphone_input)
+    speaker_output = pipeline1(audio)
     text_output = asr(audio,return_timestamps="word")
     full_text = text_output['text'].lower()
 title = "Speech Recognition with Speaker Diarization"
 description = "Speaker Diarization is the act of attributing parts of the audio recording to different speakers. This space aims to distinguish the speakers and apply speech-to-text from a given input audio file. Pre-trained models from Pyannote[1] for the Speaker Diarization and [2]."
 article = "<p style='text-align: center'><a href='https://github.com/pyannote/pyannote-audio' target='_blank'>[1] Pyannote - Speaker Diarization model</a></p>"
+inputs = [gr.inputs.Audio(source="upload", type="filepath", label="Upload your audio file here:"),
+        gr.inputs.Audio(source="microphone", type="filepath",label="Or use your Microphone:"),
+        gr.inputs.Radio(["Upload","Microphone"],type="value",label="Select which input:")]
+outputs = [gr.outputs.Textbox(type="auto", label="Diarized Output"),
+        gr.outputs.Textbox(type="auto",label="Full ASR Text for comparison")]
+examples = [["test_audio1.wav","test_audio1.wav","Upload"],
+            ["test_audio2.wav","test_audio2.wav","Upload"]]
 app = gr.Interface(fn=diarization,
                 inputs=inputs,