import gradio as gr from pyannote.audio import Pipeline from datasets import load_dataset from transformers import pipeline librispeech_en = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") asr = pipeline( "automatic-speech-recognition", model="facebook/s2t-wav2vec2-large-en-de", feature_extractor="facebook/s2t-wav2vec2-large-en-de", ) def speech_to_text(audio): translation = asr(librispeech_en[0][audio]) return translation def diarization(audio): pipeline = Pipeline.from_pretrained("pyannote/speaker-segmentation") output = pipeline(audio) result = "" for turn, _, speaker in output.itertracks(yield_label=True): text_result = speech_to_text(audio) result += "{} said '{}' from {:.3f} to {:.3f}\n".format(speaker,text_result,turn.start,turn.end) return "No output" if result == "" else result title = "Speech Recognition with Speaker Diarization" description = "Speaker Diarization is the act of attributing parts of the audio recording to different speakers. This space aims to distinguish the speakers and apply speech-to-text from a given input audio file. Pre-trained models from Pyannote[1] for the Speaker Diarization and [2]." article = "

[1] Pyannote - Speaker Diarization model

" app = gr.Interface(fn=diarization, inputs=gr.inputs.Audio(source="upload", type="filepath", label="Upload your audio file here:"), outputs=gr.outputs.Textbox(type="auto", label="OUTPUT"), examples=[["test_audio1.wav"]], title=title, description=description, article=article, allow_flagging=False) app.launch(enable_queue=True)