update app.py
Browse files
app.py
CHANGED
@@ -1,17 +1,38 @@
|
|
1 |
import gradio as gr
|
2 |
from pyannote.audio import Pipeline
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
def diarization(audio):
|
5 |
pipeline = Pipeline.from_pretrained("pyannote/speaker-segmentation")
|
6 |
-
#parameters = {"onset":0.7,"offset":0.3,"min_duration_on":0.0,"min_duration_off":0.0}
|
7 |
-
#pipeline.instantiate(parameters)
|
8 |
output = pipeline(audio)
|
9 |
-
|
10 |
for turn, _, speaker in output.itertracks(yield_label=True):
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
app = gr.Interface(fn=diarization,
|
15 |
-
inputs=gr.inputs.Audio(source="upload", type="filepath", label="audio"),
|
16 |
-
outputs="
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
from pyannote.audio import Pipeline
|
3 |
+
from datasets import load_dataset
|
4 |
+
from transformers import pipeline
|
5 |
+
|
6 |
+
librispeech_en = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
7 |
+
asr = pipeline(
|
8 |
+
"automatic-speech-recognition",
|
9 |
+
model="facebook/s2t-wav2vec2-large-en-de",
|
10 |
+
feature_extractor="facebook/s2t-wav2vec2-large-en-de",
|
11 |
+
)
|
12 |
+
|
13 |
+
def speech_to_text(audio):
|
14 |
+
translation = asr(librispeech_en[0][audio])
|
15 |
+
return translation
|
16 |
|
17 |
def diarization(audio):
|
18 |
pipeline = Pipeline.from_pretrained("pyannote/speaker-segmentation")
|
|
|
|
|
19 |
output = pipeline(audio)
|
20 |
+
result = ""
|
21 |
for turn, _, speaker in output.itertracks(yield_label=True):
|
22 |
+
text_result = speech_to_text(audio)
|
23 |
+
result += "{} said '{}' from {:.3f} to {:.3f}\n".format(speaker,text_result,turn.start,turn.end)
|
24 |
+
return "No output" if result == "" else result
|
25 |
+
|
26 |
+
title = "Speech Recognition with Speaker Diarization"
|
27 |
+
description = "Speaker Diarization is the act of attributing parts of the audio recording to different speakers. This space aims to distinguish the speakers and apply speech-to-text from a given input audio file. Pre-trained models from Pyannote[1] for the Speaker Diarization and [2]."
|
28 |
+
article = "<p style='text-align: center'><a href='https://github.com/pyannote/pyannote-audio' target='_blank'>[1] Pyannote - Speaker Diarization model</a></p>"
|
29 |
|
30 |
app = gr.Interface(fn=diarization,
|
31 |
+
inputs=gr.inputs.Audio(source="upload", type="filepath", label="Upload your audio file here:"),
|
32 |
+
outputs=gr.outputs.Textbox(type="auto", label="OUTPUT"),
|
33 |
+
examples=[["test_audio1.wav"]],
|
34 |
+
title=title,
|
35 |
+
description=description,
|
36 |
+
article=article,
|
37 |
+
allow_flagging=False)
|
38 |
+
app.launch(enable_queue=True)
|