File size: 3,055 Bytes
834c01a
 
 
 
 
320ad77
834c01a
0281777
 
 
 
 
 
 
834c01a
 
 
 
0281777
834c01a
0281777
 
834c01a
1073b8a
 
 
 
 
0281777
 
834c01a
1073b8a
 
 
 
 
0281777
1073b8a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
834c01a
0281777
 
 
 
 
834c01a
 
0281777
834c01a
0281777
 
 
 
 
 
 
834c01a
 
 
 
 
0281777
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#import gradio as gr
import tempfile
from pydub import AudioSegment
from transformers import pipeline
from pyannote.audio import Pipeline
import gradio as gr

import gradio as gr
from transformers import pipeline
import whisper
from pyannote.audio import Pipeline
import tempfile
import os

def load_models(model_size):
    if model_size == "transcriber":
        model_name = "clinifyemr/yoruba-model-finetuned"
        transcriber = pipeline("automatic-speech-recognition", model=model_name)
        return transcriber, None
    else:
        model = whisper.load_model(model_size)
        return None, model

from flask import jsonify
import tempfile
import os
import io

def process_audio(audio_file, num_speakers, model_size):
    transcriber, whisper_model = load_models(model_size)

    # Ensure audio file is provided
    if audio_file is None:
        return jsonify({"error": "Audio file is required"}), 400

    try:
        audio_file.seek(0)  # Reset the file pointer
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
            tmp.write(audio_file.read())
            tmp_path = tmp.name

        # Initialize transcription_text
        transcription_text = None

        if transcriber:
            result = transcriber(tmp_path)
            transcription_text = result['text']
        elif whisper_model:
            result = whisper_model.transcribe(tmp_path)
            transcription_text = result['text']

        if transcription_text is None:
            raise ValueError("No transcription results")

        # Diarization process
        diarization_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=HF_TOKEN)
        diarization = diarization_pipeline(tmp_path, min_speakers=num_speakers, max_speakers=5)
        
        os.remove(tmp_path)  # Cleanup the temporary file

        return jsonify({
            "transcription": transcription_text,
            "diarization": diarization.get_timeline().json()
        })

    except Exception as e:
        os.remove(tmp_path)  # Ensure to cleanup on error
        return jsonify({"error": f"Error processing audio file: {e}"}), 500


def gradio_interface(audio_file, num_speakers, model_size):
    transcription, diarization = process_audio(audio_file, num_speakers, model_size)
    if transcription is None or diarization is None:
        return "Error in processing audio file", "No diarization result"
    return transcription, diarization

iface = gr.Interface(
    fn=gradio_interface,
    inputs=[
        gr.Audio(type="filepath", label="Upload Audio"),
        gr.Dropdown(choices=[1,2,3,4,5], label="Number of Speakers"),
        gr.Dropdown(choices=['base', 'small', 'medium', 'large', 'transcriber'], label="Model Selection")
    ],
    outputs=[
        gr.Textbox(label="Transcription"),
        gr.JSON(label="Diarization Output")
    ],
    title="Audio Transcription and Speaker Diarization",
    description="Upload your audio file to transcribe and analyze speaker diarization."
)

iface.launch()