File size: 2,803 Bytes
fd585a0
 
 
9b33f31
fd585a0
 
492c47b
 
9b33f31
fd585a0
492c47b
56ba41e
fd585a0
03660fe
 
492c47b
 
03660fe
492c47b
91a1a24
 
2bf0c14
91a1a24
 
 
 
 
 
 
 
 
 
 
 
 
 
492c47b
0f1ce4a
 
03660fe
91a1a24
492c47b
 
03660fe
9b33f31
 
f9e42f2
9b33f31
f9e42f2
9b33f31
 
 
 
91a1a24
 
 
 
 
 
 
 
 
 
fd585a0
 
 
 
 
91a1a24
492c47b
91a1a24
 
 
fd585a0
 
91a1a24
 
 
 
fd585a0
 
0f1ce4a
 
91a1a24
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import os
import gradio as gr
import whisper
from whisper import tokenizer
import time

current_size = 'base'
model = whisper.load_model(current_size)
AUTO_DETECT_LANG = "Auto Detect"

def transcribe(audio, state={}, model_size='base', delay=1.2, lang=None, translate=False):
    time.sleep(delay - 1)

    global current_size
    global model
    if model_size != current_size:
        current_size = model_size
        model = whisper.load_model(current_size)
  
    transcription = model.transcribe(
        audio,
        language = lang if lang != AUTO_DETECT_LANG else None
    )
    state['transcription'] += transcription['text'] + " "

    if translate:
        x = whisper.load_audio(audio)
        x = whisper.pad_or_trim(x)
        mel = whisper.log_mel_spectrogram(x).to(model.device)

        options = whisper.DecodingOptions(task = "translation")
        translation = whisper.decode(model, mel, options)

        state['translation'] += translation.text + " "

    return state['transcription'], state['translation'], state, f"detected language: {transcription['language']}"
  

title = "OpenAI's Whisper Real-time Demo"
description = "A simple demo of OpenAI's [**Whisper**](https://github.com/openai/whisper) speech recognition model. This demo runs on a CPU. For faster inference choose 'tiny' model size and set the language explicitly."

model_size = gr.Dropdown(label="Model size", choices=['base', 'tiny', 'small', 'medium', 'large'], value='base')

delay_slider = gr.inputs.Slider(minimum=1, maximum=5, default=1.2, label="Rate of transcription")

available_languages = sorted(tokenizer.TO_LANGUAGE_CODE.keys())
available_languages = [lang.capitalize() for lang in available_languages]
available_languages = [AUTO_DETECT_LANG]+available_languages

lang_dropdown = gr.inputs.Dropdown(choices=available_languages, label="Language", default=AUTO_DETECT_LANG, type="value")

if lang_dropdown==AUTO_DETECT_LANG:
    lang_dropdown=None

translate_checkbox = gr.inputs.Checkbox(label="Translate to English", default=False)



transcription_tb = gr.Textbox(label="Transcription", lines=10, max_lines=20)
translation_tb = gr.Textbox(label="Translation", lines=10, max_lines=20)
detected_lang = gr.outputs.HTML(label="Detected Language")

state = gr.State({"transcription": "", "translation": ""})

gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(source="microphone", type="filepath", streaming=True),
        state,
        model_size,
        delay_slider,
        lang_dropdown,
        translate_checkbox
        ], 
    outputs=[
        transcription_tb,
        translation_tb,
        state,
        detected_lang
    ],
    live=True,
    allow_flagging='never',
    title=title,
    description=description,
).launch(
    # enable_queue=True,
    # debug=True
  )