Spaces:
Running
Running
File size: 2,803 Bytes
fd585a0 9b33f31 fd585a0 492c47b 9b33f31 fd585a0 492c47b 56ba41e fd585a0 03660fe 492c47b 03660fe 492c47b 91a1a24 2bf0c14 91a1a24 492c47b 0f1ce4a 03660fe 91a1a24 492c47b 03660fe 9b33f31 f9e42f2 9b33f31 f9e42f2 9b33f31 91a1a24 fd585a0 91a1a24 492c47b 91a1a24 fd585a0 91a1a24 fd585a0 0f1ce4a 91a1a24 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
import os
import gradio as gr
import whisper
from whisper import tokenizer
import time
current_size = 'base'
model = whisper.load_model(current_size)
AUTO_DETECT_LANG = "Auto Detect"
def transcribe(audio, state={}, model_size='base', delay=1.2, lang=None, translate=False):
time.sleep(delay - 1)
global current_size
global model
if model_size != current_size:
current_size = model_size
model = whisper.load_model(current_size)
transcription = model.transcribe(
audio,
language = lang if lang != AUTO_DETECT_LANG else None
)
state['transcription'] += transcription['text'] + " "
if translate:
x = whisper.load_audio(audio)
x = whisper.pad_or_trim(x)
mel = whisper.log_mel_spectrogram(x).to(model.device)
options = whisper.DecodingOptions(task = "translation")
translation = whisper.decode(model, mel, options)
state['translation'] += translation.text + " "
return state['transcription'], state['translation'], state, f"detected language: {transcription['language']}"
title = "OpenAI's Whisper Real-time Demo"
description = "A simple demo of OpenAI's [**Whisper**](https://github.com/openai/whisper) speech recognition model. This demo runs on a CPU. For faster inference choose 'tiny' model size and set the language explicitly."
model_size = gr.Dropdown(label="Model size", choices=['base', 'tiny', 'small', 'medium', 'large'], value='base')
delay_slider = gr.inputs.Slider(minimum=1, maximum=5, default=1.2, label="Rate of transcription")
available_languages = sorted(tokenizer.TO_LANGUAGE_CODE.keys())
available_languages = [lang.capitalize() for lang in available_languages]
available_languages = [AUTO_DETECT_LANG]+available_languages
lang_dropdown = gr.inputs.Dropdown(choices=available_languages, label="Language", default=AUTO_DETECT_LANG, type="value")
if lang_dropdown==AUTO_DETECT_LANG:
lang_dropdown=None
translate_checkbox = gr.inputs.Checkbox(label="Translate to English", default=False)
transcription_tb = gr.Textbox(label="Transcription", lines=10, max_lines=20)
translation_tb = gr.Textbox(label="Translation", lines=10, max_lines=20)
detected_lang = gr.outputs.HTML(label="Detected Language")
state = gr.State({"transcription": "", "translation": ""})
gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(source="microphone", type="filepath", streaming=True),
state,
model_size,
delay_slider,
lang_dropdown,
translate_checkbox
],
outputs=[
transcription_tb,
translation_tb,
state,
detected_lang
],
live=True,
allow_flagging='never',
title=title,
description=description,
).launch(
# enable_queue=True,
# debug=True
) |