Arnaudding001 commited on
Commit
01c308f
1 Parent(s): 04dba48

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -0
app.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import whisper
4
+ from whisper import tokenizer
5
+ import time
6
+
7
+ current_size = 'base'
8
+ model = whisper.load_model(current_size)
9
+ AUTO_DETECT_LANG = "Auto Detect"
10
+
11
+ def transcribe(audio, state={}, model_size='base', delay=1.2, lang=None, translate=False):
12
+ time.sleep(delay - 1)
13
+
14
+ global current_size
15
+ global model
16
+ if model_size != current_size:
17
+ current_size = model_size
18
+ model = whisper.load_model(current_size)
19
+
20
+ transcription = model.transcribe(
21
+ audio,
22
+ language = lang if lang != AUTO_DETECT_LANG else None
23
+ )
24
+ state['transcription'] += transcription['text'] + " "
25
+
26
+ if translate:
27
+ x = whisper.load_audio(audio)
28
+ x = whisper.pad_or_trim(x)
29
+ mel = whisper.log_mel_spectrogram(x).to(model.device)
30
+
31
+ options = whisper.DecodingOptions(task = "translation")
32
+ translation = whisper.decode(model, mel, options)
33
+
34
+ state['translation'] += translation.text + " "
35
+
36
+ return state['transcription'], state['translation'], state, f"detected language: {transcription['language']}"
37
+
38
+
39
+ title = "OpenAI's Whisper Real-time Demo"
40
+ description = "A simple demo of OpenAI's [**Whisper**](https://github.com/openai/whisper) speech recognition model. This demo runs on a CPU. For faster inference choose 'tiny' model size and set the language explicitly."
41
+
42
+ model_size = gr.Dropdown(label="Model size", choices=['base', 'tiny', 'small', 'medium', 'large'], value='base')
43
+
44
+ delay_slider = gr.inputs.Slider(minimum=1, maximum=5, default=1.2, label="Rate of transcription")
45
+
46
+ available_languages = sorted(tokenizer.TO_LANGUAGE_CODE.keys())
47
+ available_languages = [lang.capitalize() for lang in available_languages]
48
+ available_languages = [AUTO_DETECT_LANG]+available_languages
49
+
50
+ lang_dropdown = gr.inputs.Dropdown(choices=available_languages, label="Language", default=AUTO_DETECT_LANG, type="value")
51
+
52
+ if lang_dropdown==AUTO_DETECT_LANG:
53
+ lang_dropdown=None
54
+
55
+ translate_checkbox = gr.inputs.Checkbox(label="Translate to English", default=False)
56
+
57
+
58
+
59
+ transcription_tb = gr.Textbox(label="Transcription", lines=10, max_lines=20)
60
+ translation_tb = gr.Textbox(label="Translation", lines=10, max_lines=20)
61
+ detected_lang = gr.outputs.HTML(label="Detected Language")
62
+
63
+ state = gr.State({"transcription": "", "translation": ""})
64
+
65
+ gr.Interface(
66
+ fn=transcribe,
67
+ inputs=[
68
+ gr.Audio(source="microphone", type="filepath", streaming=True),
69
+ state,
70
+ model_size,
71
+ delay_slider,
72
+ lang_dropdown,
73
+ translate_checkbox
74
+ ],
75
+ outputs=[
76
+ transcription_tb,
77
+ translation_tb,
78
+ state,
79
+ detected_lang
80
+ ],
81
+ live=True,
82
+ allow_flagging='never',
83
+ title=title,
84
+ description=description,
85
+ ).launch(
86
+ # enable_queue=True,
87
+ # debug=True
88
+ )