chingus commited on
Commit
c89eb2d
·
1 Parent(s): 9ca8828

added initial version

Browse files
Files changed (2) hide show
  1. README.md +0 -1
  2. app.py +189 -0
README.md CHANGED
@@ -9,4 +9,3 @@ app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
9
  pinned: false
10
  ---
11
 
 
app.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # # import os
2
+ # # os.system("pip install git+https://github.com/openai/whisper.git")
3
+ # import gradio as gr
4
+ # import whisper
5
+
6
+ import gradio as gr
7
+ import whisper
8
+ import io
9
+ import os
10
+ import numpy as np
11
+ from datetime import datetime
12
+
13
+ LANGUAGES = {
14
+ "en": "english",
15
+ "zh": "chinese",
16
+ "de": "german",
17
+ "es": "spanish",
18
+ "ru": "russian",
19
+ "ko": "korean",
20
+ "fr": "french",
21
+ "ja": "japanese",
22
+ "pt": "portuguese",
23
+ "tr": "turkish",
24
+ "pl": "polish",
25
+ "ca": "catalan",
26
+ "nl": "dutch",
27
+ "ar": "arabic",
28
+ "sv": "swedish",
29
+ "it": "italian",
30
+ "id": "indonesian",
31
+ "hi": "hindi",
32
+ "fi": "finnish",
33
+ "vi": "vietnamese",
34
+ "iw": "hebrew",
35
+ "uk": "ukrainian",
36
+ "el": "greek",
37
+ "ms": "malay",
38
+ "cs": "czech",
39
+ "ro": "romanian",
40
+ "da": "danish",
41
+ "hu": "hungarian",
42
+ "ta": "tamil",
43
+ "no": "norwegian",
44
+ "th": "thai",
45
+ "ur": "urdu",
46
+ "hr": "croatian",
47
+ "bg": "bulgarian",
48
+ "lt": "lithuanian",
49
+ "la": "latin",
50
+ "mi": "maori",
51
+ "ml": "malayalam",
52
+ "cy": "welsh",
53
+ "sk": "slovak",
54
+ "te": "telugu",
55
+ "fa": "persian",
56
+ "lv": "latvian",
57
+ "bn": "bengali",
58
+ "sr": "serbian",
59
+ "az": "azerbaijani",
60
+ "sl": "slovenian",
61
+ "kn": "kannada",
62
+ "et": "estonian",
63
+ "mk": "macedonian",
64
+ "br": "breton",
65
+ "eu": "basque",
66
+ "is": "icelandic",
67
+ "hy": "armenian",
68
+ "ne": "nepali",
69
+ "mn": "mongolian",
70
+ "bs": "bosnian",
71
+ "kk": "kazakh",
72
+ "sq": "albanian",
73
+ "sw": "swahili",
74
+ "gl": "galician",
75
+ "mr": "marathi",
76
+ "pa": "punjabi",
77
+ "si": "sinhala",
78
+ "km": "khmer",
79
+ "sn": "shona",
80
+ "yo": "yoruba",
81
+ "so": "somali",
82
+ "af": "afrikaans",
83
+ "oc": "occitan",
84
+ "ka": "georgian",
85
+ "be": "belarusian",
86
+ "tg": "tajik",
87
+ "sd": "sindhi",
88
+ "gu": "gujarati",
89
+ "am": "amharic",
90
+ "yi": "yiddish",
91
+ "lo": "lao",
92
+ "uz": "uzbek",
93
+ "fo": "faroese",
94
+ "ht": "haitian creole",
95
+ "ps": "pashto",
96
+ "tk": "turkmen",
97
+ "nn": "nynorsk",
98
+ "mt": "maltese",
99
+ "sa": "sanskrit",
100
+ "lb": "luxembourgish",
101
+ "my": "myanmar",
102
+ "bo": "tibetan",
103
+ "tl": "tagalog",
104
+ "mg": "malagasy",
105
+ "as": "assamese",
106
+ "tt": "tatar",
107
+ "haw": "hawaiian",
108
+ "ln": "lingala",
109
+ "ha": "hausa",
110
+ "ba": "bashkir",
111
+ "jw": "javanese",
112
+ "su": "sundanese",
113
+ }
114
+
115
+ lang_detect = ['tiny', 'base', 'small', 'medium', 'large']
116
+ def sendToWhisper(audio_record, audio_upload, task, models_selected, language_toggle, language_selected, without_timestamps):
117
+ results = []
118
+
119
+ audio = None
120
+ if audio_record is not None:
121
+ audio = audio_record
122
+ elif audio_upload is not None:
123
+ audio = audio_upload
124
+ else:
125
+ return [["Invalid input"]*5]
126
+
127
+ audio = whisper.load_audio(audio)
128
+ audio = whisper.pad_or_trim(audio)
129
+
130
+ for model_name in models_selected:
131
+ start = datetime.now()
132
+ model = whisper.load_model(model_name)
133
+ mel = whisper.log_mel_spectrogram(audio).to(model.device)
134
+ options = whisper.DecodingOptions(fp16 = False, without_timestamps=without_timestamps, task=task)
135
+ if language_toggle:
136
+ options = whisper.DecodingOptions(fp16 = False, without_timestamps=without_timestamps, task=task, language=language_selected)
137
+ language = ""
138
+ prob = 0
139
+ if model_name in lang_detect:
140
+ _, probs = model.detect_language(mel)
141
+ language = max(probs, key=probs.get)
142
+ prob = probs[language]
143
+ else:
144
+ language="en"
145
+ options = whisper.DecodingOptions(fp16 = False, without_timestamps=without_timestamps, task=task, language="en")
146
+ output_text = whisper.decode(model, mel, options)
147
+ results.append([model_name, output_text.text, language, str(prob), str((datetime.now() - start).total_seconds())])
148
+ return results
149
+
150
+ avail_models = whisper.available_models()
151
+
152
+ css = """
153
+ #audio_inputs{
154
+ height:100px;
155
+ max-height:100px;
156
+ }
157
+ """
158
+
159
+ with gr.Blocks(css=css) as demo:
160
+ gr.Markdown("This is a demo to use Open AI's Speech to Text (ASR) Model: Whisper. Learn more about the models here on [Github](https://github.com/openai/whisper/search?q=DecodingOptions&type=) FYI: The larger models take a lot longer to transcribe the text :)")
161
+ gr.Markdown("Here are sample audio files to try out: [Sample Audio](https://drive.google.com/drive/folders/1qYek06ZVeKr9f5Jf35eqi-9CnjNIp98u?usp=sharing)")
162
+ gr.Markdown("Built by:[@davidtsong](https://twitter.com/davidtsong)")
163
+
164
+ # with gr.Row():
165
+ with gr.Column():
166
+
167
+ # with gr.Column():
168
+ gr.Markdown("## Input")
169
+
170
+ with gr.Row():
171
+ audio_record = gr.Audio(source="microphone", label="Audio to transcribe", type="filepath",elem_id="audio_inputs")
172
+ audio_upload = gr.Audio(source="upload", type="filepath", interactive=True,elem_id="audio_inputs")
173
+
174
+ models_selected = gr.CheckboxGroup(avail_models, label="Models to use")
175
+ with gr.Accordion("Settings", open=False):
176
+ task = gr.Dropdown(["transcribe", "translate"], label="Task", value="transcribe")
177
+ language_toggle = gr.Dropdown(["Automatic", "Manual"], label="Language Selection", value="Automatic")
178
+ language_selected = gr.Dropdown(list(LANGUAGES.keys()), label="Language")
179
+ without_timestamps = gr.Checkbox(label="Without timestamps",value=True)
180
+ submit = gr.Button(label="Run")
181
+
182
+ # with gr.Row():
183
+ # with gr.Column():
184
+ gr.Markdown("## Output")
185
+ output = gr.Dataframe(headers=["Model", "Text", "Language", "Language Confidence","Time(s)"], label="Results", wrap=True)
186
+
187
+ submit.click(fn=sendToWhisper, inputs=[audio_record, audio_upload, task, models_selected, language_toggle, language_selected, without_timestamps], outputs=output)
188
+
189
+ demo.launch()