vladocar commited on
Commit
b9fb0d0
·
verified ·
1 Parent(s): 126881b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -171
app.py CHANGED
@@ -1,7 +1,5 @@
1
  import spaces
2
  import os
3
- # os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
4
-
5
  import gc
6
  from functools import partial
7
  import gradio as gr
@@ -10,8 +8,6 @@ from speechbrain.inference.interfaces import Pretrained, foreign_class
10
  from transformers import T5Tokenizer, T5ForConditionalGeneration
11
  import librosa
12
  import whisper_timestamped as whisper
13
- from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, Wav2Vec2ForCTC, AutoProcessor
14
-
15
 
16
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
17
  torch.backends.cuda.matmul.allow_tf32 = True
@@ -24,7 +20,6 @@ def clean_up_memory():
24
 
25
  @spaces.GPU(duration=15)
26
  def recap_sentence(string):
27
- # Restore capitalization and punctuation using the model
28
  inputs = recap_tokenizer(["restore capitalization and punctuation: " + string], return_tensors="pt", padding=True).to(device)
29
  outputs = recap_model.generate(**inputs, max_length=768, num_beams=5, early_stopping=True).squeeze(0)
30
  recap_result = recap_tokenizer.decode(outputs, skip_special_tokens=True)
@@ -32,206 +27,85 @@ def recap_sentence(string):
32
 
33
 
34
  @spaces.GPU(duration=30)
35
- def return_prediction_w2v2(mic=None, file=None, device=device):
36
- if mic is not None:
37
- waveform, sr = librosa.load(mic, sr=16000)
38
- waveform = waveform[:60*sr]
39
- w2v2_result = w2v2_classifier.classify_file_w2v2(waveform, device)
40
- elif file is not None:
41
- waveform, sr = librosa.load(file, sr=16000)
42
- waveform = waveform[:60*sr]
43
- w2v2_result = w2v2_classifier.classify_file_w2v2(waveform, device)
44
- else:
45
- return "You must either provide a mic recording or a file"
46
-
47
- recap_result = recap_sentence(w2v2_result[0])
48
-
49
- # If the letter after punct is small, recap it
50
- for i, letter in enumerate(recap_result):
51
- if i > 1 and recap_result[i-2] in [".", "!", "?"] and letter.islower():
52
- recap_result = recap_result[:i] + letter.upper() + recap_result[i+1:]
53
-
54
- clean_up_memory()
55
- return recap_result
56
-
57
-
58
- @spaces.GPU(duration=30)
59
- def return_prediction_whisper_mic(mic=None, device=device):
60
- if mic is not None:
61
- waveform, sr = librosa.load(mic, sr=16000)
62
- waveform = waveform[:30*sr]
63
- whisper_result = whisper_classifier.classify_file_whisper_mkd(waveform, device)
64
- else:
65
- return "You must provide a mic recording"
66
-
67
- recap_result = recap_sentence(whisper_result[0])
68
-
69
- # If the letter after punct is small, recap it
70
- for i, letter in enumerate(recap_result):
71
- if i > 1 and recap_result[i-2] in [".", "!", "?"] and letter.islower():
72
- recap_result = recap_result[:i] + letter.upper() + recap_result[i+1:]
73
-
74
- clean_up_memory()
75
- return recap_result
76
-
77
-
78
- @spaces.GPU(duration=60)
79
  def return_prediction_whisper_file(file=None, device=device):
80
- whisper_result = []
81
  if file is not None:
82
- waveform, sr = librosa.load(file, sr=16000)
83
- waveform = waveform[:3600*sr]
 
 
 
 
84
  whisper_result = whisper_classifier.classify_file_whisper_mkd_streaming(waveform, device)
85
  else:
86
- yield "You must provide a mic recording"
87
-
88
  recap_result = ""
89
  prev_segment = ""
90
  prev_segment_len = 0
91
 
92
- segment_counter = 0
93
  for segment in whisper_result:
94
- segment_counter += 1
95
  if prev_segment == "":
96
- recap_segment= recap_sentence(segment[0])
97
  else:
98
  prev_segment_len = len(prev_segment.split())
99
  recap_segment = recap_sentence(prev_segment + " " + segment[0])
100
- # remove prev_segment from the beginning of the recap_result
101
  recap_segment = recap_segment.split()
102
  recap_segment = recap_segment[prev_segment_len:]
103
  recap_segment = " ".join(recap_segment)
104
  prev_segment = segment[0]
105
  recap_result += recap_segment + " "
106
 
107
- # If the letter after punct is small, recap it
108
  for i, letter in enumerate(recap_result):
109
- if i > 1 and recap_result[i-2] in [".", "!", "?"] and letter.islower():
110
- recap_result = recap_result[:i] + letter.upper() + recap_result[i+1:]
111
-
112
- yield recap_result
113
-
114
-
115
- def return_prediction_compare(mic=None, file=None, device=device):
116
- # pipe_whisper.model.to(device)
117
- # mms_model.to(device)
118
- if mic is not None:
119
- waveform, sr = librosa.load(mic, sr=16000)
120
- waveform = waveform[:60*sr]
121
- whisper_mkd_result = whisper_classifier.classify_file_whisper_mkd(waveform, device)
122
- # result_generator_w2v2 = w2v2_classifier.classify_file_w2v2(mic, device)
123
- whisper_result = whisper_classifier.classify_file_whisper(waveform, pipe_whisper, device)
124
- mms_result_generator = whisper_classifier.classify_file_mms(waveform, processor_mms, mms_model, device)
125
-
126
- elif file is not None:
127
- waveform, sr = librosa.load(file, sr=16000)
128
- waveform = waveform[:30*sr]
129
- whisper_mkd_result = whisper_classifier.classify_file_whisper_mkd(waveform, device)
130
- # result_generator_w2v2 = w2v2_classifier.classify_file_w2v2(file, device)
131
- whisper_result = whisper_classifier.classify_file_whisper(waveform, pipe_whisper, device)
132
- mms_result_generator = whisper_classifier.classify_file_mms(waveform, processor_mms, mms_model, device)
133
- else:
134
- return "You must either provide a mic recording or a file"
135
- # pipe_whisper.model.to("cpu")
136
- # mms_model.to("cpu")
137
-
138
- segment_results_whisper = ""
139
- prev_segment_whisper = ""
140
- # segment_results_w2v2 = ""
141
- # prev_segment_w2v2 = ""
142
- segment_results_mms = ""
143
- prev_segment_mms = ""
144
-
145
- recap_result_whisper_mkd = recap_sentence(whisper_mkd_result[0])
146
- recap_result_whisper = recap_sentence(whisper_result[0])
147
- recap_result_mms = recap_sentence(mms_result_generator[0])
148
 
149
- # If the letter after punct is small, recap it
150
- for i, letter in enumerate(recap_result_whisper_mkd):
151
- if i > 1 and recap_result_whisper_mkd[i-2] in [".", "!", "?"] and letter.islower():
152
- recap_result_whisper_mkd = recap_result_whisper_mkd[:i] + letter.upper() + recap_result_whisper_mkd[i+1:]
153
-
154
- for i, letter in enumerate(recap_result_whisper):
155
- if i > 1 and recap_result_whisper[i-2] in [".", "!", "?"] and letter.islower():
156
- recap_result_whisper = recap_result_whisper[:i] + letter.upper() + recap_result_whisper[i+1:]
157
-
158
- for i, letter in enumerate(recap_result_mms):
159
- if i > 1 and recap_result_mms[i-2] in [".", "!", "?"] and letter.islower():
160
- recap_result_mms = recap_result_mms[:i] + letter.upper() + recap_result_mms[i+1:]
161
-
162
  clean_up_memory()
163
- return "Буки-Whisper:\n" + recap_result_whisper_mkd + "\n\n" + "MMS:\n" + recap_result_mms + "\n\n" + "OpenAI Whisper:\n" + recap_result_whisper
164
- # yield "Our W2v2: \n" + segment_results_w2v2 + "\n\n" + "MMS transcript:\n" + segment_results_mms
165
-
166
 
167
- # Create a partial function with the device pre-applied
168
- return_prediction_whisper_mic_with_device = partial(return_prediction_whisper_mic, device=device)
169
- return_prediction_whisper_file_with_device = partial(return_prediction_whisper_file, device=device)
170
- return_prediction_w2v2_with_device = partial(return_prediction_w2v2, device=device)
171
 
172
- # Load the ASR models
173
  whisper_classifier = foreign_class(source="Macedonian-ASR/whisper-large-v3-macedonian-asr", pymodule_file="custom_interface_app.py", classname="ASR")
174
  whisper_classifier = whisper_classifier.to(device)
175
  whisper_classifier.eval()
176
 
177
- w2v2_classifier = foreign_class(source="Macedonian-ASR/wav2vec2-aed-macedonian-asr", pymodule_file="custom_interface_app.py", classname="ASR")
178
- w2v2_classifier = w2v2_classifier.to(device)
179
- w2v2_classifier.eval()
180
-
181
- # Load the T5 tokenizer and model for restoring capitalization
182
  recap_model_name = "Macedonian-ASR/mt5-restore-capitalization-macedonian"
183
  recap_tokenizer = T5Tokenizer.from_pretrained(recap_model_name)
184
  recap_model = T5ForConditionalGeneration.from_pretrained(recap_model_name, torch_dtype=torch.float16)
185
  recap_model.to(device)
186
  recap_model.eval()
187
 
188
-
189
  mic_transcribe_whisper = gr.Interface(
190
- fn=return_prediction_whisper_mic_with_device,
191
  inputs=gr.Audio(sources="microphone", type="filepath"),
192
  outputs=gr.Textbox(),
193
  allow_flagging="never",
194
  live=False,
195
  )
196
 
197
- file_transcribe_whisper = gr.Interface(
198
- fn=return_prediction_whisper_file_with_device,
199
- inputs=gr.Audio(sources="upload", type="filepath"),
200
- outputs=gr.Textbox(),
201
  allow_flagging="never",
202
  live=True
203
  )
204
 
205
- mic_transcribe_w2v2 = gr.Interface(
206
- fn=return_prediction_w2v2_with_device,
207
- inputs=gr.Audio(sources="microphone", type="filepath"),
208
- outputs=gr.Textbox(),
209
- allow_flagging="never",
210
- live=False,
211
- )
212
-
213
  project_description = '''
214
  <img src="https://i.ibb.co/SKDfwn9/bookie.png"
215
  alt="Bookie logo"
216
  style="float: right; width: 130px; height: 110px; margin-left: 10px;" />
217
-
218
- ## Автори:
219
  1. **Дејан Порјазовски**
220
  2. **Илина Јакимовска**
221
  3. **Ордан Чукалиев**
222
  4. **Никола Стиков**
223
 
224
- Оваа колаборација е дел од активностите на **Центарот за напредни интердисциплинарни истражувања ([ЦеНИИс](https://ukim.edu.mk/en/centri/centar-za-napredni-interdisciplinarni-istrazhuvanja-ceniis))** при УКИМ.
225
-
226
- ## Во тренирањето на овој модел се употребени податоци од:
227
- 1. Дигитален архив за етнолошки и антрополошки ресурси ([ДАЕАР](https://iea.pmf.ukim.edu.mk/tabs/view/61f236ed7d95176b747c20566ddbda1a)) при Институтот за етнологија и антропологија, Природно-математички факултет при УКИМ.
228
- 2. Аудио верзија на меѓународното списание [„ЕтноАнтропоЗум“](https://etno.pmf.ukim.mk/index.php/eaz/issue/archive) на Институтот за етнологија и антропологија, Природно-математички факултет при УКИМ.
229
- 3. Аудио подкастот [„Обични луѓе“](https://obicniluge.mk/episodes/) на Илина Јакимовска
230
- 4. Научните видеа од серијалот [„Наука за деца“](http://naukazadeca.mk), фондација [КАНТАРОТ](https://qantarot.substack.com/)
231
- 5. Македонска верзија на [Mozilla Common Voice](https://commonvoice.mozilla.org/en/datasets) (верзија 18.0)
232
 
233
- ## Како да придонесете за подобрување на македонските модели за препознавање на говор?
234
- На следниот [линк](https://drive.google.com/file/d/1YdZJz9o1X8AMc6J4MNPnVZjASyIXnvoZ/view?usp=sharing) ќе најдете инструкции за тоа како да донирате македонски говор преку платформата Mozilla Common Voice.
235
  '''
236
 
237
  # Custom CSS
@@ -248,31 +122,15 @@ css = """
248
  }
249
  """
250
 
251
- transcriber_app = gr.Blocks(css=css, delete_cache=(60, 120))
252
-
253
  with transcriber_app:
254
- state = gr.State()
255
  gr.Markdown(project_description, elem_classes="custom-markdown")
256
-
257
- # gr.TabbedInterface(
258
- # [mic_transcribe_whisper, mic_transcribe_compare],
259
- # ["Буки-Whisper транскрипција", "Споредба на модели"],
260
- # )
261
- # state = gr.State(value=[], delete_callback=lambda v: print("STATE DELETED"))
262
-
263
  gr.TabbedInterface(
264
- [mic_transcribe_whisper, mic_transcribe_w2v2],
265
- ["Буки-Whisper транскрипција", "Буки-Wav2vec2 транскрипција"],
266
  )
267
- state = gr.State(value=[], delete_callback=lambda v: print("STATE DELETED"))
268
-
269
- transcriber_app.unload(return_prediction_whisper_mic)
270
- transcriber_app.unload(return_prediction_whisper_file)
271
- transcriber_app.unload(return_prediction_compare)
272
- transcriber_app.unload(return_prediction_w2v2)
273
-
274
 
275
- # transcriber_app.launch(debug=True, share=True, ssl_verify=False)
276
  if __name__ == "__main__":
277
  transcriber_app.queue()
278
- transcriber_app.launch(share=True)
 
1
  import spaces
2
  import os
 
 
3
  import gc
4
  from functools import partial
5
  import gradio as gr
 
8
  from transformers import T5Tokenizer, T5ForConditionalGeneration
9
  import librosa
10
  import whisper_timestamped as whisper
 
 
11
 
12
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13
  torch.backends.cuda.matmul.allow_tf32 = True
 
20
 
21
  @spaces.GPU(duration=15)
22
  def recap_sentence(string):
 
23
  inputs = recap_tokenizer(["restore capitalization and punctuation: " + string], return_tensors="pt", padding=True).to(device)
24
  outputs = recap_model.generate(**inputs, max_length=768, num_beams=5, early_stopping=True).squeeze(0)
25
  recap_result = recap_tokenizer.decode(outputs, skip_special_tokens=True)
 
27
 
28
 
29
  @spaces.GPU(duration=30)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  def return_prediction_whisper_file(file=None, device=device):
 
31
  if file is not None:
32
+ try:
33
+ waveform, sr = librosa.load(file.name, sr=16000)
34
+ except Exception as e:
35
+ return f"Error loading the audio file: {str(e)}"
36
+
37
+ waveform = waveform[:3600 * sr]
38
  whisper_result = whisper_classifier.classify_file_whisper_mkd_streaming(waveform, device)
39
  else:
40
+ return "You must provide an audio file."
41
+
42
  recap_result = ""
43
  prev_segment = ""
44
  prev_segment_len = 0
45
 
 
46
  for segment in whisper_result:
 
47
  if prev_segment == "":
48
+ recap_segment = recap_sentence(segment[0])
49
  else:
50
  prev_segment_len = len(prev_segment.split())
51
  recap_segment = recap_sentence(prev_segment + " " + segment[0])
 
52
  recap_segment = recap_segment.split()
53
  recap_segment = recap_segment[prev_segment_len:]
54
  recap_segment = " ".join(recap_segment)
55
  prev_segment = segment[0]
56
  recap_result += recap_segment + " "
57
 
 
58
  for i, letter in enumerate(recap_result):
59
+ if i > 1 and recap_result[i - 2] in [".", "!", "?"] and letter.islower():
60
+ recap_result = recap_result[:i] + letter.upper() + recap_result[i + 1:]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  clean_up_memory()
63
+ return recap_result
 
 
64
 
 
 
 
 
65
 
66
+ # Load the models
67
  whisper_classifier = foreign_class(source="Macedonian-ASR/whisper-large-v3-macedonian-asr", pymodule_file="custom_interface_app.py", classname="ASR")
68
  whisper_classifier = whisper_classifier.to(device)
69
  whisper_classifier.eval()
70
 
 
 
 
 
 
71
  recap_model_name = "Macedonian-ASR/mt5-restore-capitalization-macedonian"
72
  recap_tokenizer = T5Tokenizer.from_pretrained(recap_model_name)
73
  recap_model = T5ForConditionalGeneration.from_pretrained(recap_model_name, torch_dtype=torch.float16)
74
  recap_model.to(device)
75
  recap_model.eval()
76
 
77
+ # Interfaces
78
  mic_transcribe_whisper = gr.Interface(
79
+ fn=return_prediction_whisper_file,
80
  inputs=gr.Audio(sources="microphone", type="filepath"),
81
  outputs=gr.Textbox(),
82
  allow_flagging="never",
83
  live=False,
84
  )
85
 
86
+ file_transcribe_whisper_upload = gr.Interface(
87
+ fn=return_prediction_whisper_file,
88
+ inputs=gr.File(label="Upload an Audio File", type="file"),
89
+ outputs=gr.Textbox(label="Transcription"),
90
  allow_flagging="never",
91
  live=True
92
  )
93
 
 
 
 
 
 
 
 
 
94
  project_description = '''
95
  <img src="https://i.ibb.co/SKDfwn9/bookie.png"
96
  alt="Bookie logo"
97
  style="float: right; width: 130px; height: 110px; margin-left: 10px;" />
98
+
99
+ ## Authors:
100
  1. **Дејан Порјазовски**
101
  2. **Илина Јакимовска**
102
  3. **Ордан Чукалиев**
103
  4. **Никола Стиков**
104
 
105
+ This collaboration is part of the activities of the **Center for Advanced Interdisciplinary Research ([CeNIIs](https://ukim.edu.mk/en/centri/centar-za-napredni-interdisciplinarni-istrazhuvanja-ceniis))** at UKIM.
 
 
 
 
 
 
 
106
 
107
+ ## Contribute to improving Macedonian speech recognition models
108
+ You can find instructions for donating Macedonian speech at the following [link](https://drive.google.com/file/d/1YdZJz9o1X8AMc6J4MNPnVZjASyIXnvoZ/view?usp=sharing).
109
  '''
110
 
111
  # Custom CSS
 
122
  }
123
  """
124
 
125
+ transcriber_app = gr.Blocks(css=css)
126
+
127
  with transcriber_app:
 
128
  gr.Markdown(project_description, elem_classes="custom-markdown")
 
 
 
 
 
 
 
129
  gr.TabbedInterface(
130
+ [mic_transcribe_whisper, file_transcribe_whisper_upload],
131
+ ["Microphone Transcription", "Upload File for Transcription"],
132
  )
 
 
 
 
 
 
 
133
 
 
134
  if __name__ == "__main__":
135
  transcriber_app.queue()
136
+ transcriber_app.launch(share=True)