vladocar commited on
Commit
e2f3dc3
1 Parent(s): b9fb0d0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +106 -34
app.py CHANGED
@@ -8,16 +8,15 @@ from speechbrain.inference.interfaces import Pretrained, foreign_class
8
  from transformers import T5Tokenizer, T5ForConditionalGeneration
9
  import librosa
10
  import whisper_timestamped as whisper
 
11
 
12
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13
  torch.backends.cuda.matmul.allow_tf32 = True
14
 
15
-
16
  def clean_up_memory():
17
  gc.collect()
18
  torch.cuda.empty_cache()
19
 
20
-
21
  @spaces.GPU(duration=15)
22
  def recap_sentence(string):
23
  inputs = recap_tokenizer(["restore capitalization and punctuation: " + string], return_tensors="pt", padding=True).to(device)
@@ -25,30 +24,69 @@ def recap_sentence(string):
25
  recap_result = recap_tokenizer.decode(outputs, skip_special_tokens=True)
26
  return recap_result
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  @spaces.GPU(duration=30)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  def return_prediction_whisper_file(file=None, device=device):
 
31
  if file is not None:
32
- try:
33
- waveform, sr = librosa.load(file.name, sr=16000)
34
- except Exception as e:
35
- return f"Error loading the audio file: {str(e)}"
36
-
37
- waveform = waveform[:3600 * sr]
38
  whisper_result = whisper_classifier.classify_file_whisper_mkd_streaming(waveform, device)
39
  else:
40
- return "You must provide an audio file."
41
-
42
  recap_result = ""
43
  prev_segment = ""
44
  prev_segment_len = 0
45
 
 
46
  for segment in whisper_result:
 
47
  if prev_segment == "":
48
  recap_segment = recap_sentence(segment[0])
49
  else:
50
  prev_segment_len = len(prev_segment.split())
51
  recap_segment = recap_sentence(prev_segment + " " + segment[0])
 
52
  recap_segment = recap_segment.split()
53
  recap_segment = recap_segment[prev_segment_len:]
54
  recap_segment = " ".join(recap_segment)
@@ -56,62 +94,89 @@ def return_prediction_whisper_file(file=None, device=device):
56
  recap_result += recap_segment + " "
57
 
58
  for i, letter in enumerate(recap_result):
59
- if i > 1 and recap_result[i - 2] in [".", "!", "?"] and letter.islower():
60
- recap_result = recap_result[:i] + letter.upper() + recap_result[i + 1:]
61
 
62
- clean_up_memory()
63
- return recap_result
64
 
 
 
 
65
 
66
- # Load the models
67
  whisper_classifier = foreign_class(source="Macedonian-ASR/whisper-large-v3-macedonian-asr", pymodule_file="custom_interface_app.py", classname="ASR")
68
  whisper_classifier = whisper_classifier.to(device)
69
  whisper_classifier.eval()
70
 
 
 
 
 
 
71
  recap_model_name = "Macedonian-ASR/mt5-restore-capitalization-macedonian"
72
  recap_tokenizer = T5Tokenizer.from_pretrained(recap_model_name)
73
  recap_model = T5ForConditionalGeneration.from_pretrained(recap_model_name, torch_dtype=torch.float16)
74
  recap_model.to(device)
75
  recap_model.eval()
76
 
77
- # Interfaces
78
  mic_transcribe_whisper = gr.Interface(
79
- fn=return_prediction_whisper_file,
80
  inputs=gr.Audio(sources="microphone", type="filepath"),
81
  outputs=gr.Textbox(),
82
  allow_flagging="never",
83
  live=False,
84
  )
85
 
86
- file_transcribe_whisper_upload = gr.Interface(
87
- fn=return_prediction_whisper_file,
88
- inputs=gr.File(label="Upload an Audio File", type="file"),
89
- outputs=gr.Textbox(label="Transcription"),
90
  allow_flagging="never",
91
  live=True
92
  )
93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  project_description = '''
95
  <img src="https://i.ibb.co/SKDfwn9/bookie.png"
96
  alt="Bookie logo"
97
  style="float: right; width: 130px; height: 110px; margin-left: 10px;" />
98
-
99
- ## Authors:
100
  1. **Дејан Порјазовски**
101
  2. **Илина Јакимовска**
102
  3. **Ордан Чукалиев**
103
  4. **Никола Стиков**
104
-
105
- This collaboration is part of the activities of the **Center for Advanced Interdisciplinary Research ([CeNIIs](https://ukim.edu.mk/en/centri/centar-za-napredni-interdisciplinarni-istrazhuvanja-ceniis))** at UKIM.
106
-
107
- ## Contribute to improving Macedonian speech recognition models
108
- You can find instructions for donating Macedonian speech at the following [link](https://drive.google.com/file/d/1YdZJz9o1X8AMc6J4MNPnVZjASyIXnvoZ/view?usp=sharing).
 
 
 
 
109
  '''
110
 
111
  # Custom CSS
112
  css = """
113
  .gradio-container {
114
- background-color: #f0f0f0; /* Set your desired background color */
115
  }
116
  .custom-markdown p, .custom-markdown li, .custom-markdown h2, .custom-markdown a {
117
  font-size: 15px !important;
@@ -122,15 +187,22 @@ css = """
122
  }
123
  """
124
 
125
- transcriber_app = gr.Blocks(css=css)
126
-
127
  with transcriber_app:
 
128
  gr.Markdown(project_description, elem_classes="custom-markdown")
 
129
  gr.TabbedInterface(
130
- [mic_transcribe_whisper, file_transcribe_whisper_upload],
131
- ["Microphone Transcription", "Upload File for Transcription"],
132
  )
 
 
 
 
 
133
 
134
  if __name__ == "__main__":
135
  transcriber_app.queue()
136
- transcriber_app.launch(share=True)
 
8
  from transformers import T5Tokenizer, T5ForConditionalGeneration
9
  import librosa
10
  import whisper_timestamped as whisper
11
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, Wav2Vec2ForCTC, AutoProcessor
12
 
13
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
14
  torch.backends.cuda.matmul.allow_tf32 = True
15
 
 
16
  def clean_up_memory():
17
  gc.collect()
18
  torch.cuda.empty_cache()
19
 
 
20
  @spaces.GPU(duration=15)
21
  def recap_sentence(string):
22
  inputs = recap_tokenizer(["restore capitalization and punctuation: " + string], return_tensors="pt", padding=True).to(device)
 
24
  recap_result = recap_tokenizer.decode(outputs, skip_special_tokens=True)
25
  return recap_result
26
 
27
+ @spaces.GPU(duration=30)
28
+ def return_prediction_w2v2(mic=None, file=None, device=device):
29
+ if mic is not None:
30
+ waveform, sr = librosa.load(mic, sr=16000)
31
+ waveform = waveform[:60*sr]
32
+ w2v2_result = w2v2_classifier.classify_file_w2v2(waveform, device)
33
+ elif file is not None:
34
+ waveform, sr = librosa.load(file, sr=16000)
35
+ waveform = waveform[:60*sr]
36
+ w2v2_result = w2v2_classifier.classify_file_w2v2(waveform, device)
37
+ else:
38
+ return "You must either provide a mic recording or a file"
39
+
40
+ recap_result = recap_sentence(w2v2_result[0])
41
+
42
+ for i, letter in enumerate(recap_result):
43
+ if i > 1 and recap_result[i-2] in [".", "!", "?"] and letter.islower():
44
+ recap_result = recap_result[:i] + letter.upper() + recap_result[i+1:]
45
+
46
+ clean_up_memory()
47
+ return recap_result
48
 
49
  @spaces.GPU(duration=30)
50
+ def return_prediction_whisper_mic(mic=None, device=device):
51
+ if mic is not None:
52
+ waveform, sr = librosa.load(mic, sr=16000)
53
+ waveform = waveform[:30*sr]
54
+ whisper_result = whisper_classifier.classify_file_whisper_mkd(waveform, device)
55
+ else:
56
+ return "You must provide a mic recording"
57
+
58
+ recap_result = recap_sentence(whisper_result[0])
59
+
60
+ for i, letter in enumerate(recap_result):
61
+ if i > 1 and recap_result[i-2] in [".", "!", "?"] and letter.islower():
62
+ recap_result = recap_result[:i] + letter.upper() + recap_result[i+1:]
63
+
64
+ clean_up_memory()
65
+ return recap_result
66
+
67
+ @spaces.GPU(duration=60)
68
  def return_prediction_whisper_file(file=None, device=device):
69
+ whisper_result = []
70
  if file is not None:
71
+ waveform, sr = librosa.load(file, sr=16000)
72
+ waveform = waveform[:3600*sr]
 
 
 
 
73
  whisper_result = whisper_classifier.classify_file_whisper_mkd_streaming(waveform, device)
74
  else:
75
+ yield "You must provide a file"
76
+
77
  recap_result = ""
78
  prev_segment = ""
79
  prev_segment_len = 0
80
 
81
+ segment_counter = 0
82
  for segment in whisper_result:
83
+ segment_counter += 1
84
  if prev_segment == "":
85
  recap_segment = recap_sentence(segment[0])
86
  else:
87
  prev_segment_len = len(prev_segment.split())
88
  recap_segment = recap_sentence(prev_segment + " " + segment[0])
89
+
90
  recap_segment = recap_segment.split()
91
  recap_segment = recap_segment[prev_segment_len:]
92
  recap_segment = " ".join(recap_segment)
 
94
  recap_result += recap_segment + " "
95
 
96
  for i, letter in enumerate(recap_result):
97
+ if i > 1 and recap_result[i-2] in [".", "!", "?"] and letter.islower():
98
+ recap_result = recap_result[:i] + letter.upper() + recap_result[i+1:]
99
 
100
+ yield recap_result
 
101
 
102
+ return_prediction_whisper_mic_with_device = partial(return_prediction_whisper_mic, device=device)
103
+ return_prediction_whisper_file_with_device = partial(return_prediction_whisper_file, device=device)
104
+ return_prediction_w2v2_with_device = partial(return_prediction_w2v2, device=device)
105
 
106
+ # Load the ASR models
107
  whisper_classifier = foreign_class(source="Macedonian-ASR/whisper-large-v3-macedonian-asr", pymodule_file="custom_interface_app.py", classname="ASR")
108
  whisper_classifier = whisper_classifier.to(device)
109
  whisper_classifier.eval()
110
 
111
+ w2v2_classifier = foreign_class(source="Macedonian-ASR/wav2vec2-aed-macedonian-asr", pymodule_file="custom_interface_app.py", classname="ASR")
112
+ w2v2_classifier = w2v2_classifier.to(device)
113
+ w2v2_classifier.eval()
114
+
115
+ # Load the T5 tokenizer and model
116
  recap_model_name = "Macedonian-ASR/mt5-restore-capitalization-macedonian"
117
  recap_tokenizer = T5Tokenizer.from_pretrained(recap_model_name)
118
  recap_model = T5ForConditionalGeneration.from_pretrained(recap_model_name, torch_dtype=torch.float16)
119
  recap_model.to(device)
120
  recap_model.eval()
121
 
122
+ # Interface definitions
123
  mic_transcribe_whisper = gr.Interface(
124
+ fn=return_prediction_whisper_mic_with_device,
125
  inputs=gr.Audio(sources="microphone", type="filepath"),
126
  outputs=gr.Textbox(),
127
  allow_flagging="never",
128
  live=False,
129
  )
130
 
131
+ file_transcribe_whisper = gr.Interface(
132
+ fn=return_prediction_whisper_file_with_device,
133
+ inputs=gr.Audio(sources="upload", type="filepath"),
134
+ outputs=gr.Textbox(),
135
  allow_flagging="never",
136
  live=True
137
  )
138
 
139
+ mic_transcribe_w2v2 = gr.Interface(
140
+ fn=return_prediction_w2v2_with_device,
141
+ inputs=gr.Audio(sources="microphone", type="filepath"),
142
+ outputs=gr.Textbox(),
143
+ allow_flagging="never",
144
+ live=False,
145
+ )
146
+
147
+ file_transcribe_w2v2 = gr.Interface(
148
+ fn=return_prediction_w2v2_with_device,
149
+ inputs=gr.Audio(sources="upload", type="filepath"),
150
+ outputs=gr.Textbox(),
151
+ allow_flagging="never",
152
+ live=False
153
+ )
154
+
155
  project_description = '''
156
  <img src="https://i.ibb.co/SKDfwn9/bookie.png"
157
  alt="Bookie logo"
158
  style="float: right; width: 130px; height: 110px; margin-left: 10px;" />
159
+
160
+ ## Автори:
161
  1. **Дејан Порјазовски**
162
  2. **Илина Јакимовска**
163
  3. **Ордан Чукалиев**
164
  4. **Никола Стиков**
165
+ Оваа колаборација е дел од активностите на **Центарот за напредни интердисциплинарни истражувања ([ЦеНИИс](https://ukim.edu.mk/en/centri/centar-za-napredni-interdisciplinarni-istrazhuvanja-ceniis))** при УКИМ.
166
+ ## Во тренирањето на овој модел се употребени податоци од:
167
+ 1. Дигитален архив за етнолошки и антрополошки ресурси ([ДАЕАР](https://iea.pmf.ukim.edu.mk/tabs/view/61f236ed7d95176b747c20566ddbda1a)) при Институтот за етнологија и антропологија, Природно-математички факултет при УКИМ.
168
+ 2. Аудио верзија на меѓународното списание [„ЕтноАнтропоЗум"](https://etno.pmf.ukim.mk/index.php/eaz/issue/archive) на Институтот за етнологија и антропологија, Природно-математички факултет при УКИМ.
169
+ 3. Аудио подкастот [„Обични луѓе"](https://obicniluge.mk/episodes/) на Илина Јакимовска
170
+ 4. Научните видеа од серијалот [„Наука за деца"](http://naukazadeca.mk), фондација [КАНТАРОТ](https://qantarot.substack.com/)
171
+ 5. Македонска верзија на [Mozilla Common Voice](https://commonvoice.mozilla.org/en/datasets) (верзија 18.0)
172
+ ## Како да придонесете за подобрување на македонските модели за препознавање на говор?
173
+ На следниот [линк](https://drive.google.com/file/d/1YdZJz9o1X8AMc6J4MNPnVZjASyIXnvoZ/view?usp=sharing) ќе најдете инструкции за тоа како да донирате македонски говор преку платформата Mozilla Common Voice.
174
  '''
175
 
176
  # Custom CSS
177
  css = """
178
  .gradio-container {
179
+ background-color: #f0f0f0;
180
  }
181
  .custom-markdown p, .custom-markdown li, .custom-markdown h2, .custom-markdown a {
182
  font-size: 15px !important;
 
187
  }
188
  """
189
 
190
+ transcriber_app = gr.Blocks(css=css, delete_cache=(60, 120))
191
+
192
  with transcriber_app:
193
+ state = gr.State()
194
  gr.Markdown(project_description, elem_classes="custom-markdown")
195
+
196
  gr.TabbedInterface(
197
+ [mic_transcribe_whisper, file_transcribe_whisper, mic_transcribe_w2v2, file_transcribe_w2v2],
198
+ ["Буки-Whisper микрофон", "Буки-Whisper датотека", "Буки-Wav2vec2 микрофон", "Буки-Wav2vec2 датотека"],
199
  )
200
+ state = gr.State(value=[], delete_callback=lambda v: print("STATE DELETED"))
201
+
202
+ transcriber_app.unload(return_prediction_whisper_mic)
203
+ transcriber_app.unload(return_prediction_whisper_file)
204
+ transcriber_app.unload(return_prediction_w2v2)
205
 
206
  if __name__ == "__main__":
207
  transcriber_app.queue()
208
+ transcriber_app.launch(share=True)