File size: 11,309 Bytes
df08a7c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d72e622
df08a7c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1f7156
 
 
 
 
 
 
df08a7c
 
 
 
 
 
 
 
93b87db
df08a7c
93b87db
df08a7c
b40f25a
 
 
 
 
47c8873
 
 
df08a7c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1f7156
 
df08a7c
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
import os
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

import gc
from functools import partial
import gradio as gr
import torch
from speechbrain.inference.interfaces import Pretrained, foreign_class
from transformers import T5Tokenizer, T5ForConditionalGeneration
import librosa
import whisper_timestamped as whisper
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, Wav2Vec2ForCTC, AutoProcessor
    

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.backends.cuda.matmul.allow_tf32 = True


def clean_up_memory():
    gc.collect()
    torch.cuda.empty_cache()


def recap_sentence(string):
    # Restore capitalization and punctuation using the model
    inputs = recap_tokenizer(["restore capitalization and punctuation: " + string], return_tensors="pt", padding=True).to(device)
    outputs = recap_model.generate(**inputs, max_length=768, num_beams=5, early_stopping=True).squeeze(0)
    recap_result = recap_tokenizer.decode(outputs, skip_special_tokens=True)
    return recap_result


def return_prediction_w2v2(mic=None, file=None, device=device):
    if mic is not None:
        waveform, sr = librosa.load(mic, sr=16000)
        waveform = waveform[:30*sr]
        w2v2_result = w2v2_classifier.classify_file_w2v2(waveform, device)
    elif file is not None:
        waveform, sr = librosa.load(file, sr=16000)
        waveform = waveform[:30*sr]
        w2v2_result = w2v2_classifier.classify_file_w2v2(waveform, device)
    else:
        return "You must either provide a mic recording or a file"

    recap_result = recap_sentence(w2v2_result[0])

    # If the letter after punct is small, recap it
    for i, letter in enumerate(recap_result):
        if i > 1 and recap_result[i-2] in [".", "!", "?"] and letter.islower():
            recap_result = recap_result[:i] + letter.upper() + recap_result[i+1:]

    clean_up_memory()
    return recap_result



def return_prediction_whisper(mic=None, file=None, device=device):
    if mic is not None:
        waveform, sr = librosa.load(mic, sr=16000)
        waveform = waveform[:30*sr]
        whisper_result = whisper_classifier.classify_file_whisper_mkd(waveform, device)
    elif file is not None:
        waveform, sr = librosa.load(file, sr=16000)
        waveform = waveform[:30*sr]
        whisper_result = whisper_classifier.classify_file_whisper_mkd(waveform, device)
    else:
        return "You must either provide a mic recording or a file"

    recap_result = recap_sentence(whisper_result[0])

    # If the letter after punct is small, recap it
    for i, letter in enumerate(recap_result):
        if i > 1 and recap_result[i-2] in [".", "!", "?"] and letter.islower():
            recap_result = recap_result[:i] + letter.upper() + recap_result[i+1:]

    clean_up_memory()
    return recap_result


def return_prediction_compare(mic=None, file=None, device=device):
    # pipe_whisper.model.to(device)
    # mms_model.to(device)
    if mic is not None:
        waveform, sr = librosa.load(mic, sr=16000)
        waveform = waveform[:30*sr]
        whisper_mkd_result = whisper_classifier.classify_file_whisper_mkd(waveform, device)
        # result_generator_w2v2 = w2v2_classifier.classify_file_w2v2(mic, device)
        whisper_result = whisper_classifier.classify_file_whisper(waveform, pipe_whisper, device)
        mms_result_generator = whisper_classifier.classify_file_mms(waveform, processor_mms, mms_model, device)
        
    elif file is not None:
        waveform, sr = librosa.load(file, sr=16000)
        waveform = waveform[:30*sr]
        whisper_mkd_result = whisper_classifier.classify_file_whisper_mkd(waveform, device)
        # result_generator_w2v2 = w2v2_classifier.classify_file_w2v2(file, device)
        whisper_result = whisper_classifier.classify_file_whisper(waveform, pipe_whisper, device)
        mms_result_generator = whisper_classifier.classify_file_mms(waveform, processor_mms, mms_model, device)
    else:
        return "You must either provide a mic recording or a file"
    # pipe_whisper.model.to("cpu")
    # mms_model.to("cpu")

    segment_results_whisper = ""
    prev_segment_whisper = ""
    # segment_results_w2v2 = ""
    # prev_segment_w2v2 = ""
    segment_results_mms = ""
    prev_segment_mms = ""

    recap_result_whisper_mkd = recap_sentence(whisper_mkd_result[0])
    recap_result_whisper = recap_sentence(whisper_result[0])
    recap_result_mms = recap_sentence(mms_result_generator[0])

    # If the letter after punct is small, recap it
    for i, letter in enumerate(recap_result_whisper_mkd):
        if i > 1 and recap_result_whisper_mkd[i-2] in [".", "!", "?"] and letter.islower():
            recap_result_whisper_mkd = recap_result_whisper_mkd[:i] + letter.upper() + recap_result_whisper_mkd[i+1:]

    for i, letter in enumerate(recap_result_whisper):
        if i > 1 and recap_result_whisper[i-2] in [".", "!", "?"] and letter.islower():
            recap_result_whisper = recap_result_whisper[:i] + letter.upper() + recap_result_whisper[i+1:]

    for i, letter in enumerate(recap_result_mms):
        if i > 1 and recap_result_mms[i-2] in [".", "!", "?"] and letter.islower():
            recap_result_mms = recap_result_mms[:i] + letter.upper() + recap_result_mms[i+1:]
    
    clean_up_memory()
    return "Буки-Whisper:\n" + recap_result_whisper_mkd + "\n\n" + "MMS:\n" + recap_result_mms + "\n\n" + "OpenAI Whisper:\n" + recap_result_whisper
        # yield "Our W2v2: \n" + segment_results_w2v2 + "\n\n" + "MMS transcript:\n" + segment_results_mms



# Load Whisper model
model_id = "openai/whisper-large-v3"
whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation="sdpa")
whisper_model.to(device)
processor = AutoProcessor.from_pretrained(model_id)
pipe_whisper = pipeline(
    "automatic-speech-recognition",
    model=whisper_model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch.float16,
    return_timestamps=True,
    device=device,
)


# Load MMS model
model_id = "facebook/mms-1b-all"
processor_mms = AutoProcessor.from_pretrained(model_id)
mms_model = Wav2Vec2ForCTC.from_pretrained(model_id)
mms_model = mms_model.to(device)
mms_model.eval()
processor_mms.tokenizer.set_target_lang("mkd")
mms_model.load_adapter("mkd")



# Create a partial function with the device pre-applied
return_prediction_whisper_with_device = partial(return_prediction_whisper, device=device)
# return_prediction_w2v2_with_device = partial(return_prediction_w2v2, device=device)
return_prediction_with_device_compare = partial(return_prediction_compare, device=device)


# Load the ASR models
whisper_classifier = foreign_class(source="Macedonian-ASR/whisper-large-v3-macedonian-asr", pymodule_file="custom_interface_app.py", classname="ASR")
whisper_classifier = whisper_classifier.to(device)
whisper_classifier.eval()


# Load the T5 tokenizer and model for restoring capitalization
recap_model_name = "Macedonian-ASR/mt5-restore-capitalization-macedonian"
recap_tokenizer = T5Tokenizer.from_pretrained(recap_model_name)
recap_model = T5ForConditionalGeneration.from_pretrained(recap_model_name, torch_dtype=torch.float16)
recap_model.to(device)
recap_model.eval()



mic_transcribe_compare = gr.Interface(
    fn=return_prediction_with_device_compare,
    inputs=gr.Audio(sources="microphone", type="filepath"),
    outputs=gr.Textbox(),
    allow_flagging="never",
    live=False,
)

# file_transcribe_compare = gr.Interface(
#     fn=return_prediction_with_device_compare,
#     inputs=gr.Audio(sources="upload", type="filepath"),
#     outputs=gr.Textbox(),
#     allow_flagging="never",
#     live=False
# )


project_description = '''
## Автори:
1. **Дејан Порјазовски**
2. **Илина Јакимовска**
3. **Ордан Чукалиев**
4. **Никола Стиков**

Оваа колаборација е дел од активностите на **Центарот за напредни интердисциплинарни истражувања ([ЦеНИИс](https://ukim.edu.mk/en/centri/centar-za-napredni-interdisciplinarni-istrazhuvanja-ceniis))** при УКИМ.

## Во тренирањето на овој модел се употребени податоци од:
1. Дигитален архив за етнолошки и антрополошки ресурси ([ДАЕАР](https://iea.pmf.ukim.edu.mk/tabs/view/61f236ed7d95176b747c20566ddbda1a)) при Институтот за етнологија и антропологија, Природно-математички факултет при УКИМ.
2. Аудио верзија на меѓународното списание [„ЕтноАнтропоЗум“](https://etno.pmf.ukim.mk/index.php/eaz/issue/archive) на Институтот за етнологија и антропологија, Природно-математички факултет при УКИМ.
3. Аудио подкастот [„Обични луѓе“](https://obicniluge.mk/episodes/) на Илина Јакимовска
4. Научните видеа од серијалот [„Наука за деца“](http://naukazadeca.mk), фондација [КАНТАРОТ](https://qantarot.substack.com/)
5. Македонска верзија на [Mozilla Common Voice](https://commonvoice.mozilla.org/en/datasets) (верзија 18.0)

## Како да придонесете за подобрување на македонските модели за препознавање на говор?
На  следниот [линк](https://drive.google.com/file/d/18sXbX8OABlMS5uYXfC4y9qlI6YZIJ1fQ/view?usp=sharing) ќе најдете инструкции за тоа како да донирате македонски говор преку платформата Mozilla Common Voice.
'''

# Custom CSS
css = """
.gradio-container {
    background-color: #f0f0f0;  /* Set your desired background color */
}
.custom-markdown p, .custom-markdown li, .custom-markdown h2, .custom-markdown a {
    font-size: 15px !important;
    font-family: Arial, sans-serif !important;
}
.gradio-container {
    background-color: #f3f3f3 !important;
}
"""

transcriber_app = gr.Blocks(css=css, delete_cache=(60, 120))
    
with transcriber_app:
    state = gr.State()
    gr.Markdown(project_description, elem_classes="custom-markdown")

    # gr.TabbedInterface(
    #     [mic_transcribe_whisper, mic_transcribe_compare],
    #     ["Буки-Whisper транскрипција", "Споредба на модели"],
    # )
    # state = gr.State(value=[], delete_callback=lambda v: print("STATE DELETED"))

    gr.TabbedInterface(
        [mic_transcribe_compare],
        ["Споредба на модели"],
    )
    state = gr.State(value=[], delete_callback=lambda v: print("STATE DELETED"))

    transcriber_app.unload(return_prediction_whisper)
    transcriber_app.unload(return_prediction_compare)
    transcriber_app.unload(return_prediction_w2v2)


# transcriber_app.launch(debug=True, share=True, ssl_verify=False)
if __name__ == "__main__":
    transcriber_app.queue()
    transcriber_app.launch(share=True)