Bookie-Whisper-Macedonian-ASR

Sleeping

App Files Files Community

vladocar commited on Dec 3, 2024

Commit

e2f3dc3

verified ·

1 Parent(s): b9fb0d0

Update app.py

Browse files

Files changed (1) hide show

app.py +106 -34

app.py CHANGED Viewed

@@ -8,16 +8,15 @@ from speechbrain.inference.interfaces import Pretrained, foreign_class
 from transformers import T5Tokenizer, T5ForConditionalGeneration
 import librosa
 import whisper_timestamped as whisper
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 torch.backends.cuda.matmul.allow_tf32 = True
 def clean_up_memory():
     gc.collect()
     torch.cuda.empty_cache()
 @spaces.GPU(duration=15)
 def recap_sentence(string):
     inputs = recap_tokenizer(["restore capitalization and punctuation: " + string], return_tensors="pt", padding=True).to(device)
@@ -25,30 +24,69 @@ def recap_sentence(string):
     recap_result = recap_tokenizer.decode(outputs, skip_special_tokens=True)
     return recap_result
 @spaces.GPU(duration=30)
 def return_prediction_whisper_file(file=None, device=device):
     if file is not None:
-        try:
-            waveform, sr = librosa.load(file.name, sr=16000)
-        except Exception as e:
-            return f"Error loading the audio file: {str(e)}"
-        waveform = waveform[:3600 * sr]
         whisper_result = whisper_classifier.classify_file_whisper_mkd_streaming(waveform, device)
     else:
-        return "You must provide an audio file."
     recap_result = ""
     prev_segment = ""
     prev_segment_len = 0
     for segment in whisper_result:
         if prev_segment == "":
             recap_segment = recap_sentence(segment[0])
         else:
             prev_segment_len = len(prev_segment.split())
             recap_segment = recap_sentence(prev_segment + " " + segment[0])
         recap_segment = recap_segment.split()
         recap_segment = recap_segment[prev_segment_len:]
         recap_segment = " ".join(recap_segment)
@@ -56,62 +94,89 @@ def return_prediction_whisper_file(file=None, device=device):
         recap_result += recap_segment + " "
         for i, letter in enumerate(recap_result):
-            if i > 1 and recap_result[i - 2] in [".", "!", "?"] and letter.islower():
-                recap_result = recap_result[:i] + letter.upper() + recap_result[i + 1:]
-    clean_up_memory()
-    return recap_result
-# Load the models
 whisper_classifier = foreign_class(source="Macedonian-ASR/whisper-large-v3-macedonian-asr", pymodule_file="custom_interface_app.py", classname="ASR")
 whisper_classifier = whisper_classifier.to(device)
 whisper_classifier.eval()
 recap_model_name = "Macedonian-ASR/mt5-restore-capitalization-macedonian"
 recap_tokenizer = T5Tokenizer.from_pretrained(recap_model_name)
 recap_model = T5ForConditionalGeneration.from_pretrained(recap_model_name, torch_dtype=torch.float16)
 recap_model.to(device)
 recap_model.eval()
-# Interfaces
 mic_transcribe_whisper = gr.Interface(
-    fn=return_prediction_whisper_file,
     inputs=gr.Audio(sources="microphone", type="filepath"),
     outputs=gr.Textbox(),
     allow_flagging="never",
     live=False,
 )
-file_transcribe_whisper_upload = gr.Interface(
-    fn=return_prediction_whisper_file,
-    inputs=gr.File(label="Upload an Audio File", type="file"),
-    outputs=gr.Textbox(label="Transcription"),
     allow_flagging="never",
     live=True
 )
 project_description = '''
 <img src="https://i.ibb.co/SKDfwn9/bookie.png"
      alt="Bookie logo"
      style="float: right; width: 130px; height: 110px; margin-left: 10px;" />
-## Authors:
 1. **Дејан Порјазовски**
 2. **Илина Јакимовска**
 3. **Ордан Чукалиев**
 4. **Никола Стиков**
-This collaboration is part of the activities of the **Center for Advanced Interdisciplinary Research ([CeNIIs](https://ukim.edu.mk/en/centri/centar-za-napredni-interdisciplinarni-istrazhuvanja-ceniis))** at UKIM.
-## Contribute to improving Macedonian speech recognition models
-You can find instructions for donating Macedonian speech at the following [link](https://drive.google.com/file/d/1YdZJz9o1X8AMc6J4MNPnVZjASyIXnvoZ/view?usp=sharing).
 '''
 # Custom CSS
 css = """
 .gradio-container {
-    background-color: #f0f0f0;  /* Set your desired background color */
 }
 .custom-markdown p, .custom-markdown li, .custom-markdown h2, .custom-markdown a {
     font-size: 15px !important;
@@ -122,15 +187,22 @@ css = """
 }
 """
-transcriber_app = gr.Blocks(css=css)
 with transcriber_app:
     gr.Markdown(project_description, elem_classes="custom-markdown")
     gr.TabbedInterface(
-        [mic_transcribe_whisper, file_transcribe_whisper_upload],
-        ["Microphone Transcription", "Upload File for Transcription"],
     )
 if __name__ == "__main__":
     transcriber_app.queue()
-    transcriber_app.launch(share=True)

 from transformers import T5Tokenizer, T5ForConditionalGeneration
 import librosa
 import whisper_timestamped as whisper
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, Wav2Vec2ForCTC, AutoProcessor
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 torch.backends.cuda.matmul.allow_tf32 = True
 def clean_up_memory():
     gc.collect()
     torch.cuda.empty_cache()
 @spaces.GPU(duration=15)
 def recap_sentence(string):
     inputs = recap_tokenizer(["restore capitalization and punctuation: " + string], return_tensors="pt", padding=True).to(device)
     recap_result = recap_tokenizer.decode(outputs, skip_special_tokens=True)
     return recap_result
+@spaces.GPU(duration=30)
+def return_prediction_w2v2(mic=None, file=None, device=device):
+    if mic is not None:
+        waveform, sr = librosa.load(mic, sr=16000)
+        waveform = waveform[:60*sr]
+        w2v2_result = w2v2_classifier.classify_file_w2v2(waveform, device)
+    elif file is not None:
+        waveform, sr = librosa.load(file, sr=16000)
+        waveform = waveform[:60*sr]
+        w2v2_result = w2v2_classifier.classify_file_w2v2(waveform, device)
+    else:
+        return "You must either provide a mic recording or a file"
+    recap_result = recap_sentence(w2v2_result[0])
+    for i, letter in enumerate(recap_result):
+        if i > 1 and recap_result[i-2] in [".", "!", "?"] and letter.islower():
+            recap_result = recap_result[:i] + letter.upper() + recap_result[i+1:]
+    clean_up_memory()
+    return recap_result
 @spaces.GPU(duration=30)
+def return_prediction_whisper_mic(mic=None, device=device):
+    if mic is not None:
+        waveform, sr = librosa.load(mic, sr=16000)
+        waveform = waveform[:30*sr]
+        whisper_result = whisper_classifier.classify_file_whisper_mkd(waveform, device)
+    else:
+        return "You must provide a mic recording"
+    recap_result = recap_sentence(whisper_result[0])
+    for i, letter in enumerate(recap_result):
+        if i > 1 and recap_result[i-2] in [".", "!", "?"] and letter.islower():
+            recap_result = recap_result[:i] + letter.upper() + recap_result[i+1:]
+    clean_up_memory()
+    return recap_result
+@spaces.GPU(duration=60)
 def return_prediction_whisper_file(file=None, device=device):
+    whisper_result = []
     if file is not None:
+        waveform, sr = librosa.load(file, sr=16000)
+        waveform = waveform[:3600*sr]
         whisper_result = whisper_classifier.classify_file_whisper_mkd_streaming(waveform, device)
     else:
+        yield "You must provide a file"
     recap_result = ""
     prev_segment = ""
     prev_segment_len = 0
+    segment_counter = 0
     for segment in whisper_result:
+        segment_counter += 1
         if prev_segment == "":
             recap_segment = recap_sentence(segment[0])
         else:
             prev_segment_len = len(prev_segment.split())
             recap_segment = recap_sentence(prev_segment + " " + segment[0])
         recap_segment = recap_segment.split()
         recap_segment = recap_segment[prev_segment_len:]
         recap_segment = " ".join(recap_segment)
         recap_result += recap_segment + " "
         for i, letter in enumerate(recap_result):
+            if i > 1 and recap_result[i-2] in [".", "!", "?"] and letter.islower():
+                recap_result = recap_result[:i] + letter.upper() + recap_result[i+1:]
+        yield recap_result
+return_prediction_whisper_mic_with_device = partial(return_prediction_whisper_mic, device=device)
+return_prediction_whisper_file_with_device = partial(return_prediction_whisper_file, device=device)
+return_prediction_w2v2_with_device = partial(return_prediction_w2v2, device=device)
+# Load the ASR models
 whisper_classifier = foreign_class(source="Macedonian-ASR/whisper-large-v3-macedonian-asr", pymodule_file="custom_interface_app.py", classname="ASR")
 whisper_classifier = whisper_classifier.to(device)
 whisper_classifier.eval()
+w2v2_classifier = foreign_class(source="Macedonian-ASR/wav2vec2-aed-macedonian-asr", pymodule_file="custom_interface_app.py", classname="ASR")
+w2v2_classifier = w2v2_classifier.to(device)
+w2v2_classifier.eval()
+# Load the T5 tokenizer and model
 recap_model_name = "Macedonian-ASR/mt5-restore-capitalization-macedonian"
 recap_tokenizer = T5Tokenizer.from_pretrained(recap_model_name)
 recap_model = T5ForConditionalGeneration.from_pretrained(recap_model_name, torch_dtype=torch.float16)
 recap_model.to(device)
 recap_model.eval()
+# Interface definitions
 mic_transcribe_whisper = gr.Interface(
+    fn=return_prediction_whisper_mic_with_device,
     inputs=gr.Audio(sources="microphone", type="filepath"),
     outputs=gr.Textbox(),
     allow_flagging="never",
     live=False,
 )
+file_transcribe_whisper = gr.Interface(
+    fn=return_prediction_whisper_file_with_device,
+    inputs=gr.Audio(sources="upload", type="filepath"),
+    outputs=gr.Textbox(),
     allow_flagging="never",
     live=True
 )
+mic_transcribe_w2v2 = gr.Interface(
+    fn=return_prediction_w2v2_with_device,
+    inputs=gr.Audio(sources="microphone", type="filepath"),
+    outputs=gr.Textbox(),
+    allow_flagging="never",
+    live=False,
+)
+file_transcribe_w2v2 = gr.Interface(
+    fn=return_prediction_w2v2_with_device,
+    inputs=gr.Audio(sources="upload", type="filepath"),
+    outputs=gr.Textbox(),
+    allow_flagging="never",
+    live=False
+)
 project_description = '''
 <img src="https://i.ibb.co/SKDfwn9/bookie.png"
      alt="Bookie logo"
      style="float: right; width: 130px; height: 110px; margin-left: 10px;" />
+## Автори:
 1. **Дејан Порјазовски**
 2. **Илина Јакимовска**
 3. **Ордан Чукалиев**
 4. **Никола Стиков**
+Оваа колаборација е дел од активностите на **Центарот за напредни интердисциплинарни истражувања ([ЦеНИИс](https://ukim.edu.mk/en/centri/centar-za-napredni-interdisciplinarni-istrazhuvanja-ceniis))** при УКИМ.
+## Во тренирањето на овој модел се употребени податоци од:
+1. Дигитален архив за етнолошки и антрополошки ресурси ([ДАЕАР](https://iea.pmf.ukim.edu.mk/tabs/view/61f236ed7d95176b747c20566ddbda1a)) при Институтот за етнологија и антропологија, Природно-математички факултет при УКИМ.
+2. Аудио верзија на меѓународното списание [„ЕтноАнтропоЗум"](https://etno.pmf.ukim.mk/index.php/eaz/issue/archive) на Институтот за етнологија и антропологија, Природно-математички факултет при УКИМ.
+3. Аудио подкастот [„Обични луѓе"](https://obicniluge.mk/episodes/) на Илина Јакимовска
+4. Научните видеа од серијалот [„Наука за деца"](http://naukazadeca.mk), фондација [КАНТАРОТ](https://qantarot.substack.com/)
+5. Македонска верзија на [Mozilla Common Voice](https://commonvoice.mozilla.org/en/datasets) (верзија 18.0)
+## Како да придонесете за подобрување на македонските модели за препознавање на говор?
+На  следниот [линк](https://drive.google.com/file/d/1YdZJz9o1X8AMc6J4MNPnVZjASyIXnvoZ/view?usp=sharing) ќе најдете инструкции за тоа како да донирате македонски говор преку платформата Mozilla Common Voice.
 '''
 # Custom CSS
 css = """
 .gradio-container {
+    background-color: #f0f0f0;
 }
 .custom-markdown p, .custom-markdown li, .custom-markdown h2, .custom-markdown a {
     font-size: 15px !important;
 }
 """
+transcriber_app = gr.Blocks(css=css, delete_cache=(60, 120))
 with transcriber_app:
+    state = gr.State()
     gr.Markdown(project_description, elem_classes="custom-markdown")
     gr.TabbedInterface(
+        [mic_transcribe_whisper, file_transcribe_whisper, mic_transcribe_w2v2, file_transcribe_w2v2],
+        ["Буки-Whisper микрофон", "Буки-Whisper датотека", "Буки-Wav2vec2 микрофон", "Буки-Wav2vec2 датотека"],
     )
+    state = gr.State(value=[], delete_callback=lambda v: print("STATE DELETED"))
+    transcriber_app.unload(return_prediction_whisper_mic)
+    transcriber_app.unload(return_prediction_whisper_file)
+    transcriber_app.unload(return_prediction_w2v2)
 if __name__ == "__main__":
     transcriber_app.queue()
+    transcriber_app.launch(share=True)