Bookie-Whisper-Macedonian-ASR

Sleeping

App Files Files Community

Bookie-Whisper-Macedonian-ASR / app.py

vladocar

Update app.py

e2f3dc3 verified 10 months ago

raw

history blame contribute delete

9.13 kB

	import spaces
	import os
	import gc
	from functools import partial
	import gradio as gr
	import torch
	from speechbrain.inference.interfaces import Pretrained, foreign_class
	from transformers import T5Tokenizer, T5ForConditionalGeneration
	import librosa
	import whisper_timestamped as whisper
	from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, Wav2Vec2ForCTC, AutoProcessor

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	torch.backends.cuda.matmul.allow_tf32 = True

	def clean_up_memory():
	gc.collect()
	torch.cuda.empty_cache()

	@spaces.GPU(duration=15)
	def recap_sentence(string):
	inputs = recap_tokenizer(["restore capitalization and punctuation: " + string], return_tensors="pt", padding=True).to(device)
	outputs = recap_model.generate(**inputs, max_length=768, num_beams=5, early_stopping=True).squeeze(0)
	recap_result = recap_tokenizer.decode(outputs, skip_special_tokens=True)
	return recap_result

	@spaces.GPU(duration=30)
	def return_prediction_w2v2(mic=None, file=None, device=device):
	if mic is not None:
	waveform, sr = librosa.load(mic, sr=16000)
	waveform = waveform[:60*sr]
	w2v2_result = w2v2_classifier.classify_file_w2v2(waveform, device)
	elif file is not None:
	waveform, sr = librosa.load(file, sr=16000)
	waveform = waveform[:60*sr]
	w2v2_result = w2v2_classifier.classify_file_w2v2(waveform, device)
	else:
	return "You must either provide a mic recording or a file"

	recap_result = recap_sentence(w2v2_result[0])

	for i, letter in enumerate(recap_result):
	if i > 1 and recap_result[i-2] in [".", "!", "?"] and letter.islower():
	recap_result = recap_result[:i] + letter.upper() + recap_result[i+1:]

	clean_up_memory()
	return recap_result

	@spaces.GPU(duration=30)
	def return_prediction_whisper_mic(mic=None, device=device):
	if mic is not None:
	waveform, sr = librosa.load(mic, sr=16000)
	waveform = waveform[:30*sr]
	whisper_result = whisper_classifier.classify_file_whisper_mkd(waveform, device)
	else:
	return "You must provide a mic recording"

	recap_result = recap_sentence(whisper_result[0])

	for i, letter in enumerate(recap_result):
	if i > 1 and recap_result[i-2] in [".", "!", "?"] and letter.islower():
	recap_result = recap_result[:i] + letter.upper() + recap_result[i+1:]

	clean_up_memory()
	return recap_result

	@spaces.GPU(duration=60)
	def return_prediction_whisper_file(file=None, device=device):
	whisper_result = []
	if file is not None:
	waveform, sr = librosa.load(file, sr=16000)
	waveform = waveform[:3600*sr]
	whisper_result = whisper_classifier.classify_file_whisper_mkd_streaming(waveform, device)
	else:
	yield "You must provide a file"

	recap_result = ""
	prev_segment = ""
	prev_segment_len = 0

	segment_counter = 0
	for segment in whisper_result:
	segment_counter += 1
	if prev_segment == "":
	recap_segment = recap_sentence(segment[0])
	else:
	prev_segment_len = len(prev_segment.split())
	recap_segment = recap_sentence(prev_segment + " " + segment[0])

	recap_segment = recap_segment.split()
	recap_segment = recap_segment[prev_segment_len:]
	recap_segment = " ".join(recap_segment)
	prev_segment = segment[0]
	recap_result += recap_segment + " "

	for i, letter in enumerate(recap_result):
	if i > 1 and recap_result[i-2] in [".", "!", "?"] and letter.islower():
	recap_result = recap_result[:i] + letter.upper() + recap_result[i+1:]

	yield recap_result

	return_prediction_whisper_mic_with_device = partial(return_prediction_whisper_mic, device=device)
	return_prediction_whisper_file_with_device = partial(return_prediction_whisper_file, device=device)
	return_prediction_w2v2_with_device = partial(return_prediction_w2v2, device=device)

	# Load the ASR models
	whisper_classifier = foreign_class(source="Macedonian-ASR/whisper-large-v3-macedonian-asr", pymodule_file="custom_interface_app.py", classname="ASR")
	whisper_classifier = whisper_classifier.to(device)
	whisper_classifier.eval()

	w2v2_classifier = foreign_class(source="Macedonian-ASR/wav2vec2-aed-macedonian-asr", pymodule_file="custom_interface_app.py", classname="ASR")
	w2v2_classifier = w2v2_classifier.to(device)
	w2v2_classifier.eval()

	# Load the T5 tokenizer and model
	recap_model_name = "Macedonian-ASR/mt5-restore-capitalization-macedonian"
	recap_tokenizer = T5Tokenizer.from_pretrained(recap_model_name)
	recap_model = T5ForConditionalGeneration.from_pretrained(recap_model_name, torch_dtype=torch.float16)
	recap_model.to(device)
	recap_model.eval()

	# Interface definitions
	mic_transcribe_whisper = gr.Interface(
	fn=return_prediction_whisper_mic_with_device,
	inputs=gr.Audio(sources="microphone", type="filepath"),
	outputs=gr.Textbox(),
	allow_flagging="never",
	live=False,
	)

	file_transcribe_whisper = gr.Interface(
	fn=return_prediction_whisper_file_with_device,
	inputs=gr.Audio(sources="upload", type="filepath"),
	outputs=gr.Textbox(),
	allow_flagging="never",
	live=True
	)

	mic_transcribe_w2v2 = gr.Interface(
	fn=return_prediction_w2v2_with_device,
	inputs=gr.Audio(sources="microphone", type="filepath"),
	outputs=gr.Textbox(),
	allow_flagging="never",
	live=False,
	)

	file_transcribe_w2v2 = gr.Interface(
	fn=return_prediction_w2v2_with_device,
	inputs=gr.Audio(sources="upload", type="filepath"),
	outputs=gr.Textbox(),
	allow_flagging="never",
	live=False
	)

	project_description = '''
	<img src="https://i.ibb.co/SKDfwn9/bookie.png"
	alt="Bookie logo"
	style="float: right; width: 130px; height: 110px; margin-left: 10px;" />

	## Автори:
	1. Дејан Порјазовски
	2. Илина Јакимовска
	3. Ордан Чукалиев
	4. Никола Стиков
	Оваа колаборација е дел од активностите на Центарот за напредни интердисциплинарни истражувања ([ЦеНИИс](https://ukim.edu.mk/en/centri/centar-za-napredni-interdisciplinarni-istrazhuvanja-ceniis)) при УКИМ.
	## Во тренирањето на овој модел се употребени податоци од:
	1. Дигитален архив за етнолошки и антрополошки ресурси ([ДАЕАР](https://iea.pmf.ukim.edu.mk/tabs/view/61f236ed7d95176b747c20566ddbda1a)) при Институтот за етнологија и антропологија, Природно-математички факултет при УКИМ.
	2. Аудио верзија на меѓународното списание [„ЕтноАнтропоЗум"](https://etno.pmf.ukim.mk/index.php/eaz/issue/archive) на Институтот за етнологија и антропологија, Природно-математички факултет при УКИМ.
	3. Аудио подкастот [„Обични луѓе"](https://obicniluge.mk/episodes/) на Илина Јакимовска
	4. Научните видеа од серијалот [„Наука за деца"](http://naukazadeca.mk), фондација [КАНТАРОТ](https://qantarot.substack.com/)
	5. Македонска верзија на [Mozilla Common Voice](https://commonvoice.mozilla.org/en/datasets) (верзија 18.0)
	## Како да придонесете за подобрување на македонските модели за препознавање на говор?
	На следниот [линк](https://drive.google.com/file/d/1YdZJz9o1X8AMc6J4MNPnVZjASyIXnvoZ/view?usp=sharing) ќе најдете инструкции за тоа како да донирате македонски говор преку платформата Mozilla Common Voice.
	'''

	# Custom CSS
	css = """
	.gradio-container {
	background-color: #f0f0f0;
	}
	.custom-markdown p, .custom-markdown li, .custom-markdown h2, .custom-markdown a {
	font-size: 15px !important;
	font-family: Arial, sans-serif !important;
	}
	.gradio-container {
	background-color: #f3f3f3 !important;
	}
	"""

	transcriber_app = gr.Blocks(css=css, delete_cache=(60, 120))

	with transcriber_app:
	state = gr.State()
	gr.Markdown(project_description, elem_classes="custom-markdown")

	gr.TabbedInterface(
	[mic_transcribe_whisper, file_transcribe_whisper, mic_transcribe_w2v2, file_transcribe_w2v2],
	["Буки-Whisper микрофон", "Буки-Whisper датотека", "Буки-Wav2vec2 микрофон", "Буки-Wav2vec2 датотека"],
	)
	state = gr.State(value=[], delete_callback=lambda v: print("STATE DELETED"))

	transcriber_app.unload(return_prediction_whisper_mic)
	transcriber_app.unload(return_prediction_whisper_file)
	transcriber_app.unload(return_prediction_w2v2)

	if __name__ == "__main__":
	transcriber_app.queue()
	transcriber_app.launch(share=True)