from pprint import pformat from huggingface_hub import hf_hub_download import librosa import gradio as gr from pipeline import PreTrainedPipeline HF_HUB_URL = 'ales/wav2vec2-cv-be' LM_HUB_FP = 'language_model/cv8be_5gram.bin' MODEL_SAMPLING_RATE = 16_000 # 16kHz # download Language Model from HF Hub lm_fp = hf_hub_download(repo_id=HF_HUB_URL, filename=LM_HUB_FP) # init pipeline pipeline = PreTrainedPipeline(model_path=HF_HUB_URL, language_model_fp=lm_fp) def main(recorded_audio_fp: str, uploaded_audio_fp: str): audio_fp = None if recorded_audio_fp is not None: audio_fp = recorded_audio_fp used_audiofile = 'recorded' elif uploaded_audio_fp is not None: audio_fp = uploaded_audio_fp used_audiofile = 'uploaded' else: return ( 'Памылка! Вы мусіце альбо запісаць, альбо запампаваць аўдыяфайл.', 'Error! You have to either record or upload an audiofile.' ) # read audio file inputs = librosa.load(audio_fp, sr=MODEL_SAMPLING_RATE, mono=True)[0] # recognize speech pipeline_res = pipeline(inputs=inputs) text = pipeline_res['text'][0] # unpack batch of size 1 # add technical information to the output tech_data = pipeline_res del tech_data['text'] tech_data['used_audiofile'] = used_audiofile tech_data['recorded_file_present'] = recorded_audio_fp is not None tech_data['uploaded_file_present'] = uploaded_audio_fp is not None tech_data['audiofile_path'] = audio_fp tech_data['model_sampling_rate'] = MODEL_SAMPLING_RATE tech_data['inputs_shape'] = inputs.shape tech_data['inputs_max'] = inputs.max().item() tech_data['inputs_min'] = inputs.min().item() tech_data_str = pformat(tech_data) return text, tech_data_str article = """ The model used can be found here: [ales/wav2vec2-cv-be](https://huggingface.co/ales/wav2vec2-cv-be) ![Page Visits](https://visitor-badge.glitch.me/badge?page_id=huggingface.co/spaces/ales/wav2vec2-cv-be-lm&left_color=darkgray&right_color=crimson&left_text=Page%20Visits) """ iface = gr.Interface( fn=main, inputs=[ gr.inputs.Audio( source='microphone', type='filepath', label='Запішыце аўдыяфайл, каб распазнаць маўленьне', optional=True, ), gr.inputs.Audio( source='upload', type='filepath', label='Альбо загрузіце ўжо запісаны аўдыяфайл сюды', optional=True ), ], outputs=[ gr.outputs.Textbox(type='str', label='Распазнаны тэкст'), gr.outputs.Textbox(type='str', label='Тэхнічная інфармацыя') ], title='wav2vec2 fine-tuned on CommonVoice 8 Be + Language Model', description=('Мадэль распазнаваньня беларускага маўленьня, навучаная на датсэце Common Voice 8.\n' 'Акустычная мадэль + моўная мадэль.' ), article=article ) iface.launch(enable_queue=True)