import torch from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor,Wav2Vec2ProcessorWithLM import gradio as gr import numpy as np import librosa import os from huggingface_hub import snapshot_download # Check if GPU is available if torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") def parse_transcription_with_lm(logits): result = processor_with_LM.batch_decode(logits.cpu().numpy()) text = result.text transcription = text[0].replace('','') return transcription def parse_transcription(logits): predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.decode(predicted_ids[0], skip_special_tokens=True) return transcription file_path = snapshot_download(repo_id="shizukanabasho/North3", use_auth_token=os.environ['TOKEN']) processor = Wav2Vec2Processor.from_pretrained(file_path) processor_with_LM = Wav2Vec2ProcessorWithLM.from_pretrained(file_path) model = Wav2Vec2ForCTC.from_pretrained(file_path).to(device) def process(audio, applyLM): try: if isinstance(audio, str): # Process microphone audio arr, _ = librosa.load(audio, sr=16000, mono=True) elif isinstance(audio, tuple) and len(audio) == 2 and isinstance(audio[0], int) and isinstance(audio[1], np.ndarray): # Process uploaded audio file arr = audio[1].astype(np.float64) # Ensure that the audio data is in float64 format print("Uploaded audio data - Shape:", arr.shape, "Data Type:", arr.dtype) # Resample to 16000 Hz and convert to mono if needed if arr.ndim > 1: arr = np.mean(arr, axis=1) # Convert to mono if audio is stereo arr = librosa.resample(arr, orig_sr=audio[0], target_sr=16000) print("Processed audio data - Shape:", arr.shape, "Data Type:", arr.dtype) else: return "Invalid audio source" input_values = processor(arr, sampling_rate=16000, return_tensors="pt", padding=True) input_values = input_values.to(device) with torch.no_grad(): logits = model(**input_values).logits if applyLM: transcription = parse_transcription_with_lm(logits) else: transcription = parse_transcription(logits) return transcription except Exception as e: return str(e) microphone_interface = gr.Interface( fn=process, inputs=[gr.Audio(label="Microphone", sources="microphone", type="filepath"), gr.Checkbox(label="Apply LM", value=True)], outputs=gr.Textbox(label="Transcription:", lines=3), live=False, analytics_enabled=False, # show_tips=False, # enable_queue=True, # preserve_state=True, examples=[ [os.path.join(os.path.dirname(__file__),"examples/บ่ต้องไปส่ง.mp3")], [os.path.join(os.path.dirname(__file__),"examples/ไปตวย.mp3")], ], cache_examples = False, ) upload_interface = gr.Interface( fn=process, inputs=[gr.Audio(label="Upload Speech", sources="upload", type="numpy"), gr.Checkbox(label="Apply LM", value=True)], outputs=gr.Textbox(label="Transcription:", lines=3), analytics_enabled=False, # show_tips=False, # enable_queue=True, # preserve_state=True, examples=[ [os.path.join(os.path.dirname(__file__),"examples/บ่ต้องไปส่ง.mp3")], [os.path.join(os.path.dirname(__file__),"examples/ไปตวย.mp3")], ], cache_examples = False, ) with gr.Blocks() as datasets: gr.Markdown(""" # Dataset Details This research project develops a Northern-Central Thai parallel speech dataset which is composed of 80,000 audio files. These files have been divided into two corpora, each corpus contain 40,000 files. - **Corpus 1**: This corpus is composed of speech data from dialogues of daily life among northern Thai people. - **Corpus 2**: This corpus is composed of speech data from dialogues in tourism domain between customers and service providers who are northern Thai people. ## Accessing the Dataset If you're interested in accessing this dataset, please visit this link, and please contact us via email at ds.sci.cmu@gmail.com or phimphaka.t@cmu.ac.th for the password. This research has received funding support from the NSRF via the Program Management Unit for Human Resources & Institutional Development, Research and Innovation [grant number B04G640073]. """) with gr.Row(): gr.Image(value = os.path.join(os.path.dirname(__file__),"examples/y2_train_5row.png"), type="pil", label="Dataset Example") Instructions = """
Northern Thai Dialect Speech Recognition System

Instructions:

This research has received funding support from the NSRF via the Program Management Unit for Human Resources & Institutional Development, Research and Innovation [grant number B04G640073].

""" if __name__ == "__main__": with gr.Blocks(title="Northern Thai ASR") as demo: gr.HTML(Instructions) tabbed_interface = gr.TabbedInterface( [microphone_interface, upload_interface, datasets], tab_names=["Microphone", "Upload", "Dataset"], ) demo.launch()