import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor,Wav2Vec2ProcessorWithLM
import gradio as gr
import numpy as np
import librosa
import os
from huggingface_hub import snapshot_download
# Check if GPU is available
if torch.cuda.is_available():
device = torch.device("cuda")
else:
device = torch.device("cpu")
def parse_transcription_with_lm(logits):
result = processor_with_LM.batch_decode(logits.cpu().numpy())
text = result.text
transcription = text[0].replace('','')
return transcription
def parse_transcription(logits):
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
return transcription
file_path = snapshot_download(repo_id="shizukanabasho/North3", use_auth_token=os.environ['TOKEN'])
processor = Wav2Vec2Processor.from_pretrained(file_path)
processor_with_LM = Wav2Vec2ProcessorWithLM.from_pretrained(file_path)
model = Wav2Vec2ForCTC.from_pretrained(file_path).to(device)
def process(audio, applyLM):
try:
if isinstance(audio, str):
# Process microphone audio
arr, _ = librosa.load(audio, sr=16000, mono=True)
elif isinstance(audio, tuple) and len(audio) == 2 and isinstance(audio[0], int) and isinstance(audio[1], np.ndarray):
# Process uploaded audio file
arr = audio[1].astype(np.float64) # Ensure that the audio data is in float64 format
print("Uploaded audio data - Shape:", arr.shape, "Data Type:", arr.dtype)
# Resample to 16000 Hz and convert to mono if needed
if arr.ndim > 1:
arr = np.mean(arr, axis=1) # Convert to mono if audio is stereo
arr = librosa.resample(arr, orig_sr=audio[0], target_sr=16000)
print("Processed audio data - Shape:", arr.shape, "Data Type:", arr.dtype)
else:
return "Invalid audio source"
input_values = processor(arr, sampling_rate=16000, return_tensors="pt", padding=True)
input_values = input_values.to(device)
with torch.no_grad():
logits = model(**input_values).logits
if applyLM:
transcription = parse_transcription_with_lm(logits)
else:
transcription = parse_transcription(logits)
return transcription
except Exception as e:
return str(e)
microphone_interface = gr.Interface(
fn=process,
inputs=[gr.Audio(label="Microphone", sources="microphone", type="filepath"), gr.Checkbox(label="Apply LM", value=True)],
outputs=gr.Textbox(label="Transcription:", lines=3),
live=False,
analytics_enabled=False,
# show_tips=False,
# enable_queue=True,
# preserve_state=True,
examples=[
[os.path.join(os.path.dirname(__file__),"examples/บ่ต้องไปส่ง.mp3")],
[os.path.join(os.path.dirname(__file__),"examples/ไปตวย.mp3")],
],
cache_examples = False,
)
upload_interface = gr.Interface(
fn=process,
inputs=[gr.Audio(label="Upload Speech", sources="upload", type="numpy"), gr.Checkbox(label="Apply LM", value=True)],
outputs=gr.Textbox(label="Transcription:", lines=3),
analytics_enabled=False,
# show_tips=False,
# enable_queue=True,
# preserve_state=True,
examples=[
[os.path.join(os.path.dirname(__file__),"examples/บ่ต้องไปส่ง.mp3")],
[os.path.join(os.path.dirname(__file__),"examples/ไปตวย.mp3")],
],
cache_examples = False,
)
with gr.Blocks() as datasets:
gr.Markdown("""
# Dataset Details
This research project develops a Northern-Central Thai parallel speech dataset which is composed of 80,000 audio files. These files have been divided into two corpora, each corpus contain 40,000 files.
- **Corpus 1**: This corpus is composed of speech data from dialogues of daily life among northern Thai people.
- **Corpus 2**: This corpus is composed of speech data from dialogues in tourism domain between customers and service providers who are northern Thai people.
## Accessing the Dataset
If you're interested in accessing this dataset, please visit this link, and please contact us via email at ds.sci.cmu@gmail.com or phimphaka.t@cmu.ac.th for the password.
This research has received funding support from the NSRF via the Program Management Unit for Human Resources & Institutional Development, Research and Innovation [grant number B04G640073].
""")
with gr.Row():
gr.Image(value = os.path.join(os.path.dirname(__file__),"examples/y2_train_5row.png"), type="pil", label="Dataset Example")
Instructions = """
Instructions:
This research has received funding support from the NSRF via the Program Management Unit for Human Resources & Institutional Development, Research and Innovation [grant number B04G640073].
""" if __name__ == "__main__": with gr.Blocks(title="Northern Thai ASR") as demo: gr.HTML(Instructions) tabbed_interface = gr.TabbedInterface( [microphone_interface, upload_interface, datasets], tab_names=["Microphone", "Upload", "Dataset"], ) demo.launch()