Wav2Vec2-Large-XLSR-53-khmer
Fine-tuned facebook/wav2vec2-large-xlsr-53 on Khmer using the Common Voice, and OpenSLR Kh.
When using this model, make sure that your speech input is sampled at 16kHz.
Usage
The model can be used directly (without a language model) as follows:
import torch
import torchaudio
from datasets import load_dataset
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
!wget https://www.openslr.org/resources/42/km_kh_male.zip
!unzip km_kh_male.zip
!ls km_kh_male
colnames=['path','sentence']
df = pd.read_csv('/content/km_kh_male/line_index.tsv',sep='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\t',header=None,names = colnames)
df['path'] = '/content/km_kh_male/wavs/'+df['path'] +'.wav'
train, test = train_test_split(df, test_size=0.1)
test.to_csv('/content/km_kh_male/line_index_test.csv')
test_dataset = load_dataset('csv', data_files='/content/km_kh_male/line_index_test.csv',split = 'train')
processor = Wav2Vec2Processor.from_pretrained("gagan3012/wav2vec2-xlsr-nepali")
model = Wav2Vec2ForCTC.from_pretrained("gagan3012/wav2vec2-xlsr-nepali")
resampler = torchaudio.transforms.Resample(48_000, 16_000)
# Preprocessing the datasets.
# We need to read the aduio files as arrays
def speech_file_to_array_fn(batch):
\\\\\\\\\\\\\\\\tspeech_array, sampling_rate = torchaudio.load(batch["path"])
\\\\\\\\\\\\\\\\tbatch["speech"] = resampler(speech_array).squeeze().numpy()
\\\\\\\\\\\\\\\\treturn batch
test_dataset = test_dataset.map(speech_file_to_array_fn)
inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
with torch.no_grad():
\\\\\\\\\\\\\\\\tlogits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
predicted_ids = torch.argmax(logits, dim=-1)
print("Prediction:", processor.batch_decode(predicted_ids))
print("Reference:", test_dataset["sentence"][:2])
Result
Prediction: ['पारानाको ब्राजिली राज्यमा रहेको राजधानी', 'देवराज जोशी त्रिभुवन विश्वविद्यालयबाट शिक्षाशास्त्रमा स्नातक हुनुहुन्छ']
Reference: ['पारानाको ब्राजिली राज्यमा रहेको राजधानी', 'देवराज जोशी त्रिभुवन विश्वविद्यालयबाट शिक्षाशास्त्रमा स्नातक हुनुहुन्छ']
Evaluation
The model can be evaluated as follows on the {language} test data of Common Voice. # TODO: replace #TODO: replace language with your {language}, e.g. French
import torch
import torchaudio
from datasets import load_dataset, load_metric
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import re
from sklearn.model_selection import train_test_split
import pandas as pd
from datasets import load_dataset
!wget https://www.openslr.org/resources/42/km_kh_male.zip
!unzip km_kh_male.zip
!ls km_kh_male
colnames=['path','sentence']
df = pd.read_csv('/content/km_kh_male/line_index.tsv',sep='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\t',header=None,names = colnames)
df['path'] = '/content/km_kh_male/wavs/'+df['path'] +'.wav'
train, test = train_test_split(df, test_size=0.1)
test.to_csv('/content/km_kh_male/line_index_test.csv')
test_dataset = load_dataset('csv', data_files='/content/km_kh_male/line_index_test.csv',split = 'train')
wer = load_metric("wer")
cer = load_metric("cer")
processor = Wav2Vec2Processor.from_pretrained("gagan3012/wav2vec2-xlsr-khmer")
model = Wav2Vec2ForCTC.from_pretrained("gagan3012/wav2vec2-xlsr-khmer")
model.to("cuda")
chars_to_ignore_regex = '[\\\\,\\\\?\\\\.\\\\!\\\\-\\\\;\\\\:\\\\"\\\\“]'
resampler = torchaudio.transforms.Resample(48_000, 16_000)
# Preprocessing the datasets.
# We need to read the aduio files as arrays
def speech_file_to_array_fn(batch):
\\tbatch["text"] = re.sub(chars_to_ignore_regex, '', batch["text"]).lower()
\\tspeech_array, sampling_rate = torchaudio.load(batch["path"])
\\tbatch["speech"] = resampler(speech_array).squeeze().numpy()
\\treturn batch
test_dataset = test_dataset.map(speech_file_to_array_fn)
# Preprocessing the datasets.
# We need to read the aduio files as arrays
def evaluate(batch):
\\tinputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
\\twith torch.no_grad():
\\t\\tlogits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
\\tpred_ids = torch.argmax(logits, dim=-1)
\\tbatch["pred_strings"] = processor.batch_decode(pred_ids)
\\treturn batch
cer = load_metric("cer")
result = test_dataset.map(evaluate, batched=True, batch_size=8)
print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["text"])))
print("CER: {:2f}".format(100 * cer.compute(predictions=result["pred_strings"], references=result["text"])))
Test Result: 24.96 %
WER: 24.962519 CER: 6.950925
Training
The script used for training can be found here
- Downloads last month
- 125
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social
visibility and check back later, or deploy to Inference Endpoints (dedicated)
instead.