This model was finetuned from a Wav2vec2.0 XLS-R model: 300M with the Waikhana train parition of the Americas NLP 2022 dataset. This challenge took place during NeurIPSS 2022.

Example of usage

The model can be used directly (without a language model) as follows:

from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch
import torchaudio

# load model and processor
processor = Wav2Vec2Processor.from_pretrained("ivangtorre/wav2vec2-xlsr-300m-waikhana")
model = Wav2Vec2ForCTC.from_pretrained("ivangtorre/wav2vec2-xlsr-300m-waikhana")

# Pat to wav file
pathfile = "/path/to/wavfile"

# Load and normalize the file
wav, curr_sample_rate = sf.read(pathfile, dtype="float32")
feats = torch.from_numpy(wav).float()
with torch.no_grad():
    feats = F.layer_norm(feats, feats.shape)
feats = torch.unsqueeze(feats, 0)
logits = model(feats).logits

# take argmax and decode
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)
print("HF prediction: ", transcription)

This code snipnet shows how to Evaluate the wav2vec2-xlsr-300m-waikhana in Second Americas NLP 2022 Waikhana dev set

from datasets import load_dataset
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch
from jiwer import cer
import torch.nn.functional as F
from datasets import load_dataset
import soundfile as sf

americasnlp = load_dataset("ivangtorre/second_americas_nlp_2022", "waikhana", split="dev")
waikhana = americasnlp.filter(lambda language: language['subset']=='waikhana')

model = Wav2Vec2ForCTC.from_pretrained("ivangtorre/wav2vec2-xlsr-300m-waikhana")
processor = Wav2Vec2Processor.from_pretrained("ivangtorre/wav2vec2-xlsr-300m-waikhana")

def map_to_pred(batch):
    wav = batch["audio"][0]["array"]
    feats = torch.from_numpy(wav).float()
    feats = F.layer_norm(feats, feats.shape) # Normalization performed during finetuning
    feats = torch.unsqueeze(feats, 0)
    logits = model(feats).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    batch["transcription"] = processor.batch_decode(predicted_ids)
    return batch

result = waikhana.map(map_to_pred, batched=True, batch_size=1)

print("CER:", cer(result["source_processed"], result["transcription"]))

Citation

@article{romero2024automatic,
  title={Automatic Speech Recognition Advancements for Indigenous Languages of the Americas},
  author={Romero, Monica and G{\'o}mez-Canaval, Sandra and Torre, Ivan G},
  journal={Applied Sciences},
  volume={14},
  number={15},
  pages={6497},
  year={2024},
  publisher={MDPI}
}
Downloads last month
12
Inference Examples
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social visibility and check back later, or deploy to Inference Endpoints (dedicated) instead.

Dataset used to train ivangtorre/wav2vec2-xlsr-300m-waikhana

Evaluation results