wav2vec2-kazakh / utils.py
Bauyrjan's picture
Upload utils.py
ae63903
raw
history blame
1.29 kB
import datasets
import torchaudio
import re
def get_test_dataset(data_path='ISSAI_KSC_335RS_v1.1'):
def read_sentence(idx):
with open(f"{data_path}/Transcriptions/{idx}.txt", 'r') as f:
text = ' '.join(f.readlines())
return text
def read_text(batch):
batch["sentence"] = read_sentence(batch['uttID'])
return batch
chars_to_ignore = ["f", "m"]
chars_to_ignore_regex = f'[{"".join(chars_to_ignore)}]'
def process_text(batch):
batch["text"] = re.sub(chars_to_ignore_regex, "", batch["sentence"]).lower() + " "
batch['text'] = batch['text'].replace('a', 'а').replace('ə', 'ә').replace('ɵ', 'ө')
return batch
def load_audio(batch):
path = f"{data_path}/Audios_flac/{batch['uttID']}.flac"
speech_array, sr = torchaudio.load(path)
batch["speech"] = speech_array
batch["sampling_rate"] = sr
return batch
test_dataset = datasets.load_dataset(
'csv',
data_files=f"{data_path}/Meta/test.csv",
delimiter=' ',
split='train'
)
test_dataset = test_dataset.map(read_text)
test_dataset = test_dataset.map(process_text)
test_dataset = test_dataset.map(load_audio, num_proc=1)
return test_dataset