In [6]:
import glob

def count_files_by_extension(path, extension):
    """
    path : root path to check ,
    extension : .wav , ...
    """

    files = glob.glob(f"{path}/*.{extension}")
    return len(files)


root_path = "./vin_data/vlsp2020_train_set_02/"


In [7]:
num_wav_files = count_files_by_extension(root_path, "wav")

In [8]:
num_txt_files = count_files_by_extension(root_path, "txt")

In [9]:
print(f"Số lượng file WAV: {num_wav_files}")
print(f"Số lượng file text: {num_txt_files}")

Số lượng file WAV: 56427
Số lượng file text: 56427


In [10]:
import os
import random
import wave


def get_random_wav_file_info(folder_path):
    wav_files = glob.glob(f"{folder_path}/*.wav")
    
    if not wav_files:
        return None, None
    
    random_wav_file = random.choice(wav_files)
    
    with wave.open(random_wav_file, 'rb') as wav_file:
        sample_rate = wav_file.getframerate()
        channels = wav_file.getnchannels()
    
    return sample_rate, channels

path_to_wav_folder = "./vin_data/vlsp2020_train_set_02/"

sample_rate, channels = get_random_wav_file_info(path_to_wav_folder)

if sample_rate is not None and channels is not None:
    print(f"Tần số mẫu (sample rate): {sample_rate} Hz")
    print(f"Số kênh (channels): {channels}")
else:
    print("Nothing.")


Tần số mẫu (sample rate): 16000 Hz
Số kênh (channels): 1


In [13]:
import os
import csv
from tqdm import tqdm

def create_csv_from_wav_folder(folder_path, output_csv_file):
    wav_files = glob.glob(f"{folder_path}/*.wav")

    if not wav_files:
        print("Không có file WAV nào trong thư mục.")
        return

    # Mở tệp CSV đầu ra và tạo bộ đếm số lượng file WAV
    with open(output_csv_file, mode='w', newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(['path', 'name','sentence'])

        for wav_file_path in tqdm(wav_files):

            text_file_path = os.path.splitext(wav_file_path)[0] + ".txt"
            if os.path.exists(text_file_path):
                with open(text_file_path, 'r') as txt_file:
                    text_content = txt_file.read()
            else:
                text_content = "Not found."

            csv_writer.writerow([wav_file_path, os.path.basename(wav_file_path), sample_rate, channels, text_content])


In [14]:
output_csv_file = "vin.csv"
path_to_wav_folder = "./vin_data/vlsp2020_train_set_02/"
create_csv_from_wav_folder(path_to_wav_folder, output_csv_file)

100%|██████████| 56427/56427 [00:37<00:00, 1492.44it/s]


In [34]:
import pandas as pd 
data = pd.read_csv('vin_test.csv')
data.head(5)

Unnamed: 0,path,name,sentence
0,./vin_data/vlsp2020_train_set_02/spkyut-201907...,spkyut-20190730-utt000000716.wav,cây cam canh là loại cây ăn quả dễ trồng dễ ch...
1,./vin_data/vlsp2020_train_set_02/database_sa3_...,database_sa3_1_150h_15Jan2020_cleaned_utt_0000...,những đặc sản vùng miền nổi tiếng như miến don...
2,./vin_data/vlsp2020_train_set_02/speaker_544-0...,speaker_544-069450-1.wav,trước thông tin này trương nam thành chia sẻ c...
3,./vin_data/vlsp2020_train_set_02/database_sa1_...,database_sa1_Jan08_Mar19_cleaned_utt_000005361...,giống như những nữ hoàng á
4,./vin_data/vlsp2020_train_set_02/database_sa2_...,database_sa2_Jan4_Feb29_cleaned_utt_0000154206...,thay vì phun toàn bộ cánh đồng bằng hóa chất c...


In [30]:
import csv
import random

def split_csv_file(input_file, output_file1, output_file2, ratio):
    with open(input_file, 'r', newline='', encoding='utf-8') as csvfile:
        csvreader = csv.reader(csvfile)
        header = next(csvreader) 
        
        data = list(csvreader)
        random.shuffle(data)

        total_rows = len(data)
        rows_output_file1 = int(total_rows * ratio)
        rows_output_file2 = total_rows - rows_output_file1
        
        # Split the data into two parts
        data1 = data[:rows_output_file1]
        data2 = data[rows_output_file1:]

    with open(output_file1, 'w', newline='', encoding='utf-8') as csvfile1:
        csvwriter1 = csv.writer(csvfile1, quotechar='|', quoting=csv.QUOTE_MINIMAL)
        csvwriter1.writerow(header)
        csvwriter1.writerows(data1)

    with open(output_file2, 'w', newline='', encoding='utf-8') as csvfile2:
        csvwriter2 = csv.writer(csvfile2, quotechar='|', quoting=csv.QUOTE_MINIMAL)
        csvwriter2.writerow(header)
        csvwriter2.writerows(data2)

input_file = 'vin.csv'
output_file1 = 'vin_train.csv'
output_file2 = 'vin_test.csv'
ratio = 0.8  

split_csv_file(input_file, output_file1, output_file2, ratio)


In [None]:
from datasets import load_dataset, DatasetDict

vivos = DatasetDict()

In [46]:
import os
import numpy as np

import torch
import torchaudio

import pandas as pd
import whisper
import torchaudio.transforms as at
from pathlib import Path

def load_wave(wave_path, sample_rate:int=16000) -> torch.Tensor:
    waveform, sr = torchaudio.load(wave_path, normalize=True)
    if sample_rate != sr:
        waveform = at.Resample(sr, sample_rate)(waveform)
    return waveform



def get_list_files_vin100h(phase, dataset_path='./vin_data/vlsp2020_train_set_02/', text_max_length=10000, audio_max_sample_length=1000000, sample_rate=16000):
    audio_transcript_pair_list = []
    if phase == 'train':
        csv_file = 'vin_train.csv'
    else:
        csv_file = 'vin_test.csv'
    df = pd.read_csv(csv_file)
    for index, row in df.iterrows():
        new_path = Path(row['path'])
        audio_id = index
        text = row['sentence']
        if new_path.exists():
            audio = load_wave(new_path, sample_rate=sample_rate)[0]
            # if len(text) > text_max_length or len(audio) > audio_max_sample_length:
            #     print('skip file:', new_path, 'with len text:', len(text), 'and len audio', len(audio))
            #     continue
            audio_transcript_pair_list.append((audio_id, str(new_path), text))
            print(audio_transcript_pair_list)
    return audio,  audio_transcript_pair_list


In [None]:
get_list_files_vin100h(phase='train')