File size: 1,110 Bytes
f5b4ff2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import json
import tqdm
import pandas as pd
import os
import random

# Load the JSON data
jsondata = pd.read_json(path_or_buf=os.path.join('./Gigaspeech', 'trans', 'train.json'), lines=True)

# Ensure there are at least 10,000 items in jsondata
data_length = len(jsondata)

# Randomly sample 800,000 unique indices
sample_indices = range(data_length) if data_length < 800000 else random.sample(range(data_length), 800000)

# Prepare the sampled data
data = []
for i in tqdm.tqdm(sample_indices):
    tmp = {
        "path": jsondata['wav'][i],
        "duration": jsondata['duration'][i],
        "sample_rate": 16000,
        "amplitude": None,
        "weight": None,
        "info_path": None
    }
    data.append(tmp)

# Create the output directory if it does not exist
os.makedirs('./egs/train', exist_ok=True)

# Define the output file path
output_file = './egs/train/data.jsonl'

# Write the sampled data to the JSONL file
with open(output_file, 'w') as file:
    for record in data:
        json_line = json.dumps(record)
        file.write(json_line + '\n')

print(f"Data written to {output_file}")