File size: 931 Bytes
8e2b754
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import json
import os

import torch
from torchvision.io import ImageReadMode, read_image

# SUPPORTED_EXTENSIONS = {'PNG', 'JPG', 'png', 'JPEG', 'jpg', 'jpeg'}

for split in ["train", "valid", "test"]:
    with open(f"/home/{os.environ['USER']}/data/wit/prepared_dataset/{split}_dataset.json") as f:
        examples = [json.loads(line) for line in f.readlines()]
    
    supported_examples = []
    for example in examples:
        try:
            image = read_image(example["image_path"], mode=ImageReadMode.RGB)
            supported_examples.append(json.dumps(example, ensure_ascii=False))
        except Exception as e:
            print(f"Excluding file: {example['image_path']} due to error: {e}")

    print(f"Total {split} examples: {len(supported_examples)}")
    with open(f"/home/{os.environ['USER']}/data/wit/prepared_dataset/{split}_dataset_filtered.json", "w") as f:
        f.write("\n".join(supported_examples))