File size: 5,489 Bytes
0b4516f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# Copyright (c) OpenMMLab. All rights reserved.
import json
import os
import tempfile
import unittest

from mmocr.utils import (check_integrity, get_md5, is_archive, list_files,
                         list_from_file, list_to_file)

lists = [
    [],
    [' '],
    ['\t'],
    ['a'],
    [1],
    [1.],
    ['a', 'b'],
    ['a', 1, 1.],
    [1, 1., 'a'],
    ['啊', '啊啊'],
    ['選択', 'noël', 'Информацией', 'ÄÆä'],
]

dicts = [
    [{
        'text': []
    }],
    [{
        'text': [' ']
    }],
    [{
        'text': ['\t']
    }],
    [{
        'text': ['a']
    }],
    [{
        'text': [1]
    }],
    [{
        'text': [1.]
    }],
    [{
        'text': ['a', 'b']
    }],
    [{
        'text': ['a', 1, 1.]
    }],
    [{
        'text': [1, 1., 'a']
    }],
    [{
        'text': ['啊', '啊啊']
    }],
    [{
        'text': ['選択', 'noël', 'Информацией', 'ÄÆä']
    }],
]


def test_list_to_file():
    with tempfile.TemporaryDirectory() as tmpdirname:
        # test txt
        for i, lines in enumerate(lists):
            filename = f'{tmpdirname}/{i}.txt'
            list_to_file(filename, lines)
            lines2 = [
                line.rstrip('\r\n')
                for line in open(filename, encoding='utf-8').readlines()
            ]
            lines = list(map(str, lines))
            assert len(lines) == len(lines2)
            assert all(line1 == line2 for line1, line2 in zip(lines, lines2))
        # test jsonl
        for i, lines in enumerate(dicts):
            filename = f'{tmpdirname}/{i}.jsonl'
            list_to_file(filename, [json.dumps(line) for line in lines])
            lines2 = [
                json.loads(line.rstrip('\r\n'))['text']
                for line in open(filename, encoding='utf-8').readlines()
            ][0]

            lines = list(lines[0]['text'])
            assert len(lines) == len(lines2)
            assert all(line1 == line2 for line1, line2 in zip(lines, lines2))


def test_list_from_file():
    with tempfile.TemporaryDirectory() as tmpdirname:
        # test txt file
        for i, lines in enumerate(lists):
            filename = f'{tmpdirname}/{i}.txt'
            with open(filename, 'w', encoding='utf-8') as f:
                f.writelines(f'{line}\n' for line in lines)
            lines2 = list_from_file(filename, encoding='utf-8')
            lines = list(map(str, lines))
            assert len(lines) == len(lines2)
            assert all(line1 == line2 for line1, line2 in zip(lines, lines2))
        # test jsonl file
        for i, lines in enumerate(dicts):
            filename = f'{tmpdirname}/{i}.jsonl'
            with open(filename, 'w', encoding='utf-8') as f:
                f.writelines(f'{line}\n' for line in lines)
            lines2 = list_from_file(filename, encoding='utf-8')
            lines = list(map(str, lines))
            assert len(lines) == len(lines2)
            assert all(line1 == line2 for line1, line2 in zip(lines, lines2))


class TestIsArchive(unittest.TestCase):

    def setUp(self) -> None:
        self.zip = 'data/annotations_123.zip'
        self.tar = 'data/img.abc.tar'
        self.targz = 'data/img12345_.tar.gz'
        self.rar = '/m/abc/t.rar'
        self.dir = '/a/b/c/'

    def test_is_archive(self):
        # test zip
        self.assertTrue(is_archive(self.zip))
        # test tar
        self.assertTrue(is_archive(self.tar))
        # test tar.gz
        self.assertTrue(is_archive(self.targz))
        # test rar
        self.assertFalse(is_archive(self.rar))
        # test dir
        self.assertFalse(is_archive(self.dir))


class TestCheckIntegrity(unittest.TestCase):

    def setUp(self) -> None:
        # Do not use text files for tests, because the md5 value of text files
        # is different on different platforms (CR - CRLF)
        self.file1 = ('tests/data/det_toy_dataset/imgs/test/img_2.jpg',
                      '52b28b5dfc92d9027e70ec3ff95d8702')
        self.file2 = ('tests/data/det_toy_dataset/imgs/test/img_1.jpg',
                      'abc123')
        self.file3 = ('abc/abc.jpg', 'abc123')

    def test_check_integrity(self):
        file, md5 = self.file1
        self.assertTrue(check_integrity(file, md5))
        file, md5 = self.file2
        self.assertFalse(check_integrity(file, md5))
        self.assertTrue(check_integrity(file, None))
        file, md5 = self.file3
        self.assertFalse(check_integrity(file, md5))


class TextGetMD5(unittest.TestCase):

    def setUp(self) -> None:
        # Do not use text files for tests, because the md5 value of text files
        # is different on different platforms (CR - CRLF)
        self.file1 = ('tests/data/det_toy_dataset/imgs/test/img_2.jpg',
                      '52b28b5dfc92d9027e70ec3ff95d8702')
        self.file2 = ('tests/data/det_toy_dataset/imgs/test/img_1.jpg',
                      'abc123')

    def test_get_md5(self):
        file, md5 = self.file1
        self.assertEqual(get_md5(file), md5)
        file, md5 = self.file2
        self.assertNotEqual(get_md5(file), md5)


class TestListFiles(unittest.TestCase):

    def setUp(self) -> None:
        self.path = 'tests/data/det_toy_dataset/imgs/test'

    def test_check_integrity(self):
        suffix = 'jpg'
        files = list_files(self.path, suffix)
        for file in os.listdir(self.path):
            if file.endswith(suffix):
                self.assertIn(os.path.join(self.path, file), files)