File size: 1,401 Bytes
6a1bcc1
 
 
 
 
 
 
 
 
 
 
 
 
6433081
6a1bcc1
6433081
6a1bcc1
6433081
 
 
6a1bcc1
 
 
6433081
6a1bcc1
 
 
 
6433081
6a1bcc1
6433081
 
 
 
 
 
 
 
 
6a1bcc1
 
6433081
6a1bcc1
6433081
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from .text_utils import split_words
from .stream import StreamInstanceOperator, InstanceOperatorWithGlobalAccess, Artifact

from datasets import Value, Features, Dataset, Sequence

from dataclasses import field
from typing import Dict, Any
from abc import ABC, abstractmethod

class Validator(ABC):
    pass

class ValidateSchema(Validator, StreamInstanceOperator):
    
    schema: Features = None
    
    def verify(self):
        assert isinstance(self.schema, Features), 'Schema must be an instance of Features'
        assert self.schema is not None, 'Schema must be specified'
    
    def verify_first_instance(self, instance):
        for field in self.standart_fields:
            assert field in instance, f'Field "{field}" is missing in the first instance'
    
    def process(self, instance: Dict[str, Any], stream_name: str = None) -> Dict[str, Any]:
        return instance

class StandardSchema(Features):
    
    def __init__(self):
        super().__init__({
            'source': Value('string'),
            'target': Value('string'),
            'references': Sequence(Value('string')),
            'metrics': Sequence(Value('string')),
            'parser': Value('string'),
            # 'group': Value('string'),
            # 'guidance': Value('string'),
        })

class ValidateStandartSchema:
    
    schema: Features = field(default_factory=StandardSchema)