File size: 1,656 Bytes
e8425dc
2746bef
e8425dc
2746bef
f2336e3
2636a15
 
f2336e3
2636a15
 
 
 
c60c34e
 
 
 
 
 
2746bef
 
 
 
 
 
e8425dc
2746bef
 
e8425dc
 
2746bef
 
 
 
e8425dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2636a15
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import json
import re
from typing import Any

from .operator import BaseFieldOperator


class ToString(BaseFieldOperator):
    def process(self, instance):
        return str(instance)


class ToListByComma(BaseFieldOperator):
    def process(self, instance):
        output = [x.strip() for x in instance.split(",")]
        return output


class RegexParser(BaseFieldOperator):
    """
    A processor that uses regex in order to parse a string.
    """

    regex: str
    termination_regex: str = None

    def process(self, text):
        if self.termination_regex is not None and re.fullmatch(self.termination_regex, text):
            return []
        matches = re.findall(self.regex, text)
        return matches


class LoadJson(BaseFieldOperator):
    def process(self, text):
        try:
            return json.loads(text)
        except json.JSONDecodeError:
            return []


class ListToEmptyEntitiesTuples(BaseFieldOperator):
    def process(self, lst):
        try:
            return [(str(item), "") for item in lst]
        except json.JSONDecodeError:
            return []


class DictOfListsToPairs(BaseFieldOperator):
    position_key_before_value: bool = True

    def process(self, obj):
        try:
            result = []
            for key, values in obj.items():
                for value in values:
                    assert isinstance(value, str)
                    pair = (key, value) if self.position_key_before_value else (value, key)
                    result.append(pair)
            return result
        except:
            return []


# add_to_catalog(ToString('prediction'), 'processors', 'to_string')