Elron commited on
Commit
a471d0a
·
verified ·
1 Parent(s): f0b2749

Upload processors.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. processors.py +53 -52
processors.py CHANGED
@@ -1,31 +1,32 @@
1
  import json
2
  import re
 
3
 
4
- from .operator import BaseFieldOperator
5
 
6
 
7
- class ToString(BaseFieldOperator):
8
- def process(self, instance):
9
- return str(instance)
10
 
11
 
12
- class ToStringStripped(BaseFieldOperator):
13
- def process(self, instance):
14
- return str(instance).strip()
15
 
16
 
17
- class ToListByComma(BaseFieldOperator):
18
- def process(self, instance):
19
- return [x.strip() for x in instance.split(",")]
20
 
21
 
22
- class RegexParser(BaseFieldOperator):
23
  """A processor that uses regex in order to parse a string."""
24
 
25
  regex: str
26
  termination_regex: str = None
27
 
28
- def process(self, text):
29
  if self.termination_regex is not None and re.fullmatch(
30
  self.termination_regex, text
31
  ):
@@ -33,26 +34,26 @@ class RegexParser(BaseFieldOperator):
33
  return re.findall(self.regex, text)
34
 
35
 
36
- class LoadJson(BaseFieldOperator):
37
- def process(self, text):
38
  try:
39
  return json.loads(text)
40
  except json.JSONDecodeError:
41
  return []
42
 
43
 
44
- class ListToEmptyEntitiesTuples(BaseFieldOperator):
45
- def process(self, lst):
46
  try:
47
  return [(str(item), "") for item in lst]
48
  except json.JSONDecodeError:
49
  return []
50
 
51
 
52
- class DictOfListsToPairs(BaseFieldOperator):
53
  position_key_before_value: bool = True
54
 
55
- def process(self, obj):
56
  try:
57
  result = []
58
  for key, values in obj.items():
@@ -67,17 +68,17 @@ class DictOfListsToPairs(BaseFieldOperator):
67
  return []
68
 
69
 
70
- class TakeFirstNonEmptyLine(BaseFieldOperator):
71
- def process(self, instance):
72
- splitted = str(instance).strip().split("\n")
73
  if len(splitted) == 0:
74
  return ""
75
  return splitted[0].strip()
76
 
77
 
78
- class ConvertToBoolean(BaseFieldOperator):
79
- def process(self, instance):
80
- clean_instance = str(instance).strip().lower()
81
  if any(w in clean_instance for w in ["no", "not", "wrong", "false"]):
82
  return "FALSE"
83
  if any(w in clean_instance for w in ["yes", "right", "correct", "true"]):
@@ -85,9 +86,9 @@ class ConvertToBoolean(BaseFieldOperator):
85
  return "OTHER"
86
 
87
 
88
- class LowerCaseTillPunc(BaseFieldOperator):
89
- def process(self, instance):
90
- non_empty_line = instance.lower()
91
  match = re.search(r"[.,!?;]", non_empty_line)
92
  if match:
93
  # Extract text up to the first punctuation
@@ -95,56 +96,56 @@ class LowerCaseTillPunc(BaseFieldOperator):
95
  return non_empty_line
96
 
97
 
98
- class LowerCase(BaseFieldOperator):
99
- def process(self, instance):
100
- return instance.lower()
101
 
102
 
103
- class FirstCharacter(BaseFieldOperator):
104
- def process(self, instance):
105
- match = re.search(r"\s*(\w)", instance)
106
  if match:
107
  return match.groups(0)[0]
108
  return ""
109
 
110
 
111
- class TakeFirstWord(BaseFieldOperator):
112
- def process(self, instance):
113
- match = re.search(r"[\w]+", instance)
114
  if match:
115
- return instance[match.start() : match.end()]
116
  return ""
117
 
118
 
119
- class YesNoToInt(BaseFieldOperator):
120
- def process(self, instance):
121
- if instance == "yes":
122
  return "1"
123
  return "0"
124
 
125
 
126
- class ToYesOrNone(BaseFieldOperator):
127
- def process(self, instance):
128
- if instance == "yes":
129
  return "yes"
130
  return "none"
131
 
132
 
133
- class StanceToProCon(BaseFieldOperator):
134
- def process(self, instance):
135
- if instance == "positive":
136
  return "PRO"
137
- if instance in ["negative", "suggestion"]:
138
  return "CON"
139
  return "none"
140
 
141
 
142
- class StringOrNotString(BaseFieldOperator):
143
  string: str
144
 
145
- def process(self, instance):
146
- if "not " + self.string.lower() in instance.lower():
147
  return "not " + self.string.lower()
148
- if self.string.lower() in instance.lower():
149
  return self.string.lower()
150
- return instance
 
1
  import json
2
  import re
3
+ from typing import Any
4
 
5
+ from .operators import FieldOperator
6
 
7
 
8
+ class ToString(FieldOperator):
9
+ def process_value(self, text: Any) -> Any:
10
+ return str(text)
11
 
12
 
13
+ class ToStringStripped(FieldOperator):
14
+ def process_value(self, text: Any) -> Any:
15
+ return str(text).strip()
16
 
17
 
18
+ class ToListByComma(FieldOperator):
19
+ def process_value(self, text: Any) -> Any:
20
+ return [x.strip() for x in text.split(",")]
21
 
22
 
23
+ class RegexParser(FieldOperator):
24
  """A processor that uses regex in order to parse a string."""
25
 
26
  regex: str
27
  termination_regex: str = None
28
 
29
+ def process_value(self, text: Any) -> Any:
30
  if self.termination_regex is not None and re.fullmatch(
31
  self.termination_regex, text
32
  ):
 
34
  return re.findall(self.regex, text)
35
 
36
 
37
+ class LoadJson(FieldOperator):
38
+ def process_value(self, text: Any) -> Any:
39
  try:
40
  return json.loads(text)
41
  except json.JSONDecodeError:
42
  return []
43
 
44
 
45
+ class ListToEmptyEntitiesTuples(FieldOperator):
46
+ def process_value(self, lst: Any) -> Any:
47
  try:
48
  return [(str(item), "") for item in lst]
49
  except json.JSONDecodeError:
50
  return []
51
 
52
 
53
+ class DictOfListsToPairs(FieldOperator):
54
  position_key_before_value: bool = True
55
 
56
+ def process_value(self, obj: Any) -> Any:
57
  try:
58
  result = []
59
  for key, values in obj.items():
 
68
  return []
69
 
70
 
71
+ class TakeFirstNonEmptyLine(FieldOperator):
72
+ def process_value(self, text: Any) -> Any:
73
+ splitted = str(text).strip().split("\n")
74
  if len(splitted) == 0:
75
  return ""
76
  return splitted[0].strip()
77
 
78
 
79
+ class ConvertToBoolean(FieldOperator):
80
+ def process_value(self, text: Any) -> Any:
81
+ clean_instance = str(text).strip().lower()
82
  if any(w in clean_instance for w in ["no", "not", "wrong", "false"]):
83
  return "FALSE"
84
  if any(w in clean_instance for w in ["yes", "right", "correct", "true"]):
 
86
  return "OTHER"
87
 
88
 
89
+ class LowerCaseTillPunc(FieldOperator):
90
+ def process_value(self, text: Any) -> Any:
91
+ non_empty_line = text.lower()
92
  match = re.search(r"[.,!?;]", non_empty_line)
93
  if match:
94
  # Extract text up to the first punctuation
 
96
  return non_empty_line
97
 
98
 
99
+ class LowerCase(FieldOperator):
100
+ def process_value(self, text: Any) -> Any:
101
+ return text.lower()
102
 
103
 
104
+ class FirstCharacter(FieldOperator):
105
+ def process_value(self, text: Any) -> Any:
106
+ match = re.search(r"\s*(\w)", text)
107
  if match:
108
  return match.groups(0)[0]
109
  return ""
110
 
111
 
112
+ class TakeFirstWord(FieldOperator):
113
+ def process_value(self, text: Any) -> Any:
114
+ match = re.search(r"[\w]+", text)
115
  if match:
116
+ return text[match.start() : match.end()]
117
  return ""
118
 
119
 
120
+ class YesNoToInt(FieldOperator):
121
+ def process_value(self, text: Any) -> Any:
122
+ if text == "yes":
123
  return "1"
124
  return "0"
125
 
126
 
127
+ class ToYesOrNone(FieldOperator):
128
+ def process_value(self, text: Any) -> Any:
129
+ if text == "yes":
130
  return "yes"
131
  return "none"
132
 
133
 
134
+ class StanceToProCon(FieldOperator):
135
+ def process_value(self, text: Any) -> Any:
136
+ if text == "positive":
137
  return "PRO"
138
+ if text in ["negative", "suggestion"]:
139
  return "CON"
140
  return "none"
141
 
142
 
143
+ class StringOrNotString(FieldOperator):
144
  string: str
145
 
146
+ def process_value(self, text: Any) -> Any:
147
+ if "not " + self.string.lower() in text.lower():
148
  return "not " + self.string.lower()
149
+ if self.string.lower() in text.lower():
150
  return self.string.lower()
151
+ return text