|
import re |
|
from typing import List |
|
|
|
from .operators import FieldOperator |
|
|
|
|
|
class Split(FieldOperator): |
|
by: str |
|
|
|
def process_value(self, value: str) -> List[str]: |
|
return value.split(self.by) |
|
|
|
|
|
class RegexSplit(FieldOperator): |
|
by: str |
|
|
|
def process_value(self, value: str) -> List[str]: |
|
return re.split(self.by, value) |
|
|
|
|
|
class TokensSplit(FieldOperator): |
|
model: str |
|
_requirements_list = ["transformers"] |
|
|
|
def prepare(self): |
|
super().prepare() |
|
from transformers import AutoTokenizer |
|
|
|
self.tokenizer = AutoTokenizer.from_pretrained(self.model) |
|
|
|
def process_value(self, value: str) -> List[str]: |
|
return self.tokenizer.tokenize(value) |
|
|
|
|
|
class Join(FieldOperator): |
|
by: str |
|
|
|
def process_value(self, value: List[str]) -> str: |
|
return self.by.join(value) |
|
|
|
|
|
class Strip(FieldOperator): |
|
def process_value(self, value: str) -> str: |
|
return value.strip() |
|
|
|
|
|
class Replace(FieldOperator): |
|
old: str |
|
new: str |
|
|
|
def process_value(self, value: str) -> str: |
|
return value.replace(self.old, self.new) |
|
|