import re from typing import ( Any, Dict, List, Optional, ) from .operators import FieldOperator, InstanceOperator class Split(FieldOperator): by: str def process_value(self, value: str) -> List[str]: return value.split(self.by) class RegexSplit(FieldOperator): by: str def process_value(self, value: str) -> List[str]: return re.split(self.by, value) class TokensSplit(FieldOperator): model: str _requirements_list = ["transformers"] def prepare(self): super().prepare() from transformers import AutoTokenizer self.tokenizer = AutoTokenizer.from_pretrained(self.model) def process_value(self, value: str) -> List[str]: return self.tokenizer.tokenize(value) class Join(FieldOperator): by: str def process_value(self, value: List[str]) -> str: return self.by.join(value) class FormatText(InstanceOperator): to_field: str text: str def process( self, instance: Dict[str, Any], stream_name: Optional[str] = None ) -> Dict[str, Any]: instance[self.to_field] = self.text.format(**instance) return instance class Strip(FieldOperator): def process_value(self, value: str) -> str: return value.strip() class Replace(FieldOperator): old: str new: str def process_value(self, value: str) -> str: return value.replace(self.old, self.new)