Commit
·
06e4a39
1
Parent(s):
89991d1
Fix handling of long text. (#3)
Browse files- Fix exception handling in JumanppTokenizer and (514804743e0d9464e02cb292293a7e98990293b8)
Co-authored-by: KENGO SHIMIZU <[email protected]>
tokenization_deberta_v2_jumanpp.py
CHANGED
|
@@ -28,9 +28,9 @@ class JumanppTokenizer:
|
|
| 28 |
self.jumanpp = rhoknp.Jumanpp()
|
| 29 |
|
| 30 |
def tokenize(self, text: str) -> str:
|
| 31 |
-
|
| 32 |
-
|
|
|
|
| 33 |
doc = self.rhoknp.Document.from_raw_text(text)
|
| 34 |
morphemes = self.jumanpp.apply_to_document(doc).morphemes
|
| 35 |
return " ".join([morpheme.surf for morpheme in morphemes])
|
| 36 |
-
|
|
|
|
| 28 |
self.jumanpp = rhoknp.Jumanpp()
|
| 29 |
|
| 30 |
def tokenize(self, text: str) -> str:
|
| 31 |
+
try:
|
| 32 |
+
morphemes = self.jumanpp.apply_to_sentence(text).morphemes
|
| 33 |
+
except RuntimeError:
|
| 34 |
doc = self.rhoknp.Document.from_raw_text(text)
|
| 35 |
morphemes = self.jumanpp.apply_to_document(doc).morphemes
|
| 36 |
return " ".join([morpheme.surf for morpheme in morphemes])
|
|
|
tokenization_deberta_v2_jumanpp_fast.py
CHANGED
|
@@ -62,8 +62,9 @@ class JumanppPreTokenizer:
|
|
| 62 |
pretok.split(self.jumanpp_split)
|
| 63 |
|
| 64 |
def jumanpp_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
|
| 65 |
-
|
| 66 |
-
|
|
|
|
| 67 |
doc = self.rhoknp.Document.from_raw_text(str(normalized_string))
|
| 68 |
offsets = [morpheme.span for morpheme in self.jumanpp.apply_to_document(doc).morphemes]
|
| 69 |
return [normalized_string[offset[0]:offset[1]] for offset in offsets]
|
|
|
|
| 62 |
pretok.split(self.jumanpp_split)
|
| 63 |
|
| 64 |
def jumanpp_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
|
| 65 |
+
try:
|
| 66 |
+
offsets = [morpheme.span for morpheme in self.jumanpp.apply_to_sentence(str(normalized_string)).morphemes]
|
| 67 |
+
except RuntimeError:
|
| 68 |
doc = self.rhoknp.Document.from_raw_text(str(normalized_string))
|
| 69 |
offsets = [morpheme.span for morpheme in self.jumanpp.apply_to_document(doc).morphemes]
|
| 70 |
return [normalized_string[offset[0]:offset[1]] for offset in offsets]
|