Fix jumanpp.apply_to_sentence return empty list for sentence larger than ~1700 characters
Browse files
tokenization_deberta_v2_jumanpp_fast.py
CHANGED
@@ -62,4 +62,7 @@ class JumanppPreTokenizer:
|
|
62 |
|
63 |
def jumanpp_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
|
64 |
offsets = [morpheme.span for morpheme in self.jumanpp.apply_to_sentence(str(normalized_string)).morphemes]
|
|
|
|
|
|
|
65 |
return [normalized_string[offset[0]:offset[1]] for offset in offsets]
|
|
|
62 |
|
63 |
def jumanpp_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
|
64 |
offsets = [morpheme.span for morpheme in self.jumanpp.apply_to_sentence(str(normalized_string)).morphemes]
|
65 |
+
if not offsets:
|
66 |
+
doc = rhoknp.Document.from_raw_text(str(normalized_string))
|
67 |
+
offsets = [morpheme.span for morpheme in self.jumanpp.apply_to_document(doc).morphemes]
|
68 |
return [normalized_string[offset[0]:offset[1]] for offset in offsets]
|