Upload folder using huggingface_hub
Browse files
modeling_sentence_tokenizer.py
CHANGED
@@ -11,12 +11,14 @@ class SentenceTokenizerConfig(PretrainedConfig):
|
|
11 |
min_length=32,
|
12 |
max_length=64,
|
13 |
n_overlap=3,
|
|
|
14 |
**kwargs
|
15 |
):
|
16 |
super().__init__(**kwargs)
|
17 |
self.min_length = min_length
|
18 |
self.max_length = max_length
|
19 |
self.n_overlap = n_overlap
|
|
|
20 |
|
21 |
class SentenceTokenizer(PreTrainedModel):
|
22 |
config_class = SentenceTokenizerConfig
|
@@ -27,6 +29,7 @@ class SentenceTokenizer(PreTrainedModel):
|
|
27 |
self.min_length = config.min_length
|
28 |
self.max_length = config.max_length
|
29 |
self.n_overlap = config.n_overlap
|
|
|
30 |
|
31 |
def split_text_into_sentences(self, text):
|
32 |
split_text = re.split(r'([^가-힣] )', text)
|
@@ -117,10 +120,10 @@ class SentenceTokenizer(PreTrainedModel):
|
|
117 |
filtered_text = [s + sp for s, sp in zip(split_text[::2], split_text[1::2] + [''])]
|
118 |
return filtered_text
|
119 |
|
120 |
-
def overlap(self, chunks
|
121 |
if not chunks:
|
122 |
return []
|
123 |
-
if roll
|
124 |
chunks = [chunks[-1]] + chunks + [chunks[0]]
|
125 |
res = []
|
126 |
total_idx = 0
|
@@ -139,7 +142,7 @@ class SentenceTokenizer(PreTrainedModel):
|
|
139 |
|
140 |
return res
|
141 |
|
142 |
-
def decode_overlap(self, chunks
|
143 |
if not chunks:
|
144 |
return ""
|
145 |
|
@@ -162,7 +165,7 @@ class SentenceTokenizer(PreTrainedModel):
|
|
162 |
most_common_char, _ = Counter(index_char_map[i]).most_common(1)[0]
|
163 |
reconstructed_text.append(most_common_char)
|
164 |
res = "".join(reconstructed_text)
|
165 |
-
if roll
|
166 |
res = res[len(chunks[0][2]):-len(chunks[-1][2])]
|
167 |
|
168 |
return res
|
|
|
11 |
min_length=32,
|
12 |
max_length=64,
|
13 |
n_overlap=3,
|
14 |
+
roll=False,
|
15 |
**kwargs
|
16 |
):
|
17 |
super().__init__(**kwargs)
|
18 |
self.min_length = min_length
|
19 |
self.max_length = max_length
|
20 |
self.n_overlap = n_overlap
|
21 |
+
self.roll = roll
|
22 |
|
23 |
class SentenceTokenizer(PreTrainedModel):
|
24 |
config_class = SentenceTokenizerConfig
|
|
|
29 |
self.min_length = config.min_length
|
30 |
self.max_length = config.max_length
|
31 |
self.n_overlap = config.n_overlap
|
32 |
+
self.roll = config.roll
|
33 |
|
34 |
def split_text_into_sentences(self, text):
|
35 |
split_text = re.split(r'([^가-힣] )', text)
|
|
|
120 |
filtered_text = [s + sp for s, sp in zip(split_text[::2], split_text[1::2] + [''])]
|
121 |
return filtered_text
|
122 |
|
123 |
+
def overlap(self, chunks):
|
124 |
if not chunks:
|
125 |
return []
|
126 |
+
if self.roll:
|
127 |
chunks = [chunks[-1]] + chunks + [chunks[0]]
|
128 |
res = []
|
129 |
total_idx = 0
|
|
|
142 |
|
143 |
return res
|
144 |
|
145 |
+
def decode_overlap(self, chunks):
|
146 |
if not chunks:
|
147 |
return ""
|
148 |
|
|
|
165 |
most_common_char, _ = Counter(index_char_map[i]).most_common(1)[0]
|
166 |
reconstructed_text.append(most_common_char)
|
167 |
res = "".join(reconstructed_text)
|
168 |
+
if self.roll:
|
169 |
res = res[len(chunks[0][2]):-len(chunks[-1][2])]
|
170 |
|
171 |
return res
|
sentence_tokenizer/config.json
CHANGED
@@ -3,13 +3,20 @@
|
|
3 |
"SentenceTokenizer"
|
4 |
],
|
5 |
"auto_map": {
|
6 |
-
"AutoConfig":
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
},
|
9 |
"max_length": 64,
|
10 |
"min_length": 32,
|
11 |
"model_type": "sentence_tokenizer",
|
12 |
"n_overlap": 3,
|
|
|
13 |
"torch_dtype": "float32",
|
14 |
-
"transformers_version": "4.
|
15 |
}
|
|
|
3 |
"SentenceTokenizer"
|
4 |
],
|
5 |
"auto_map": {
|
6 |
+
"AutoConfig": [
|
7 |
+
"modeling_sentence_tokenizer.SentenceTokenizerConfig",
|
8 |
+
null
|
9 |
+
],
|
10 |
+
"AutoModel": [
|
11 |
+
"modeling_sentence_tokenizer.SentenceTokenizer",
|
12 |
+
null
|
13 |
+
]
|
14 |
},
|
15 |
"max_length": 64,
|
16 |
"min_length": 32,
|
17 |
"model_type": "sentence_tokenizer",
|
18 |
"n_overlap": 3,
|
19 |
+
"roll": false,
|
20 |
"torch_dtype": "float32",
|
21 |
+
"transformers_version": "4.50.2"
|
22 |
}
|
sentence_tokenizer/modeling_sentence_tokenizer.py
CHANGED
@@ -11,12 +11,14 @@ class SentenceTokenizerConfig(PretrainedConfig):
|
|
11 |
min_length=32,
|
12 |
max_length=64,
|
13 |
n_overlap=3,
|
|
|
14 |
**kwargs
|
15 |
):
|
16 |
super().__init__(**kwargs)
|
17 |
self.min_length = min_length
|
18 |
self.max_length = max_length
|
19 |
self.n_overlap = n_overlap
|
|
|
20 |
|
21 |
class SentenceTokenizer(PreTrainedModel):
|
22 |
config_class = SentenceTokenizerConfig
|
@@ -27,6 +29,7 @@ class SentenceTokenizer(PreTrainedModel):
|
|
27 |
self.min_length = config.min_length
|
28 |
self.max_length = config.max_length
|
29 |
self.n_overlap = config.n_overlap
|
|
|
30 |
|
31 |
def split_text_into_sentences(self, text):
|
32 |
split_text = re.split(r'([^가-힣] )', text)
|
@@ -117,10 +120,10 @@ class SentenceTokenizer(PreTrainedModel):
|
|
117 |
filtered_text = [s + sp for s, sp in zip(split_text[::2], split_text[1::2] + [''])]
|
118 |
return filtered_text
|
119 |
|
120 |
-
def overlap(self, chunks
|
121 |
if not chunks:
|
122 |
return []
|
123 |
-
if roll
|
124 |
chunks = [chunks[-1]] + chunks + [chunks[0]]
|
125 |
res = []
|
126 |
total_idx = 0
|
@@ -139,7 +142,7 @@ class SentenceTokenizer(PreTrainedModel):
|
|
139 |
|
140 |
return res
|
141 |
|
142 |
-
def decode_overlap(self, chunks
|
143 |
if not chunks:
|
144 |
return ""
|
145 |
|
@@ -162,7 +165,7 @@ class SentenceTokenizer(PreTrainedModel):
|
|
162 |
most_common_char, _ = Counter(index_char_map[i]).most_common(1)[0]
|
163 |
reconstructed_text.append(most_common_char)
|
164 |
res = "".join(reconstructed_text)
|
165 |
-
if roll
|
166 |
res = res[len(chunks[0][2]):-len(chunks[-1][2])]
|
167 |
|
168 |
-
return res
|
|
|
11 |
min_length=32,
|
12 |
max_length=64,
|
13 |
n_overlap=3,
|
14 |
+
roll=False,
|
15 |
**kwargs
|
16 |
):
|
17 |
super().__init__(**kwargs)
|
18 |
self.min_length = min_length
|
19 |
self.max_length = max_length
|
20 |
self.n_overlap = n_overlap
|
21 |
+
self.roll = roll
|
22 |
|
23 |
class SentenceTokenizer(PreTrainedModel):
|
24 |
config_class = SentenceTokenizerConfig
|
|
|
29 |
self.min_length = config.min_length
|
30 |
self.max_length = config.max_length
|
31 |
self.n_overlap = config.n_overlap
|
32 |
+
self.roll = config.roll
|
33 |
|
34 |
def split_text_into_sentences(self, text):
|
35 |
split_text = re.split(r'([^가-힣] )', text)
|
|
|
120 |
filtered_text = [s + sp for s, sp in zip(split_text[::2], split_text[1::2] + [''])]
|
121 |
return filtered_text
|
122 |
|
123 |
+
def overlap(self, chunks):
|
124 |
if not chunks:
|
125 |
return []
|
126 |
+
if self.roll:
|
127 |
chunks = [chunks[-1]] + chunks + [chunks[0]]
|
128 |
res = []
|
129 |
total_idx = 0
|
|
|
142 |
|
143 |
return res
|
144 |
|
145 |
+
def decode_overlap(self, chunks):
|
146 |
if not chunks:
|
147 |
return ""
|
148 |
|
|
|
165 |
most_common_char, _ = Counter(index_char_map[i]).most_common(1)[0]
|
166 |
reconstructed_text.append(most_common_char)
|
167 |
res = "".join(reconstructed_text)
|
168 |
+
if self.roll:
|
169 |
res = res[len(chunks[0][2]):-len(chunks[-1][2])]
|
170 |
|
171 |
+
return res
|