jwengr commited on
Commit
5467ee5
·
verified ·
1 Parent(s): 7e25246

Upload folder using huggingface_hub

Browse files
modeling_sentence_tokenizer.py CHANGED
@@ -11,12 +11,14 @@ class SentenceTokenizerConfig(PretrainedConfig):
11
  min_length=32,
12
  max_length=64,
13
  n_overlap=3,
 
14
  **kwargs
15
  ):
16
  super().__init__(**kwargs)
17
  self.min_length = min_length
18
  self.max_length = max_length
19
  self.n_overlap = n_overlap
 
20
 
21
  class SentenceTokenizer(PreTrainedModel):
22
  config_class = SentenceTokenizerConfig
@@ -27,6 +29,7 @@ class SentenceTokenizer(PreTrainedModel):
27
  self.min_length = config.min_length
28
  self.max_length = config.max_length
29
  self.n_overlap = config.n_overlap
 
30
 
31
  def split_text_into_sentences(self, text):
32
  split_text = re.split(r'([^가-힣] )', text)
@@ -117,10 +120,10 @@ class SentenceTokenizer(PreTrainedModel):
117
  filtered_text = [s + sp for s, sp in zip(split_text[::2], split_text[1::2] + [''])]
118
  return filtered_text
119
 
120
- def overlap(self, chunks, roll=False):
121
  if not chunks:
122
  return []
123
- if roll==True:
124
  chunks = [chunks[-1]] + chunks + [chunks[0]]
125
  res = []
126
  total_idx = 0
@@ -139,7 +142,7 @@ class SentenceTokenizer(PreTrainedModel):
139
 
140
  return res
141
 
142
- def decode_overlap(self, chunks, roll=False):
143
  if not chunks:
144
  return ""
145
 
@@ -162,7 +165,7 @@ class SentenceTokenizer(PreTrainedModel):
162
  most_common_char, _ = Counter(index_char_map[i]).most_common(1)[0]
163
  reconstructed_text.append(most_common_char)
164
  res = "".join(reconstructed_text)
165
- if roll==True:
166
  res = res[len(chunks[0][2]):-len(chunks[-1][2])]
167
 
168
  return res
 
11
  min_length=32,
12
  max_length=64,
13
  n_overlap=3,
14
+ roll=False,
15
  **kwargs
16
  ):
17
  super().__init__(**kwargs)
18
  self.min_length = min_length
19
  self.max_length = max_length
20
  self.n_overlap = n_overlap
21
+ self.roll = roll
22
 
23
  class SentenceTokenizer(PreTrainedModel):
24
  config_class = SentenceTokenizerConfig
 
29
  self.min_length = config.min_length
30
  self.max_length = config.max_length
31
  self.n_overlap = config.n_overlap
32
+ self.roll = config.roll
33
 
34
  def split_text_into_sentences(self, text):
35
  split_text = re.split(r'([^가-힣] )', text)
 
120
  filtered_text = [s + sp for s, sp in zip(split_text[::2], split_text[1::2] + [''])]
121
  return filtered_text
122
 
123
+ def overlap(self, chunks):
124
  if not chunks:
125
  return []
126
+ if self.roll:
127
  chunks = [chunks[-1]] + chunks + [chunks[0]]
128
  res = []
129
  total_idx = 0
 
142
 
143
  return res
144
 
145
+ def decode_overlap(self, chunks):
146
  if not chunks:
147
  return ""
148
 
 
165
  most_common_char, _ = Counter(index_char_map[i]).most_common(1)[0]
166
  reconstructed_text.append(most_common_char)
167
  res = "".join(reconstructed_text)
168
+ if self.roll:
169
  res = res[len(chunks[0][2]):-len(chunks[-1][2])]
170
 
171
  return res
sentence_tokenizer/config.json CHANGED
@@ -3,13 +3,20 @@
3
  "SentenceTokenizer"
4
  ],
5
  "auto_map": {
6
- "AutoConfig": "modeling_sentence_tokenizer.SentenceTokenizerConfig",
7
- "AutoModel": "modeling_sentence_tokenizer.SentenceTokenizer"
 
 
 
 
 
 
8
  },
9
  "max_length": 64,
10
  "min_length": 32,
11
  "model_type": "sentence_tokenizer",
12
  "n_overlap": 3,
 
13
  "torch_dtype": "float32",
14
- "transformers_version": "4.48.0"
15
  }
 
3
  "SentenceTokenizer"
4
  ],
5
  "auto_map": {
6
+ "AutoConfig": [
7
+ "modeling_sentence_tokenizer.SentenceTokenizerConfig",
8
+ null
9
+ ],
10
+ "AutoModel": [
11
+ "modeling_sentence_tokenizer.SentenceTokenizer",
12
+ null
13
+ ]
14
  },
15
  "max_length": 64,
16
  "min_length": 32,
17
  "model_type": "sentence_tokenizer",
18
  "n_overlap": 3,
19
+ "roll": false,
20
  "torch_dtype": "float32",
21
+ "transformers_version": "4.50.2"
22
  }
sentence_tokenizer/modeling_sentence_tokenizer.py CHANGED
@@ -11,12 +11,14 @@ class SentenceTokenizerConfig(PretrainedConfig):
11
  min_length=32,
12
  max_length=64,
13
  n_overlap=3,
 
14
  **kwargs
15
  ):
16
  super().__init__(**kwargs)
17
  self.min_length = min_length
18
  self.max_length = max_length
19
  self.n_overlap = n_overlap
 
20
 
21
  class SentenceTokenizer(PreTrainedModel):
22
  config_class = SentenceTokenizerConfig
@@ -27,6 +29,7 @@ class SentenceTokenizer(PreTrainedModel):
27
  self.min_length = config.min_length
28
  self.max_length = config.max_length
29
  self.n_overlap = config.n_overlap
 
30
 
31
  def split_text_into_sentences(self, text):
32
  split_text = re.split(r'([^가-힣] )', text)
@@ -117,10 +120,10 @@ class SentenceTokenizer(PreTrainedModel):
117
  filtered_text = [s + sp for s, sp in zip(split_text[::2], split_text[1::2] + [''])]
118
  return filtered_text
119
 
120
- def overlap(self, chunks, roll=False):
121
  if not chunks:
122
  return []
123
- if roll==True:
124
  chunks = [chunks[-1]] + chunks + [chunks[0]]
125
  res = []
126
  total_idx = 0
@@ -139,7 +142,7 @@ class SentenceTokenizer(PreTrainedModel):
139
 
140
  return res
141
 
142
- def decode_overlap(self, chunks, roll=False):
143
  if not chunks:
144
  return ""
145
 
@@ -162,7 +165,7 @@ class SentenceTokenizer(PreTrainedModel):
162
  most_common_char, _ = Counter(index_char_map[i]).most_common(1)[0]
163
  reconstructed_text.append(most_common_char)
164
  res = "".join(reconstructed_text)
165
- if roll==True:
166
  res = res[len(chunks[0][2]):-len(chunks[-1][2])]
167
 
168
- return res
 
11
  min_length=32,
12
  max_length=64,
13
  n_overlap=3,
14
+ roll=False,
15
  **kwargs
16
  ):
17
  super().__init__(**kwargs)
18
  self.min_length = min_length
19
  self.max_length = max_length
20
  self.n_overlap = n_overlap
21
+ self.roll = roll
22
 
23
  class SentenceTokenizer(PreTrainedModel):
24
  config_class = SentenceTokenizerConfig
 
29
  self.min_length = config.min_length
30
  self.max_length = config.max_length
31
  self.n_overlap = config.n_overlap
32
+ self.roll = config.roll
33
 
34
  def split_text_into_sentences(self, text):
35
  split_text = re.split(r'([^가-힣] )', text)
 
120
  filtered_text = [s + sp for s, sp in zip(split_text[::2], split_text[1::2] + [''])]
121
  return filtered_text
122
 
123
+ def overlap(self, chunks):
124
  if not chunks:
125
  return []
126
+ if self.roll:
127
  chunks = [chunks[-1]] + chunks + [chunks[0]]
128
  res = []
129
  total_idx = 0
 
142
 
143
  return res
144
 
145
+ def decode_overlap(self, chunks):
146
  if not chunks:
147
  return ""
148
 
 
165
  most_common_char, _ = Counter(index_char_map[i]).most_common(1)[0]
166
  reconstructed_text.append(most_common_char)
167
  res = "".join(reconstructed_text)
168
+ if self.roll:
169
  res = res[len(chunks[0][2]):-len(chunks[-1][2])]
170
 
171
+ return res