Update TrueCaser
Browse files- TrueCaser.py +8 -1
TrueCaser.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import math
|
2 |
import pickle
|
|
|
3 |
import string
|
4 |
|
5 |
from nltk.tokenize import word_tokenize
|
@@ -79,10 +80,16 @@ class TrueCaser(object):
|
|
79 |
def first_token_case(raw):
|
80 |
return raw.capitalize()
|
81 |
|
|
|
|
|
|
|
|
|
82 |
def get_true_case(self, sentence, out_of_vocabulary_token_option="title"):
|
83 |
tokens = word_tokenize(sentence)
|
84 |
tokens_true_case = self.get_true_case_from_tokens(tokens, out_of_vocabulary_token_option)
|
85 |
-
|
|
|
|
|
86 |
|
87 |
def get_true_case_from_tokens(self, tokens, out_of_vocabulary_token_option="title"):
|
88 |
tokens_true_case = []
|
|
|
1 |
import math
|
2 |
import pickle
|
3 |
+
import re
|
4 |
import string
|
5 |
|
6 |
from nltk.tokenize import word_tokenize
|
|
|
80 |
def first_token_case(raw):
|
81 |
return raw.capitalize()
|
82 |
|
83 |
+
@staticmethod
|
84 |
+
def upper_replacement(match):
|
85 |
+
return '. ' + match.group(0)[-1].upper()
|
86 |
+
|
87 |
def get_true_case(self, sentence, out_of_vocabulary_token_option="title"):
|
88 |
tokens = word_tokenize(sentence)
|
89 |
tokens_true_case = self.get_true_case_from_tokens(tokens, out_of_vocabulary_token_option)
|
90 |
+
text = self.detknzr.detokenize(tokens_true_case)
|
91 |
+
text = re.sub(r' \. .', self.upper_replacement, text)
|
92 |
+
return text
|
93 |
|
94 |
def get_true_case_from_tokens(self, tokens, out_of_vocabulary_token_option="title"):
|
95 |
tokens_true_case = []
|