Update pipeline.py
Browse files- pipeline.py +10 -0
pipeline.py
CHANGED
@@ -654,15 +654,25 @@ class NormalisationPipeline(Pipeline):
|
|
654 |
for i in range(len(result)):
|
655 |
input_sent, pred_sent = input_sents[i].strip(), result[i][0]['text'].strip()
|
656 |
input_sent = input_sent.replace('ſ' , 's')
|
|
|
|
|
657 |
if not self.no_post_clean:
|
658 |
pred_sent = self.post_cleaning(pred_sent)
|
659 |
alignment, pred_sent_tok = self.align(input_sent, pred_sent)
|
660 |
|
|
|
661 |
if not self.no_postproc_lex:
|
662 |
alignment = self.postprocess_correct_sent(alignment)
|
|
|
|
|
663 |
pred_sent = self.get_pred_from_alignment(alignment)
|
|
|
|
|
664 |
if not self.no_post_clean:
|
665 |
pred_sent = self.post_cleaning(pred_sent)
|
|
|
|
|
|
|
666 |
char_spans = self.get_char_idx_align(input_sent, pred_sent, alignment)
|
667 |
output.append({'text': pred_sent, 'alignment': char_spans})
|
668 |
return output
|
|
|
654 |
for i in range(len(result)):
|
655 |
input_sent, pred_sent = input_sents[i].strip(), result[i][0]['text'].strip()
|
656 |
input_sent = input_sent.replace('ſ' , 's')
|
657 |
+
|
658 |
+
# apply cleaning and get alignment (necessary for postprocessing w/ the lexicon)
|
659 |
if not self.no_post_clean:
|
660 |
pred_sent = self.post_cleaning(pred_sent)
|
661 |
alignment, pred_sent_tok = self.align(input_sent, pred_sent)
|
662 |
|
663 |
+
# apply postprocessing w/ the lexicon to the sentence (using the alignment)
|
664 |
if not self.no_postproc_lex:
|
665 |
alignment = self.postprocess_correct_sent(alignment)
|
666 |
+
|
667 |
+
# get the predicted sentence from the alignment
|
668 |
pred_sent = self.get_pred_from_alignment(alignment)
|
669 |
+
|
670 |
+
# redo another round of cleaning and get the alignment again in case things have changed
|
671 |
if not self.no_post_clean:
|
672 |
pred_sent = self.post_cleaning(pred_sent)
|
673 |
+
alignment, pred_sent_tok = self.align(input_sent, pred_sent)
|
674 |
+
|
675 |
+
# get aligned character spans
|
676 |
char_spans = self.get_char_idx_align(input_sent, pred_sent, alignment)
|
677 |
output.append({'text': pred_sent, 'alignment': char_spans})
|
678 |
return output
|