rbawden commited on
Commit
fedd01a
1 Parent(s): 8c4b8e7

Update pipeline.py

Browse files
Files changed (1) hide show
  1. pipeline.py +10 -0
pipeline.py CHANGED
@@ -654,15 +654,25 @@ class NormalisationPipeline(Pipeline):
654
  for i in range(len(result)):
655
  input_sent, pred_sent = input_sents[i].strip(), result[i][0]['text'].strip()
656
  input_sent = input_sent.replace('ſ' , 's')
 
 
657
  if not self.no_post_clean:
658
  pred_sent = self.post_cleaning(pred_sent)
659
  alignment, pred_sent_tok = self.align(input_sent, pred_sent)
660
 
 
661
  if not self.no_postproc_lex:
662
  alignment = self.postprocess_correct_sent(alignment)
 
 
663
  pred_sent = self.get_pred_from_alignment(alignment)
 
 
664
  if not self.no_post_clean:
665
  pred_sent = self.post_cleaning(pred_sent)
 
 
 
666
  char_spans = self.get_char_idx_align(input_sent, pred_sent, alignment)
667
  output.append({'text': pred_sent, 'alignment': char_spans})
668
  return output
 
654
  for i in range(len(result)):
655
  input_sent, pred_sent = input_sents[i].strip(), result[i][0]['text'].strip()
656
  input_sent = input_sent.replace('ſ' , 's')
657
+
658
+ # apply cleaning and get alignment (necessary for postprocessing w/ the lexicon)
659
  if not self.no_post_clean:
660
  pred_sent = self.post_cleaning(pred_sent)
661
  alignment, pred_sent_tok = self.align(input_sent, pred_sent)
662
 
663
+ # apply postprocessing w/ the lexicon to the sentence (using the alignment)
664
  if not self.no_postproc_lex:
665
  alignment = self.postprocess_correct_sent(alignment)
666
+
667
+ # get the predicted sentence from the alignment
668
  pred_sent = self.get_pred_from_alignment(alignment)
669
+
670
+ # redo another round of cleaning and get the alignment again in case things have changed
671
  if not self.no_post_clean:
672
  pred_sent = self.post_cleaning(pred_sent)
673
+ alignment, pred_sent_tok = self.align(input_sent, pred_sent)
674
+
675
+ # get aligned character spans
676
  char_spans = self.get_char_idx_align(input_sent, pred_sent, alignment)
677
  output.append({'text': pred_sent, 'alignment': char_spans})
678
  return output