Spaces:

AshtonIsNotHere
/

xlmr-longformer_comparison

Runtime error

AshtonIsNotHere commited on Feb 3, 2023

Commit

b840e20

•

1 Parent(s): 2e85755

Fix to allow masked token after 512th token

Sequences longer than 510 are now truncated around the masked token for xlm-roberta-base, regardless of mask location.

Files changed (1) hide show

app.py CHANGED Viewed

@@ -31,8 +31,35 @@ xlmr_tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base', max_length=51
 xlmr_p = pipeline("fill-mask", model=model, tokenizer=tokenizer)
 def xlmr_base_fn(text):
- text = ' '.join(text.split()[:500])
- preds = xlmr_p(text)
  pred_dict = {}
  for pred in preds:
  pred_dict[pred['token_str']] = pred['score']

 xlmr_p = pipeline("fill-mask", model=model, tokenizer=tokenizer)
 def xlmr_base_fn(text):
+ # Find our masked token
+ tokens = xlmr_tokenizer.tokenize(text)
+ mask_token_idx = [i for i, x in enumerate(tokens) if xlmr_tokenizer.mask_token in x][0]
+ max_len = tokenizer.model_max_length
+ max_len = max_len-2 if max_len % 512 == 0 and max_len < 4096 else 510
+ # Smart truncation for long sequences
+ if not len(tokens) < max_len:
+ # Find left and right bounds for truncated sequences
+ lbound = max(0, mask_token_idx-(max_len//2))
+ rbound = min(len(tokens), mask_token_idx+(max_len//2))
+ # If we hit an edge, expand sequence in the other direction
+ if lbound == 0 and rbound != len(tokens)-1:
+ rbound = min(len(tokens), max_len)
+ elif rbound == len(tokens) and lbound != 0:
+ lbound = max(0, len(tokens)-max_len)
+ # Apply truncation and rejoin tokens to form new text
+ truncated_text = ''.join(tokens[lbound:rbound])
+ # Handle lowbar from xlmr tokenizer
+ truncated_text = ''.join([x if ord(x) != 9601 else ' ' for x in result])
+ else:
+ truncated_text = text
+ preds = xlmr_p(truncated_text)
  pred_dict = {}
  for pred in preds:
  pred_dict[pred['token_str']] = pred['score']