Spaces:
Running
Running
File size: 755 Bytes
751936e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 |
import difflib
import webbrowser
from transformers import AutoTokenizer
from data_sample.oov_base import space_tokens, jd_vocab_tokens, docs
tokenizer = AutoTokenizer.from_pretrained("tokenizer")
def test_oov():
d = difflib.HtmlDiff(wrapcolumn=50)
raw_lines = []
decode_lines = []
for line in space_tokens + jd_vocab_tokens + docs:
tokens = tokenizer.encode(line)
decode_line = tokenizer.decode(tokens)
if line != decode_line:
raw_lines.append(line)
decode_lines.append(decode_line)
q = d.make_file(raw_lines, decode_lines)
with open('diff.html', 'w', encoding="utf-8") as f_new:
f_new.write(q)
webbrowser.open('diff.html')
if __name__ == "__main__":
test_oov() |