Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
""" | |
Take a given list of sentences and turn it into a numpy array, where each | |
number corresponds to a word. Padding is used (number 0) to ensure fixed length | |
of sentences. | |
""" | |
from __future__ import print_function, unicode_literals | |
import example_helper | |
import json | |
from torchmoji.sentence_tokenizer import SentenceTokenizer | |
with open('../model/vocabulary.json', 'r') as f: | |
vocabulary = json.load(f) | |
st = SentenceTokenizer(vocabulary, 30) | |
test_sentences = [ | |
'\u2014 -- \u203c !!\U0001F602', | |
'Hello world!', | |
'This is a sample tweet #example', | |
] | |
tokens, infos, stats = st.tokenize_sentences(test_sentences) | |
print(tokens) | |
print(infos) | |
print(stats) | |