Spaces:
Sleeping
Sleeping
import os | |
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' | |
import re | |
import emoji | |
import nltk | |
from nltk.corpus import stopwords | |
from nltk.stem import WordNetLemmatizer | |
import tensorflow as tf | |
import keras | |
vectorizer = keras.layers.TextVectorization( | |
max_tokens = 2000, | |
output_sequence_length = 32 | |
) | |
vectorizer.load_assets('./vectorizer') | |
nltk.download('punkt') | |
nltk.download('wordnet') | |
nltk.download('stopwords') | |
# Get english stopwords | |
en_stopwords = set(stopwords.words('english')) | |
# Get the lemmatizer | |
lemmatizer = WordNetLemmatizer() | |
def preprocess_text(text): | |
# Conver the text to lowercase | |
text = text.lower() | |
# Replace '#' tags | |
text = text.replace('#', '') | |
# Remove the nametags/mentions | |
text = re.sub(r'@[^\s]+', '', text) | |
# Remove the hyperlinks | |
text = re.sub(r'https:\/\/\S+', '', text) | |
# Remove the leading and trailing spaces | |
text = text.strip() | |
# Remove the emojis | |
text = emoji.demojize(text) | |
# Tokenize the word to lematize it | |
tokens = nltk.word_tokenize(text) | |
lemma_tokens = [lemmatizer.lemmatize(token) for token in tokens] | |
lemma_tokens = [w for w in lemma_tokens if w not in en_stopwords] | |
text = ' '.join(lemma_tokens) | |
tokens = vectorizer(text) | |
tokens = tf.expand_dims(tokens, axis=0) | |
return tokens | |
if __name__ == "__main__": | |
print(preprocess_text("I am running today")) |