nlp_proj / preprocessing.py
Maslov-Artem
minor changes
afed7b5
raw
history blame
777 Bytes
import re
import string
import nltk
import pymorphy2
from nltk.tokenize import word_tokenize
nltk.download("punkt")
def clean_text(text: str) -> str:
text = text.lower()
text = re.sub(r"\w*(\w)\1{2,}\w*", "", text)
text = re.sub(r"\d+\w*", "", text)
text = re.sub(r"\[.*?\]", "", text)
text = text.translate(str.maketrans("", "", string.punctuation))
return text
def lemmize_and_tokenize_text(text: str) -> list[str]:
morph = pymorphy2.MorphAnalyzer()
tokens = word_tokenize(text)
lemmas = [morph.parse(token)[0].normal_form for token in tokens]
return lemmas
def data_preprocessing(text: str) -> list[str]:
cleaned_text = clean_text(text)
lemmized_text = lemmize_and_tokenize_text(cleaned_text)
return lemmized_text