|
import re |
|
import string |
|
|
|
import nltk |
|
import pymorphy2 |
|
from nltk.tokenize import word_tokenize |
|
|
|
nltk.download("punkt") |
|
|
|
|
|
def clean_text(text: str) -> str: |
|
text = text.lower() |
|
text = re.sub(r"\w*(\w)\1{2,}\w*", "", text) |
|
text = re.sub(r"\d+\w*", "", text) |
|
text = re.sub(r"\[.*?\]", "", text) |
|
text = text.translate(str.maketrans("", "", string.punctuation)) |
|
return text |
|
|
|
|
|
def lemmize_and_tokenize_text(text: str) -> list[str]: |
|
morph = pymorphy2.MorphAnalyzer() |
|
tokens = word_tokenize(text) |
|
lemmas = [morph.parse(token)[0].normal_form for token in tokens] |
|
return lemmas |
|
|
|
|
|
def data_preprocessing(text: str) -> list[str]: |
|
cleaned_text = clean_text(text) |
|
lemmized_text = lemmize_and_tokenize_text(cleaned_text) |
|
return lemmized_text |
|
|