File size: 777 Bytes
cb2adb5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 |
import re
import string
import nltk
import pymorphy2
from nltk.tokenize import word_tokenize
nltk.download("punkt")
def clean_text(text: str) -> str:
text = text.lower()
text = re.sub(r"\w*(\w)\1{2,}\w*", "", text)
text = re.sub(r"\d+\w*", "", text)
text = re.sub(r"\[.*?\]", "", text)
text = text.translate(str.maketrans("", "", string.punctuation))
return text
def lemmize_and_tokenize_text(text: str) -> list[str]:
morph = pymorphy2.MorphAnalyzer()
tokens = word_tokenize(text)
lemmas = [morph.parse(token)[0].normal_form for token in tokens]
return lemmas
def data_preprocessing(text: str) -> list[str]:
cleaned_text = clean_text(text)
lemmized_text = lemmize_and_tokenize_text(cleaned_text)
return lemmized_text
|