import re |
import pandas as pd |
import pymorphy2 |
from sklearn.metrics.pairwise import cosine_similarity |
import language_tool_python |
import torch |
from transformers import AutoTokenizer, AutoModel |
import nltk |
from nltk.corpus import stopwords |
from tqdm import tqdm |
import numpy as np |
import tempfile |
import streamlit as st |
nltk.download('stopwords') |
stop_words = set(stopwords.words('russian')) |
bert_model_name = "sberbank-ai/sbert_large_nlu_ru" |
tokenizer = AutoTokenizer.from_pretrained(bert_model_name) |
model = AutoModel.from_pretrained(bert_model_name) |
def preprocess_text(text): |
"""Функция для предобработки текста.""" |
if not isinstance(text, str): |
return "" |
text = text.lower().strip() |
text = re.sub(r'[^\w\s]', ' ', text) |
text = re.sub(r'-', ' ', text) |
text = re.sub(r'[a-zA-Z0-9]', ' ', text) |
text = generate(text) |
text = correct_text(text) |
return text |
def get_embedding(text): |
"""Получение эмбеддингов BERT для текста.""" |
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) |
with torch.no_grad(): |
outputs = model(**inputs) |
return outputs.last_hidden_state.mean(dim=1).squeeze() |
def find_best_matches(profession, nkz_list, embeddings_nkz, top_n=10): |
"""Находит топ-N наилучших соответствий для профессии в списке НКЗ.""" |
processed_profession = preprocess_text(profession) |
print(f"Предобработанная профессия: {processed_profession}") |
processed_profession = re.sub(r'-', ' ', processed_profession) |
profession_embedding = get_embedding(processed_profession).numpy().reshape(1, -1) |
similarity = cosine_similarity(profession_embedding, embeddings_nkz) |
top_n_idx = similarity[0].argsort()[-top_n:][::-1] |
top_matches = [] |
for idx in top_n_idx: |
top_matches.append({ |
'profession': profession, |
'nkz_ru': nkz_list.iloc[idx]['NAME_RU'], |
'nkz_kz': nkz_list.iloc[idx]['NAME_KZ'], |
'nkz_code': nkz_list.iloc[idx]['CODE'], |
'similarity': similarity[0][idx] |
}) |
return pd.DataFrame(top_matches) |
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer |
def generate(text, **kwargs): |
inputs = tr_tokenizer(text, return_tensors='pt') |
with torch.no_grad(): |
hypotheses = tr_model.generate(**inputs, num_beams=5, **kwargs) |
return tr_tokenizer.decode(hypotheses[0], skip_special_tokens=True) |
tr_model = AutoModelForSeq2SeqLM.from_pretrained('deepvk/kazRush-kk-ru') |
tr_tokenizer = AutoTokenizer.from_pretrained('deepvk/kazRush-kk-ru') |
import hunspell |
def correct_text(text: str, dict_path: str = 'ru_RU.dic', aff_path: str = 'ru_RU.aff') -> str: |
h = hunspell.HunSpell(dict_path, aff_path) |
words = text.split() |
corrected_words = [] |
for word in words: |
if not h.spell(word): |
suggestions = h.suggest(word) |
if suggestions: |
corrected_words.append(suggestions[0]) |
else: |
corrected_words.append(word) |
else: |
corrected_words.append(word) |
corrected_text = ' '.join(corrected_words) |
return corrected_text |
import numpy as np |
nkz_list = pd.read_csv('nkz_list_emb_df.csv') |
import numpy as np |
nkz_list['EMB'] = nkz_list['EMB'].apply(lambda x: np.fromstring(x[1:-1], sep=', ')) |
embeddings_nkz_df = np.vstack(nkz_list['EMB']) |
text1 = st.text_input("Enter the occupation to compare:", "Оператор пульта управления") |
if st.button("Find Similar Occupations"): |
try: |
results = find_best_matches(text1, nkz_list, embeddings_nkz_df, top_n=10) |
if results is not None: |
st.write("Similar Occupations:") |
st.dataframe(results) |
else: |
st.warning("No similar occupations found.") |
except Exception as e: |
st.error(f"An error occurred: {e}") |