|
|
|
import re |
|
import pandas as pd |
|
import pymorphy2 |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
import language_tool_python |
|
import torch |
|
from transformers import AutoTokenizer, AutoModel |
|
import nltk |
|
from nltk.corpus import stopwords |
|
from tqdm import tqdm |
|
import numpy as np |
|
import tempfile |
|
import streamlit as st |
|
|
|
|
|
|
|
nltk.download('stopwords') |
|
stop_words = set(stopwords.words('russian')) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bert_model_name = "sberbank-ai/sbert_large_nlu_ru" |
|
tokenizer = AutoTokenizer.from_pretrained(bert_model_name) |
|
model = AutoModel.from_pretrained(bert_model_name) |
|
|
|
def preprocess_text(text): |
|
"""Функция для предобработки текста.""" |
|
if not isinstance(text, str): |
|
return "" |
|
|
|
|
|
text = text.lower().strip() |
|
text = re.sub(r'[^\w\s]', ' ', text) |
|
text = re.sub(r'-', ' ', text) |
|
text = re.sub(r'[a-zA-Z0-9]', ' ', text) |
|
text = generate(text) |
|
text = correct_text(text) |
|
return text |
|
|
|
def get_embedding(text): |
|
|
|
"""Получение эмбеддингов BERT для текста.""" |
|
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) |
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
|
|
return outputs.last_hidden_state.mean(dim=1).squeeze() |
|
def find_best_matches(profession, nkz_list, embeddings_nkz, top_n=10): |
|
"""Находит топ-N наилучших соответствий для профессии в списке НКЗ.""" |
|
|
|
processed_profession = preprocess_text(profession) |
|
print(f"Предобработанная профессия: {processed_profession}") |
|
processed_profession = re.sub(r'-', ' ', processed_profession) |
|
|
|
profession_embedding = get_embedding(processed_profession).numpy().reshape(1, -1) |
|
|
|
|
|
similarity = cosine_similarity(profession_embedding, embeddings_nkz) |
|
|
|
|
|
top_n_idx = similarity[0].argsort()[-top_n:][::-1] |
|
|
|
top_matches = [] |
|
for idx in top_n_idx: |
|
top_matches.append({ |
|
'profession': profession, |
|
'nkz_ru': nkz_list.iloc[idx]['NAME_RU'], |
|
'nkz_kz': nkz_list.iloc[idx]['NAME_KZ'], |
|
'nkz_code': nkz_list.iloc[idx]['CODE'], |
|
'similarity': similarity[0][idx] |
|
}) |
|
|
|
return pd.DataFrame(top_matches) |
|
|
|
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer |
|
def generate(text, **kwargs): |
|
inputs = tr_tokenizer(text, return_tensors='pt') |
|
with torch.no_grad(): |
|
hypotheses = tr_model.generate(**inputs, num_beams=5, **kwargs) |
|
return tr_tokenizer.decode(hypotheses[0], skip_special_tokens=True) |
|
tr_model = AutoModelForSeq2SeqLM.from_pretrained('deepvk/kazRush-kk-ru') |
|
tr_tokenizer = AutoTokenizer.from_pretrained('deepvk/kazRush-kk-ru') |
|
|
|
|
|
import hunspell |
|
|
|
def correct_text(text: str, dict_path: str = 'ru_RU.dic', aff_path: str = 'ru_RU.aff') -> str: |
|
|
|
h = hunspell.HunSpell(dict_path, aff_path) |
|
|
|
|
|
words = text.split() |
|
|
|
|
|
corrected_words = [] |
|
for word in words: |
|
if not h.spell(word): |
|
suggestions = h.suggest(word) |
|
if suggestions: |
|
corrected_words.append(suggestions[0]) |
|
else: |
|
corrected_words.append(word) |
|
else: |
|
corrected_words.append(word) |
|
|
|
|
|
corrected_text = ' '.join(corrected_words) |
|
|
|
return corrected_text |
|
|
|
import numpy as np |
|
|
|
|
|
nkz_list = pd.read_csv('nkz_list_emb_df.csv') |
|
|
|
import numpy as np |
|
|
|
|
|
|
|
|
|
nkz_list['EMB'] = nkz_list['EMB'].apply(lambda x: np.fromstring(x[1:-1], sep=', ')) |
|
|
|
|
|
embeddings_nkz_df = np.vstack(nkz_list['EMB']) |
|
|
|
|
|
|
|
|
|
text1 = st.text_input("Enter the occupation to compare:", "Оператор пульта управления") |
|
|
|
if st.button("Find Similar Occupations"): |
|
try: |
|
|
|
results = find_best_matches(text1, nkz_list, embeddings_nkz_df, top_n=10) |
|
|
|
|
|
|
|
if results is not None: |
|
st.write("Similar Occupations:") |
|
st.dataframe(results) |
|
|
|
else: |
|
st.warning("No similar occupations found.") |
|
except Exception as e: |
|
st.error(f"An error occurred: {e}") |