|
import re |
|
import pandas as pd |
|
import pymorphy2 |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
from sklearn.metrics import accuracy_score |
|
from translate import Translator |
|
import streamlit as st |
|
|
|
|
|
|
|
morph = pymorphy2.MorphAnalyzer() |
|
|
|
nkz_list = pd.read_csv('filtered_nkz.csv') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from translate import Translator |
|
|
|
def translate_text(text, from_lang='kk', to_lang='ru'): |
|
""" |
|
Перевод текста с одного языка на другой. |
|
|
|
:param text: Исходный текст для перевода. |
|
:param from_lang: Язык исходного текста (по умолчанию казахский 'kk'). |
|
:param to_lang: Язык перевода (по умолчанию русский 'ru'). |
|
:return: Переведённый текст. |
|
""" |
|
try: |
|
translator = Translator(from_lang=from_lang, to_lang=to_lang) |
|
translated_text = translator.translate(text) |
|
return translated_text |
|
except Exception as e: |
|
print(f"Ошибка перевода: {e}") |
|
return text |
|
|
|
|
|
kazakh_letters = set("әғқңөұүі") |
|
|
|
|
|
|
|
def preprocess_text(text): |
|
"""Функция для предобработки текста: перевод, очистка, приведение к нижнему регистру, лемматизация.""" |
|
if not isinstance(text, str): |
|
return "" |
|
|
|
|
|
if any(char in kazakh_letters for char in text.lower()): |
|
text = translate_text(text) |
|
|
|
|
|
text = text.lower().strip() |
|
text = re.sub(r'[^\w\s]', '', text) |
|
text = re.sub(r'[a-zA-Z0-9]', '', text) |
|
words = text.split() |
|
lemmas = [morph.parse(word)[0].normal_form for word in words] |
|
return ' '.join(lemmas) |
|
|
|
|
|
|
|
|
|
def find_best_matches(profession, nkz_list, vectorizer, tfidf_nkz, top_n=10): |
|
"""Находит топ-10 наилучших соответствий для одной профессии в списке НКЗ.""" |
|
|
|
processed_profession = preprocess_text(profession) |
|
print(processed_profession) |
|
|
|
tfidf_profession = vectorizer.transform([processed_profession]) |
|
|
|
|
|
similarity = cosine_similarity(tfidf_profession, tfidf_nkz) |
|
|
|
|
|
top_n_idx = similarity[0].argsort()[-top_n:][::-1] |
|
|
|
|
|
top_matches = [] |
|
for idx in top_n_idx: |
|
top_matches.append({ |
|
'profession': processed_profession, |
|
'nkz_match': nkz_list.iloc[idx]['NAME_RU2'], |
|
'nkz_code': nkz_list.iloc[idx]['CODE'], |
|
'similarity': similarity[0][idx] |
|
}) |
|
dfs = pd.DataFrame(top_matches) |
|
return dfs |
|
|
|
|
|
|
|
|
|
nkz_list['cleaned'] = nkz_list['NAME_RU2'].apply(preprocess_text) |
|
|
|
vectorizer = TfidfVectorizer() |
|
tfidf_nkz = vectorizer.fit_transform(nkz_list['cleaned']) |
|
|
|
st.title("Occupation Similarity Finder") |
|
|
|
text1 = st.text_input("Enter the occupation to compare:", "Оператор пульта управления") |
|
|
|
if st.button("Find Similar Occupations"): |
|
try: |
|
|
|
results = find_best_matches(text1, nkz_list, vectorizer, tfidf_nkz, top_n=10) |
|
|
|
|
|
|
|
if results is not None: |
|
st.write("Similar Occupations:") |
|
st.dataframe(results) |
|
|
|
else: |
|
st.warning("No similar occupations found.") |
|
except Exception as e: |
|
st.error(f"An error occurred: {e}") |
|
|