Spaces:
Running
Running
import json | |
import spacy | |
import gensim | |
import pymorphy2 | |
import streamlit as st | |
from transformers import pipeline | |
def load_morph(): | |
_morph = pymorphy2.MorphAnalyzer(lang='ru') | |
return _morph | |
def load_w2v(model_path): | |
_w2v_model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True) | |
return _w2v_model | |
def load_spacy(): | |
_nlp = spacy.load('ru_core_news_lg') | |
return _nlp | |
def load_bert(): | |
return pipeline("fill-mask", model="a-v-white/ruBert-base-finetuned-russian-moshkov-child-corpus-pro") | |
nlp = load_spacy() | |
morph = load_morph() | |
w2v_model1_path = r'model1.gz' | |
w2v_model2_path = r'model2.gz' | |
# Upload stop list | |
stop_list = set() | |
with open(r'language_data/stop_words.txt', 'r', encoding='utf-8') as read_file: | |
for line in read_file: | |
stop_list.add(line.strip()) | |
# Upload minimums | |
a1_path, a1_target_set = r'language_data/A1_MINIMUM.txt', set() | |
a2_path, a2_target_set = r'language_data/A2_MINIMUM.txt', set() | |
b1_path, b1_target_set = r'language_data/B1_MINIMUM.txt', set() | |
b2_path, b2_target_set = r'language_data/B2_MINIMUM.txt', set() | |
c1_path, c1_target_set = r'language_data/C1_MINIMUM.txt', set() | |
c2_path, c2_target_set = r'language_data/C2_MINIMUM.txt', set() | |
minimums_paths = (a1_path, a2_path, b1_path, b2_path) | |
minimums_sets = (a1_target_set, a2_target_set, b1_target_set, b2_target_set, c1_target_set, c2_target_set) | |
for i in range(len(minimums_paths)): | |
with open(minimums_paths[i], 'r', encoding='utf-8') as read_file: | |
for line in read_file: | |
minimums_sets[i].add(line.strip()) | |
a1_distractor_set = a1_target_set | |
a2_distractor_set = a2_target_set.union(a1_target_set) | |
b1_distractor_set = b1_target_set.union(a2_target_set) | |
b2_distractor_set = b2_target_set.union(b1_target_set) | |
c1_distractor_set = c1_target_set.union(b2_target_set) | |
c2_distractor_set = c2_target_set.union(c1_target_set) | |
with open('language_data/phrases.json', 'r', encoding='utf-8') as f: | |
PHRASES = set(json.load(f)['PHRASES']) | |
SIMILARITY_VALUES_w2v = {'A1': 1.0, 'A2': 1.0, 'B1': 1.0, 'B2': 1.0, 'C1': 1.0, 'C2': 1.0, 'Без уровня': 1.0} | |
SIMILARITY_VALUES_bert = {'A1': 1.0, 'A2': 1.0, 'B1': 1.0, 'B2': 1.0, 'C1': 1.0, 'C2': 1.0, 'Без уровня': 1.0} | |
BAD_USER_TARGET_WORDS = [] | |