Spaces:

a-v-bely
/

russian-task-generator

Running

russian-task-generator / utilities_language_general /rus_constants.py

togokah

Update code and models

1c4aea6 about 2 years ago

2.34 kB

	import json
	import spacy
	import gensim
	import pymorphy2
	import streamlit as st
	from transformers import pipeline


	@st.cache_resource
	def load_morph():
	_morph = pymorphy2.MorphAnalyzer(lang='ru')
	return _morph


	@st.cache_resource
	def load_w2v(model_path):
	_w2v_model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)
	return _w2v_model


	@st.cache_resource
	def load_spacy():
	_nlp = spacy.load('ru_core_news_lg')
	return _nlp


	@st.cache_resource
	def load_bert():
	return pipeline("fill-mask", model="a-v-white/ruBert-base-finetuned-russian-moshkov-child-corpus-pro")


	nlp = load_spacy()
	morph = load_morph()
	w2v_model1_path = r'model1.gz'
	w2v_model2_path = r'model2.gz'

	# Upload stop list
	stop_list = set()
	with open(r'language_data/stop_words.txt', 'r', encoding='utf-8') as read_file:
	for line in read_file:
	stop_list.add(line.strip())

	# Upload minimums
	a1_path, a1_target_set = r'language_data/A1_MINIMUM.txt', set()
	a2_path, a2_target_set = r'language_data/A2_MINIMUM.txt', set()
	b1_path, b1_target_set = r'language_data/B1_MINIMUM.txt', set()
	b2_path, b2_target_set = r'language_data/B2_MINIMUM.txt', set()
	c1_path, c1_target_set = r'language_data/C1_MINIMUM.txt', set()
	c2_path, c2_target_set = r'language_data/C2_MINIMUM.txt', set()
	minimums_paths = (a1_path, a2_path, b1_path, b2_path)
	minimums_sets = (a1_target_set, a2_target_set, b1_target_set, b2_target_set, c1_target_set, c2_target_set)
	for i in range(len(minimums_paths)):
	with open(minimums_paths[i], 'r', encoding='utf-8') as read_file:
	for line in read_file:
	minimums_sets[i].add(line.strip())

	a1_distractor_set = a1_target_set
	a2_distractor_set = a2_target_set.union(a1_target_set)
	b1_distractor_set = b1_target_set.union(a2_target_set)
	b2_distractor_set = b2_target_set.union(b1_target_set)
	c1_distractor_set = c1_target_set.union(b2_target_set)
	c2_distractor_set = c2_target_set.union(c1_target_set)

	with open('language_data/phrases.json', 'r', encoding='utf-8') as f:
	PHRASES = set(json.load(f)['PHRASES'])

	SIMILARITY_VALUES_w2v = {'A1': 1.0, 'A2': 1.0, 'B1': 1.0, 'B2': 1.0, 'C1': 1.0, 'C2': 1.0, 'Без уровня': 1.0}
	SIMILARITY_VALUES_bert = {'A1': 1.0, 'A2': 1.0, 'B1': 1.0, 'B2': 1.0, 'C1': 1.0, 'C2': 1.0, 'Без уровня': 1.0}

	BAD_USER_TARGET_WORDS = []