Spaces:
Sleeping
Sleeping
Update app.py
Browse fileschange to Chinese only
app.py
CHANGED
@@ -11,41 +11,19 @@ if USE_GPU and torch.cuda.is_available():
|
|
11 |
else:
|
12 |
device = torch.device('cpu')
|
13 |
|
14 |
-
MODEL_NAME_ENGLISH = "facebook/xlm-v-base"
|
15 |
-
#SENTENCE_MODEL_NAME_ENGLISH = 'sentence-transformers/all-MiniLM-L6-v2'
|
16 |
-
#WORD_MODEL_NAME_ENGLISH = 'vocab-transformers/distilbert-word2vec_256k-MLM_best'
|
17 |
-
|
18 |
-
# chinese models
|
19 |
MODEL_NAME_CHINESE = "IDEA-CCNL/Erlangshen-DeBERTa-v2-186M-Chinese-SentencePiece"
|
20 |
|
21 |
WORD_PROBABILITY_THRESHOLD = 0.02
|
22 |
-
#WORD_PROBABILITY_THRESHOLD_ENGLISH = 0.02
|
23 |
-
#WORD_PROBABILITY_THRESHOLD_CHINESE = 0.02
|
24 |
TOP_K_WORDS = 10
|
25 |
|
26 |
-
ENGLISH_LANG = "English"
|
27 |
-
CHINESE_LANG = "Chinese"
|
28 |
-
|
29 |
CHINESE_WORDLIST = ['一定','一样','不得了','主观','从此','便于','俗话','倒霉','候选','充沛','分别','反倒','只好','同情','吹捧','咳嗽','围绕','如意','实行','将近','就职','应该','归还','当面','忘记','急忙','恢复','悲哀','感冒','成长','截至','打架','把握','报告','抱怨','担保','拒绝','拜访','拥护','拳头','拼搏','损坏','接待','握手','揭发','攀登','显示','普遍','未免','欣赏','正式','比如','流浪','涂抹','深刻','演绎','留念','瞻仰','确保','稍微','立刻','精心','结算','罕见','访问','请示','责怪','起初','转达','辅导','过瘾','运动','连忙','适合','遭受','重叠','镇静']
|
30 |
|
31 |
@st.cache_resource
|
32 |
def get_model_chinese():
|
33 |
return pipeline("fill-mask", MODEL_NAME_CHINESE, device = device)
|
34 |
|
35 |
-
@st.cache_resource
|
36 |
-
def get_model_english():
|
37 |
-
return pipeline("fill-mask", MODEL_NAME_ENGLISH, device = device)
|
38 |
-
|
39 |
-
@st.cache_data
|
40 |
-
def get_wordlist_chinese():
|
41 |
-
return pd.read_csv('wordlist_chinese.csv')
|
42 |
-
|
43 |
-
@st.cache_data
|
44 |
-
def get_wordlist_english():
|
45 |
-
return pd.read_csv('wordlist_english.csv')
|
46 |
-
|
47 |
def assess_chinese(word, sentence):
|
48 |
-
print("Assessing
|
49 |
if sentence.lower().find(word.lower()) == -1:
|
50 |
print('Sentence does not contain the word!')
|
51 |
return
|
@@ -65,57 +43,19 @@ def assess_chinese(word, sentence):
|
|
65 |
|
66 |
return top_k_prediction, score
|
67 |
|
68 |
-
def
|
69 |
-
|
70 |
-
raise Exception("Sentence does not contain the target word")
|
71 |
-
|
72 |
-
text = sentence.replace(word.lower(), "<mask>")
|
73 |
-
|
74 |
-
top_k_prediction = mask_filler_english(text, top_k=TOP_K_WORDS)
|
75 |
-
target_word_prediction = mask_filler_english(text, targets = chr(9601)+word)
|
76 |
-
|
77 |
-
score = target_word_prediction[0]['score']
|
78 |
-
|
79 |
-
# append the original word if its not found in the results
|
80 |
-
top_k_prediction_filtered = [output for output in top_k_prediction if \
|
81 |
-
output['token_str'] == word]
|
82 |
-
if len(top_k_prediction_filtered) == 0:
|
83 |
-
top_k_prediction.extend(target_word_prediction)
|
84 |
-
|
85 |
-
return top_k_prediction, score
|
86 |
-
|
87 |
-
def assess_sentence(language, word, sentence):
|
88 |
-
if (language == ENGLISH_LANG):
|
89 |
-
return assess_english(word, sentence)
|
90 |
-
elif (language == CHINESE_LANG):
|
91 |
-
return assess_chinese(word, sentence)
|
92 |
|
93 |
def get_chinese_word():
|
94 |
-
|
95 |
-
|
96 |
-
word = possible_words.sample(1).iloc[0].Chinese
|
97 |
-
test_words = CHINESE_WORDLIST
|
98 |
-
word = np.random.choice(test_words)
|
99 |
-
return word
|
100 |
-
|
101 |
-
def get_english_word():
|
102 |
-
include = (wordlist_english.assess == True)
|
103 |
-
possible_words = wordlist_english[include]
|
104 |
-
word = possible_words.sample(1).iloc[0].word
|
105 |
-
test_words = ["independent","satisfied","excited"]
|
106 |
-
word = np.random.choice(test_words)
|
107 |
return word
|
108 |
|
109 |
-
def get_word(
|
110 |
-
|
111 |
-
return get_english_word()
|
112 |
-
elif (language == CHINESE_LANG):
|
113 |
-
return get_chinese_word()
|
114 |
|
115 |
mask_filler_chinese = get_model_chinese()
|
116 |
-
mask_filler_english = get_model_english()
|
117 |
wordlist_chinese = get_wordlist_chinese()
|
118 |
-
wordlist_english = get_wordlist_english()
|
119 |
|
120 |
def highlight_given_word(row):
|
121 |
color = '#ACE5EE' if row.Words == target_word else 'white'
|
@@ -141,23 +81,21 @@ def get_top_5_results(top_k_prediction):
|
|
141 |
|
142 |
#### Streamlit Page
|
143 |
st.title("造句 Auto-marking Demo")
|
144 |
-
language = st.radio("Select your language", (ENGLISH_LANG, CHINESE_LANG))
|
145 |
-
#st.info("You are practising on " + language)
|
146 |
|
147 |
if 'target_word' not in st.session_state:
|
148 |
-
st.session_state['target_word'] = get_word(
|
149 |
target_word = st.session_state['target_word']
|
150 |
|
151 |
st.write("Target word: ", target_word)
|
152 |
if st.button("Get new word"):
|
153 |
-
st.session_state['target_word'] = get_word(
|
154 |
st.experimental_rerun()
|
155 |
|
156 |
st.subheader("Form your sentence and input below!")
|
157 |
sentence = st.text_input('Enter your sentence here', placeholder="Enter your sentence here!")
|
158 |
|
159 |
if st.button("Grade"):
|
160 |
-
top_k_prediction, score = assess_sentence(
|
161 |
with open('./result01.json', 'w') as outfile:
|
162 |
outfile.write(str(top_k_prediction))
|
163 |
|
|
|
11 |
else:
|
12 |
device = torch.device('cpu')
|
13 |
|
|
|
|
|
|
|
|
|
|
|
14 |
MODEL_NAME_CHINESE = "IDEA-CCNL/Erlangshen-DeBERTa-v2-186M-Chinese-SentencePiece"
|
15 |
|
16 |
WORD_PROBABILITY_THRESHOLD = 0.02
|
|
|
|
|
17 |
TOP_K_WORDS = 10
|
18 |
|
|
|
|
|
|
|
19 |
CHINESE_WORDLIST = ['一定','一样','不得了','主观','从此','便于','俗话','倒霉','候选','充沛','分别','反倒','只好','同情','吹捧','咳嗽','围绕','如意','实行','将近','就职','应该','归还','当面','忘记','急忙','恢复','悲哀','感冒','成长','截至','打架','把握','报告','抱怨','担保','拒绝','拜访','拥护','拳头','拼搏','损坏','接待','握手','揭发','攀登','显示','普遍','未免','欣赏','正式','比如','流浪','涂抹','深刻','演绎','留念','瞻仰','确保','稍微','立刻','精心','结算','罕见','访问','请示','责怪','起初','转达','辅导','过瘾','运动','连忙','适合','遭受','重叠','镇静']
|
20 |
|
21 |
@st.cache_resource
|
22 |
def get_model_chinese():
|
23 |
return pipeline("fill-mask", MODEL_NAME_CHINESE, device = device)
|
24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
def assess_chinese(word, sentence):
|
26 |
+
print("Assessing Chinese")
|
27 |
if sentence.lower().find(word.lower()) == -1:
|
28 |
print('Sentence does not contain the word!')
|
29 |
return
|
|
|
43 |
|
44 |
return top_k_prediction, score
|
45 |
|
46 |
+
def assess_sentence(word, sentence):
|
47 |
+
return assess_chinese(word, sentence)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
def get_chinese_word():
|
50 |
+
possible_words = CHINESE_WORDLIST
|
51 |
+
word = np.random.choice(possible_words)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
return word
|
53 |
|
54 |
+
def get_word():
|
55 |
+
return get_chinese_word()
|
|
|
|
|
|
|
56 |
|
57 |
mask_filler_chinese = get_model_chinese()
|
|
|
58 |
wordlist_chinese = get_wordlist_chinese()
|
|
|
59 |
|
60 |
def highlight_given_word(row):
|
61 |
color = '#ACE5EE' if row.Words == target_word else 'white'
|
|
|
81 |
|
82 |
#### Streamlit Page
|
83 |
st.title("造句 Auto-marking Demo")
|
|
|
|
|
84 |
|
85 |
if 'target_word' not in st.session_state:
|
86 |
+
st.session_state['target_word'] = get_word()
|
87 |
target_word = st.session_state['target_word']
|
88 |
|
89 |
st.write("Target word: ", target_word)
|
90 |
if st.button("Get new word"):
|
91 |
+
st.session_state['target_word'] = get_word()
|
92 |
st.experimental_rerun()
|
93 |
|
94 |
st.subheader("Form your sentence and input below!")
|
95 |
sentence = st.text_input('Enter your sentence here', placeholder="Enter your sentence here!")
|
96 |
|
97 |
if st.button("Grade"):
|
98 |
+
top_k_prediction, score = assess_sentence(target_word, sentence)
|
99 |
with open('./result01.json', 'w') as outfile:
|
100 |
outfile.write(str(top_k_prediction))
|
101 |
|