Spaces:

cn91
/

zaoju-demo

Sleeping

App Files Files Community

cn91 commited on Aug 16, 2023

Commit

0a6e4e2

•

1 Parent(s): a6be0b1

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -6

app.py CHANGED Viewed

@@ -11,12 +11,12 @@ if USE_GPU and torch.cuda.is_available():
 else:
     device = torch.device('cpu')
-#MODEL_NAME_CHINESE = "IDEA-CCNL/Erlangshen-DeBERTa-v2-186M-Chinese-SentencePiece"
-MODEL_NAME_CHINESE = "IDEA-CCNL/Erlangshen-DeBERTa-v2-97M-CWS-Chinese"
 WORD_PROBABILITY_THRESHOLD = 0.02
-TOP_K_WORDS = 10
 CHINESE_WORDLIST = ['一定','一样','不得了','主观','从此','便于','俗话','倒霉','候选','充沛','分别','反倒','只好','同情','吹捧','咳嗽','围绕','如意','实行','将近','就职','应该','归还','当面','忘记','急忙','恢复','悲哀','感冒','成长','截至','打架','把握','报告','抱怨','担保','拒绝','拜访','拥护','拳头','拼搏','损坏','接待','握手','揭发','攀登','显示','普遍','未免','欣赏','正式','比如','流浪','涂抹','深刻','演绎','留念','瞻仰','确保','稍微','立刻','精心','结算','罕见','访问','请示','责怪','起初','转达','辅导','过瘾','运动','连忙','适合','遭受','重叠','镇静']
@@ -24,8 +24,15 @@ CHINESE_WORDLIST = ['一定','一样','不得了','主观','从此','便于','
 def get_model_chinese():
     return pipeline("fill-mask", MODEL_NAME_CHINESE, device = device)
 def assess_chinese(word, sentence):
     print("Assessing Chinese")
     if sentence.lower().find(word.lower()) == -1:
         print('Sentence does not contain the word!')
         return
@@ -35,15 +42,27 @@ def assess_chinese(word, sentence):
     top_k_prediction = mask_filler_chinese(text, top_k=TOP_K_WORDS)
     target_word_prediction = mask_filler_chinese(text, targets = word)
     score = target_word_prediction[0]['score']
     # append the original word if its not found in the results
-    top_k_prediction_filtered = [output for output in top_k_prediction if \
                                  output['token_str'] == word]
     if len(top_k_prediction_filtered) == 0:
-        top_k_prediction.extend(target_word_prediction)
-    return top_k_prediction, score
 def assess_sentence(word, sentence):
     return assess_chinese(word, sentence)

 else:
     device = torch.device('cpu')
+MODEL_NAME_CHINESE = "IDEA-CCNL/Erlangshen-DeBERTa-v2-186M-Chinese-SentencePiece"
+#MODEL_NAME_CHINESE = "IDEA-CCNL/Erlangshen-DeBERTa-v2-97M-CWS-Chinese"
 WORD_PROBABILITY_THRESHOLD = 0.02
+TOP_K_WORDS = 200
 CHINESE_WORDLIST = ['一定','一样','不得了','主观','从此','便于','俗话','倒霉','候选','充沛','分别','反倒','只好','同情','吹捧','咳嗽','围绕','如意','实行','将近','就职','应该','归还','当面','忘记','急忙','恢复','悲哀','感冒','成长','截至','打架','把握','报告','抱怨','担保','拒绝','拜访','拥护','拳头','拼搏','损坏','接待','握手','揭发','攀登','显示','普遍','未免','欣赏','正式','比如','流浪','涂抹','深刻','演绎','留念','瞻仰','确保','稍微','立刻','精心','结算','罕见','访问','请示','责怪','起初','转达','辅导','过瘾','运动','连忙','适合','遭受','重叠','镇静']
 def get_model_chinese():
     return pipeline("fill-mask", MODEL_NAME_CHINESE, device = device)
+@st.cache_resource
+def get_allowed_tokens():
+    df = pd.read_csv('allowed_token_ids.csv')
+    return set(list(df['token']))
 def assess_chinese(word, sentence):
     print("Assessing Chinese")
+    allowed_token_ids = get_allowed_tokens()
     if sentence.lower().find(word.lower()) == -1:
         print('Sentence does not contain the word!')
         return
     top_k_prediction = mask_filler_chinese(text, top_k=TOP_K_WORDS)
     target_word_prediction = mask_filler_chinese(text, targets = word)
+    norm_factor = 0
+    for output in top_k_prediction:
+        if output['token'] not in allowed_token_ids:
+            norm_factor += output['score']
+    top_k_prediction_new = []
+    for output in top_k_prediction:
+        if output['token'] in allowed_token_ids:
+            output['score'] = output['score']/(1-min(0.5,norm_factor))
+            top_k_prediction_new.append(output)
+    target_word_prediction[0]['score'] = target_word_prediction[0]['score'] / (1-min(0.5,norm_factor))
     score = target_word_prediction[0]['score']
     # append the original word if its not found in the results
+    top_k_prediction_filtered = [output for output in top_k_prediction_new if \
                                  output['token_str'] == word]
     if len(top_k_prediction_filtered) == 0:
+        top_k_prediction_new.extend(target_word_prediction)
+    return top_k_prediction_new, score
 def assess_sentence(word, sentence):
     return assess_chinese(word, sentence)