Spaces:
Running
Running
import json | |
from collections import defaultdict | |
all_zh_words = defaultdict(int) | |
for model_name in [ | |
"gpt2", | |
"gpt2_chinese", | |
"chinese_llama_lora_7b", | |
"bert_chinese", | |
"moss", | |
"bloom", | |
"bloomz_6b4_zh", | |
"gpt_nexo_20b", | |
"gpt_neox_chinese_v1", | |
"glm_chinese", | |
"chatglm" | |
]: | |
zh_word_set = set() | |
for line in open(model_name + "_vocab.zh.jsonl", "r", encoding="utf-8"): | |
item = json.loads(line) | |
token = item["token"] | |
if item["type"] in ["中文单字", "中文多字"]: | |
zh_word_set.add(token.strip()) | |
for word in zh_word_set: | |
all_zh_words[word] += 1 | |
sorted_keywords = sorted(all_zh_words.items(), key=lambda kv: kv[1], reverse=True) | |
with open("vocab.freq.zh.txt", "w", encoding="utf-8") as f_out: | |
for word, count in sorted_keywords: | |
f_out.write("%s\t%d\n" % (word, count)) | |