KoichiYasuoka
commited on
Commit
•
2adf0cf
1
Parent(s):
9c7558d
model improved
Browse files- README.md +1 -1
- config.json +1 -1
- maker.py +3 -3
- pytorch_model.bin +2 -2
- tokenizer_config.json +0 -1
- vocab.txt +469 -0
README.md
CHANGED
@@ -16,7 +16,7 @@ pipeline_tag: "token-classification"
|
|
16 |
|
17 |
## Model Description
|
18 |
|
19 |
-
This is a RoBERTa model pre-trained on Chinese texts (both simplified and traditional) for POS-tagging and dependency-parsing (using `goeswith` for subwords), derived from [
|
20 |
|
21 |
## How to Use
|
22 |
|
|
|
16 |
|
17 |
## Model Description
|
18 |
|
19 |
+
This is a RoBERTa model pre-trained on Chinese texts (both simplified and traditional) for POS-tagging and dependency-parsing (using `goeswith` for subwords), derived from [roberta-base-chinese-upos](https://huggingface.co/KoichiYasuoka/roberta-base-chinese-upos).
|
20 |
|
21 |
## How to Use
|
22 |
|
config.json
CHANGED
@@ -644,5 +644,5 @@
|
|
644 |
"transformers_version": "4.22.1",
|
645 |
"type_vocab_size": 2,
|
646 |
"use_cache": true,
|
647 |
-
"vocab_size":
|
648 |
}
|
|
|
644 |
"transformers_version": "4.22.1",
|
645 |
"type_vocab_size": 2,
|
646 |
"use_cache": true,
|
647 |
+
"vocab_size": 21597
|
648 |
}
|
maker.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
#! /usr/bin/python3
|
2 |
-
src="
|
3 |
tgt="KoichiYasuoka/roberta-base-chinese-ud-goeswith"
|
4 |
import os
|
5 |
for d in ["UD_Chinese-GSD","UD_Chinese-GSDSimp"]:
|
@@ -45,9 +45,9 @@ trainDS=UDgoeswithDataset("train.conllu",tkz)
|
|
45 |
devDS=UDgoeswithDataset("dev.conllu",tkz)
|
46 |
testDS=UDgoeswithDataset("test.conllu",tkz)
|
47 |
lid=trainDS(devDS,testDS)
|
48 |
-
cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,id2label={i:l for l,i in lid.items()})
|
49 |
arg=TrainingArguments(num_train_epochs=3,per_device_train_batch_size=32,output_dir="/tmp",overwrite_output_dir=True,save_total_limit=2,evaluation_strategy="epoch",learning_rate=5e-05,warmup_ratio=0.1)
|
50 |
-
trn=Trainer(args=arg,data_collator=DataCollatorForTokenClassification(tkz),model=AutoModelForTokenClassification.from_pretrained(src,config=cfg),train_dataset=trainDS,eval_dataset=devDS)
|
51 |
trn.train()
|
52 |
trn.save_model(tgt)
|
53 |
tkz.save_pretrained(tgt)
|
|
|
1 |
#! /usr/bin/python3
|
2 |
+
src="KoichiYasuoka/roberta-base-chinese-upos"
|
3 |
tgt="KoichiYasuoka/roberta-base-chinese-ud-goeswith"
|
4 |
import os
|
5 |
for d in ["UD_Chinese-GSD","UD_Chinese-GSDSimp"]:
|
|
|
45 |
devDS=UDgoeswithDataset("dev.conllu",tkz)
|
46 |
testDS=UDgoeswithDataset("test.conllu",tkz)
|
47 |
lid=trainDS(devDS,testDS)
|
48 |
+
cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,id2label={i:l for l,i in lid.items()},ignore_mismatched_sizes=True)
|
49 |
arg=TrainingArguments(num_train_epochs=3,per_device_train_batch_size=32,output_dir="/tmp",overwrite_output_dir=True,save_total_limit=2,evaluation_strategy="epoch",learning_rate=5e-05,warmup_ratio=0.1)
|
50 |
+
trn=Trainer(args=arg,data_collator=DataCollatorForTokenClassification(tkz),model=AutoModelForTokenClassification.from_pretrained(src,config=cfg,ignore_mismatched_sizes=True),train_dataset=trainDS,eval_dataset=devDS)
|
51 |
trn.train()
|
52 |
trn.save_model(tgt)
|
53 |
tkz.save_pretrained(tgt)
|
pytorch_model.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d46ab28f34d65e8cd28c870bc9ca6e8d10d012e9a900df272620a292b67f5825
|
3 |
+
size 409151985
|
tokenizer_config.json
CHANGED
@@ -7,7 +7,6 @@
|
|
7 |
"never_split": null,
|
8 |
"pad_token": "[PAD]",
|
9 |
"sep_token": "[SEP]",
|
10 |
-
"special_tokens_map_file": null,
|
11 |
"strip_accents": null,
|
12 |
"tokenize_chinese_chars": true,
|
13 |
"tokenizer_class": "BertTokenizerFast",
|
|
|
7 |
"never_split": null,
|
8 |
"pad_token": "[PAD]",
|
9 |
"sep_token": "[SEP]",
|
|
|
10 |
"strip_accents": null,
|
11 |
"tokenize_chinese_chars": true,
|
12 |
"tokenizer_class": "BertTokenizerFast",
|
vocab.txt
CHANGED
@@ -21126,3 +21126,472 @@ fishbase
|
|
21126 |
##🔥
|
21127 |
##😂
|
21128 |
##😎
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21126 |
##🔥
|
21127 |
##😂
|
21128 |
##😎
|
21129 |
+
𫍟
|
21130 |
+
𧦧
|
21131 |
+
𰾵
|
21132 |
+
𨬟
|
21133 |
+
𫮃
|
21134 |
+
墠
|
21135 |
+
𪪼
|
21136 |
+
彃
|
21137 |
+
𢭏
|
21138 |
+
擣
|
21139 |
+
𥐟
|
21140 |
+
礒
|
21141 |
+
𬘝
|
21142 |
+
紾
|
21143 |
+
𫄨
|
21144 |
+
絺
|
21145 |
+
𮉪
|
21146 |
+
緅
|
21147 |
+
𰬸
|
21148 |
+
繐
|
21149 |
+
𫄸
|
21150 |
+
纁
|
21151 |
+
𮉡
|
21152 |
+
纑
|
21153 |
+
𫄥
|
21154 |
+
纚
|
21155 |
+
𦰏
|
21156 |
+
蓧
|
21157 |
+
𫉁
|
21158 |
+
薆
|
21159 |
+
訑
|
21160 |
+
𫍥
|
21161 |
+
誂
|
21162 |
+
𬤊
|
21163 |
+
諟
|
21164 |
+
𬤣
|
21165 |
+
譈
|
21166 |
+
𫐄
|
21167 |
+
軏
|
21168 |
+
𫐐
|
21169 |
+
輗
|
21170 |
+
𬨎
|
21171 |
+
輶
|
21172 |
+
𫓧
|
21173 |
+
鈇
|
21174 |
+
𨱂
|
21175 |
+
鈋
|
21176 |
+
𬱙
|
21177 |
+
頖
|
21178 |
+
𫖹
|
21179 |
+
顣
|
21180 |
+
𫗞
|
21181 |
+
飦
|
21182 |
+
𫗦
|
21183 |
+
餔
|
21184 |
+
𮩝
|
21185 |
+
餲
|
21186 |
+
𮩞
|
21187 |
+
饐
|
21188 |
+
𫗴
|
21189 |
+
饘
|
21190 |
+
𬶍
|
21191 |
+
鮀
|
21192 |
+
𫚈
|
21193 |
+
鱮
|
21194 |
+
𫛞
|
21195 |
+
鴃
|
21196 |
+
𫜁
|
21197 |
+
鷩
|
21198 |
+
㑹
|
21199 |
+
㧛
|
21200 |
+
㧞
|
21201 |
+
㨗
|
21202 |
+
㫖
|
21203 |
+
㱩
|
21204 |
+
殰
|
21205 |
+
䃅
|
21206 |
+
磾
|
21207 |
+
䌶
|
21208 |
+
䊷
|
21209 |
+
䌷
|
21210 |
+
紬
|
21211 |
+
䌹
|
21212 |
+
絅
|
21213 |
+
䘮
|
21214 |
+
䜣
|
21215 |
+
訢
|
21216 |
+
䟽
|
21217 |
+
䧟
|
21218 |
+
䯄
|
21219 |
+
騧
|
21220 |
+
䶮
|
21221 |
+
龑
|
21222 |
+
竝
|
21223 |
+
伥
|
21224 |
+
倀
|
21225 |
+
倂
|
21226 |
+
侮
|
21227 |
+
倹
|
21228 |
+
偸
|
21229 |
+
僂
|
21230 |
+
偾
|
21231 |
+
僨
|
21232 |
+
儺
|
21233 |
+
僞
|
21234 |
+
僧
|
21235 |
+
免
|
21236 |
+
兎
|
21237 |
+
勉
|
21238 |
+
勤
|
21239 |
+
勲
|
21240 |
+
卑
|
21241 |
+
卽
|
21242 |
+
厠
|
21243 |
+
廄
|
21244 |
+
嘆
|
21245 |
+
呉
|
21246 |
+
吿
|
21247 |
+
呑
|
21248 |
+
呙
|
21249 |
+
咼
|
21250 |
+
呪
|
21251 |
+
哙
|
21252 |
+
噲
|
21253 |
+
喩
|
21254 |
+
器
|
21255 |
+
嚢
|
21256 |
+
圹
|
21257 |
+
壙
|
21258 |
+
埀
|
21259 |
+
塡
|
21260 |
+
墨
|
21261 |
+
壌
|
21262 |
+
壱
|
21263 |
+
奨
|
21264 |
+
嫗
|
21265 |
+
妬
|
21266 |
+
娯
|
21267 |
+
嬋
|
21268 |
+
媭
|
21269 |
+
嬃
|
21270 |
+
賓
|
21271 |
+
尙
|
21272 |
+
尭
|
21273 |
+
屦
|
21274 |
+
屨
|
21275 |
+
巌
|
21276 |
+
巣
|
21277 |
+
帏
|
21278 |
+
幃
|
21279 |
+
帱
|
21280 |
+
幬
|
21281 |
+
幷
|
21282 |
+
廐
|
21283 |
+
廪
|
21284 |
+
廩
|
21285 |
+
廵
|
21286 |
+
忾
|
21287 |
+
愾
|
21288 |
+
怃
|
21289 |
+
憮
|
21290 |
+
愴
|
21291 |
+
懟
|
21292 |
+
恠
|
21293 |
+
惻
|
21294 |
+
悔
|
21295 |
+
懲
|
21296 |
+
愠
|
21297 |
+
慍
|
21298 |
+
愼
|
21299 |
+
憎
|
21300 |
+
懐
|
21301 |
+
戯
|
21302 |
+
扡
|
21303 |
+
拝
|
21304 |
+
挿
|
21305 |
+
揔
|
21306 |
+
摠
|
21307 |
+
摂
|
21308 |
+
摅
|
21309 |
+
攄
|
21310 |
+
擯
|
21311 |
+
撄
|
21312 |
+
攖
|
21313 |
+
擥
|
21314 |
+
擧
|
21315 |
+
攷
|
21316 |
+
敏
|
21317 |
+
斉
|
21318 |
+
旣
|
21319 |
+
暁
|
21320 |
+
暦
|
21321 |
+
朞
|
21322 |
+
殺
|
21323 |
+
枨
|
21324 |
+
棖
|
21325 |
+
柰
|
21326 |
+
橈
|
21327 |
+
梅
|
21328 |
+
梼
|
21329 |
+
檮
|
21330 |
+
棁
|
21331 |
+
梲
|
21332 |
+
椟
|
21333 |
+
櫝
|
21334 |
+
榅
|
21335 |
+
榲
|
21336 |
+
楡
|
21337 |
+
槚
|
21338 |
+
檟
|
21339 |
+
槨
|
21340 |
+
槪
|
21341 |
+
櫓
|
21342 |
+
欤
|
21343 |
+
歟
|
21344 |
+
歿
|
21345 |
+
漢
|
21346 |
+
汚
|
21347 |
+
浍
|
21348 |
+
澮
|
21349 |
+
海
|
21350 |
+
涜
|
21351 |
+
淸
|
21352 |
+
渇
|
21353 |
+
渓
|
21354 |
+
漑
|
21355 |
+
潅
|
21356 |
+
竈
|
21357 |
+
焭
|
21358 |
+
煢
|
21359 |
+
煕
|
21360 |
+
犠
|
21361 |
+
貍
|
21362 |
+
猟
|
21363 |
+
玆
|
21364 |
+
疴
|
21365 |
+
痾
|
21366 |
+
癰
|
21367 |
+
益
|
21368 |
+
著
|
21369 |
+
硁
|
21370 |
+
硜
|
21371 |
+
硗
|
21372 |
+
磽
|
21373 |
+
社
|
21374 |
+
祈
|
21375 |
+
祐
|
21376 |
+
祖
|
21377 |
+
祝
|
21378 |
+
神
|
21379 |
+
祥
|
21380 |
+
禎
|
21381 |
+
禍
|
21382 |
+
福
|
21383 |
+
秊
|
21384 |
+
穀
|
21385 |
+
穑
|
21386 |
+
穡
|
21387 |
+
穣
|
21388 |
+
穰
|
21389 |
+
突
|
21390 |
+
笾
|
21391 |
+
籩
|
21392 |
+
筚
|
21393 |
+
篳
|
21394 |
+
箪
|
21395 |
+
簞
|
21396 |
+
節
|
21397 |
+
簣
|
21398 |
+
簒
|
21399 |
+
籴
|
21400 |
+
糴
|
21401 |
+
類
|
21402 |
+
粛
|
21403 |
+
緖
|
21404 |
+
緜
|
21405 |
+
緼
|
21406 |
+
縕
|
21407 |
+
縦
|
21408 |
+
繁
|
21409 |
+
繊
|
21410 |
+
繍
|
21411 |
+
绐
|
21412 |
+
紿
|
21413 |
+
绖
|
21414 |
+
絰
|
21415 |
+
绤
|
21416 |
+
綌
|
21417 |
+
绹
|
21418 |
+
綯
|
21419 |
+
缁
|
21420 |
+
緇
|
21421 |
+
缊
|
21422 |
+
缌
|
21423 |
+
緦
|
21424 |
+
缗
|
21425 |
+
緡
|
21426 |
+
缧
|
21427 |
+
縲
|
21428 |
+
缫
|
21429 |
+
繅
|
21430 |
+
缵
|
21431 |
+
纘
|
21432 |
+
者
|
21433 |
+
聡
|
21434 |
+
臭
|
21435 |
+
茕
|
21436 |
+
荅
|
21437 |
+
荛
|
21438 |
+
蕘
|
21439 |
+
蒉
|
21440 |
+
蕢
|
21441 |
+
蔂
|
21442 |
+
虆
|
21443 |
+
薫
|
21444 |
+
虜
|
21445 |
+
虯
|
21446 |
+
蛍
|
21447 |
+
鼃
|
21448 |
+
螻
|
21449 |
+
蝿
|
21450 |
+
褐
|
21451 |
+
視
|
21452 |
+
觌
|
21453 |
+
覿
|
21454 |
+
覲
|
21455 |
+
觴
|
21456 |
+
訚
|
21457 |
+
誾
|
21458 |
+
諸
|
21459 |
+
謁
|
21460 |
+
謹
|
21461 |
+
讐
|
21462 |
+
讎
|
21463 |
+
讦
|
21464 |
+
訐
|
21465 |
+
讱
|
21466 |
+
訒
|
21467 |
+
謳
|
21468 |
+
诎
|
21469 |
+
詘
|
21470 |
+
诐
|
21471 |
+
詖
|
21472 |
+
诒
|
21473 |
+
詒
|
21474 |
+
诔
|
21475 |
+
誄
|
21476 |
+
诼
|
21477 |
+
諑
|
21478 |
+
諛
|
21479 |
+
諂
|
21480 |
+
谇
|
21481 |
+
誶
|
21482 |
+
讒
|
21483 |
+
謚
|
21484 |
+
謫
|
21485 |
+
谮
|
21486 |
+
譖
|
21487 |
+
賛
|
21488 |
+
贶
|
21489 |
+
貺
|
21490 |
+
賅
|
21491 |
+
赆
|
21492 |
+
贐
|
21493 |
+
赉
|
21494 |
+
賚
|
21495 |
+
踯
|
21496 |
+
躑
|
21497 |
+
轫
|
21498 |
+
軔
|
21499 |
+
軻
|
21500 |
+
辂
|
21501 |
+
輅
|
21502 |
+
辔
|
21503 |
+
轡
|
21504 |
+
逓
|
21505 |
+
逸
|
21506 |
+
郞
|
21507 |
+
都
|
21508 |
+
鄕
|
21509 |
+
酔
|
21510 |
+
醤
|
21511 |
+
釈
|
21512 |
+
鋭
|
21513 |
+
鋳
|
21514 |
+
錬
|
21515 |
+
鎭
|
21516 |
+
鑚
|
21517 |
+
鉞
|
21518 |
+
鑠
|
21519 |
+
锜
|
21520 |
+
錡
|
21521 |
+
锸
|
21522 |
+
鍤
|
21523 |
+
镃
|
21524 |
+
鎡
|
21525 |
+
镒
|
21526 |
+
鎰
|
21527 |
+
镦
|
21528 |
+
鐓
|
21529 |
+
鬭
|
21530 |
+
闋
|
21531 |
+
陥
|
21532 |
+
隷
|
21533 |
+
難
|
21534 |
+
雠
|
21535 |
+
靁
|
21536 |
+
靑
|
21537 |
+
韫
|
21538 |
+
韞
|
21539 |
+
頻
|
21540 |
+
顕
|
21541 |
+
顚
|
21542 |
+
顙
|
21543 |
+
飮
|
21544 |
+
餍
|
21545 |
+
饜
|
21546 |
+
饩
|
21547 |
+
餼
|
21548 |
+
飭
|
21549 |
+
馑
|
21550 |
+
饉
|
21551 |
+
騒
|
21552 |
+
駟
|
21553 |
+
駢
|
21554 |
+
驪
|
21555 |
+
骍
|
21556 |
+
騂
|
21557 |
+
騫
|
21558 |
+
鬪
|
21559 |
+
鲂
|
21560 |
+
魴
|
21561 |
+
鲐
|
21562 |
+
鮐
|
21563 |
+
鲧
|
21564 |
+
鯀
|
21565 |
+
鳏
|
21566 |
+
鰥
|
21567 |
+
鴈
|
21568 |
+
鶏
|
21569 |
+
鷄
|
21570 |
+
鸱
|
21571 |
+
鴟
|
21572 |
+
鸷
|
21573 |
+
鷙
|
21574 |
+
鵠
|
21575 |
+
鶻
|
21576 |
+
鹥
|
21577 |
+
鷖
|
21578 |
+
鹯
|
21579 |
+
鸇
|
21580 |
+
麪
|
21581 |
+
麹
|
21582 |
+
黙
|
21583 |
+
鼇
|
21584 |
+
鼈
|
21585 |
+
黿
|
21586 |
+
鼌
|
21587 |
+
鼂
|
21588 |
+
鼍
|
21589 |
+
鼉
|
21590 |
+
龁
|
21591 |
+
齕
|
21592 |
+
隆
|
21593 |
+
精
|
21594 |
+
羽
|
21595 |
+
飯
|
21596 |
+
館
|
21597 |
+
既
|