KoichiYasuoka
commited on
Commit
•
5f1d37e
1
Parent(s):
f643899
model changed
Browse files- README.md +3 -3
- config.json +2 -5
- maker.py +5 -5
- pytorch_model.bin +2 -2
- tokenizer_config.json +1 -0
README.md
CHANGED
@@ -16,7 +16,7 @@ pipeline_tag: "token-classification"
|
|
16 |
|
17 |
## Model Description
|
18 |
|
19 |
-
This is a RoBERTa model pre-trained on Chinese Wikipedia texts (both simplified and traditional) for POS-tagging and dependency-parsing (using `goeswith` for subwords), derived from [
|
20 |
|
21 |
## How to Use
|
22 |
|
@@ -60,7 +60,7 @@ class UDgoeswith(object):
|
|
60 |
return u+"\n"
|
61 |
|
62 |
nlp=UDgoeswith("KoichiYasuoka/roberta-base-chinese-ud-goeswith")
|
63 |
-
print(nlp("
|
64 |
```
|
65 |
|
66 |
with [ufal.chu-liu-edmonds](https://pypi.org/project/ufal.chu-liu-edmonds/).
|
@@ -69,6 +69,6 @@ Or without ufal.chu-liu-edmonds:
|
|
69 |
```
|
70 |
from transformers import pipeline
|
71 |
nlp=pipeline("universal-dependencies","KoichiYasuoka/roberta-base-chinese-ud-goeswith",trust_remote_code=True,aggregation_strategy="simple")
|
72 |
-
print(nlp("
|
73 |
```
|
74 |
|
|
|
16 |
|
17 |
## Model Description
|
18 |
|
19 |
+
This is a RoBERTa model pre-trained on Chinese Wikipedia texts (both simplified and traditional) for POS-tagging and dependency-parsing (using `goeswith` for subwords), derived from [roberta_chinese_base](https://huggingface.co/clue/roberta_chinese_base).
|
20 |
|
21 |
## How to Use
|
22 |
|
|
|
60 |
return u+"\n"
|
61 |
|
62 |
nlp=UDgoeswith("KoichiYasuoka/roberta-base-chinese-ud-goeswith")
|
63 |
+
print(nlp("我把这本书看完了"))
|
64 |
```
|
65 |
|
66 |
with [ufal.chu-liu-edmonds](https://pypi.org/project/ufal.chu-liu-edmonds/).
|
|
|
69 |
```
|
70 |
from transformers import pipeline
|
71 |
nlp=pipeline("universal-dependencies","KoichiYasuoka/roberta-base-chinese-ud-goeswith",trust_remote_code=True,aggregation_strategy="simple")
|
72 |
+
print(nlp("我把这本书看完了"))
|
73 |
```
|
74 |
|
config.json
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
{
|
2 |
"architectures": [
|
3 |
-
"
|
4 |
],
|
5 |
"attention_probs_dropout_prob": 0.1,
|
6 |
"bos_token_id": 0,
|
@@ -12,8 +12,6 @@
|
|
12 |
},
|
13 |
"directionality": "bidi",
|
14 |
"eos_token_id": 2,
|
15 |
-
"finetuning_task": "ner",
|
16 |
-
"gradient_checkpointing": false,
|
17 |
"hidden_act": "gelu",
|
18 |
"hidden_dropout_prob": 0.1,
|
19 |
"hidden_size": 768,
|
@@ -631,10 +629,9 @@
|
|
631 |
},
|
632 |
"layer_norm_eps": 1e-12,
|
633 |
"max_position_embeddings": 512,
|
634 |
-
"model_type": "
|
635 |
"num_attention_heads": 12,
|
636 |
"num_hidden_layers": 12,
|
637 |
-
"output_past": true,
|
638 |
"pad_token_id": 1,
|
639 |
"pooler_fc_size": 768,
|
640 |
"pooler_num_attention_heads": 12,
|
|
|
1 |
{
|
2 |
"architectures": [
|
3 |
+
"RobertaForTokenClassification"
|
4 |
],
|
5 |
"attention_probs_dropout_prob": 0.1,
|
6 |
"bos_token_id": 0,
|
|
|
12 |
},
|
13 |
"directionality": "bidi",
|
14 |
"eos_token_id": 2,
|
|
|
|
|
15 |
"hidden_act": "gelu",
|
16 |
"hidden_dropout_prob": 0.1,
|
17 |
"hidden_size": 768,
|
|
|
629 |
},
|
630 |
"layer_norm_eps": 1e-12,
|
631 |
"max_position_embeddings": 512,
|
632 |
+
"model_type": "roberta",
|
633 |
"num_attention_heads": 12,
|
634 |
"num_hidden_layers": 12,
|
|
|
635 |
"pad_token_id": 1,
|
636 |
"pooler_fc_size": 768,
|
637 |
"pooler_num_attention_heads": 12,
|
maker.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
#! /usr/bin/python3
|
2 |
-
src="
|
3 |
tgt="KoichiYasuoka/roberta-base-chinese-ud-goeswith"
|
4 |
import os
|
5 |
for d in ["UD_Chinese-GSD","UD_Chinese-GSDSimp"]:
|
@@ -39,15 +39,15 @@ class UDgoeswithDataset(object):
|
|
39 |
return lid
|
40 |
__len__=lambda self:len(self.ids)
|
41 |
__getitem__=lambda self,i:{"input_ids":self.ids[i],"labels":[self.label2id[t] for t in self.tags[i]]}
|
42 |
-
from transformers import
|
43 |
-
tkz=
|
44 |
trainDS=UDgoeswithDataset("train.conllu",tkz)
|
45 |
devDS=UDgoeswithDataset("dev.conllu",tkz)
|
46 |
testDS=UDgoeswithDataset("test.conllu",tkz)
|
47 |
lid=trainDS(devDS,testDS)
|
48 |
-
cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,id2label={i:l for l,i in lid.items()}
|
49 |
arg=TrainingArguments(num_train_epochs=3,per_device_train_batch_size=32,output_dir="/tmp",overwrite_output_dir=True,save_total_limit=2,evaluation_strategy="epoch",learning_rate=5e-05,warmup_ratio=0.1)
|
50 |
-
trn=Trainer(args=arg,data_collator=DataCollatorForTokenClassification(tkz),model=AutoModelForTokenClassification.from_pretrained(src,config=cfg
|
51 |
trn.train()
|
52 |
trn.save_model(tgt)
|
53 |
tkz.save_pretrained(tgt)
|
|
|
1 |
#! /usr/bin/python3
|
2 |
+
src="clue/roberta_chinese_base"
|
3 |
tgt="KoichiYasuoka/roberta-base-chinese-ud-goeswith"
|
4 |
import os
|
5 |
for d in ["UD_Chinese-GSD","UD_Chinese-GSDSimp"]:
|
|
|
39 |
return lid
|
40 |
__len__=lambda self:len(self.ids)
|
41 |
__getitem__=lambda self,i:{"input_ids":self.ids[i],"labels":[self.label2id[t] for t in self.tags[i]]}
|
42 |
+
from transformers import BertTokenizer,AutoConfig,AutoModelForTokenClassification,DataCollatorForTokenClassification,TrainingArguments,Trainer
|
43 |
+
tkz=BertTokenizer.from_pretrained(src,model_max_length=512)
|
44 |
trainDS=UDgoeswithDataset("train.conllu",tkz)
|
45 |
devDS=UDgoeswithDataset("dev.conllu",tkz)
|
46 |
testDS=UDgoeswithDataset("test.conllu",tkz)
|
47 |
lid=trainDS(devDS,testDS)
|
48 |
+
cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,id2label={i:l for l,i in lid.items()})
|
49 |
arg=TrainingArguments(num_train_epochs=3,per_device_train_batch_size=32,output_dir="/tmp",overwrite_output_dir=True,save_total_limit=2,evaluation_strategy="epoch",learning_rate=5e-05,warmup_ratio=0.1)
|
50 |
+
trn=Trainer(args=arg,data_collator=DataCollatorForTokenClassification(tkz),model=AutoModelForTokenClassification.from_pretrained(src,config=cfg),train_dataset=trainDS,eval_dataset=devDS)
|
51 |
trn.train()
|
52 |
trn.save_model(tgt)
|
53 |
tkz.save_pretrained(tgt)
|
pytorch_model.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dfb7b516490a2a111a771d186112e9a33f9aa9c39c54514a0db029382159fd45
|
3 |
+
size 407711217
|
tokenizer_config.json
CHANGED
@@ -7,6 +7,7 @@
|
|
7 |
"never_split": null,
|
8 |
"pad_token": "[PAD]",
|
9 |
"sep_token": "[SEP]",
|
|
|
10 |
"strip_accents": null,
|
11 |
"tokenize_chinese_chars": true,
|
12 |
"tokenizer_class": "BertTokenizerFast",
|
|
|
7 |
"never_split": null,
|
8 |
"pad_token": "[PAD]",
|
9 |
"sep_token": "[SEP]",
|
10 |
+
"special_tokens_map_file": null,
|
11 |
"strip_accents": null,
|
12 |
"tokenize_chinese_chars": true,
|
13 |
"tokenizer_class": "BertTokenizerFast",
|