Commit
·
0ee48d1
1
Parent(s):
c300498
model improved for transformers 4.42
Browse files- config.json +2 -13
- maker.sh +1 -37
- pytorch_model-00001-of-00003.bin +1 -1
- pytorch_model-00002-of-00003.bin +1 -1
- pytorch_model-00003-of-00003.bin +1 -1
- tokenizer_config.json +1 -0
- upos.py +2 -41
config.json
CHANGED
|
@@ -3,22 +3,11 @@
|
|
| 3 |
"Qwen2ForTokenClassification"
|
| 4 |
],
|
| 5 |
"attention_dropout": 0.0,
|
| 6 |
-
"auto_map": {
|
| 7 |
-
"AutoModelForTokenClassification": "upos.Qwen2ForTokenClassification"
|
| 8 |
-
},
|
| 9 |
"bos_token_id": 151643,
|
| 10 |
"custom_pipelines": {
|
| 11 |
"upos": {
|
| 12 |
"impl": "upos.BellmanFordTokenClassificationPipeline",
|
| 13 |
"pt": "AutoModelForTokenClassification"
|
| 14 |
-
},
|
| 15 |
-
"token-classification":{
|
| 16 |
-
"impl": "upos.RawTokenClassificationPipeline",
|
| 17 |
-
"pt": "AutoModelForTokenClassification"
|
| 18 |
-
},
|
| 19 |
-
"ner":{
|
| 20 |
-
"impl": "upos.RawTokenClassificationPipeline",
|
| 21 |
-
"pt": "AutoModelForTokenClassification"
|
| 22 |
}
|
| 23 |
},
|
| 24 |
"eos_token_id": 151643,
|
|
@@ -376,9 +365,9 @@
|
|
| 376 |
"rope_theta": 5000000.0,
|
| 377 |
"sliding_window": 32768,
|
| 378 |
"tie_word_embeddings": false,
|
| 379 |
-
"tokenizer_class": "Qwen2Tokenizer",
|
| 380 |
"torch_dtype": "float32",
|
| 381 |
-
"
|
|
|
|
| 382 |
"use_cache": false,
|
| 383 |
"use_sliding_window": false,
|
| 384 |
"vocab_size": 151936
|
|
|
|
| 3 |
"Qwen2ForTokenClassification"
|
| 4 |
],
|
| 5 |
"attention_dropout": 0.0,
|
|
|
|
|
|
|
|
|
|
| 6 |
"bos_token_id": 151643,
|
| 7 |
"custom_pipelines": {
|
| 8 |
"upos": {
|
| 9 |
"impl": "upos.BellmanFordTokenClassificationPipeline",
|
| 10 |
"pt": "AutoModelForTokenClassification"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
}
|
| 12 |
},
|
| 13 |
"eos_token_id": 151643,
|
|
|
|
| 365 |
"rope_theta": 5000000.0,
|
| 366 |
"sliding_window": 32768,
|
| 367 |
"tie_word_embeddings": false,
|
|
|
|
| 368 |
"torch_dtype": "float32",
|
| 369 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
| 370 |
+
"transformers_version": "4.42.4",
|
| 371 |
"use_cache": false,
|
| 372 |
"use_sliding_window": false,
|
| 373 |
"vocab_size": 151936
|
maker.sh
CHANGED
|
@@ -13,43 +13,7 @@ TMP=./maker$$.py
|
|
| 13 |
echo 'tgt="KoichiYasuoka/'$S'-upos"'
|
| 14 |
) > $TMP
|
| 15 |
cat << 'EOF' >> $TMP
|
| 16 |
-
from transformers import AutoTokenizer,
|
| 17 |
-
from transformers.modeling_outputs import TokenClassifierOutput
|
| 18 |
-
|
| 19 |
-
class Qwen2ForTokenClassification(Qwen2PreTrainedModel):
|
| 20 |
-
def __init__(self,config):
|
| 21 |
-
from torch import nn
|
| 22 |
-
super().__init__(config)
|
| 23 |
-
self.num_labels=config.num_labels
|
| 24 |
-
self.model=Qwen2Model(config)
|
| 25 |
-
if getattr(config,"classifier_dropout",None) is not None:
|
| 26 |
-
classifier_dropout=config.classifier_dropout
|
| 27 |
-
elif getattr(config,"hidden_dropout",None) is not None:
|
| 28 |
-
classifier_dropout=config.hidden_dropout
|
| 29 |
-
else:
|
| 30 |
-
classifier_dropout=0.1
|
| 31 |
-
self.dropout=nn.Dropout(classifier_dropout)
|
| 32 |
-
self.score=nn.Linear(config.hidden_size,config.num_labels)
|
| 33 |
-
self.post_init()
|
| 34 |
-
def get_input_embeddings(self):
|
| 35 |
-
return self.model.embed_tokens
|
| 36 |
-
def set_input_embeddings(self,value):
|
| 37 |
-
self.model.embed_tokens=value
|
| 38 |
-
def forward(self,input_ids=None,past_key_values=None,attention_mask=None,position_ids=None,inputs_embeds=None,labels=None,use_cache=None,output_attentions=None,output_hidden_states=None,return_dict=None):
|
| 39 |
-
return_dict=return_dict if return_dict is not None else self.config.use_return_dict
|
| 40 |
-
outputs=self.model(input_ids,past_key_values=past_key_values,attention_mask=attention_mask,position_ids=position_ids,inputs_embeds=inputs_embeds,use_cache=use_cache,output_attentions=output_attentions,output_hidden_states=output_hidden_states,return_dict=return_dict)
|
| 41 |
-
sequence_output=outputs[0]
|
| 42 |
-
sequence_output=self.dropout(sequence_output)
|
| 43 |
-
logits=self.score(sequence_output)
|
| 44 |
-
loss=None
|
| 45 |
-
if labels is not None:
|
| 46 |
-
from torch import nn
|
| 47 |
-
loss_fct=nn.CrossEntropyLoss()
|
| 48 |
-
loss=loss_fct(logits.view(-1,self.num_labels),labels.view(-1))
|
| 49 |
-
if not return_dict:
|
| 50 |
-
output=(logits,)+outputs[2:]
|
| 51 |
-
return ((loss,)+output) if loss is not None else output
|
| 52 |
-
return TokenClassifierOutput(loss=loss,logits=logits,hidden_states=outputs.hidden_states,attentions=outputs.attentions)
|
| 53 |
|
| 54 |
class UPOSFileDataset(object):
|
| 55 |
def __init__(self,conllu,tokenizer):
|
|
|
|
| 13 |
echo 'tgt="KoichiYasuoka/'$S'-upos"'
|
| 14 |
) > $TMP
|
| 15 |
cat << 'EOF' >> $TMP
|
| 16 |
+
from transformers import AutoTokenizer,Qwen2ForTokenClassification,AutoConfig,DataCollatorForTokenClassification,TrainingArguments,Trainer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
class UPOSFileDataset(object):
|
| 19 |
def __init__(self,conllu,tokenizer):
|
pytorch_model-00001-of-00003.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4974769352
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:47333ed441637dc876fad1a283080dbf29c00970c0570d14f4293ac5d9382723
|
| 3 |
size 4974769352
|
pytorch_model-00002-of-00003.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4934433952
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:db2ebda5f396ded9ba823c3b86e46fd751d8e208822c9b94da333c5c3ababd89
|
| 3 |
size 4934433952
|
pytorch_model-00003-of-00003.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4338334558
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8d64c6d1dc4366f38ac560fa2413115f565dc7bff0ad64eb9ae55e7b0476ca23
|
| 3 |
size 4338334558
|
tokenizer_config.json
CHANGED
|
@@ -31,6 +31,7 @@
|
|
| 31 |
"<|im_end|>"
|
| 32 |
],
|
| 33 |
"bos_token": null,
|
|
|
|
| 34 |
"clean_up_tokenization_spaces": false,
|
| 35 |
"eos_token": "<|endoftext|>",
|
| 36 |
"errors": "replace",
|
|
|
|
| 31 |
"<|im_end|>"
|
| 32 |
],
|
| 33 |
"bos_token": null,
|
| 34 |
+
"chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ content }}{% elif message['role'] == 'assistant' %}{{ content + '<|endoftext|>' }}{% endif %}{% endfor %}",
|
| 35 |
"clean_up_tokenization_spaces": false,
|
| 36 |
"eos_token": "<|endoftext|>",
|
| 37 |
"errors": "replace",
|
upos.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
-
from transformers import TokenClassificationPipeline
|
| 2 |
-
from transformers.modeling_outputs import TokenClassifierOutput
|
| 3 |
|
| 4 |
class BellmanFordTokenClassificationPipeline(TokenClassificationPipeline):
|
| 5 |
def __init__(self,**kwargs):
|
|
@@ -17,6 +16,7 @@ class BellmanFordTokenClassificationPipeline(TokenClassificationPipeline):
|
|
| 17 |
import numpy
|
| 18 |
if "logits" not in model_outputs:
|
| 19 |
return self.postprocess(model_outputs[0],**kwargs)
|
|
|
|
| 20 |
m=model_outputs["logits"][0].numpy()
|
| 21 |
e=numpy.exp(m-numpy.max(m,axis=-1,keepdims=True))
|
| 22 |
z=e/e.sum(axis=-1,keepdims=True)
|
|
@@ -40,42 +40,3 @@ class BellmanFordTokenClassificationPipeline(TokenClassificationPipeline):
|
|
| 40 |
t["text"]=model_outputs["sentence"][t["start"]:t["end"]]
|
| 41 |
return w
|
| 42 |
|
| 43 |
-
class RawTokenClassificationPipeline(TokenClassificationPipeline):
|
| 44 |
-
def check_model_type(self,supported_models):
|
| 45 |
-
pass
|
| 46 |
-
|
| 47 |
-
class Qwen2ForTokenClassification(Qwen2PreTrainedModel):
|
| 48 |
-
def __init__(self,config):
|
| 49 |
-
from torch import nn
|
| 50 |
-
super().__init__(config)
|
| 51 |
-
self.num_labels=config.num_labels
|
| 52 |
-
self.model=Qwen2Model(config)
|
| 53 |
-
if getattr(config,"classifier_dropout",None) is not None:
|
| 54 |
-
classifier_dropout=config.classifier_dropout
|
| 55 |
-
elif getattr(config,"hidden_dropout",None) is not None:
|
| 56 |
-
classifier_dropout=config.hidden_dropout
|
| 57 |
-
else:
|
| 58 |
-
classifier_dropout=0.1
|
| 59 |
-
self.dropout=nn.Dropout(classifier_dropout)
|
| 60 |
-
self.score=nn.Linear(config.hidden_size,config.num_labels)
|
| 61 |
-
self.post_init()
|
| 62 |
-
def get_input_embeddings(self):
|
| 63 |
-
return self.model.embed_tokens
|
| 64 |
-
def set_input_embeddings(self,value):
|
| 65 |
-
self.model.embed_tokens=value
|
| 66 |
-
def forward(self,input_ids=None,past_key_values=None,attention_mask=None,position_ids=None,inputs_embeds=None,labels=None,use_cache=None,output_attentions=None,output_hidden_states=None,return_dict=None):
|
| 67 |
-
return_dict=return_dict if return_dict is not None else self.config.use_return_dict
|
| 68 |
-
outputs=self.model(input_ids,past_key_values=past_key_values,attention_mask=attention_mask,position_ids=position_ids,inputs_embeds=inputs_embeds,use_cache=use_cache,output_attentions=output_attentions,output_hidden_states=output_hidden_states,return_dict=return_dict)
|
| 69 |
-
sequence_output=outputs[0]
|
| 70 |
-
sequence_output=self.dropout(sequence_output)
|
| 71 |
-
logits=self.score(sequence_output)
|
| 72 |
-
loss=None
|
| 73 |
-
if labels is not None:
|
| 74 |
-
from torch import nn
|
| 75 |
-
loss_fct=nn.CrossEntropyLoss()
|
| 76 |
-
loss=loss_fct(logits.view(-1,self.num_labels),labels.view(-1))
|
| 77 |
-
if not return_dict:
|
| 78 |
-
output=(logits,)+outputs[2:]
|
| 79 |
-
return ((loss,)+output) if loss is not None else output
|
| 80 |
-
return TokenClassifierOutput(loss=loss,logits=logits,hidden_states=outputs.hidden_states,attentions=outputs.attentions)
|
| 81 |
-
|
|
|
|
| 1 |
+
from transformers import TokenClassificationPipeline
|
|
|
|
| 2 |
|
| 3 |
class BellmanFordTokenClassificationPipeline(TokenClassificationPipeline):
|
| 4 |
def __init__(self,**kwargs):
|
|
|
|
| 16 |
import numpy
|
| 17 |
if "logits" not in model_outputs:
|
| 18 |
return self.postprocess(model_outputs[0],**kwargs)
|
| 19 |
+
print(model_outputs["logits"].size())
|
| 20 |
m=model_outputs["logits"][0].numpy()
|
| 21 |
e=numpy.exp(m-numpy.max(m,axis=-1,keepdims=True))
|
| 22 |
z=e/e.sum(axis=-1,keepdims=True)
|
|
|
|
| 40 |
t["text"]=model_outputs["sentence"][t["start"]:t["end"]]
|
| 41 |
return w
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|