KoichiYasuoka commited on
Commit
65faa3d
1 Parent(s): 7ec0086

model improved

Browse files
Files changed (4) hide show
  1. maker.py +16 -14
  2. pytorch_model.bin +1 -1
  3. tokenizer_config.json +0 -1
  4. ud.py +8 -2
maker.py CHANGED
@@ -8,11 +8,11 @@ from transformers import AutoTokenizer,AutoConfig,Qwen2ForTokenClassification,De
8
  d=os.path.basename(url)
9
  os.system("test -d "+d+" || git clone --depth=1 "+url)
10
  os.system("for F in train dev test ; do cp "+d+"/*-$F.conllu $F.conllu ; done")
11
- tkz=AutoTokenizer.from_pretrained(src,unk_token="<|im_start|>",sep_token="<|im_end|>")
12
- tkz.save_pretrained("tmpdir")
13
  os.rename("tmpdir/tokenizer.json","tmpdir/tokenizer.json.old")
14
  os.rename("tmpdir/merges.txt","tmpdir/oldmerges.txt")
15
- d=json.loads(tkz.backend_tokenizer.to_str())
16
  form=set()
17
  with open("train.conllu","r",encoding="utf-8") as r:
18
  for s in r:
@@ -20,21 +20,22 @@ with open("train.conllu","r",encoding="utf-8") as r:
20
  if len(w)==10 and w[0].isdecimal():
21
  form.add(w[1])
22
  m=[t for t in d["model"]["merges"] if len(t)<5 and unicodedata.category(t[0])[0]!="P"]
23
- for i in range(len(tkz)):
24
- w=tkz.decode(i)
25
  if len(w)==2 and w in form and not unicodedata.name(w[0]).startswith("HIRAGANA"):
26
- k=tkz([w[0],w[1]],add_special_tokens=False)["input_ids"]
27
  if len(k[0])==1 and len(k[1])==1:
28
- m.append(" ".join(tkz.convert_ids_to_tokens([k[0][0],k[1][0]])))
29
  with open("tmpdir/merges.txt","w",encoding="utf-8") as w:
30
  print("#version: 0.2",file=w)
31
  print("\n".join(m),file=w)
32
- tkz=AutoTokenizer.from_pretrained("tmpdir")
33
 
34
  class UDCausalDataset(object):
35
- def __init__(self,conllu,tokenizer,embeddings=None):
36
  self.conllu=open(conllu,"r",encoding="utf-8")
37
  self.tokenizer=tokenizer
 
38
  self.embeddings=embeddings
39
  self.max_tokens=3
40
  self.seeks=[(0,0)]
@@ -79,8 +80,8 @@ class UDCausalDataset(object):
79
  if w[0].isdecimal():
80
  upos.append(w[3] if w[5]=="_" else w[3]+"|"+w[5])
81
  deps.append((int(w[6]),w[7]))
82
- v=self.tokenizer(form,add_special_tokens=False)
83
  if t==0:
 
84
  i,u=[],[]
85
  for j,(x,y) in enumerate(zip(v["input_ids"],upos)):
86
  if x!=[]:
@@ -90,6 +91,7 @@ class UDCausalDataset(object):
90
  pad=self.tokenizer.pad_token_id
91
  else:
92
  import torch
 
93
  m=[]
94
  for x in v["input_ids"]:
95
  if x==[]:
@@ -117,9 +119,9 @@ class UDCausalDataset(object):
117
  upos=u[0:self.max_tokens]
118
  return {"inputs_embeds":emb[ids,:],"labels":[self.label2id[p] for p in upos]}
119
 
120
- trainDS=UDCausalDataset("train.conllu",tkz)
121
- devDS=UDCausalDataset("dev.conllu",tkz)
122
- testDS=UDCausalDataset("test.conllu",tkz)
123
  lid=trainDS(devDS,testDS)
124
  cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,id2label={i:l for l,i in lid.items()},ignore_mismatched_sizes=True)
125
  mdl=Qwen2ForTokenClassification.from_pretrained(src,config=cfg,ignore_mismatched_sizes=True)
@@ -129,4 +131,4 @@ arg=TrainingArguments(num_train_epochs=3,per_device_train_batch_size=32,dataload
129
  trn=Trainer(args=arg,data_collator=DefaultDataCollator(),model=mdl,train_dataset=trainDS)
130
  trn.train()
131
  trn.save_model(tgt)
132
- tkz.save_pretrained(tgt)
 
8
  d=os.path.basename(url)
9
  os.system("test -d "+d+" || git clone --depth=1 "+url)
10
  os.system("for F in train dev test ; do cp "+d+"/*-$F.conllu $F.conllu ; done")
11
+ otk=AutoTokenizer.from_pretrained(src,unk_token="<|im_start|>",sep_token="<|im_end|>")
12
+ otk.save_pretrained("tmpdir")
13
  os.rename("tmpdir/tokenizer.json","tmpdir/tokenizer.json.old")
14
  os.rename("tmpdir/merges.txt","tmpdir/oldmerges.txt")
15
+ d=json.loads(otk.backend_tokenizer.to_str())
16
  form=set()
17
  with open("train.conllu","r",encoding="utf-8") as r:
18
  for s in r:
 
20
  if len(w)==10 and w[0].isdecimal():
21
  form.add(w[1])
22
  m=[t for t in d["model"]["merges"] if len(t)<5 and unicodedata.category(t[0])[0]!="P"]
23
+ for i in range(len(otk)):
24
+ w=otk.decode(i)
25
  if len(w)==2 and w in form and not unicodedata.name(w[0]).startswith("HIRAGANA"):
26
+ k=otk([w[0],w[1]],add_special_tokens=False)["input_ids"]
27
  if len(k[0])==1 and len(k[1])==1:
28
+ m.append(" ".join(otk.convert_ids_to_tokens([k[0][0],k[1][0]])))
29
  with open("tmpdir/merges.txt","w",encoding="utf-8") as w:
30
  print("#version: 0.2",file=w)
31
  print("\n".join(m),file=w)
32
+ ntk=AutoTokenizer.from_pretrained("tmpdir")
33
 
34
  class UDCausalDataset(object):
35
+ def __init__(self,conllu,tokenizer,oldtokenizer=None,embeddings=None):
36
  self.conllu=open(conllu,"r",encoding="utf-8")
37
  self.tokenizer=tokenizer
38
+ self.oldtokenizer=oldtokenizer if oldtokenizer else tokenizer
39
  self.embeddings=embeddings
40
  self.max_tokens=3
41
  self.seeks=[(0,0)]
 
80
  if w[0].isdecimal():
81
  upos.append(w[3] if w[5]=="_" else w[3]+"|"+w[5])
82
  deps.append((int(w[6]),w[7]))
 
83
  if t==0:
84
+ v=self.tokenizer(form,add_special_tokens=False)
85
  i,u=[],[]
86
  for j,(x,y) in enumerate(zip(v["input_ids"],upos)):
87
  if x!=[]:
 
91
  pad=self.tokenizer.pad_token_id
92
  else:
93
  import torch
94
+ v=self.oldtokenizer(form,add_special_tokens=False)
95
  m=[]
96
  for x in v["input_ids"]:
97
  if x==[]:
 
119
  upos=u[0:self.max_tokens]
120
  return {"inputs_embeds":emb[ids,:],"labels":[self.label2id[p] for p in upos]}
121
 
122
+ trainDS=UDCausalDataset("train.conllu",ntk,otk)
123
+ devDS=UDCausalDataset("dev.conllu",ntk,otk)
124
+ testDS=UDCausalDataset("test.conllu",ntk,otk)
125
  lid=trainDS(devDS,testDS)
126
  cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,id2label={i:l for l,i in lid.items()},ignore_mismatched_sizes=True)
127
  mdl=Qwen2ForTokenClassification.from_pretrained(src,config=cfg,ignore_mismatched_sizes=True)
 
131
  trn=Trainer(args=arg,data_collator=DefaultDataCollator(),model=mdl,train_dataset=trainDS)
132
  trn.train()
133
  trn.save_model(tgt)
134
+ ntk.save_pretrained(tgt)
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dab62fe8682839188aaa62be1ef62d06d3e724852661e54cfabe34a6e6693870
3
  size 1856725466
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aea606a7a9a7b46f6f045932dc3960fb0deefe348b28aa3fbf419d4b189795db
3
  size 1856725466
tokenizer_config.json CHANGED
@@ -32,7 +32,6 @@
32
  "<|im_end|>"
33
  ],
34
  "bos_token": null,
35
- "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
36
  "clean_up_tokenization_spaces": false,
37
  "eos_token": "<|endoftext|>",
38
  "errors": "replace",
 
32
  "<|im_end|>"
33
  ],
34
  "bos_token": null,
 
35
  "clean_up_tokenization_spaces": false,
36
  "eos_token": "<|endoftext|>",
37
  "errors": "replace",
ud.py CHANGED
@@ -1,5 +1,10 @@
1
  import numpy
2
- from transformers import TokenClassificationPipeline
 
 
 
 
 
3
 
4
  class BellmanFordTokenClassificationPipeline(TokenClassificationPipeline):
5
  def __init__(self,**kwargs):
@@ -42,6 +47,7 @@ class UniversalDependenciesCausalPipeline(BellmanFordTokenClassificationPipeline
42
  def __init__(self,**kwargs):
43
  kwargs["aggregation_strategy"]="simple"
44
  super().__init__(**kwargs)
 
45
  x=self.model.config.label2id
46
  self.root=numpy.full((len(x)),numpy.nan)
47
  self.left_arc=numpy.full((len(x)),numpy.nan)
@@ -87,7 +93,7 @@ class UniversalDependenciesCausalPipeline(BellmanFordTokenClassificationPipeline
87
  if d[i].strip()=="":
88
  d.pop(i)
89
  w.pop(i)
90
- v=self.tokenizer(d,add_special_tokens=False)
91
  e=self.model.get_input_embeddings().weight
92
  m=[]
93
  for x in v["input_ids"]:
 
1
  import numpy
2
+ from transformers import TokenClassificationPipeline,AutoTokenizer
3
+ try:
4
+ from transformers.utils import cached_file
5
+ except:
6
+ from transformers.file_utils import cached_path,hf_bucket_url
7
+ cached_file=lambda x,y:os.path.join(x,y) if os.path.isdir(x) else cached_path(hf_bucket_url(x,y))
8
 
9
  class BellmanFordTokenClassificationPipeline(TokenClassificationPipeline):
10
  def __init__(self,**kwargs):
 
47
  def __init__(self,**kwargs):
48
  kwargs["aggregation_strategy"]="simple"
49
  super().__init__(**kwargs)
50
+ self.oldtokenizer=AutoTokenizer.from_pretrained(self.tokenizer.name_or_path,merges_file=cached_file(self.tokenizer.name_or_path,"oldmerges.txt"))
51
  x=self.model.config.label2id
52
  self.root=numpy.full((len(x)),numpy.nan)
53
  self.left_arc=numpy.full((len(x)),numpy.nan)
 
93
  if d[i].strip()=="":
94
  d.pop(i)
95
  w.pop(i)
96
+ v=self.oldtokenizer(d,add_special_tokens=False)
97
  e=self.model.get_input_embeddings().weight
98
  m=[]
99
  for x in v["input_ids"]: