KoichiYasuoka commited on
Commit
f2856b0
·
1 Parent(s): 531973e

variants augmentatiion

Browse files
Files changed (4) hide show
  1. README.md +1 -1
  2. maker.py +15 -10
  3. pytorch_model.bin +1 -1
  4. tokenizer.json +0 -0
README.md CHANGED
@@ -18,7 +18,7 @@ widget:
18
 
19
  ## Model Description
20
 
21
- This is a ModernBERT model pre-trained on 青空文庫 texts. You can fine-tune `modernbert-base-japanese-aozora` for downstream tasks, such as POS-tagging, dependency-parsing, and so on.
22
 
23
  ## How to Use
24
 
 
18
 
19
  ## Model Description
20
 
21
+ This is a ModernBERT model pre-trained on 青空文庫 texts. NVIDIA A100-SXM4-40GB×8 took 5 hours 55 minutes for training. You can fine-tune `modernbert-base-japanese-aozora` for downstream tasks, such as POS-tagging, dependency-parsing, and so on.
22
 
23
  ## How to Use
24
 
maker.py CHANGED
@@ -2,12 +2,10 @@
2
  #pip3 install transformers accelerate deepspeed triton datasets fugashi unidic-lite
3
  import os,json
4
  os.system("""
5
- if test -d transformers
6
  then :
7
  else git clone --depth=1 https://github.com/huggingface/transformers transformers-all
8
  ln -s transformers-all/src/transformers transformers
9
- sed 's/-> \\(.*\\) | \\(.*\\):/-> Union[\\1, \\2]:/' transformers/models/modernbert/modeling_modernbert.py > modeling_modernbert.py
10
- cp modeling_modernbert.py transformers/models/modernbert
11
  fi
12
  test -d ModernBERT-base || git clone --depth=1 https://huggingface.co/answerdotai/ModernBERT-base
13
  test -f ModernBERT-base/configuration_modernbert.py || sed 's/^from \\.\\.\\./from transformers./' transformers/models/modernbert/configuration_modernbert.py > ModernBERT-base/configuration_modernbert.py
@@ -27,17 +25,24 @@ if not "auto_map" in d:
27
  json.dump(d,w,indent=2)
28
  if not os.path.isfile("train.txt"):
29
  import datasets
 
30
  with open("train.txt","w",encoding="utf-8") as w:
31
- d,i=datasets.load_dataset("globis-university/aozorabunko-clean"),0
32
  for t in d["train"]:
33
  for s in t["text"].replace("。","。\n").replace("\u3000"," ").split("\n"):
34
- if i+len(s)<10000:
35
- print(s,end="",file=w)
36
- i+=len(s)
 
 
 
 
 
 
37
  else:
38
- print("\n"+s,end="",file=w)
39
- i=len(s)
40
- print("",file=w)
41
  os.system("test -s token.txt || fugashi -Owakati < train.txt > token.txt")
42
 
43
  from transformers import DebertaV2TokenizerFast
 
2
  #pip3 install transformers accelerate deepspeed triton datasets fugashi unidic-lite
3
  import os,json
4
  os.system("""
5
+ if test -d transformers
6
  then :
7
  else git clone --depth=1 https://github.com/huggingface/transformers transformers-all
8
  ln -s transformers-all/src/transformers transformers
 
 
9
  fi
10
  test -d ModernBERT-base || git clone --depth=1 https://huggingface.co/answerdotai/ModernBERT-base
11
  test -f ModernBERT-base/configuration_modernbert.py || sed 's/^from \\.\\.\\./from transformers./' transformers/models/modernbert/configuration_modernbert.py > ModernBERT-base/configuration_modernbert.py
 
25
  json.dump(d,w,indent=2)
26
  if not os.path.isfile("train.txt"):
27
  import datasets
28
+ aug=lambda x:(x.replace("侠","俠").replace("倶","俱").replace("洗","冼").replace("剥","剝").replace("即","卽").replace("呑","吞").replace("呉","吳").replace("填","塡").replace("巣","巢").replace("徴","徵").replace("徳","德").replace("掲","揭").replace("撃","擊").replace("教","敎").replace("晩","晚").replace("横","橫").replace("歩","步").replace("歴","歷").replace("毎","每").replace("冷","泠").replace("渉","涉").replace("涙","淚").replace("清","淸").replace("渇","渴").replace("温","溫").replace("状","狀").replace("産","產").replace("痩","瘦").replace("禰","祢").replace("箪","簞").replace("緑","綠").replace("緒","緖").replace("縁","緣").replace("繋","繫").replace("莱","萊").replace("薫","薰").replace("虚","虛").replace("蝉","蟬").replace("説","說").replace("躯","軀").replace("郎","郞").replace("醤","醬").replace("録","錄").replace("錬","鍊").replace("間","閒").replace("頬","頰").replace("顛","顚").replace("鴎","鷗").replace("麺","麵").replace("黄","黃").replace("黒","黑").replace("叱","𠮟"))
29
  with open("train.txt","w",encoding="utf-8") as w:
30
+ d,u,v=datasets.load_dataset("globis-university/aozorabunko-clean"),"",""
31
  for t in d["train"]:
32
  for s in t["text"].replace("。","。\n").replace("\u3000"," ").split("\n"):
33
+ r=aug(s)
34
+ if r!=s:
35
+ if len(r)+len(v)<10000:
36
+ v+=r
37
+ else:
38
+ print(v,file=w)
39
+ v=r
40
+ if len(s)+len(u)<10000:
41
+ u+=s
42
  else:
43
+ print(u,file=w)
44
+ u=s
45
+ print(u,v,file=w)
46
  os.system("test -s token.txt || fugashi -Owakati < train.txt > token.txt")
47
 
48
  from transformers import DebertaV2TokenizerFast
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e33012f766bf187fcbc322e69c2b82c44d5610cbbf437b8c6975f8a2117f648c
3
  size 643674094
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71a8a58812cf68dee53784bc77332caf4e40b0211db98eef49fd3827aada2735
3
  size 643674094
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff