KoichiYasuoka
commited on
Commit
·
f2856b0
1
Parent(s):
531973e
variants augmentatiion
Browse files- README.md +1 -1
- maker.py +15 -10
- pytorch_model.bin +1 -1
- tokenizer.json +0 -0
README.md
CHANGED
@@ -18,7 +18,7 @@ widget:
|
|
18 |
|
19 |
## Model Description
|
20 |
|
21 |
-
This is a ModernBERT model pre-trained on 青空文庫 texts. You can fine-tune `modernbert-base-japanese-aozora` for downstream tasks, such as POS-tagging, dependency-parsing, and so on.
|
22 |
|
23 |
## How to Use
|
24 |
|
|
|
18 |
|
19 |
## Model Description
|
20 |
|
21 |
+
This is a ModernBERT model pre-trained on 青空文庫 texts. NVIDIA A100-SXM4-40GB×8 took 5 hours 55 minutes for training. You can fine-tune `modernbert-base-japanese-aozora` for downstream tasks, such as POS-tagging, dependency-parsing, and so on.
|
22 |
|
23 |
## How to Use
|
24 |
|
maker.py
CHANGED
@@ -2,12 +2,10 @@
|
|
2 |
#pip3 install transformers accelerate deepspeed triton datasets fugashi unidic-lite
|
3 |
import os,json
|
4 |
os.system("""
|
5 |
-
if test -d transformers
|
6 |
then :
|
7 |
else git clone --depth=1 https://github.com/huggingface/transformers transformers-all
|
8 |
ln -s transformers-all/src/transformers transformers
|
9 |
-
sed 's/-> \\(.*\\) | \\(.*\\):/-> Union[\\1, \\2]:/' transformers/models/modernbert/modeling_modernbert.py > modeling_modernbert.py
|
10 |
-
cp modeling_modernbert.py transformers/models/modernbert
|
11 |
fi
|
12 |
test -d ModernBERT-base || git clone --depth=1 https://huggingface.co/answerdotai/ModernBERT-base
|
13 |
test -f ModernBERT-base/configuration_modernbert.py || sed 's/^from \\.\\.\\./from transformers./' transformers/models/modernbert/configuration_modernbert.py > ModernBERT-base/configuration_modernbert.py
|
@@ -27,17 +25,24 @@ if not "auto_map" in d:
|
|
27 |
json.dump(d,w,indent=2)
|
28 |
if not os.path.isfile("train.txt"):
|
29 |
import datasets
|
|
|
30 |
with open("train.txt","w",encoding="utf-8") as w:
|
31 |
-
d,
|
32 |
for t in d["train"]:
|
33 |
for s in t["text"].replace("。","。\n").replace("\u3000"," ").split("\n"):
|
34 |
-
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
else:
|
38 |
-
print(
|
39 |
-
|
40 |
-
print(
|
41 |
os.system("test -s token.txt || fugashi -Owakati < train.txt > token.txt")
|
42 |
|
43 |
from transformers import DebertaV2TokenizerFast
|
|
|
2 |
#pip3 install transformers accelerate deepspeed triton datasets fugashi unidic-lite
|
3 |
import os,json
|
4 |
os.system("""
|
5 |
+
if test -d transformers
|
6 |
then :
|
7 |
else git clone --depth=1 https://github.com/huggingface/transformers transformers-all
|
8 |
ln -s transformers-all/src/transformers transformers
|
|
|
|
|
9 |
fi
|
10 |
test -d ModernBERT-base || git clone --depth=1 https://huggingface.co/answerdotai/ModernBERT-base
|
11 |
test -f ModernBERT-base/configuration_modernbert.py || sed 's/^from \\.\\.\\./from transformers./' transformers/models/modernbert/configuration_modernbert.py > ModernBERT-base/configuration_modernbert.py
|
|
|
25 |
json.dump(d,w,indent=2)
|
26 |
if not os.path.isfile("train.txt"):
|
27 |
import datasets
|
28 |
+
aug=lambda x:(x.replace("侠","俠").replace("倶","俱").replace("洗","冼").replace("剥","剝").replace("即","卽").replace("呑","吞").replace("呉","吳").replace("填","塡").replace("巣","巢").replace("徴","徵").replace("徳","德").replace("掲","揭").replace("撃","擊").replace("教","敎").replace("晩","晚").replace("横","橫").replace("歩","步").replace("歴","歷").replace("毎","每").replace("冷","泠").replace("渉","涉").replace("涙","淚").replace("清","淸").replace("渇","渴").replace("温","溫").replace("状","狀").replace("産","產").replace("痩","瘦").replace("禰","祢").replace("箪","簞").replace("緑","綠").replace("緒","緖").replace("縁","緣").replace("繋","繫").replace("莱","萊").replace("薫","薰").replace("虚","虛").replace("蝉","蟬").replace("説","說").replace("躯","軀").replace("郎","郞").replace("醤","醬").replace("録","錄").replace("錬","鍊").replace("間","閒").replace("頬","頰").replace("顛","顚").replace("鴎","鷗").replace("麺","麵").replace("黄","黃").replace("黒","黑").replace("叱","𠮟"))
|
29 |
with open("train.txt","w",encoding="utf-8") as w:
|
30 |
+
d,u,v=datasets.load_dataset("globis-university/aozorabunko-clean"),"",""
|
31 |
for t in d["train"]:
|
32 |
for s in t["text"].replace("。","。\n").replace("\u3000"," ").split("\n"):
|
33 |
+
r=aug(s)
|
34 |
+
if r!=s:
|
35 |
+
if len(r)+len(v)<10000:
|
36 |
+
v+=r
|
37 |
+
else:
|
38 |
+
print(v,file=w)
|
39 |
+
v=r
|
40 |
+
if len(s)+len(u)<10000:
|
41 |
+
u+=s
|
42 |
else:
|
43 |
+
print(u,file=w)
|
44 |
+
u=s
|
45 |
+
print(u,v,file=w)
|
46 |
os.system("test -s token.txt || fugashi -Owakati < train.txt > token.txt")
|
47 |
|
48 |
from transformers import DebertaV2TokenizerFast
|
pytorch_model.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 643674094
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:71a8a58812cf68dee53784bc77332caf4e40b0211db98eef49fd3827aada2735
|
3 |
size 643674094
|
tokenizer.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|