File size: 6,814 Bytes
bfdb71d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
# -*- coding: utf-8 -*-
"""korscideberta.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1vJNUG_F5El5LY8xmmwRVXo66bYBfXtdz
"""

#!git clone https://huggingface.co/kisti/korscideberta; cd korscideberta

# Commented out IPython magic to ensure Python compatibility.
#!pwd
#%cd ..
#!pip install konlpy
# %cd korscideberta

# Commented out IPython magic to ensure Python compatibility.
'''
! git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git
# %cd Mecab-ko-for-Google-Colab/
! bash install_mecab-ko_on_colab_light_220429.sh
# %cd ..
!pip install datasets transformers[sentencepiece]
'''

# Commented out IPython magic to ensure Python compatibility.
'''
!pip install -U accelerate; pip install -U transformers; pip install pydantic==1.8
'''

!pwd
# %cd /content/korscideberta

'''
#[ํ•„์ˆ˜]๋ฆฌ๋ˆ…์Šค ํ„ฐ๋ฏธ๋„์—์„œ ๋ณธ ์ฝ”๋“œ ๋ฐ ํ† ํฌ๋‚˜์ด์ € ๋‹ค์šด๋กœ๋“œ
#git clone https://huggingface.co/kisti/korscideberta
#cd korscideberta

#[ํ•„์ˆ˜]๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์„ค์น˜(Mecab ๋“ฑ ์ž์„ธํ•œ ์„ค์น˜ ๋ฐฉ๋ฒ•์€ KorSciDeBERTaํ™˜๊ฒฝ์„ค์น˜+ํŒŒ์ธํŠœ๋‹.pdf ์ฐธ์กฐ)
!apt install git-lfs

'''

from datasets import load_dataset
import datasets
from huggingface_hub import notebook_login

notebook_login() #Huggingface ๋กœ๊ทธ์ธ
#ํ† ํฐ ์˜ˆ์‹œ: hf_jRjLZcSBibYHwUaTjiNUEeoJlFxhFkGM

model_repository = "kisti/korscideberta" #Huggingface ๋ชจ๋ธ๋ช… ์„ค์ •
#model_repository = "./"
from transformers import AutoTokenizer
from tokenization_korscideberta import DebertaV2Tokenizer
tokenizer = DebertaV2Tokenizer.from_pretrained(model_repository)
out = tokenizer.tokenize("<cls> ํ•œ๊ตญ์–ด ๋ชจ๋ธ์„ <s> ํ•œ๊ตญ์–ด ๋ชจ๋ธ์„ ๊ณต์œ ํ•ฉ๋‹ˆ๋‹ค. <s>")
print(str(out))

#๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ
#data_files = {"train": "๋ฌธ์žฅ์˜๋ฏธ-๊ท ๋“ฑ์ €๋„/test.json", "test": "๋ฌธ์žฅ์˜๋ฏธ-๊ท ๋“ฑ์ €๋„/train.json", 'dev':'๋ฌธ์žฅ์˜๋ฏธ-๊ท ๋“ฑ์ €๋„/dev.json'}
#dataset = load_dataset('json', data_files=data_files)
dataset = load_dataset('csv', data_files='data/Abstract_Annotation_Data_tagsentence.csv', split='train')
dataset = dataset.shuffle(seed=42)
dataset = dataset.train_test_split(test_size=0.1)
print("dataset:", str(dataset))

#๋ฐ์ดํ„ฐ์…‹์„ ํ† ํฌ๋‚˜์ด์ง• ํ›„ ์ €์žฅ
from datasets import ClassLabel
labels = [x for x in dataset['train']['tag']]
labels = list(set(labels))
labels.sort()
num_labels = len(labels)
print('Labels: '+str(labels)[:200])
ClassLabels = ClassLabel(num_classes=len(labels), names=labels)

def preprocess_function(example):
    output_dict = tokenizer('<cls>'+example["sentence"]+'<s>', max_length=512, truncation=True)
    output_dict['labels'] = ClassLabels.str2int(example['tag'])
    return output_dict
#tokenized_datasets = dataset.map(preprocess_function, batched=False, remove_columns=dataset["train"].column_names)
tokenized_datasets = dataset.map(preprocess_function, batched=False)
tokenized_datasets = tokenized_datasets.cast_column("labels", ClassLabel(names=labels))

#๋ฐ์ดํ„ฐ์…‹ ํ† ํฌ๋‚˜์ด์ง• ํ™•์ธ
random_id = 1
print("Input IDS:", tokenized_datasets["train"][random_id]["input_ids"])
print("Labels:", tokenized_datasets["train"][random_id]["labels"])
tokenized_datasets.save_to_disk('data/tok')

#KorSciDeBERTa ๋ชจ๋ธ ๋กœ๋”ฉ
from transformers import AutoModelForSequenceClassification

num_labels = len(labels)
def model_init():
    #return AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=7)
    #return AutoModelForSequenceClassification.from_pretrained(model_repository, num_labels=num_labels, hidden_dropout_prob=0.3, attention_probs_dropout_prob=0.25)
    return AutoModelForSequenceClassification.from_pretrained(model_repository, num_labels=num_labels, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1)
model = model_init()

#DataCollator ํ™•์ธ
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
from collections import Counter
print("Test:", Counter(tokenized_datasets["test"]["labels"]))

#์ •ํ™•๋„ ์ฒ™๋„
from datasets import load_metric
accuracy = load_metric("accuracy")

import numpy as np
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_classes = np.argmax(pred_logits, axis=-1)
    labels = np.asarray(pred.label_ids)
    acc = accuracy.compute(predictions=pred_classes, references=labels)
    return {"accuracy": acc["accuracy"]}

#training_args ์„ค์ •
#๋‹ค์Œ ์—๋Ÿฌ ๋ฐœ์ƒ์‹œ output_dir์„ ๋ณ€๊ฒฝํ•˜์—ฌ ๋‹ค์‹œ ์‹œ๋„
#MlflowException: Changing param values is not allowed. Param with key=

import gc
gc.collect()
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir="deberta_sent4455",
    num_train_epochs=4,
    #learning_rate=5e-5,
    learning_rate=1.5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    fp16=True,                       # Use mixed precision
    fp16_opt_level="01",             # mixed precision mode
    warmup_steps=500,
    logging_steps=200,
    save_steps=2000,
    eval_steps=500,
    push_to_hub=True,
    evaluation_strategy="steps",
)

#Trainer ์„ค์ • ํ›„ ํ•™์Šต ์‹œ์ž‘
import gc
gc.collect()

from transformers import Trainer
trainer = Trainer(
    args=training_args,
    compute_metrics=compute_metrics,
    model=model,
    #tokenizer=tokenizer, #์—๋Ÿฌ ์œ ๋ฐœ: TypeError: save_vocabulary() got an unexpected keyword argument 'filename_prefix'
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"]
)
train_metrics = trainer.train().metrics
trainer.save_metrics("train", train_metrics)
trainer.push_to_hub()
#### ํŒŒ์ธํŠœ๋‹ ๋ฐ ๋ชจ๋ธ ์—…๋กœ๋“œ ์™„๋ฃŒ

# Commented out IPython magic to ensure Python compatibility.
# %cd mecab
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh); cd mecab-0.996-ko-0.9.2;
!chmod 775 ./configure; ./configure; make; chmod 775 tests/*.sh; make check; make install

# Commented out IPython magic to ensure Python compatibility.
!pwd
# %cd mecab
!cd mecab-ko-dic-2.1.1-20180720; chmod 775 ./autogen.sh; ./autogen.sh; ./configure; make

#!mecab -d /usr/local/lib/mecab/dic/mecab-ko-dic

# Commented out IPython magic to ensure Python compatibility.
!pwd
!ls
# %cd korscideberta

! unzip korscideberta.zip -d korscideberta; cd korscideberta

# Commented out IPython magic to ensure Python compatibility.
!pwd
# %cd korscideberta

! pip3 install -r requirements.txt; pip install --upgrade nltk;
!pip uninstall -y torch torchtext torch-tensorrt; pip install --upgrade pip; pip install torch==1.10.1+cu111 torchvision==0.11.2+cu111 torchaudio==0.10.1 -f https://download.pytorch.org/whl/cu111/torch_stable.html --default-timeout=100; pip install setuptools_scm six mlflow; pip install "numpy<1.24.0"; pip install .