Spaces:
Running
Running
# -*- coding: utf-8 -*- | |
""" | |
@author:XuMing([email protected]) | |
@description: refer https://github.com/ThilinaRajapakse/simpletransformers | |
""" | |
import json | |
import os | |
import sys | |
from dataclasses import asdict, dataclass, field | |
from multiprocessing import cpu_count | |
from typing import Optional | |
from loguru import logger | |
from torch.utils.data import Dataset | |
def get_default_process_count(): | |
process_count = cpu_count() - 2 if cpu_count() > 2 else 1 | |
if sys.platform == "win32": | |
process_count = min(process_count, 61) | |
return process_count | |
def get_special_tokens(): | |
return ["<s>", "<pad>", "</s>", "<unk>", "<mask>"] | |
class ModelArgs: | |
adafactor_beta1: float = None | |
adafactor_clip_threshold: float = 1.0 | |
adafactor_decay_rate: float = -0.8 | |
adafactor_eps: tuple = field(default_factory=lambda: (1e-30, 1e-3)) | |
adafactor_relative_step: bool = True | |
adafactor_scale_parameter: bool = True | |
adafactor_warmup_init: bool = True | |
adam_epsilon: float = 1e-8 | |
best_model_dir: str = "outputs/best_model" | |
cache_dir: str = "cache_dir/" | |
config: dict = field(default_factory=dict) | |
cosine_schedule_num_cycles: float = 0.5 | |
custom_layer_parameters: list = field(default_factory=list) | |
custom_parameter_groups: list = field(default_factory=list) | |
dataloader_num_workers: int = 0 | |
do_lower_case: bool = False | |
dynamic_quantize: bool = False | |
early_stopping_consider_epochs: bool = False | |
early_stopping_delta: float = 0 | |
early_stopping_metric: str = "eval_loss" | |
early_stopping_metric_minimize: bool = True | |
early_stopping_patience: int = 3 | |
encoding: str = "utf-8" | |
eval_batch_size: int = 8 | |
evaluate_during_training: bool = False | |
evaluate_during_training_silent: bool = True | |
evaluate_during_training_steps: int = 6000 | |
evaluate_during_training_verbose: bool = False | |
evaluate_each_epoch: bool = True | |
fp16: bool = False | |
gradient_accumulation_steps: int = 1 | |
learning_rate: float = 2e-5 | |
local_rank: int = -1 | |
logging_steps: int = 50 | |
manual_seed: int = None | |
max_grad_norm: float = 1.0 | |
max_seq_length: int = 128 # max length of input sequence | |
model_name: str = None | |
model_type: str = None | |
multiprocessing_chunksize: int = -1 | |
n_gpu: int = 2 | |
no_cache: bool = False | |
no_save: bool = False | |
not_saved_args: list = field(default_factory=list) | |
num_train_epochs: int = 1 | |
optimizer: str = "AdamW" | |
output_dir: str = "outputs/" | |
overwrite_output_dir: bool = True | |
polynomial_decay_schedule_lr_end: float = 1e-7 | |
polynomial_decay_schedule_power: float = 1.0 | |
process_count: int = field(default_factory=get_default_process_count) | |
quantized_model: bool = False | |
reprocess_input_data: bool = False | |
save_best_model: bool = True | |
save_eval_checkpoints: bool = True | |
save_model_every_epoch: bool = False | |
save_optimizer_and_scheduler: bool = True | |
save_steps: int = 10000 | |
scheduler: str = "linear_schedule_with_warmup" | |
silent: bool = False | |
skip_special_tokens: bool = True | |
tensorboard_dir: str = None | |
thread_count: int = None | |
tokenizer_name: str = None | |
tokenizer_type: str = None | |
train_batch_size: int = 8 | |
train_custom_parameters_only: bool = False | |
use_cached_eval_features: bool = False | |
use_early_stopping: bool = False | |
use_hf_datasets: bool = False | |
use_multiprocessing: bool = True | |
use_multiprocessing_for_evaluation: bool = True | |
wandb_kwargs: dict = field(default_factory=dict) | |
wandb_project: str = None | |
warmup_ratio: float = 0.06 | |
warmup_steps: int = 0 | |
weight_decay: float = 0.0 | |
def update_from_dict(self, new_values): | |
if isinstance(new_values, dict): | |
for key, value in new_values.items(): | |
setattr(self, key, value) | |
else: | |
raise (TypeError(f"{new_values} is not a Python dict.")) | |
def get_args_for_saving(self): | |
args_for_saving = {key: value for key, value in asdict(self).items() if key not in self.not_saved_args} | |
return args_for_saving | |
def save(self, output_dir): | |
os.makedirs(output_dir, exist_ok=True) | |
with open(os.path.join(output_dir, "model_args.json"), "w", encoding='utf-8') as f: | |
args_dict = self.get_args_for_saving() | |
if args_dict['dataset_class'] is not None and not isinstance(args_dict["dataset_class"], str): | |
args_dict['dataset_class'] = type(args_dict['dataset_class']).__name__ | |
if args_dict["tokenizer_type"] is not None and not isinstance(args_dict["tokenizer_type"], str): | |
args_dict["tokenizer_type"] = type(args_dict["tokenizer_type"]).__name__ | |
json.dump(args_dict, f) | |
def load(self, input_dir): | |
if input_dir: | |
model_args_file = os.path.join(input_dir, "model_args.json") | |
if os.path.isfile(model_args_file): | |
with open(model_args_file, "r", encoding='utf-8') as f: | |
model_args = json.load(f) | |
if model_args["dataset_class"]: | |
logger.warning( | |
"This model was trained using a custom dataset_class." | |
"This cannot be loaded automatically and must be specified in the model args" | |
"when loading the model." | |
) | |
self.update_from_dict(model_args) | |
class T5Args(ModelArgs): | |
""" | |
Model args for a T5Model | |
""" | |
model_class: str = "T5Model" | |
dataset_class: Dataset = None | |
do_sample: bool = False | |
early_stopping: bool = True | |
evaluate_generated_text: bool = False | |
length_penalty: float = 2.0 | |
max_length: int = 180 # max length of the sequence to be generated | |
max_steps: int = -1 | |
num_beams: int = 1 | |
num_return_sequences: int = 1 | |
preprocess_inputs: bool = True | |
repetition_penalty: float = 1.0 | |
scheduler: str = "constant_schedule_with_warmup" | |
adafactor_relative_step: bool = False | |
adafactor_scale_parameter: bool = False | |
adafactor_warmup_init: bool = False | |
learning_rate: float = 5e-4 | |
optimizer: str = "AdamW" | |
special_tokens_list: list = field(default_factory=list) | |
top_k: float = None | |
top_p: float = None | |
use_multiprocessed_decoding: bool = False | |
class CopyT5Args(ModelArgs): | |
""" | |
Model args for a CopyT5Model | |
""" | |
model_class: str = "CopyT5Model" | |
dataset_class: Dataset = None | |
do_sample: bool = False | |
early_stopping: bool = True | |
evaluate_generated_text: bool = False | |
length_penalty: float = 2.0 | |
max_length: int = 128 # max length of the sequence to be generated | |
max_steps: int = -1 | |
num_beams: int = 3 | |
num_return_sequences: int = 1 | |
preprocess_inputs: bool = True | |
repetition_penalty: float = 1.0 | |
scheduler: str = "linear_schedule_with_warmup" | |
adafactor_relative_step: bool = False | |
adafactor_scale_parameter: bool = False | |
adafactor_warmup_init: bool = False | |
learning_rate: float = 1e-3 | |
optimizer: str = "AdamW" | |
special_tokens_list: list = field(default_factory=list) | |
top_k: float = None | |
top_p: float = None | |
use_multiprocessed_decoding: bool = False | |
class LanguageModelingArgs(ModelArgs): | |
""" | |
Model args for a LanguageModelingModel | |
""" | |
model_class: str = "LanguageModelingModel" | |
block_size: int = -1 | |
config_name: str = None | |
dataset_class: Dataset = None | |
dataset_type: str = "None" | |
discriminator_config: dict = field(default_factory=dict) | |
discriminator_loss_weight: float = 50.0 | |
generator_config: dict = field(default_factory=dict) | |
max_steps: int = -1 | |
min_frequency: int = 2 | |
mlm: bool = True | |
mlm_probability: float = 0.15 | |
sliding_window: bool = False | |
special_tokens: list = field(default_factory=get_special_tokens) | |
stride: float = 0.8 | |
tie_generator_and_discriminator_embeddings: bool = True | |
tokenizer_name: str = None | |
vocab_size: int = None | |
clean_text: bool = True | |
handle_chinese_chars: bool = True | |
special_tokens_list: list = field(default_factory=list) | |
strip_accents: bool = True | |
local_rank: int = -1 | |
class Seq2SeqArgs(ModelArgs): | |
""" | |
Model args for a Seq2SeqModel | |
""" | |
model_class: str = "Seq2SeqModel" | |
base_marian_model_name: str = None | |
dataset_class: Dataset = None | |
do_sample: bool = False | |
early_stopping: bool = True | |
evaluate_generated_text: bool = False | |
faiss_d: int = 768 | |
faiss_m: int = 128 | |
length_penalty: float = 2.0 | |
max_length: int = 128 # max length of the sequence to be generated | |
max_steps: int = -1 | |
num_beams: int = 1 | |
num_return_sequences: int = 1 | |
rag_embed_batch_size: int = 16 | |
repetition_penalty: float = 1.0 | |
top_k: float = None | |
top_p: float = None | |
use_multiprocessed_decoding: bool = False | |
save_knowledge_dataset: bool = True | |
save_knowledge_dataset_with_checkpoints: bool = False | |
split_text_character: str = " " | |
split_text_n: int = 100 | |
src_lang: str = "en_XX" | |
tgt_lang: str = "ro_RO" | |
class LanguageGenerationArgs(ModelArgs): | |
""" | |
Model args for a LanguageGenerationModel | |
""" | |
model_class: str = "LanguageGenerationModel" | |
do_sample: bool = True | |
early_stopping: bool = True | |
evaluate_generated_text: bool = False | |
length_penalty: float = 2.0 | |
max_length: int = 128 # max length of the sequence to be generated | |
max_steps: int = -1 | |
num_beams: int = 1 | |
num_return_sequences: int = 1 | |
repetition_penalty: float = 1.0 | |
top_k: float = 50 | |
top_p: float = 0.95 | |
prompt: str = "" | |
stop_token: str = None | |
temperature: float = 1.0 | |
padding_text: str = "" | |
xlm_language: str = "" | |
config_name: str = None | |
tokenizer_name: str = None | |
special_tokens_list: list = field(default_factory=list) | |
class SongNetArgs(LanguageModelingArgs): | |
""" | |
Model args for a SongNetModel | |
""" | |
model_class: str = "SongNetModel" | |
dataset_class: Dataset = None | |
do_sample: bool = False | |
early_stopping: bool = True | |
evaluate_generated_text: bool = False | |
length_penalty: float = 2.0 | |
max_length: int = 128 | |
min_length: int = 10 | |
max_steps: int = -1 | |
num_beams: int = 3 | |
num_return_sequences: int = 1 | |
repetition_penalty: float = 1.0 | |
scheduler: str = None | |
adafactor_relative_step: bool = False | |
adafactor_scale_parameter: bool = False | |
adafactor_warmup_init: bool = False | |
learning_rate: float = 1e-3 | |
early_stopping_metric: str = "eval_ppl" | |
special_tokens_list: list = field(default_factory=list) | |
save_eval_checkpoints: bool = False | |
skip_special_tokens: bool = False | |
k: int = 16 | |
use_multiprocessed_decoding: bool = False | |
embed_dim: int = 768 | |
ff_embed_dim: int = 3072 | |
num_heads: int = 12 | |
num_layers: int = 12 | |
dropout: float = 0.2 | |
warmup_ratio: float = 0.05 | |
weight_decay: float = 0.0 | |
smoothing_factor: float = 0.1 | |
class ChatGlmArgs(ModelArgs): | |
""" | |
Model args for a ChatGLMModel | |
""" | |
model_class: str = "ChatGlmArgs" | |
dataset_class: Dataset = None | |
learning_rate: float = 2e-5 | |
fp16: bool = True | |
bf16: bool = False | |
int8: bool = False | |
int4: bool = False | |
debug: bool = False | |
max_seq_length: int = 256 # max length of input sequence | |
max_length = 384 # max length of the sequence to be generated | |
do_sample: bool = True | |
early_stopping: bool = True | |
is_train_on_prompt: bool = False # if compute loss with prompt labels | |
evaluate_generated_text: bool = True | |
report_to = "tensorboard" | |
optimizer: str = "adamw_torch" | |
save_strategy: str = "steps" | |
evaluation_strategy: str = "no" | |
eval_steps: int = 50 | |
save_steps: int = 400 | |
max_eval_samples: int = 20 | |
length_penalty: float = 2.0 | |
num_beams: int = 4 | |
num_return_sequences: int = 1 | |
repetition_penalty: float = 1.0 | |
temperature: float = 0.1 | |
special_tokens_list: list = field(default_factory=list) | |
top_k: float = 40 | |
top_p: float = 0.75 | |
model_name_or_path: Optional[str] = field(default="THUDM/chatglm-6b") | |
use_peft: bool = True | |
peft_type: str = "LORA" | |
peft_bin_name: str = "adapter_model.bin" | |
lora_r: int = 8 | |
lora_alpha = 32 | |
lora_dropout = 0.05 | |
lora_target_modules = ["all"] # ["all"] or ["query_key_value"] | |
lora_bias = "none" | |
adalora_init_r: int = 12 | |
adalora_tinit: int = 200 | |
adalora_tfinal: int = 1000 | |
adalora_delta_t: int = 10 | |
lora_beta: float = 0.85 | |
num_virtual_tokens: int = 20 | |
prompt_encoder_hidden_size: int = 128 | |
num_train_epochs = 1 | |
max_steps = -1 | |
per_device_train_batch_size = 2 | |
eval_batch_size: int = 4 | |
gradient_accumulation_steps = 1 | |
gradient_checkpointing: bool = True | |
torch_compile: bool = False | |
save_total_limit = 10 | |
remove_unused_columns = False | |
logging_steps = 50 | |
resume_from_checkpoint: str = None | |
qlora: bool = False | |
class GptArgs(ModelArgs): | |
""" | |
Model args for a GptModel | |
""" | |
model_class: str = "GptArgs" | |
dataset_class: Dataset = None | |
learning_rate: float = 2e-5 | |
fp16: bool = True | |
bf16: bool = False | |
int8: bool = False | |
int4: bool = False | |
debug: bool = False | |
max_seq_length: int = 256 # max length of input sequence | |
max_length = 256 # max length of the sequence to be generated | |
do_sample: bool = True | |
early_stopping: bool = True | |
evaluate_generated_text: bool = True | |
is_train_on_prompt: bool = False # if compute loss with prompt labels | |
warmup_steps: int = 50 | |
report_to = "tensorboard" | |
optimizer: str = "adamw_torch" | |
save_strategy: str = "steps" | |
eval_steps: int = 200 | |
save_steps: int = 400 | |
pad_to_multiple_of: int = 8 | |
max_eval_samples: int = 20 | |
length_penalty: float = 2.0 | |
num_beams: int = 1 | |
num_return_sequences: int = 1 | |
repetition_penalty: float = 1.3 | |
temperature: float = 0.4 | |
special_tokens_list: list = field(default_factory=list) | |
top_k: float = 40 | |
top_p: float = 0.9 | |
model_name_or_path: Optional[str] = field(default="shibing624/chinese-alpaca-plus-7b-hf") | |
use_peft: bool = True | |
peft_type: str = "LORA" | |
peft_bin_name: str = "adapter_model.bin" | |
lora_r: int = 8 | |
lora_alpha = 16 | |
lora_dropout = 0.05 | |
lora_target_modules = ["all"] # ["all"] or ["k_proj"] | |
lora_bias = "none" | |
adalora_init_r: int = 12 | |
adalora_tinit: int = 200 | |
adalora_tfinal: int = 1000 | |
adalora_delta_t: int = 10 | |
lora_beta: float = 0.85 | |
num_virtual_tokens: int = 20 | |
prompt_encoder_hidden_size: int = 128 | |
num_train_epochs = 3 | |
max_steps = -1 | |
per_device_train_batch_size = 2 | |
eval_batch_size: int = 4 | |
gradient_accumulation_steps = 1 | |
save_total_limit = 10 | |
remove_unused_columns = False | |
logging_steps = 50 | |
resume_from_checkpoint: str = None | |
gradient_checkpointing: bool = True | |
torch_compile: bool = False | |
trust_remote_code: bool = True | |
qlora: bool = False | |