File size: 5,179 Bytes
d9b8e9c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW
from model import load_model_lazy, unload_model
from database import fetch_all_inputs, clear_database # مدیریت دیتابیس
from datasets import load_dataset
class TextDataset(Dataset):
def __init__(self, texts, tokenizer, max_length=512):
self.texts = texts
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
text = self.texts[idx]
encodings = self.tokenizer(
text,
truncation=True,
padding="max_length", # پُر کردن توکنها تا طول مشخص
max_length=self.max_length,
return_tensors="pt"
)
attention_mask = encodings.attention_mask.squeeze(0)
return encodings.input_ids.squeeze(0), attention_mask
def train_model_with_text(selected_model, custom_text, epochs, batch_size):
"""
آموزش مدل با متن سفارشی.
"""
model, tokenizer = load_model_lazy(selected_model)
dataset = TextDataset([custom_text], tokenizer)
dataloader = DataLoader(dataset, batch_size=min(batch_size, len(dataset)), shuffle=True)
_train_model(model, tokenizer, dataloader, epochs, selected_model, "custom_text")
unload_model(selected_model)
def train_model_with_database(selected_model, epochs, batch_size):
"""
آموزش مدل با دادههای موجود در دیتابیس.
"""
model, tokenizer = load_model_lazy(selected_model)
inputs_data = fetch_all_inputs()
texts = [input_text for input_text, model_name in inputs_data if model_name == selected_model]
if not texts:
print("Error: No data found in the database for the selected model.")
return
dataset = TextDataset(texts, tokenizer)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
_train_model(model, tokenizer, dataloader, epochs, selected_model, "database")
clear_database()
unload_model(selected_model)
def train_model_with_dataset(selected_model, epochs, batch_size, dataset_path):
"""
آموزش مدل با فایل دیتاست آپلودشده.
"""
model, tokenizer = load_model_lazy(selected_model)
# خواندن دیتاست
with open(dataset_path, "r", encoding="utf-8") as f:
texts = f.readlines()
if not texts:
print("Error: Dataset is empty.")
return
dataset = TextDataset(texts, tokenizer)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
_train_model(model, tokenizer, dataloader, epochs, selected_model, "dataset")
unload_model(selected_model)
def _train_model(model, tokenizer, dataloader, epochs, model_name, method):
"""
منطق مشترک آموزش مدل.
"""
optimizer = AdamW(model.parameters(), lr=5e-5)
# انتقال مدل به GPU در صورت وجود
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()
for epoch in range(epochs):
total_loss = 0
for step, (input_ids, attention_mask) in enumerate(dataloader):
optimizer.zero_grad()
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
# محاسبه خروجی و خطا
outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
loss = outputs.loss
loss.backward()
optimizer.step()
total_loss += loss.item()
print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(dataloader)}")
# ذخیره مدل
save_path = f"trained_{model_name}_{method}"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print(f"Model {model_name} trained with {method} and saved to {save_path}.")
def train_model_with_hf_dataset(selected_model, epochs, batch_size, dataset_name, split="train"):
"""
آموزش مدل با استفاده از دیتاستهای Hugging Face.
Args:
selected_model (str): نام مدل برای آموزش.
epochs (int): تعداد epochs.
batch_size (int): اندازه batch.
dataset_name (str): نام دیتاست در Hugging Face.
split (str): بخش دیتاست برای بارگذاری (train, test, validation).
"""
model, tokenizer = load_model_lazy(selected_model)
# بارگذاری دادهها از Hugging Face
texts = load_dataset(dataset_name, split)
if not texts:
print(f"Error: Dataset {dataset_name} ({split} split) is empty or invalid.")
return
dataset = TextDataset(texts, tokenizer)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
_train_model(model, tokenizer, dataloader, epochs, selected_model, f"huggingface_{dataset_name}")
unload_model(selected_model)
|