|
import torch
|
|
from torch.utils.data import Dataset, DataLoader
|
|
from transformers import AdamW
|
|
from model import load_model_lazy, unload_model
|
|
from database import fetch_all_inputs, clear_database
|
|
from datasets import load_dataset
|
|
|
|
class TextDataset(Dataset):
|
|
def __init__(self, texts, tokenizer, max_length=512):
|
|
self.texts = texts
|
|
self.tokenizer = tokenizer
|
|
self.max_length = max_length
|
|
|
|
def __len__(self):
|
|
return len(self.texts)
|
|
|
|
def __getitem__(self, idx):
|
|
text = self.texts[idx]
|
|
encodings = self.tokenizer(
|
|
text,
|
|
truncation=True,
|
|
padding="max_length",
|
|
max_length=self.max_length,
|
|
return_tensors="pt"
|
|
)
|
|
attention_mask = encodings.attention_mask.squeeze(0)
|
|
return encodings.input_ids.squeeze(0), attention_mask
|
|
|
|
def train_model_with_text(selected_model, custom_text, epochs, batch_size):
|
|
"""
|
|
آموزش مدل با متن سفارشی.
|
|
"""
|
|
model, tokenizer = load_model_lazy(selected_model)
|
|
dataset = TextDataset([custom_text], tokenizer)
|
|
dataloader = DataLoader(dataset, batch_size=min(batch_size, len(dataset)), shuffle=True)
|
|
|
|
_train_model(model, tokenizer, dataloader, epochs, selected_model, "custom_text")
|
|
unload_model(selected_model)
|
|
|
|
def train_model_with_database(selected_model, epochs, batch_size):
|
|
"""
|
|
آموزش مدل با دادههای موجود در دیتابیس.
|
|
"""
|
|
model, tokenizer = load_model_lazy(selected_model)
|
|
inputs_data = fetch_all_inputs()
|
|
texts = [input_text for input_text, model_name in inputs_data if model_name == selected_model]
|
|
|
|
if not texts:
|
|
print("Error: No data found in the database for the selected model.")
|
|
return
|
|
|
|
dataset = TextDataset(texts, tokenizer)
|
|
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
|
|
|
|
_train_model(model, tokenizer, dataloader, epochs, selected_model, "database")
|
|
clear_database()
|
|
unload_model(selected_model)
|
|
|
|
def train_model_with_dataset(selected_model, epochs, batch_size, dataset_path):
|
|
"""
|
|
آموزش مدل با فایل دیتاست آپلودشده.
|
|
"""
|
|
model, tokenizer = load_model_lazy(selected_model)
|
|
|
|
|
|
with open(dataset_path, "r", encoding="utf-8") as f:
|
|
texts = f.readlines()
|
|
|
|
if not texts:
|
|
print("Error: Dataset is empty.")
|
|
return
|
|
|
|
dataset = TextDataset(texts, tokenizer)
|
|
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
|
|
|
|
_train_model(model, tokenizer, dataloader, epochs, selected_model, "dataset")
|
|
unload_model(selected_model)
|
|
|
|
def _train_model(model, tokenizer, dataloader, epochs, model_name, method):
|
|
"""
|
|
منطق مشترک آموزش مدل.
|
|
"""
|
|
optimizer = AdamW(model.parameters(), lr=5e-5)
|
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
model.to(device)
|
|
|
|
model.train()
|
|
for epoch in range(epochs):
|
|
total_loss = 0
|
|
for step, (input_ids, attention_mask) in enumerate(dataloader):
|
|
optimizer.zero_grad()
|
|
input_ids = input_ids.to(device)
|
|
attention_mask = attention_mask.to(device)
|
|
|
|
|
|
outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
|
|
loss = outputs.loss
|
|
loss.backward()
|
|
optimizer.step()
|
|
total_loss += loss.item()
|
|
|
|
print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(dataloader)}")
|
|
|
|
|
|
save_path = f"trained_{model_name}_{method}"
|
|
model.save_pretrained(save_path)
|
|
tokenizer.save_pretrained(save_path)
|
|
print(f"Model {model_name} trained with {method} and saved to {save_path}.")
|
|
|
|
def train_model_with_hf_dataset(selected_model, epochs, batch_size, dataset_name, split="train"):
|
|
"""
|
|
آموزش مدل با استفاده از دیتاستهای Hugging Face.
|
|
|
|
Args:
|
|
selected_model (str): نام مدل برای آموزش.
|
|
epochs (int): تعداد epochs.
|
|
batch_size (int): اندازه batch.
|
|
dataset_name (str): نام دیتاست در Hugging Face.
|
|
split (str): بخش دیتاست برای بارگذاری (train, test, validation).
|
|
"""
|
|
model, tokenizer = load_model_lazy(selected_model)
|
|
|
|
|
|
texts = load_dataset(dataset_name, split)
|
|
|
|
if not texts:
|
|
print(f"Error: Dataset {dataset_name} ({split} split) is empty or invalid.")
|
|
return
|
|
|
|
dataset = TextDataset(texts, tokenizer)
|
|
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
|
|
|
|
_train_model(model, tokenizer, dataloader, epochs, selected_model, f"huggingface_{dataset_name}")
|
|
unload_model(selected_model)
|
|
|