from transformers import T5Tokenizer, T5ForConditionalGeneration from transformers import DataCollatorWithPadding from datasets import Dataset import pandas as pd import torch from sklearn.model_selection import train_test_split # Load the CSV file df = pd.read_csv("ruttoniaitrain1.csv") # Rename the columns df = df.rename(columns={"Quest": "text", "Answer": "target"}) # Convert the DataFrame to a Hugging Face Dataset train_df, val_df = train_test_split(df, test_size=0.2, random_state=42) train_dataset = Dataset.from_pandas(train_df) val_dataset = Dataset.from_pandas(val_df) print("CSV Processed and loaded!") # Initialize the tokenizer and model tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base") model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base") print("Model Loaded!") # Tokenize and format the data def preprocess_function(examples): inputs = tokenizer( examples['text'], truncation=True, padding='longest', max_length=512 ) targets = tokenizer( examples['target'], truncation=True, padding='longest', max_length=32 ) examples['input_ids'] = inputs['input_ids'] examples['attention_mask'] = inputs['attention_mask'] examples['labels'] = targets['input_ids'] return examples train_dataset = train_dataset.map(preprocess_function, batched=True) training_args = { 'output_dir': './Ruttoni_AI', 'num_train_epochs': 3, 'per_device_train_batch_size': 4, 'save_steps': 500, 'save_total_limit': 2, 'logging_steps': 100, 'evaluation_strategy': 'steps', 'eval_steps': 500, 'logging_dir': './logs', 'overwrite_output_dir': True, 'warmup_steps': 500, 'learning_rate': 1e-4, 'report_to': 'none' } print("Arguments and functions initialized!") data_collator = DataCollatorWithPadding(tokenizer=tokenizer) from transformers import Trainer, TrainingArguments trainer = Trainer( model=model, args=TrainingArguments(**training_args), data_collator=data_collator, train_dataset=train_dataset, eval_dataset=val_dataset, ) print("Training...") trainer.train() print("Saving...") trainer.save_model("./Ruttoni_AI")