Prositron commited on
Commit
e56b819
·
verified ·
1 Parent(s): 1bde306

Upload train_model.py

Browse files
Files changed (1) hide show
  1. train_model.py +103 -0
train_model.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.optim as optim
4
+ from torch.utils.data import DataLoader
5
+ from datasets import load_dataset
6
+ from transformers import AutoTokenizer
7
+ from tensor_network import FourDimensionalTransformer # Adjust based on your model's location
8
+
9
+ # List of dataset identifiers
10
+ dataset_ids = [
11
+ "prithivMLmods/Deepthink-Reasoning",
12
+ "ewok-core/ewok-core-1.0",
13
+ "MuskumPillerum/General-Knowledge",
14
+ "fblgit/tree-of-knowledge",
15
+ "CohereForAI/aya_dataset",
16
+ "AtlasUnified/Atlas-Reasoning",
17
+ "livebench/reasoning",
18
+ "SkunkworksAI/reasoning-0.01",
19
+ "KingNish/reasoning-base-20k",
20
+ "RLHFlow/HH-RLHF-Helpful-standard",
21
+ "MBZUAI/ArabicMMLU"
22
+ ]
23
+
24
+ # Load datasets
25
+ datasets = [load_dataset(dataset_id) for dataset_id in dataset_ids]
26
+
27
+ # Initialize tokenizer
28
+ tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') # Replace with your model's tokenizer
29
+
30
+ # Tokenize datasets
31
+ def tokenize_function(examples):
32
+ return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)
33
+
34
+ tokenized_datasets = [dataset.map(tokenize_function, batched=True) for dataset in datasets]
35
+
36
+
37
+ # Prepare DataLoader
38
+ def prepare_dataloader(dataset, batch_size=32):
39
+ dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
40
+ return DataLoader(dataset, batch_size=batch_size, shuffle=True)
41
+
42
+ train_dataloaders = [prepare_dataloader(dataset['train']) for dataset in tokenized_datasets]
43
+ val_dataloaders = [prepare_dataloader(dataset['validation']) for dataset in tokenized_datasets]
44
+
45
+
46
+ # Model setup
47
+ model = FourDimensionalTransformer(
48
+ num_layers=16,
49
+ embed_dim=7,
50
+ num_heads=1,
51
+ num_extra_tokens=16,
52
+ num_classes=10 # Adjust based on your specific task
53
+ )
54
+
55
+ # Loss function and optimizer
56
+ criterion = nn.CrossEntropyLoss()
57
+ optimizer = optim.Adam(model.parameters(), lr=1e-4) # Using Adam optimizer with a learning rate of 1e-4
58
+
59
+ # Training loop
60
+ def train(model, train_dataloaders, val_dataloaders, num_epochs=10):
61
+ for epoch in range(num_epochs):
62
+ model.train()
63
+ total_loss = 0
64
+ for dataloader in train_dataloaders:
65
+ for batch in dataloader:
66
+ input_ids = batch['input_ids']
67
+ attention_mask = batch['attention_mask']
68
+ labels = batch['label']
69
+
70
+ optimizer.zero_grad()
71
+ outputs = model(input_ids, attention_mask)
72
+ loss = criterion(outputs, labels)
73
+ loss.backward()
74
+ optimizer.step()
75
+
76
+ total_loss += loss.item()
77
+
78
+ avg_loss = total_loss / len(dataloader)
79
+ print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')
80
+
81
+ # Validation
82
+ model.eval()
83
+ total_correct = 0
84
+ with torch.no_grad():
85
+ for dataloader in val_dataloaders:
86
+ for batch in dataloader:
87
+ input_ids = batch['input_ids']
88
+ attention_mask = batch['attention_mask']
89
+ labels = batch['label']
90
+
91
+ outputs = model(input_ids, attention_mask)
92
+ _, predicted = torch.max(outputs, 1)
93
+ total_correct += (predicted == labels).sum().item()
94
+
95
+ accuracy = total_correct / len(dataloader.dataset)
96
+ print(f'Validation Accuracy: {accuracy:.4f}')
97
+
98
+ # Save the trained model
99
+ torch.save(model.state_dict(), 'trained_model.pth')
100
+
101
+
102
+ # Train the model
103
+ train(model, train_dataloaders, val_dataloaders)