TeacherPuffy commited on
Commit
150c211
·
verified ·
1 Parent(s): 11558b2

Update train_mlp_batches.py

Browse files
Files changed (1) hide show
  1. train_mlp_batches.py +73 -70
train_mlp_batches.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import argparse
2
  import os
3
  import torch
@@ -7,9 +8,13 @@ from datasets import load_dataset
7
  from PIL import Image
8
  import numpy as np
9
  from torch.utils.data import DataLoader, Dataset
 
 
 
 
10
 
11
  # Define the MLP model
12
- class MLP(nn.Module):
13
  def __init__(self, input_size, hidden_sizes, output_size):
14
  super(MLP, self).__init__()
15
  layers = []
@@ -19,9 +24,21 @@ class MLP(nn.Module):
19
  if i < len(sizes) - 2:
20
  layers.append(nn.ReLU())
21
  self.model = nn.Sequential(*layers)
22
-
23
- def forward(self, x):
24
- return self.model(x)
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  # Custom Dataset class to handle image preprocessing
27
  class TinyImageNetDataset(Dataset):
@@ -40,67 +57,22 @@ class TinyImageNetDataset(Dataset):
40
  label = torch.tensor(example['label'])
41
  return img, label
42
 
43
- # Train the model
44
- def train_model(model, train_loader, val_loader, epochs=10, lr=0.001, save_loss_path=None, save_model_dir=None):
45
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
46
- model.to(device)
47
-
48
- criterion = nn.CrossEntropyLoss()
49
- optimizer = optim.Adam(model.parameters(), lr=lr)
50
-
51
- train_losses = []
52
- val_losses = []
53
-
54
- for epoch in range(epochs):
55
- model.train()
56
- running_loss = 0.0
57
- for batch_idx, (inputs, labels) in enumerate(train_loader):
58
- inputs, labels = inputs.to(device), labels.to(device)
59
-
60
- optimizer.zero_grad()
61
- outputs = model(inputs)
62
- loss = criterion(outputs, labels)
63
- loss.backward()
64
- optimizer.step()
65
-
66
- running_loss += loss.item()
67
-
68
- avg_train_loss = running_loss / len(train_loader)
69
- train_losses.append(avg_train_loss)
70
- print(f'Epoch {epoch+1}, Loss: {avg_train_loss}')
71
-
72
- # Validation
73
- model.eval()
74
- val_loss = 0.0
75
- correct = 0
76
- total = 0
77
- with torch.no_grad():
78
- for inputs, labels in val_loader:
79
- inputs, labels = inputs.to(device), labels.to(device)
80
-
81
- outputs = model(inputs)
82
- loss = criterion(outputs, labels)
83
- val_loss += loss.item()
84
-
85
- _, predicted = torch.max(outputs.data, 1)
86
- total += labels.size(0)
87
- correct += (predicted == labels).sum().item()
88
-
89
- avg_val_loss = val_loss / len(val_loader)
90
- val_losses.append(avg_val_loss)
91
- print(f'Validation Loss: {avg_val_loss}, Accuracy: {100 * correct / total}%')
92
-
93
- # Save the model after each epoch
94
- if save_model_dir:
95
- model_path = os.path.join(save_model_dir, f'model_epoch_{epoch+1}.pth')
96
- torch.save(model.state_dict(), model_path)
97
-
98
- if save_loss_path:
99
- with open(save_loss_path, 'w') as f:
100
- for epoch, (train_loss, val_loss) in enumerate(zip(train_losses, val_losses)):
101
- f.write(f'Epoch {epoch+1}, Train Loss: {train_loss}, Validation Loss: {val_loss}\n')
102
-
103
- return avg_val_loss
104
 
105
  # Main function
106
  def main():
@@ -109,6 +81,7 @@ def main():
109
  parser.add_argument('--width', type=int, default=512, help='Number of neurons per hidden layer (default: 512)')
110
  parser.add_argument('--batch_size', type=int, default=8, help='Batch size for training (default: 8)')
111
  parser.add_argument('--save_model_dir', type=str, default='saved_models', help='Directory to save model checkpoints (default: saved_models)')
 
112
  args = parser.parse_args()
113
 
114
  # Load the zh-plus/tiny-imagenet dataset
@@ -138,9 +111,27 @@ def main():
138
  train_loader = DataLoader(TinyImageNetDataset(train_dataset), batch_size=args.batch_size, shuffle=True)
139
  val_loader = DataLoader(TinyImageNetDataset(val_dataset), batch_size=args.batch_size, shuffle=False)
140
 
141
- # Train the model and get the final loss
142
- save_loss_path = 'losses.txt'
143
- final_loss = train_model(model, train_loader, val_loader, save_loss_path=save_loss_path, save_model_dir=args.save_model_dir)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
  # Calculate the number of parameters
146
  param_count = sum(p.numel() for p in model.parameters())
@@ -156,14 +147,26 @@ def main():
156
  # Write the results to a text file in the model folder
157
  result_path = os.path.join(model_folder, 'results.txt')
158
  with open(result_path, 'w') as f:
159
- f.write(f'Layer Count: {args.layer_count}, Width: {args.width}, Parameter Count: {param_count}, Final Loss: {final_loss}\n')
160
 
161
  # Save a duplicate of the results in the 'results' folder
162
  results_folder = 'results'
163
  os.makedirs(results_folder, exist_ok=True)
164
  duplicate_result_path = os.path.join(results_folder, f'results_l{args.layer_count}w{args.width}.txt')
165
  with open(duplicate_result_path, 'w') as f:
166
- f.write(f'Layer Count: {args.layer_count}, Width: {args.width}, Parameter Count: {param_count}, Final Loss: {final_loss}\n')
 
 
 
 
 
 
 
 
 
 
 
 
167
 
168
  if __name__ == '__main__':
169
  main()
 
1
+ from modelscope.hub.api import HubApi
2
  import argparse
3
  import os
4
  import torch
 
8
  from PIL import Image
9
  import numpy as np
10
  from torch.utils.data import DataLoader, Dataset
11
+ from mmengine.model import BaseModel
12
+ from mmengine.runner import Runner, EpochBasedTrainLoop, ValLoop
13
+ from mmengine.hooks import CheckpointHook, LoggerHook
14
+ from mmengine.optim import OptimWrapper
15
 
16
  # Define the MLP model
17
+ class MLP(BaseModel):
18
  def __init__(self, input_size, hidden_sizes, output_size):
19
  super(MLP, self).__init__()
20
  layers = []
 
24
  if i < len(sizes) - 2:
25
  layers.append(nn.ReLU())
26
  self.model = nn.Sequential(*layers)
27
+ self.criterion = nn.CrossEntropyLoss()
28
+
29
+ def forward(self, inputs, labels, mode='train'):
30
+ outputs = self.model(inputs)
31
+ if mode == 'train':
32
+ loss = self.criterion(outputs, labels)
33
+ return dict(loss=loss)
34
+ elif mode == 'val':
35
+ loss = self.criterion(outputs, labels)
36
+ _, predicted = torch.max(outputs.data, 1)
37
+ correct = (predicted == labels).sum().item()
38
+ total = labels.size(0)
39
+ return dict(loss=loss, correct=correct, total=total)
40
+ else:
41
+ return outputs
42
 
43
  # Custom Dataset class to handle image preprocessing
44
  class TinyImageNetDataset(Dataset):
 
57
  label = torch.tensor(example['label'])
58
  return img, label
59
 
60
+ # Define the training loop
61
+ class MLPTrainLoop(EpochBasedTrainLoop):
62
+ def run_iter(self, data_batch: dict, train_mode: bool = True) -> None:
63
+ data_batch = self.data_preprocessor(data_batch, training=True)
64
+ outputs = self.model(**data_batch, mode='train')
65
+ parsed_outputs = self.model.parse_losses(outputs)
66
+ self.optim_wrapper.update_params(parsed_outputs)
67
+
68
+ # Define the validation loop
69
+ class MLPValLoop(ValLoop):
70
+ def run_iter(self, data_batch: dict, train_mode: bool = False) -> None:
71
+ data_batch = self.data_preprocessor(data_batch, training=False)
72
+ outputs = self.model(**data_batch, mode='val')
73
+ self.outputs['loss'].append(outputs['loss'].item())
74
+ self.outputs['correct'].append(outputs['correct'])
75
+ self.outputs['total'].append(outputs['total'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
  # Main function
78
  def main():
 
81
  parser.add_argument('--width', type=int, default=512, help='Number of neurons per hidden layer (default: 512)')
82
  parser.add_argument('--batch_size', type=int, default=8, help='Batch size for training (default: 8)')
83
  parser.add_argument('--save_model_dir', type=str, default='saved_models', help='Directory to save model checkpoints (default: saved_models)')
84
+ parser.add_argument('--access_token', type=str, required=True, help='ModelScope SDK access token')
85
  args = parser.parse_args()
86
 
87
  # Load the zh-plus/tiny-imagenet dataset
 
111
  train_loader = DataLoader(TinyImageNetDataset(train_dataset), batch_size=args.batch_size, shuffle=True)
112
  val_loader = DataLoader(TinyImageNetDataset(val_dataset), batch_size=args.batch_size, shuffle=False)
113
 
114
+ # Define the optimizer
115
+ optimizer = optim.Adam(model.parameters(), lr=0.001)
116
+
117
+ # Define the runner
118
+ runner = Runner(
119
+ model=model,
120
+ work_dir=args.save_model_dir,
121
+ train_dataloader=train_loader,
122
+ val_dataloader=val_loader,
123
+ optim_wrapper=dict(optimizer=optimizer),
124
+ train_loop=MLPTrainLoop,
125
+ val_loop=MLPValLoop,
126
+ val_interval=1,
127
+ default_hooks=dict(
128
+ checkpoint=dict(type=CheckpointHook, interval=1, save_best='auto'),
129
+ logger=dict(type=LoggerHook, interval=10)
130
+ )
131
+ )
132
+
133
+ # Start training
134
+ runner.train()
135
 
136
  # Calculate the number of parameters
137
  param_count = sum(p.numel() for p in model.parameters())
 
147
  # Write the results to a text file in the model folder
148
  result_path = os.path.join(model_folder, 'results.txt')
149
  with open(result_path, 'w') as f:
150
+ f.write(f'Layer Count: {args.layer_count}, Width: {args.width}, Parameter Count: {param_count}\n')
151
 
152
  # Save a duplicate of the results in the 'results' folder
153
  results_folder = 'results'
154
  os.makedirs(results_folder, exist_ok=True)
155
  duplicate_result_path = os.path.join(results_folder, f'results_l{args.layer_count}w{args.width}.txt')
156
  with open(duplicate_result_path, 'w') as f:
157
+ f.write(f'Layer Count: {args.layer_count}, Width: {args.width}, Parameter Count: {param_count}\n')
158
+
159
+ # Upload the model to ModelScope
160
+ api = HubApi()
161
+ api.login(args.access_token)
162
+ api.push_model(
163
+ model_id="puffy310/MLPScaling",
164
+ model_dir=model_folder # Local model directory, the directory must contain configuration.json
165
+ )
166
+
167
+ # Delete the local model directory
168
+ import shutil
169
+ shutil.rmtree(model_folder)
170
 
171
  if __name__ == '__main__':
172
  main()