4
4
import pandas as pd
5
5
from tqdm import tqdm
6
6
import cv2
7
- # %matplotlib inline
8
- import matplotlib .pyplot as plt
9
7
10
8
import torch
11
9
import torch .nn as nn
17
15
18
16
# import neccesary libraries for defining the optimizers
19
17
import torch .optim as optim
20
- from torch .optim import lr_scheduler
21
18
22
19
from trainer import fit
23
20
import config
@@ -26,11 +23,15 @@ def q(text = ''): # easy way to exiting the script. useful while debugging
26
23
print ('> ' , text )
27
24
sys .exit ()
28
25
26
+ device = torch .device ("cuda" ) if torch .cuda .is_available () else torch .device ("cpu" )
27
+ print (f'\n device: { device } ' )
28
+
29
29
parser = argparse .ArgumentParser (description = 'Following are the arguments that can be passed form the terminal itself ! Cool huh ? :D' )
30
30
parser .add_argument ('--data_path' , type = str , default = 'NIH Chest X-rays' , help = 'This is the path of the training data' )
31
31
# parser.add_argument('--test_path', type = str, default = os.path.join('hack-data-new','Scoring2/') , help = 'This is the path of the testing data')
32
- parser .add_argument ('--bs' , type = int , default = 256 , help = 'batch size' )
32
+ parser .add_argument ('--bs' , type = int , default = 128 , help = 'batch size' )
33
33
parser .add_argument ('--lr' , type = float , default = 1e-5 , help = 'Learning Rate for the optimizer' )
34
+ parser .add_argument ('--stage' , type = int , default = 1 , help = 'Stage, it decides which layers of the Neural Net to train' )
34
35
parser .add_argument ('--loss_func' , type = str , default = 'FocalLoss' , choices = {'BCE' , 'FocalLoss' }, help = 'loss function' )
35
36
parser .add_argument ('-r' ,'--resume' , action = 'store_true' ) # args.resume will return True if -r or --resume is used in the terminal
36
37
parser .add_argument ('--ckpt' , type = str , help = 'Path of the ckeckpoint that you wnat to load' )
@@ -40,6 +41,11 @@ def q(text = ''): # easy way to exiting the script. useful while debugging
40
41
if args .resume and args .test : # what if --test is not defiend at all ? test case hai ye ek
41
42
q ('The flow of this code has been designed either to train the model or to test it.\n Please choose either --resume or --test' )
42
43
44
+ stage = args .stage
45
+ if not args .resume :
46
+ print (f'\n Overwriting stage to 1, as the model training is being done from scratch' )
47
+ stage = 1
48
+
43
49
if args .test :
44
50
print ('TESTING THE MODEL' )
45
51
else :
@@ -72,7 +78,7 @@ def count_parameters(model):
72
78
print ('-------------------------------------' )
73
79
74
80
# make the dataloaders
75
- batch_size = args .bs # 256 by default
81
+ batch_size = args .bs # 128 by default
76
82
train_loader = torch .utils .data .DataLoader (train_dataset , batch_size = batch_size , shuffle = True )
77
83
val_loader = torch .utils .data .DataLoader (val_dataset , batch_size = batch_size , shuffle = not True )
78
84
test_loader = torch .utils .data .DataLoader (XRayTest_dataset , batch_size = batch_size , shuffle = not True )
@@ -97,9 +103,9 @@ def count_parameters(model):
97
103
# define the loss function
98
104
if args .loss_func == 'FocalLoss' : # by default
99
105
from losses import FocalLoss
100
- loss_fn = FocalLoss (gamma = 2. )
106
+ loss_fn = FocalLoss (device = device , gamma = 2. ). to ( device )
101
107
elif args .loss_func == 'BCE' :
102
- loss_fn = nn .BCEWithLogitsLoss ()
108
+ loss_fn = nn .BCEWithLogitsLoss (). to ( device )
103
109
104
110
# define the learning rate
105
111
lr = args .lr
@@ -114,7 +120,8 @@ def count_parameters(model):
114
120
# change the last linear layer
115
121
num_ftrs = model .fc .in_features
116
122
model .fc = nn .Linear (num_ftrs , len (XRayTrain_dataset .all_classes )) # 15 output classes
117
-
123
+ model .to (device )
124
+
118
125
print ('----- STAGE 1 -----' ) # only training 'layer2', 'layer3', 'layer4' and 'fc'
119
126
for name , param in model .named_parameters (): # all requires_grad by default, are True initially
120
127
# print('{}: {}'.format(name, param.requires_grad)) # this shows True for all the parameters
@@ -131,81 +138,85 @@ def count_parameters(model):
131
138
132
139
else :
133
140
if args .ckpt == None :
134
- q ('ERROR: Please select a checkpoint to resume from' )
141
+ q ('ERROR: Please select a valid checkpoint to resume from' )
135
142
136
143
print ('\n ckpt loaded: {}' .format (args .ckpt ))
137
144
ckpt = torch .load (os .path .join (config .models_dir , args .ckpt ))
138
145
139
146
# since we are resuming the training of the model
140
147
epochs_till_now = ckpt ['epochs' ]
141
148
model = ckpt ['model' ]
142
-
149
+ model .to (device )
150
+
143
151
# loading previous loss lists to collect future losses
144
152
losses_dict = ckpt ['losses_dict' ]
145
153
146
154
# printing some hyperparameters
147
155
print ('\n > loss_fn: {}' .format (loss_fn ))
148
156
print ('> epochs_till_now: {}' .format (epochs_till_now ))
149
157
print ('> batch_size: {}' .format (batch_size ))
158
+ print ('> stage: {}' .format (stage ))
150
159
print ('> lr: {}' .format (lr ))
151
160
152
161
else : # testing
153
162
if args .ckpt == None :
154
163
q ('ERROR: Please select a checkpoint to load the testing model from' )
155
164
156
- print ('\n ckpt loaded: {}' .format (args .ckpt ))
165
+ print ('\n checkpoint loaded: {}' .format (args .ckpt ))
157
166
ckpt = torch .load (os .path .join (config .models_dir , args .ckpt ))
158
167
159
168
# since we are resuming the training of the model
160
169
epochs_till_now = ckpt ['epochs' ]
161
170
model = ckpt ['model' ]
162
-
171
+
163
172
# loading previous loss lists to collect future losses
164
173
losses_dict = ckpt ['losses_dict' ]
165
174
166
175
# make changes(freezing/unfreezing the model's layers) in the following, for training the model for different stages
167
- if not args .test :
168
- if args .resume :
169
- '''
176
+ if (not args .test ) and (args .resume ):
177
+
178
+ if stage == 1 :
179
+
170
180
print ('\n ----- STAGE 1 -----' ) # only training 'layer2', 'layer3', 'layer4' and 'fc'
171
181
for name , param in model .named_parameters (): # all requires_grad by default, are True initially
172
182
# print('{}: {}'.format(name, param.requires_grad)) # this shows True for all the parameters
173
183
if ('layer2' in name ) or ('layer3' in name ) or ('layer4' in name ) or ('fc' in name ):
174
184
param .requires_grad = True
175
185
else :
176
186
param .requires_grad = False
177
- '''
178
187
179
- '''
188
+ elif stage == 2 :
189
+
180
190
print ('\n ----- STAGE 2 -----' ) # only training 'layer3', 'layer4' and 'fc'
181
191
for name , param in model .named_parameters ():
182
192
# print('{}: {}'.format(name, param.requires_grad)) # this shows True for all the parameters
183
193
if ('layer3' in name ) or ('layer4' in name ) or ('fc' in name ):
184
194
param .requires_grad = True
185
195
else :
186
196
param .requires_grad = False
187
- '''
188
197
189
- '''
198
+ elif stage == 3 :
199
+
190
200
print ('\n ----- STAGE 3 -----' ) # only training 'layer4' and 'fc'
191
201
for name , param in model .named_parameters ():
192
202
# print('{}: {}'.format(name, param.requires_grad)) # this shows True for all the parameters
193
203
if ('layer4' in name ) or ('fc' in name ):
194
204
param .requires_grad = True
195
205
else :
196
206
param .requires_grad = False
197
- '''
198
207
199
- # '''
208
+ elif stage == 4 :
209
+
200
210
print ('\n ----- STAGE 4 -----' ) # only training 'fc'
201
211
for name , param in model .named_parameters ():
202
212
# print('{}: {}'.format(name, param.requires_grad)) # this shows True for all the parameters
203
213
if ('fc' in name ):
204
214
param .requires_grad = True
205
215
else :
206
216
param .requires_grad = False
207
- # '''
208
217
218
+
219
+ if not args .test :
209
220
# checking the layers which are going to be trained (irrespective of args.resume)
210
221
trainable_layers = []
211
222
for name , param in model .named_parameters ():
@@ -219,20 +230,14 @@ def count_parameters(model):
219
230
print ('\n we have {} Million trainable parameters here in the {} model' .format (count_parameters (model ), model .__class__ .__name__ ))
220
231
221
232
optimizer = optim .Adam (filter (lambda p : p .requires_grad , model .parameters ()), lr = lr )
222
- step_lr_scheduler = lr_scheduler .StepLR (optimizer , step_size = 2 , gamma = 0.8 )
223
-
224
- if args .resume :
225
- # the step_size and gamma defined will be overwritten by the ones saved in the state_dict of the previous step_lr_scheduler
226
- step_lr_scheduler .load_state_dict (ckpt ['lr_scheduler_state_dict' ]) # this will use the state_dict of the saved lr_scheduler
227
- print ('\n step_lr_scheduler.state_dict(): ' , step_lr_scheduler .state_dict ())
228
233
229
234
# make changes in the parameters of the following 'fit' function
230
- fit (XRayTrain_dataset , train_loader , val_loader ,
235
+ fit (device , XRayTrain_dataset , train_loader , val_loader ,
231
236
test_loader , model , loss_fn ,
232
- optimizer , step_lr_scheduler , losses_dict ,
237
+ optimizer , losses_dict ,
233
238
epochs_till_now = epochs_till_now , epochs = 3 ,
234
- log_interval = 5 , save_interval = 1 ,
235
- lr = lr , bs = batch_size , stage_num = 4 ,
239
+ log_interval = 25 , save_interval = 1 ,
240
+ lr = lr , bs = batch_size , stage = stage ,
236
241
test_only = args .test )
237
242
238
243
script_time = time .time () - script_start_time
@@ -247,11 +252,11 @@ def count_parameters(model):
247
252
# epochs = 2
248
253
# ##### STAGE 2 ##### FocalLoss lr = 3e-4
249
254
# training layers = layer3, layer4, fc
250
- # epochs = 1
251
- # ##### STAGE 3 ##### FocalLoss lr = 1e-3
255
+ # epochs = 5
256
+ # ##### STAGE 3 ##### FocalLoss lr = 7e-4
252
257
# training layers = layer4, fc
253
- # epochs = 3
258
+ # epochs = 4
254
259
# ##### STAGE 4 ##### FocalLoss lr = 1e-3
255
260
# training layers = fc
256
- # epochs = 2
261
+ # epochs = 3
257
262
# '''
0 commit comments