|
370 | 370 | help='worker seed mode (default: all)') |
371 | 371 | group.add_argument('--log-interval', type=int, default=50, metavar='N', |
372 | 372 | help='how many batches to wait before logging training status') |
| 373 | +group.add_argument('--val-interval', type=int, default=1, metavar='N', |
| 374 | + help='how many epochs between validation and checkpointing') |
373 | 375 | group.add_argument('--recovery-interval', type=int, default=0, metavar='N', |
374 | 376 | help='how many batches to wait before writing recovery checkpoint') |
375 | 377 | group.add_argument('--checkpoint-hist', type=int, default=10, metavar='N', |
@@ -1013,6 +1015,16 @@ def main(): |
1013 | 1015 | _logger.info("Distributing BatchNorm running means and vars") |
1014 | 1016 | utils.distribute_bn(model, args.world_size, args.dist_bn == 'reduce') |
1015 | 1017 |
|
| 1018 | + if (epoch + 1) % args.val_interval != 0: |
| 1019 | + if utils.is_primary(args): |
| 1020 | + _logger.info("Skipping eval and checkpointing ") |
| 1021 | + if lr_scheduler is not None: |
| 1022 | + # step LR for next epoch |
| 1023 | + # careful when using metric dependent lr_scheduler |
| 1024 | + lr_scheduler.step(epoch + 1, metric=None) |
| 1025 | + # skip validation and metric logic |
| 1026 | + continue |
| 1027 | + |
1016 | 1028 | if loader_eval is not None: |
1017 | 1029 | eval_metrics = validate( |
1018 | 1030 | model, |
@@ -1252,7 +1264,7 @@ def _backward(_loss): |
1252 | 1264 | update_time_m.update(time.time() - update_start_time) |
1253 | 1265 | update_start_time = time_now |
1254 | 1266 |
|
1255 | | - if update_idx % args.log_interval == 0: |
| 1267 | + if update_idx % args.log_interval == 0 or last_batch: |
1256 | 1268 | lrl = [param_group['lr'] for param_group in optimizer.param_groups] |
1257 | 1269 | lr = sum(lrl) / len(lrl) |
1258 | 1270 |
|
|
0 commit comments