This repository was archived by the owner on Jun 5, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 137
/
Copy pathmain.py
executable file
·2278 lines (1869 loc) · 86.1 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# coding=utf-8
"""
main script for training and testing mask rcnn on MSCOCO/DIVA/MEVA dataset
multi gpu version
"""
import argparse
import cv2
import math
import json
import random
import operator
import time
import os
import pickle
import sys
import threading
# so here won"t have poll allocator info
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
# solve the issue of a bug in while loop, when you import the graph in
# multi-gpu, prefix is not added in while loop op [tf 1.14]
# https://github.com/tensorflow/tensorflow/issues/26526
os.environ["TF_ENABLE_CONTROL_FLOW_V2"] = "1"
# remove all the annoying warnings from tf v1.10 to v1.13
import logging
logging.getLogger("tensorflow").disabled = True
import matplotlib
# avoid the warning "gdk_cursor_new_for_display:
# assertion 'GDK_IS_DISPLAY (display)' failed" with Python 3
matplotlib.use('Agg')
import tensorflow as tf
import numpy as np
import pycocotools.mask as cocomask
from pycocotools.coco import COCO
from tqdm import tqdm
from glob import glob
from models import get_model
from models import pack
from models import initialize
from trainer import Trainer
from tester import Tester
from nn import resizeImage
from nn import fill_full_mask
from utils import evalcoco
from utils import match_detection
from utils import computeAP
from utils import computeAR_2
from utils import grouper
from utils import gather_dt
from utils import gather_gt
from utils import match_dt_gt
from utils import gather_act_singles
from utils import aggregate_eval
from utils import weighted_average
from utils import parse_nvidia_smi
from utils import sec2time
from utils import Dataset
from utils import Summary
from utils import nms_wrapper
from utils import FIFO_ME
# for using a COCO model to finetuning with DIVA data.
from class_ids import targetClass2id
from class_ids import targetAct2id
from class_ids import targetSingleAct2id
from class_ids import targetClass2id_mergeProp
from class_ids import targetClass2id_new
from class_ids import targetClass2id_new_nopo
from class_ids import targetAct2id_bupt
from class_ids import bupt_act_mapping
from class_ids import targetAct2id_meva
from class_ids import meva_act_mapping
from class_ids import coco_obj_class_to_id
from class_ids import coco_obj_id_to_class
from class_ids import coco_obj_to_actev_obj
targetid2class = {targetClass2id[one]:one for one in targetClass2id}
targetactid2class = {targetAct2id[one]:one for one in targetAct2id}
targetsingleactid2class = {
targetSingleAct2id[one]:one for one in targetSingleAct2id}
# coco class to DIVA class
eval_target = {
"Vehicle": ["car", "motorcycle", "bus", "truck", "vehicle"],
"Person": "person",
}
eval_best = "Person" # not used anymore, we use average as the best metric
def get_args():
global targetClass2id, targetid2class
parser = argparse.ArgumentParser()
parser.add_argument("datajson")
parser.add_argument("imgpath")
parser.add_argument("--log_time_and_gpu", action="store_true")
parser.add_argument("--outbasepath", type=str, default=None,
help="full path will be outbasepath/modelname/runId")
parser.add_argument("--actoutbasepath", type=str, default=None,
help="for activity box forward only")
parser.add_argument("--train_skip", type=int, default=1,
help="when load diva train set, skip how many.")
parser.add_argument("--train_skip_offset", type=int, default=0,
help="when load diva train set, offset before skip")
parser.add_argument("--val_skip", type=int, default=1,
help="when load diva val set, skip how many.")
parser.add_argument("--val_skip_offset", type=int, default=0,
help="when load diva train set, offset before skip")
parser.add_argument("--exit_after_val", action="store_true")
parser.add_argument("--forward_skip", type=int, default=1,
help="forward, skip how many.")
parser.add_argument("--use_two_level_outpath", action="store_true")
parser.add_argument("--start_from", type=int, default=0,
help="forward, start from which batch")
parser.add_argument("--modelname", type=str, default=None)
parser.add_argument("--num_class", type=int, default=81,
help="num catagory + 1 background")
# ---- for training, show losses" moving average
parser.add_argument("--show_loss_period", type=int, default=1000)
parser.add_argument("--loss_me_step", type=int, default=100,
help="moving average queue size")
# ------ extract fpn feature of the whole image
parser.add_argument("--extract_feat", action="store_true")
parser.add_argument("--feat_path", default=None)
parser.add_argument("--just_feat", action="store_true",
help="only extract full image feature no bounding box")
# ------ do object detection and extract the fpn feature for each *final*boxes
parser.add_argument("--get_box_feat", action="store_true")
parser.add_argument("--box_feat_path", default=None)
# ---different from above, only feat no object detection
parser.add_argument("--videolst", default=None)
parser.add_argument("--skip", action="store_true", help="skip existing npy")
parser.add_argument("--tococo", action="store_true",
help="for training in diva using coco model, map diva"
" class1to1 to coco")
parser.add_argument("--diva_class", action="store_true",
help="the last layer is 16 (full) class output as "
"the diva object classes")
parser.add_argument("--diva_class2", action="store_true",
help="the last layer is new classes with person_object"
" boxes")
parser.add_argument("--diva_class3", action="store_true",
help="the last layer is new classes without person_object"
" boxes")
parser.add_argument("--is_coco_model", action="store_true")
parser.add_argument("--person_only", action="store_true")
parser.add_argument("--merge_prop", action="store_true",
help="use annotation that merged prop and "
"Push_Pulled_Object and train")
parser.add_argument("--use_bg_score", action="store_true")
# ------------activity detection
parser.add_argument("--act_as_obj", action="store_true",
help="activity box as obj box")
parser.add_argument("--add_act", action="store_true",
help="add activitiy model")
# 07/2019
parser.add_argument("--bupt_exp", action="store_true",
help="bupt activity box exp")
parser.add_argument("--meva_exp", action="store_true",
help="meva activity box exp")
parser.add_argument("--check_img_exist", action="store_true",
help="check image exists when load data")
parser.add_argument("--fix_obj_model", action="store_true",
help="fix the object detection part including rpn")
# v1:
parser.add_argument("--num_act_class", type=int, default=36,
help="num catagory + 1 background")
parser.add_argument("--fastrcnn_act_fg_ratio", default=0.25, type=float)
parser.add_argument("--act_relation_nn", action="store_true",
help="add relation link in activity fastrnn head")
parser.add_argument("--act_loss_weight", default=1.0, type=float)
# ----- activity detection version 2
parser.add_argument("--act_v2", action="store_true")
parser.add_argument("--act_single_topk", type=int, default=5,
help="each box topk classes are output")
parser.add_argument("--num_act_single_class", default=36, type=int)
parser.add_argument("--num_act_pair_class", default=21, type=int)
# ---------------------------------------------
parser.add_argument("--debug", action="store_true",
help="load fewer image for debug in training")
parser.add_argument("--runId", type=int, default=1)
# forward mode: imgpath is the list of images
# will output result to outbasepath
# forward still need a coco validation json to get the catgory names
parser.add_argument("--mode", type=str, default="forward",
help="train | test | forward | boxfeat | givenbox")
parser.add_argument("--avg_feat", action="store_true",
help="for boxfeat mode, output 7x7x2048 or just "
"2048 for each box")
parser.add_argument("--boxjsonpath", default=None,
help="json contain a dict for all the boxes, imageId"
" -> boxes")
parser.add_argument("--boxfeatpath", default=None,
help="where to save the box feat path, will be a npy"
" for each image")
parser.add_argument("--boxclass", action="store_true",
help="do box classification as well")
parser.add_argument("--resnet152", action="store_true", help="")
parser.add_argument("--resnet50", action="store_true", help="")
parser.add_argument("--resnet34", action="store_true", help="")
parser.add_argument("--resnet18", action="store_true", help="")
parser.add_argument("--use_se", action="store_true",
help="use squeeze and excitation in backbone")
parser.add_argument("--use_resnext", action="store_true")
parser.add_argument("--is_fpn", action="store_true")
parser.add_argument("--use_gn", action="store_true",
help="whether to use group normalization")
parser.add_argument("--ignore_gn_vars", action="store_true",
help="add gn to previous model, will ignore loading "
"the gn var first")
parser.add_argument("--use_conv_frcnn_head", action="store_true",
help="use conv in fastrcnn head")
parser.add_argument("--use_att_frcnn_head", action="store_true",
help="use attention to sum [K, 7, 7, C] feature "
"into [K, C]")
parser.add_argument("--use_frcnn_class_agnostic", action="store_true",
help="use class agnostic fc head")
parser.add_argument("--conv_frcnn_head_dim", default=256, type=int)
parser.add_argument("--get_rpn_out", action="store_true")
parser.add_argument("--rpn_out_path", default=None)
parser.add_argument("--use_cpu_nms", action="store_true")
parser.add_argument("--no_nms", action="store_true",
help="not using nms in the end, "
"save all pre_nms_topk boxes;")
parser.add_argument("--save_all_box", action="store_true",
help="for DCR experiment, save all boxes "
"and scores in npz file")
parser.add_argument("--use_small_object_head", action="store_true")
parser.add_argument("--use_so_score_thres", action="store_true",
help="use score threshold before final nms")
parser.add_argument("--oversample_so_img", action="store_true")
parser.add_argument("--oversample_x", type=int, default=1, help="x + 1 times")
parser.add_argument("--skip_no_so_img", action="store_true")
parser.add_argument("--skip_no_object", default=None,
help="Bike, single object annotation filter")
parser.add_argument("--so_outpath", default=None)
parser.add_argument("--use_so_association", action="store_true")
parser.add_argument("--so_person_topk", type=int, default=10)
parser.add_argument("--freeze_rpn", action="store_true")
parser.add_argument("--freeze_fastrcnn", action="store_true")
parser.add_argument("--use_dilations", action="store_true",
help="use dilations=2 in res5")
parser.add_argument("--use_deformable", action="store_true",
help="use dilations=2 in res5")
parser.add_argument("--fpn_frcnn_fc_head_dim", type=int, default=1024)
parser.add_argument("--fpn_num_channel", type=int, default=256)
parser.add_argument("--freeze", type=int, default=0,
help="freeze backbone resnet until group 0|2")
parser.add_argument("--finer_resolution", action="store_true",
help="fpn use finer resolution conv")
parser.add_argument("--add_relation_nn", action="store_true",
help="add relation network feature")
parser.add_argument("--focal_loss", action="store_true",
help="use focal loss for RPN and FasterRCNN loss, "
"instead of cross entropy")
# for test mode on testing on the MSCOCO dataset, if not set this,
# will use our evaluation script
parser.add_argument("--use_coco_eval", action="store_true")
parser.add_argument("--coco2014_to_2017", action="store_true",
help="if use the cocoval 2014 json and use val2017"
" filepath, need this option to get the correct"
" file path")
parser.add_argument("--trainlst", type=str, default=None,
help="training frame name list,")
parser.add_argument("--valframepath", type=str, default=None,
help="path to top frame path")
parser.add_argument("--annopath", type=str, default=None,
help="path to annotation, each frame.npz")
parser.add_argument("--valannopath", type=str, default=None,
help="path to annotation, each frame.npz")
parser.add_argument("--one_level_framepath", action="store_true")
parser.add_argument("--flip_image", action="store_true",
help="for training, whether to random horizontal "
"flipping for input image, maybe not for "
"surveillance video")
parser.add_argument("--add_mask", action="store_true")
parser.add_argument("--vallst", type=str, default=None,
help="validation for training")
parser.add_argument("--load", action="store_true")
parser.add_argument("--load_best", action="store_true")
parser.add_argument("--skip_first_eval", action="store_true")
parser.add_argument("--best_first", type=float, default=None)
parser.add_argument("--force_first_eval", action="store_true")
parser.add_argument("--no_skip_error", action="store_true")
parser.add_argument("--show_stat", action="store_true",
help="show data distribution only")
# use for pre-trained model
parser.add_argument("--load_from", type=str, default=None)
parser.add_argument("--ignore_vars", type=str, default=None,
help="variables to ignore, multiple seperate by : "
"like: logits/W:logits/b, this var only need to "
"be var name's sub string to ignore")
parser.add_argument("--print_params", action="store_true",
help="print params and then exit")
parser.add_argument("--show_restore", action="store_true",
help="load from existing model (npz), show the"
" weight that is restored")
# -------------------- save model for deployment
parser.add_argument("--is_pack_model", action="store_true", default=False,
help="with is_test, this will pack the model to a path"
" instead of testing")
parser.add_argument("--pack_model_path", type=str, default=None,
help="path to save model, a .pb file")
parser.add_argument("--note", type=str, default=None,
help="leave a note for this packed model for"
" future reference")
parser.add_argument("--pack_modelconfig_path", type=str, default=None,
help="json file to save the config and note")
# forward with frozen gragp
parser.add_argument("--is_load_from_pb", action="store_true")
# for efficientdet
parser.add_argument("--is_efficientdet", action="store_true")
parser.add_argument("--efficientdet_modelname", default="efficientdet-d0")
parser.add_argument("--efficientdet_max_detection_topk", type=int,
default=5000, help="#topk boxes before NMS")
parser.add_argument("--efficientdet_min_level", type=int, default=3)
parser.add_argument("--efficientdet_max_level", type=int, default=7)
# ------------------------------------ model specifics
# ----------------------------------training detail
parser.add_argument("--use_all_mem", action="store_true")
parser.add_argument("--im_batch_size", type=int, default=1)
parser.add_argument("--rpn_batch_size", type=int, default=256,
help="num roi per image for RPN training")
parser.add_argument("--frcnn_batch_size", type=int, default=512,
help="num roi per image for fastRCNN training")
parser.add_argument("--rpn_test_post_nms_topk", type=int, default=1000,
help="test post nms, input to fast rcnn")
# fastrcnn output NMS suppressing iou >= this thresZ
parser.add_argument("--fastrcnn_nms_iou_thres", type=float, default=0.5)
parser.add_argument("--max_size", type=int, default=1333,
help="num roi per image for RPN and fastRCNN training")
parser.add_argument("--short_edge_size", type=int, default=800,
help="num roi per image for RPN and fastRCNN training")
parser.add_argument("--scale_jitter", action="store_true",
help="if set this, will random get int from min to max"
" to resize image;original param will still be used"
" in testing")
parser.add_argument("--short_edge_size_min", type=int, default=640,
help="num roi per image for RPN and fastRCNN training")
parser.add_argument("--short_edge_size_max", type=int, default=800,
help="num roi per image for RPN and fastRCNN training")
# ------------------------------mixup training
parser.add_argument("--use_mixup", action="store_true")
parser.add_argument("--use_constant_mixup_weight", action="store_true")
parser.add_argument("--mixup_constant_weight", type=float, default=0.5)
parser.add_argument("--mixup_chance", type=float, default=0.5,
help="the possibility of using mixup")
parser.add_argument("--max_mixup_per_frame", type=int, default=15)
# not used for fpn
parser.add_argument("--small_anchor_exp", action="store_true")
parser.add_argument("--positive_anchor_thres", default=0.7, type=float)
parser.add_argument("--negative_anchor_thres", default=0.3, type=float)
parser.add_argument("--fastrcnn_fg_ratio", default=0.25, type=float)
parser.add_argument("--gpu", default=1, type=int, help="number of gpu")
parser.add_argument("--gpuid_start", default=0, type=int,
help="start of gpu id")
parser.add_argument("--model_per_gpu", default=1, type=int,
help="it will be set as a /task:k in device")
parser.add_argument("--controller", default="/cpu:0",
help="controller for multigpu training")
#parser.add_argument("--num_step",type=int,default=360000)
parser.add_argument("--num_epochs", type=int, default=12)
parser.add_argument("--save_period", type=int, default=5000,
help="num steps to save model and eval")
# drop out rate
parser.add_argument("--keep_prob", default=1.0, type=float,
help="1.0 - drop out rate;remember to set it to 1.0 "
"in eval")
# l2 weight decay
parser.add_argument("--wd", default=None, type=float) # 0.0001
parser.add_argument("--init_lr", default=0.1, type=float,
help=("start learning rate"))
parser.add_argument("--use_lr_decay", action="store_true")
parser.add_argument("--learning_rate_decay", default=0.94, type=float,
help=("learning rate decay"))
parser.add_argument("--num_epoch_per_decay", default=2.0, type=float,
help=("how epoch after which lr decay"))
parser.add_argument("--use_cosine_schedule", action="store_true")
parser.add_argument("--use_exp_schedule", action="store_true")
parser.add_argument("--warm_up_steps", default=3000, type=int,
help=("warm up steps not epochs"))
parser.add_argument("--same_lr_steps", default=0, type=int,
help=("after warm up, keep the init_lr for k steps"))
parser.add_argument("--optimizer", default="adam", type=str,
help="optimizer: adam/adadelta")
parser.add_argument("--momentum", default=0.9, type=float)
parser.add_argument("--result_score_thres", default=0.0001, type=float)
parser.add_argument("--result_per_im", default=100, type=int)
# clipping, suggest 100.0
parser.add_argument("--clip_gradient_norm", default=None, type=float,
help=("norm to clip gradient to"))
# for debug
parser.add_argument("--vis_pre", action="store_true",
help="visualize preprocess images")
parser.add_argument("--vis_path", default=None)
# for efficient use of COCO model classes
parser.add_argument("--use_partial_classes", action="store_true")
parser.add_argument("--is_multi", action="store_true",
help="use multi-img batch model")
args = parser.parse_args()
if args.use_cosine_schedule:
args.use_lr_decay = True
if args.use_exp_schedule:
args.use_lr_decay = True
args.use_cosine_schedule = False
if args.save_all_box:
args.no_nms = True
if args.no_nms:
args.use_cpu_nms = True # so to avoid using TF nms in the graph
#assert args.model_per_gpu == 1, "not work yet!"
#assert args.gpu*args.model_per_gpu == args.im_batch_size # one gpu one image
#args.controller = "/cpu:0" # parameter server
targetid2class = targetid2class
targetClass2id = targetClass2id
args.small_objects = ["Prop", "Push_Pulled_Object",
"Prop_plus_Push_Pulled_Object", "Bike"]
if args.use_small_object_head:
assert args.merge_prop
args.so_eval_target = {c:1 for c in args.small_objects}
args.small_objects_targetClass2id = {
c: i for i, c in enumerate(["BG"] + args.small_objects)}
args.small_objects_targetid2class = {
args.small_objects_targetClass2id[one]: one
for one in args.small_objects_targetClass2id}
if args.merge_prop:
targetClass2id = targetClass2id_mergeProp
targetid2class = {targetClass2id_mergeProp[one]:one
for one in targetClass2id_mergeProp}
if args.diva_class2:
targetClass2id = targetClass2id_new
targetid2class = {targetClass2id_new[one]:one for one in targetClass2id_new}
if args.diva_class3:
targetClass2id = targetClass2id_new_nopo
targetid2class = {targetClass2id_new_nopo[one]:one
for one in targetClass2id_new_nopo}
args.classname2id = targetClass2id
args.classid2name = targetid2class
if args.act_as_obj:
# replace the obj class with actitivy class
targetClass2id = targetAct2id
targetid2class = {targetAct2id[one]:one for one in targetAct2id}
if args.bupt_exp:
args.diva_class = True
args.act_as_obj = True
targetClass2id = targetAct2id_bupt
targetid2class = {targetAct2id_bupt[one]:one for one in targetAct2id_bupt}
if args.meva_exp:
args.diva_class = True
args.act_as_obj = True
targetClass2id = targetAct2id_meva
targetid2class = {targetAct2id_meva[one]:one for one in targetAct2id_meva}
if args.is_coco_model:
#assert args.mode == "forward" or args.mode == "pack"
args.diva_class = False
targetClass2id = coco_obj_class_to_id
targetid2class = coco_obj_id_to_class
if args.person_only:
targetid2class = {0: "BG", 1: "person"}
targetClass2id = {"BG": 0, "person": 1}
if args.use_partial_classes:
assert args.is_coco_model
args.partial_classes = [classname for classname in coco_obj_to_actev_obj]
args.classname2id = targetClass2id
args.classid2name = targetid2class
if not args.tococo:
assert len(targetid2class) == args.num_class
if not args.tococo and ((args.mode == "train") or (args.mode == "test")):
assert args.num_class == len(targetid2class.keys())
args.class_names = targetClass2id.keys()
if args.vis_pre:
assert args.vis_path is not None
if not os.path.exists(args.vis_path):
os.makedirs(args.vis_path)
if args.add_act and (args.mode == "forward"):
assert args.actoutbasepath is not None
mkdir(args.actoutbasepath)
if args.outbasepath is not None:
mkdir(args.outbasepath)
if args.skip_first_eval:
assert args.best_first is not None
if (args.outbasepath is not None) and (args.modelname is not None):
args.outpath = os.path.join(args.outbasepath,
args.modelname,
str(args.runId).zfill(2))
args.save_dir = os.path.join(args.outpath, "save")
args.save_dir_best = os.path.join(args.outpath, "save-best")
args.write_self_sum = True
args.self_summary_path = os.path.join(args.outpath, "train_sum.txt")
# path to save each validation step"s performance and loss
args.stats_path = os.path.join(args.outpath, "stats.json")
args.mrcnn_head_dim = 256
args.no_obj_detect = False
if args.mode == "videofeat":
args.no_obj_detect = True
args.anchor_stride = 16 # has to be 16 to match the image feature total stride
args.anchor_sizes = (32, 64, 128, 256, 512)
if args.small_anchor_exp:
args.anchor_sizes = (16, 32, 64, 96, 128, 256) # not used for fpn
if args.is_fpn:
args.anchor_strides = (4, 8, 16, 32, 64)
# we will pad H,W to be a multiplier of 32
# [3] is 32, since there is a total pixel reduce of 2x2x2x2x2
args.fpn_resolution_requirement = float(args.anchor_strides[3])
if args.is_efficientdet:
args.fpn_resolution_requirement = 128.0 # 2 ** max_level
args.short_edge_size = np.ceil(
args.short_edge_size / args.fpn_resolution_requirement) * \
args.fpn_resolution_requirement
args.max_size = np.ceil(args.max_size / args.fpn_resolution_requirement) \
* args.fpn_resolution_requirement
#args.fpn_num_channel = 256
#args.fpn_frcnn_fc_head_dim = 1024
if args.load_best:
args.load = True
if args.load_from is not None:
args.load = True
if args.mode == "train":
assert args.outbasepath is not None
assert args.modelname is not None
args.is_train = True
mkdir(args.save_dir)
mkdir(args.save_dir_best)
else:
args.is_train = False
args.num_epochs = 1
if args.get_rpn_out:
if not os.path.exists(args.rpn_out_path):
os.makedirs(args.rpn_out_path)
# ---- all the mask rcnn config
args.resnet_num_block = [3, 4, 23, 3] # resnet 101
args.use_basic_block = False # for resnet-34 and resnet-18
if args.resnet152:
args.resnet_num_block = [3, 8, 36, 3]
if args.resnet50:
args.resnet_num_block = [3, 4, 6, 3]
if args.resnet34:
args.resnet_num_block = [3, 4, 6, 3]
args.use_basic_block = True
if args.resnet18:
args.resnet_num_block = [2, 2, 2, 2]
args.use_basic_block = True
#args.short_edge_size = 800
#args.max_size = 1333
args.anchor_ratios = (0.5, 1, 2)
args.num_anchors = len(args.anchor_sizes) * len(args.anchor_ratios)
# iou thres to determine anchor label
#args.positive_anchor_thres = 0.7
#args.negative_anchor_thres = 0.3
# when getting region proposal, avoid getting too large boxes
args.bbox_decode_clip = np.log(args.max_size / 16.0)
# RPN training
args.rpn_fg_ratio = 0.5
args.rpn_batch_per_im = args.rpn_batch_size
args.rpn_min_size = 0 # 8?
args.rpn_proposal_nms_thres = 0.7
args.rpn_train_pre_nms_topk = 12000 # not used in fpn
args.rpn_train_post_nms_topk = 2000# this is used for fpn_nms_pre
# fastrcnn
args.fastrcnn_batch_per_im = args.frcnn_batch_size
args.fastrcnn_bbox_reg_weights = np.array([10, 10, 5, 5], dtype="float32")
#args.fastrcnn_bbox_reg_weights = np.array([20, 20, 10, 10], dtype="float32")
args.fastrcnn_fg_thres = 0.5 # iou thres
#args.fastrcnn_fg_ratio = 0.25 # 1:3 -> pos:neg
# testing
args.rpn_test_pre_nms_topk = 6000
#args.rpn_test_post_nms_topk = 700 #1300 # 700 takes 40 hours, # OOM at 1722,28,28,1024 # 800 OOM for gpu4
#args.fastrcnn_nms_thres = 0.5
#args.fastrcnn_nms_iou_thres = 0.5 # 0.3 is worse
#args.result_score_thres = 0.0001
#args.result_per_im = 100 # 400 # 100
if args.focal_loss and args.clip_gradient_norm is None:
print("Focal loss needs gradient clipping or will have NaN loss")
sys.exit()
return args
def add_coco(config,datajson):
coco = COCO(datajson)
cat_ids = coco.getCatIds() #[80], each is 1-90
cat_names = [c["name"] for c in coco.loadCats(cat_ids)] # [80]
config.classId_to_cocoId = {(i+1): v for i, v in enumerate(cat_ids)}
config.class_names = ["BG"] + cat_names
# 0-80
config.class_to_classId = {c:i for i, c in enumerate(config.class_names)}
config.classId_to_class = {i:c for i, c in enumerate(config.class_names)}
# load all ground truth into memory
def read_data_diva(config, idlst, framepath, annopath, tococo=False,
randp=None, is_train=False, one_level_framepath=False):
assert idlst is not None
assert framepath is not None
assert annopath is not None
assert len(targetid2class.keys()) == config.num_class
# load the coco class name to classId so we could convert the label name
#to label classId
if tococo:
add_coco(config, config.datajson)
imgs = [os.path.splitext(os.path.basename(line.strip()))[0]
for line in open(idlst, "r").readlines()]
if randp is not None:
imgs = random.sample(imgs, int(len(imgs)*randp))
data = {"imgs":[], "gt":[]}
if config.use_mixup and is_train:
data["mixup_weights"] = []
print("loading data..")
if config.print_params:
imgs = imgs[:100]
# in diva dataset, some class may be ignored
ignored_classes = {}
targetClass2exist = {classname:0 for classname in targetClass2id}
num_empty_actboxes = 0
targetAct2exist = {classname:0 for classname in targetAct2id}
ignored_act_classes = {}
num_empty_single_actboxes = 0
ignored_single_act_classes = {}
targetAct2exist_single = {classname:0 for classname in targetSingleAct2id}
act_single_fgratio = []
if config.debug:
imgs = imgs[:1000]
if (config.train_skip > 1) and is_train:
imgs.sort()
ori_num = len(imgs)
imgs = imgs[config.train_skip_offset::config.train_skip]
print("skipping [%s::%s], got %s/%s" % (
config.train_skip_offset, config.train_skip, len(imgs), ori_num))
if (config.val_skip > 1) and not is_train:
imgs.sort()
ori_num = len(imgs)
imgs = imgs[config.val_skip_offset::config.val_skip]
print("skipping [%s::%s], got %s/%s" % (
config.val_skip_offset, config.val_skip, len(imgs), ori_num))
# get starts for each img, the label distribution
# class -> [] num_box in each image
label_dist = {classname:[] for classname in targetClass2id}
label_dist_all = []
for img in tqdm(imgs, ascii=True, smoothing=0.5):
anno = os.path.join(annopath, "%s.npz"%img)
videoname = img.strip().split("_F_")[0]
if not os.path.exists(anno):
continue
if config.check_img_exist:
if not os.path.exists(os.path.join(framepath, videoname, "%s.jpg"%img)):
continue
anno = dict(np.load(anno, allow_pickle=True)) # "boxes" -> [K,4]
# boxes are x1,y1,x2,y2
original_box_num = len(anno["boxes"])
# feed act box as object boxes
if config.act_as_obj:
anno["labels"] = anno["actlabels"]
anno["boxes"] = anno["actboxes"]
# labels are one word, diva classname
labels = []
boxes = []
no_so_box = True
no_object = True
for i, classname in enumerate(list(anno["labels"])):
if classname in targetClass2id or (
config.bupt_exp and classname in bupt_act_mapping) or (
config.meva_exp and classname in meva_act_mapping):
if config.bupt_exp and classname in bupt_act_mapping:
classname = bupt_act_mapping[classname]
if config.meva_exp and classname in meva_act_mapping:
classname = meva_act_mapping[classname]
targetClass2exist[classname] = 1
labels.append(targetClass2id[classname])
boxes.append(anno["boxes"][i])
else:
ignored_classes[classname] = 1
if classname in config.small_objects:
no_so_box = False
if config.skip_no_object is not None:
if classname == config.skip_no_object:
no_object = False
if config.use_mixup and is_train:
mixup_boxes = []
mixup_labels = []
for i, classname in enumerate(
list(anno["mixup_labels"])[:config.max_mixup_per_frame]):
if classname in targetClass2id:
# not adding now, during run time will maybe add them
#labels.append(targetClass2id[classname])
#boxes.append(anno["mixup_boxes"][i])
mixup_boxes.append(anno["mixup_boxes"][i])
mixup_labels.append(targetClass2id[classname])
anno["mixup_boxes"] = np.array(mixup_boxes, dtype="float32")
anno["mixup_labels"] = mixup_labels
anno["boxes"] = np.array(boxes, dtype="float32")
anno["labels"] = labels
#assert len(anno["boxes"]) > 0
if len(anno["boxes"]) == 0:
continue
if config.skip_no_so_img and is_train:
if no_so_box:
continue
if config.skip_no_object and is_train:
if no_object:
continue
assert len(anno["labels"]) == len(anno["boxes"]), (
anno["labels"], anno["boxes"])
assert anno["boxes"].dtype == np.float32
if config.oversample_so_img and is_train and not no_so_box:
for i in range(config.oversample_x):
data["imgs"].append(os.path.join(framepath, videoname, "%s.jpg"%img))
data["gt"].append(anno)
# statics
if config.show_stat:
for classname in label_dist:
num_box_this_img = len(
[l for l in labels if l == targetClass2id[classname]])
label_dist[classname].append(num_box_this_img)
label_dist_all.append(len(labels))
if config.add_act:
# for activity anno, we couldn"t remove any of the boxes
assert len(anno["boxes"]) == original_box_num
if config.act_v2:
# make multi class labels
# BG class is at index 0
K = len(anno["boxes"])
actSingleLabels = np.zeros((K, config.num_act_single_class),
dtype="uint8")
# use this to mark BG
hasClass = np.zeros((K), dtype="bool")
for i, classname in enumerate(list(anno["actSingleLabels"])):
if classname in targetSingleAct2id:
targetAct2exist_single[classname] = 1
act_id = targetSingleAct2id[classname]
box_id = anno["actSingleIdxs"][i]
assert box_id >= 0 and box_id < K
actSingleLabels[box_id, act_id] = 1
hasClass[box_id] = True
else:
ignored_single_act_classes[classname] = 1
# mark the BG for boxes that has not activity annotation
actSingleLabels[np.logical_not(hasClass), 0] = 1
anno["actSingleLabels_npy"] = actSingleLabels
# compute the BG vs FG ratio for the activity boxes
act_single_fgratio.append(sum(hasClass)/float(K))
if sum(hasClass) == 0:
num_empty_single_actboxes += 1
continue
else:
act_labels = []
act_good_ids = []
for i, classname in enumerate(list(anno["actlabels"])):
if classname in targetAct2id:
targetAct2exist[classname] = 1
act_labels.append(targetAct2id[classname])
act_good_ids.append(i)
else:
ignored_act_classes[classname] = 1
#print anno["actboxes"].shape
if anno["actboxes"].shape[0] == 0:# ignore this image
num_empty_actboxes += 1
continue
anno["actboxes"] = anno["actboxes"][act_good_ids]
# it is a npy array of python list, so no :
anno["actboxidxs"] = anno["actboxidxs"][act_good_ids]
anno["actlabels"] = act_labels
assert len(anno["actboxes"]) == len(anno["actlabels"])
if config.use_mixup and is_train:
# the training lst and annotation is framename_M_framename.npz files
framename1, framename2 = img.strip().split("_M_")
videoname1 = framename1.strip().split("_F_")[0]
videoname2 = framename2.strip().split("_F_")[0]
data["imgs"].append(
(os.path.join(framepath, videoname1, "%s.jpg"%framename1),
os.path.join(framepath, videoname2, "%s.jpg"%framename2)))
data["gt"].append(anno)
weight = np.random.beta(1.5, 1.5)
if config.use_constant_mixup_weight:
weight = config.mixup_constant_weight
data["mixup_weights"].append(weight)
else:
if one_level_framepath:
data["imgs"].append(os.path.join(framepath, "%s.jpg"%img))
else:
data["imgs"].append(os.path.join(framepath, videoname, "%s.jpg"%img))
data["gt"].append(anno)
print("loaded %s/%s data" % (len(data["imgs"]), len(imgs)))
if config.show_stat:
for classname in label_dist:
d = label_dist[classname]
ratios = [a/float(b) for a, b in zip(d, label_dist_all)]
print("%s, [%s - %s], median %s per img, ratio:[%.3f - %.3f], "
"median %.3f, no label %s/%s [%.3f]" % (
classname, min(d), max(d), np.median(d), min(ratios),
max(ratios),
np.median(ratios), len([i for i in d if i == 0]), len(d),
len([i for i in d if i == 0])/float(len(d))))
print("each img has boxes: [%s - %s], median %s" % (
min(label_dist_all), max(label_dist_all), np.median(label_dist_all)))
if ignored_classes:
print("ignored %s " % (ignored_classes.keys()))
noDataClasses = [classname for classname in targetClass2exist
if targetClass2exist[classname] == 0]
if noDataClasses:
print("warning: class data not exists: %s, AR will be 1.0 for these" % (
noDataClasses))
if config.add_act:
if config.act_v2:
print(" each frame positive act box percentage min %.4f, max %.4f, "
"mean %.4f" % (
min(act_single_fgratio), max(act_single_fgratio),
np.mean(act_single_fgratio)))
if ignored_single_act_classes:
print("ignored activity %s" % (ignored_single_act_classes.keys()))
print("%s/%s has no single activity boxes" % (
num_empty_single_actboxes, len(data["imgs"])))
noDataClasses = [classname for classname in targetAct2exist_single
if targetAct2exist_single[classname] == 0]