1+ # Copyright (c) Shanghai AI Lab. All rights reserved.
2+ _base_ = [
3+ '../_base_/models/mask2former_beit_cocostuff.py' ,
4+ '../_base_/datasets/coco-stuff10k.py' ,
5+ '../_base_/default_runtime.py' ,
6+ '../_base_/schedules/schedule_40k.py'
7+ ]
8+ crop_size = (512 , 512 )
9+ # pretrained = 'https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_224_pt22k_ft22k.pth'
10+ pretrained = 'pretrained/beit_large_patch16_224_pt22k_ft22k.pth'
11+ model = dict (
12+ pretrained = pretrained ,
13+ backbone = dict (
14+ type = 'BEiTAdapter' ,
15+ img_size = 512 ,
16+ patch_size = 16 ,
17+ embed_dim = 1024 ,
18+ depth = 24 ,
19+ num_heads = 16 ,
20+ mlp_ratio = 4 ,
21+ qkv_bias = True ,
22+ use_abs_pos_emb = False ,
23+ use_rel_pos_bias = True ,
24+ init_values = 1e-6 ,
25+ drop_path_rate = 0.3 ,
26+ conv_inplane = 64 ,
27+ n_points = 4 ,
28+ deform_num_heads = 16 ,
29+ cffn_ratio = 0.25 ,
30+ deform_ratio = 0.5 ,
31+ interaction_indexes = [[0 , 5 ], [6 , 11 ], [12 , 17 ], [18 , 23 ]],
32+ ),
33+ decode_head = dict (
34+ in_channels = [1024 , 1024 , 1024 , 1024 ],
35+ feat_channels = 1024 ,
36+ out_channels = 1024 ,
37+ num_queries = 100 ,
38+ pixel_decoder = dict (
39+ type = 'MSDeformAttnPixelDecoder' ,
40+ num_outs = 3 ,
41+ norm_cfg = dict (type = 'GN' , num_groups = 32 ),
42+ act_cfg = dict (type = 'ReLU' ),
43+ encoder = dict (
44+ type = 'DetrTransformerEncoder' ,
45+ num_layers = 6 ,
46+ transformerlayers = dict (
47+ type = 'BaseTransformerLayer' ,
48+ attn_cfgs = dict (
49+ type = 'MultiScaleDeformableAttention' ,
50+ embed_dims = 1024 ,
51+ num_heads = 32 ,
52+ num_levels = 3 ,
53+ num_points = 4 ,
54+ im2col_step = 64 ,
55+ dropout = 0.0 ,
56+ batch_first = False ,
57+ norm_cfg = None ,
58+ init_cfg = None ),
59+ ffn_cfgs = dict (
60+ type = 'FFN' ,
61+ embed_dims = 1024 ,
62+ feedforward_channels = 4096 ,
63+ num_fcs = 2 ,
64+ ffn_drop = 0.0 ,
65+ act_cfg = dict (type = 'ReLU' , inplace = True )),
66+ operation_order = ('self_attn' , 'norm' , 'ffn' , 'norm' )),
67+ init_cfg = None ),
68+ positional_encoding = dict (
69+ type = 'SinePositionalEncoding' , num_feats = 512 , normalize = True ),
70+ init_cfg = None ),
71+ positional_encoding = dict (
72+ type = 'SinePositionalEncoding' , num_feats = 512 , normalize = True ),
73+ transformer_decoder = dict (
74+ type = 'DetrTransformerDecoder' ,
75+ return_intermediate = True ,
76+ num_layers = 9 ,
77+ transformerlayers = dict (
78+ type = 'DetrTransformerDecoderLayer' ,
79+ attn_cfgs = dict (
80+ type = 'MultiheadAttention' ,
81+ embed_dims = 1024 ,
82+ num_heads = 32 ,
83+ attn_drop = 0.0 ,
84+ proj_drop = 0.0 ,
85+ dropout_layer = None ,
86+ batch_first = False ),
87+ ffn_cfgs = dict (
88+ embed_dims = 1024 ,
89+ feedforward_channels = 4096 ,
90+ num_fcs = 2 ,
91+ act_cfg = dict (type = 'ReLU' , inplace = True ),
92+ ffn_drop = 0.0 ,
93+ dropout_layer = None ,
94+ add_identity = True ),
95+ feedforward_channels = 4096 ,
96+ operation_order = ('cross_attn' , 'norm' , 'self_attn' , 'norm' ,
97+ 'ffn' , 'norm' )),
98+ init_cfg = None )
99+ ),
100+ test_cfg = dict (mode = 'slide' , crop_size = crop_size , stride = (341 , 341 ))
101+ )
102+ # dataset settings
103+ img_norm_cfg = dict (
104+ mean = [123.675 , 116.28 , 103.53 ], std = [58.395 , 57.12 , 57.375 ], to_rgb = True )
105+ train_pipeline = [
106+ dict (type = 'LoadImageFromFile' ),
107+ dict (type = 'LoadAnnotations' , reduce_zero_label = True ),
108+ dict (type = 'Resize' , img_scale = (2048 , 512 ), ratio_range = (0.5 , 2.0 )),
109+ dict (type = 'RandomCrop' , crop_size = crop_size , cat_max_ratio = 0.75 ),
110+ dict (type = 'RandomFlip' , prob = 0.5 ),
111+ dict (type = 'PhotoMetricDistortion' ),
112+ dict (type = 'Normalize' , ** img_norm_cfg ),
113+ dict (type = 'Pad' , size = crop_size , pad_val = 0 , seg_pad_val = 255 ),
114+ dict (type = 'ToMask' ),
115+ dict (type = 'DefaultFormatBundle' ),
116+ dict (type = 'Collect' , keys = ['img' , 'gt_semantic_seg' , 'gt_masks' , 'gt_labels' ])
117+ ]
118+ test_pipeline = [
119+ dict (type = 'LoadImageFromFile' ),
120+ dict (
121+ type = 'MultiScaleFlipAug' ,
122+ img_scale = (2048 , 512 ),
123+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
124+ flip = False ,
125+ transforms = [
126+ dict (type = 'Resize' , keep_ratio = True ),
127+ dict (type = 'ResizeToMultiple' , size_divisor = 32 ),
128+ dict (type = 'RandomFlip' ),
129+ dict (type = 'Normalize' , ** img_norm_cfg ),
130+ dict (type = 'ImageToTensor' , keys = ['img' ]),
131+ dict (type = 'Collect' , keys = ['img' ]),
132+ ])
133+ ]
134+ optimizer = dict (_delete_ = True , type = 'AdamW' , lr = 2e-5 , betas = (0.9 , 0.999 ), weight_decay = 0.05 ,
135+ constructor = 'LayerDecayOptimizerConstructor' ,
136+ paramwise_cfg = dict (num_layers = 24 , layer_decay_rate = 0.90 ))
137+ lr_config = dict (_delete_ = True ,
138+ policy = 'poly' ,
139+ warmup = 'linear' ,
140+ warmup_iters = 1500 ,
141+ warmup_ratio = 1e-6 ,
142+ power = 1.0 , min_lr = 0.0 , by_epoch = False )
143+ data = dict (samples_per_gpu = 2 ,
144+ train = dict (pipeline = train_pipeline ),
145+ val = dict (pipeline = test_pipeline ),
146+ test = dict (pipeline = test_pipeline ))
147+ runner = dict (type = 'IterBasedRunner' )
148+ checkpoint_config = dict (by_epoch = False , interval = 1000 , max_keep_ckpts = 1 )
149+ evaluation = dict (interval = 4000 , metric = 'mIoU' , save_best = 'mIoU' )
0 commit comments