From 96d20d38c91b3609bbb7ed4d99a94285c0a48c5b Mon Sep 17 00:00:00 2001 From: makecent <42603768+makecent@users.noreply.github.com> Date: Tue, 15 Aug 2023 15:58:06 +0800 Subject: [PATCH] [Enhance] Support 2D&3D Optical Flow Training (#2631) --- ..._r50_8xb16-16x4x1-256e_kinetics400-flow.py | 146 ++++++++++++++++++ ...d-r50_8xb32_5x1x3-110e_kinetics400-flow.py | 141 +++++++++++++++++ mmaction/datasets/transforms/formatting.py | 40 ++--- mmaction/datasets/transforms/loading.py | 8 +- tests/datasets/transforms/test_formating.py | 16 +- tests/datasets/transforms/test_loading.py | 8 +- 6 files changed, 314 insertions(+), 45 deletions(-) create mode 100644 configs/recognition/slowonly/slowonly_r50_8xb16-16x4x1-256e_kinetics400-flow.py create mode 100644 configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32_5x1x3-110e_kinetics400-flow.py diff --git a/configs/recognition/slowonly/slowonly_r50_8xb16-16x4x1-256e_kinetics400-flow.py b/configs/recognition/slowonly/slowonly_r50_8xb16-16x4x1-256e_kinetics400-flow.py new file mode 100644 index 0000000000..92221d9e97 --- /dev/null +++ b/configs/recognition/slowonly/slowonly_r50_8xb16-16x4x1-256e_kinetics400-flow.py @@ -0,0 +1,146 @@ +_base_ = '../../_base_/default_runtime.py' + +model = dict( + type='Recognizer3D', + backbone=dict( + type='ResNet3dSlowOnly', + depth=50, + pretrained=None, + lateral=False, + in_channels=2, + conv1_kernel=(1, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + inflate=(0, 0, 1, 1), + norm_eval=False), + cls_head=dict( + type='I3DHead', + in_channels=2048, + num_classes=400, + spatial_type='avg', + dropout_ratio=0.5, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[128, 128], + std=[128, 128], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'RawframeDataset' +data_root = 'data/kinetics400/rawframes_train' +data_root_val = 'data/kinetics400/rawframes_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_flow.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_flow.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_flow.txt' +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='SampleFrames', clip_len=16, frame_interval=4, num_clips=1), + dict(type='RawFrameDecode', **file_client_args), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +val_pipeline = [ + dict( + type='SampleFrames', + clip_len=16, + frame_interval=4, + num_clips=2, + test_mode=True), + dict(type='RawFrameDecode', **file_client_args), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=16, + frame_interval=4, + num_clips=10, + test_mode=True), + dict(type='RawFrameDecode', **file_client_args), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + filename_tmpl='{}_{:05d}.jpg', + modality='Flow', + data_prefix=dict(img=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + filename_tmpl='{}_{:05d}.jpg', + modality='Flow', + data_prefix=dict(img=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + filename_tmpl='{}_{:05d}.jpg', + modality='Flow', + data_prefix=dict(img=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=256, val_begin=1, val_interval=8) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# learning policy +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=34), + dict( + type='CosineAnnealingLR', + T_max=222, + eta_min=0, + by_epoch=True, + begin=34, + end=256) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=1e-4), + clip_grad=dict(max_norm=40, norm_type=2)) + +# runtime settings +default_hooks = dict(checkpoint=dict(interval=8, max_keep_ckpts=3)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32_5x1x3-110e_kinetics400-flow.py b/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32_5x1x3-110e_kinetics400-flow.py new file mode 100644 index 0000000000..a25eb31334 --- /dev/null +++ b/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32_5x1x3-110e_kinetics400-flow.py @@ -0,0 +1,141 @@ +_base_ = '../../_base_/default_runtime.py' + +clip_len = 5 + +model = dict( + type='Recognizer2D', + backbone=dict( + type='ResNet', + pretrained='https://download.pytorch.org/models/resnet50-11ad3fa6.pth', + depth=50, + in_channels=2 * clip_len, # ``in_channels`` should be 2 * clip_len + norm_eval=False), + cls_head=dict( + type='TSNHead', + num_classes=400, + in_channels=2048, + spatial_type='avg', + consensus=dict(type='AvgConsensus', dim=1), + dropout_ratio=0.4, + init_std=0.01, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[128, 128] * clip_len, # ``in_channels`` should be 2 * clip_len + std=[128, 128] * clip_len, # ``in_channels`` should be 2 * clip_len + format_shape='NCHW')) + +# dataset settings +dataset_type = 'RawframeDataset' +data_root = 'data/kinetics400/rawframes_train' +data_root_val = 'data/kinetics400/rawframes_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_flow.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_flow.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_flow.txt' +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict( + type='SampleFrames', clip_len=clip_len, frame_interval=1, num_clips=3), + dict(type='RawFrameDecode', **file_client_args), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict( + type='SampleFrames', + clip_len=clip_len, + frame_interval=1, + num_clips=3, + test_mode=True), + dict(type='RawFrameDecode', **file_client_args), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=clip_len, + frame_interval=1, + num_clips=25, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='TenCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + filename_tmpl='{}_{:05d}.jpg', + modality='Flow', + data_prefix=dict(img=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + filename_tmpl='{}_{:05d}.jpg', + modality='Flow', + data_prefix=dict(img=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + filename_tmpl='{}_{:05d}.jpg', + modality='Flow', + data_prefix=dict(img=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=110, val_begin=1, val_interval=5) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.005, momentum=0.9, weight_decay=0.0001), + clip_grad=dict(max_norm=40, norm_type=2)) + +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=110, + by_epoch=True, + milestones=[70, 100], + gamma=0.1) +] + +default_hooks = dict(checkpoint=dict(interval=5, max_keep_ckpts=3)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (32 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=256) diff --git a/mmaction/datasets/transforms/formatting.py b/mmaction/datasets/transforms/formatting.py index 9b9cb375a9..168509be30 100644 --- a/mmaction/datasets/transforms/formatting.py +++ b/mmaction/datasets/transforms/formatting.py @@ -204,16 +204,20 @@ class FormatShape(BaseTransform): """Format final imgs shape to the given input_format. Required keys: + - imgs (optional) - heatmap_imgs (optional) + - modality (optional) - num_clips - clip_len Modified Keys: - - imgs (optional) - - input_shape (optional) + + - imgs Added Keys: + + - input_shape - heatmap_input_shape (optional) Args: @@ -227,7 +231,7 @@ def __init__(self, input_format: str, collapse: bool = False) -> None: self.input_format = input_format self.collapse = collapse if self.input_format not in [ - 'NCTHW', 'NCHW', 'NCHW_Flow', 'NCTHW_Heatmap', 'NPTCHW' + 'NCTHW', 'NCHW', 'NCTHW_Heatmap', 'NPTCHW' ]: raise ValueError( f'The input format {self.input_format} is invalid.') @@ -300,36 +304,14 @@ def transform(self, results: Dict) -> Dict: elif self.input_format == 'NCHW': imgs = results['imgs'] imgs = np.transpose(imgs, (0, 3, 1, 2)) + if 'modality' in results and results['modality'] == 'Flow': + clip_len = results['clip_len'] + imgs = imgs.reshape((-1, clip_len * imgs.shape[1]) + + imgs.shape[2:]) # M x C x H x W results['imgs'] = imgs results['input_shape'] = imgs.shape - elif self.input_format == 'NCHW_Flow': - num_imgs = len(results['imgs']) - assert num_imgs % 2 == 0 - n = num_imgs // 2 - h, w = results['imgs'][0].shape - x_flow = np.empty((n, h, w), dtype=np.float32) - y_flow = np.empty((n, h, w), dtype=np.float32) - for i in range(n): - x_flow[i] = results['imgs'][2 * i] - y_flow[i] = results['imgs'][2 * i + 1] - imgs = np.stack([x_flow, y_flow], axis=-1) - - num_clips = results['num_clips'] - clip_len = results['clip_len'] - imgs = imgs.reshape((-1, num_clips, clip_len) + imgs.shape[1:]) - # N_crops x N_clips x T x H x W x C - imgs = np.transpose(imgs, (0, 1, 2, 5, 3, 4)) - # N_crops x N_clips x T x C x H x W - imgs = imgs.reshape((-1, imgs.shape[2] * imgs.shape[3]) + - imgs.shape[4:]) - # M' x C' x H x W - # M' = N_crops x N_clips - # C' = T x C - results['imgs'] = imgs - results['input_shape'] = imgs.shape - elif self.input_format == 'NPTCHW': num_proposals = results['num_proposals'] num_clips = results['num_clips'] diff --git a/mmaction/datasets/transforms/loading.py b/mmaction/datasets/transforms/loading.py index 22070371a1..8d789ab4c3 100644 --- a/mmaction/datasets/transforms/loading.py +++ b/mmaction/datasets/transforms/loading.py @@ -1418,11 +1418,7 @@ def transform(self, results: dict) -> dict: for i, frame_idx in enumerate(results['frame_inds']): # Avoid loading duplicated frames if frame_idx in cache: - if modality == 'RGB': - imgs.append(cp.deepcopy(imgs[cache[frame_idx]])) - else: - imgs.append(cp.deepcopy(imgs[2 * cache[frame_idx]])) - imgs.append(cp.deepcopy(imgs[2 * cache[frame_idx] + 1])) + imgs.append(cp.deepcopy(imgs[cache[frame_idx]])) continue else: cache[frame_idx] = i @@ -1443,7 +1439,7 @@ def transform(self, results: dict) -> dict: x_frame = mmcv.imfrombytes(x_img_bytes, flag='grayscale') y_img_bytes = self.file_client.get(y_filepath) y_frame = mmcv.imfrombytes(y_img_bytes, flag='grayscale') - imgs.extend([x_frame, y_frame]) + imgs.append(np.stack([x_frame, y_frame], axis=-1)) else: raise NotImplementedError diff --git a/tests/datasets/transforms/test_formating.py b/tests/datasets/transforms/test_formating.py index 4668732746..e12a1a95d7 100644 --- a/tests/datasets/transforms/test_formating.py +++ b/tests/datasets/transforms/test_formating.py @@ -191,12 +191,21 @@ def test_format_shape(): # invalid input format FormatShape('NHWC') - # 'NCHW' input format + # 'NCHW' input format (RGB Modality) results = dict( imgs=np.random.randn(3, 224, 224, 3), num_clips=1, clip_len=3) format_shape = FormatShape('NCHW') assert format_shape(results)['input_shape'] == (3, 3, 224, 224) + # `NCHW` input format (Flow Modality) + results = dict( + imgs=np.random.randn(3, 224, 224, 2), + num_clips=1, + clip_len=3, + modality='Flow') + format_shape = FormatShape('NCHW') + assert format_shape(results)['input_shape'] == (1, 6, 224, 224) + # `NCTHW` input format with num_clips=1, clip_len=3 results = dict( imgs=np.random.randn(3, 224, 224, 3), num_clips=1, clip_len=3) @@ -229,11 +238,6 @@ def test_format_shape(): format_shape = FormatShape('NCTHW_Heatmap') assert format_shape(results)['input_shape'] == (2, 17, 6, 56, 56) - # `NCHW_Flow` input format - results = dict(imgs=np.random.randn(6, 224, 224), num_clips=1, clip_len=3) - format_shape = FormatShape('NCHW_Flow') - assert format_shape(results)['input_shape'] == (1, 6, 224, 224) - # `NPTCHW` input format results = dict( imgs=np.random.randn(72, 224, 224, 3), diff --git a/tests/datasets/transforms/test_loading.py b/tests/datasets/transforms/test_loading.py index ee2cc64717..888c993fd5 100644 --- a/tests/datasets/transforms/test_loading.py +++ b/tests/datasets/transforms/test_loading.py @@ -486,8 +486,8 @@ def test_rawframe_decode(self): frame_selector = RawFrameDecode(io_backend='disk') results = frame_selector(inputs) assert assert_dict_has_keys(results, target_keys) - assert np.shape(results['imgs']) == (len(inputs['frame_inds']) * 2, - 240, 320) + assert np.shape(results['imgs']) == (len(inputs['frame_inds']), 240, + 320, 2) assert results['original_shape'] == (240, 320) # test frame selector with 1 dim input for flow images @@ -496,8 +496,8 @@ def test_rawframe_decode(self): frame_selector = RawFrameDecode(io_backend='disk') results = frame_selector(inputs) assert assert_dict_has_keys(results, target_keys) - assert np.shape(results['imgs']) == (len(inputs['frame_inds']) * 2, - 240, 320) + assert np.shape(results['imgs']) == (len(inputs['frame_inds']), 240, + 320, 2) assert results['original_shape'] == (240, 320) return