Skip to content

大佬求帮助,在训练atss检测的时候,按照官方dump成mge出错,.tm模型不能可视化 #126

Open
@corleonechensiyu

Description

@corleonechensiyu

环境

1.系统环境:Ubuntu 22.04.2
2.MegEngine版本:1.13.0
3.python版本:3.10.12
4.模型名称:atss_res18_coco_3x_800size

复现步骤

1.训练

python  official/vision/detection/tools/train.py    -f official/vision/detection/configs/atss_res18_coco_3x_800size.py -n 1 -d data/coco/

2.convert

python convert.py -f official/vision/detection/configs/atss_res18_coco_3x_800size.py -w log-of-atss_res18_coco_3x_800size/epoch_9.pkl -i official/assets/cat.jpg

请提供关键的代码片段便于追查问题

**##转换代码**
import numpy as np
import megengine.functional as F
import megengine.hub
from megengine import jit, tensor
import megengine as mge
import megengine.distributed as dist
from megengine.autodiff import GradManager
from megengine.data import DataLoader, Infinite, RandomSampler
from megengine.data import transform as T
from megengine.optimizer import SGD

from official.vision.detection.tools.data_mapper import data_mapper
from official.vision.detection.tools.utils import DetEvaluator, import_from_file
import megengine.traced_module as tm
import argparse
import bisect
import copy
import os
import time
import cv2

def make_parser():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-f", "--file", default="net.py", type=str, help="net description file"
    )
    parser.add_argument(
        "-w", "--weight_file", default=None, type=str, help="weights file",
    )
    parser.add_argument("-i", "--image", type=str)
    return parser


if __name__ == "__main__":
    parser = make_parser()
    args = parser.parse_args()
    
    current_network = import_from_file(args.file)
    cfg = current_network.Cfg()
    cfg.backbone_pretrained = False
    model = current_network.Net(cfg)
    
    ori_img = cv2.imread(args.image)
    image, im_info = DetEvaluator.process_inputs(
        ori_img.copy(), model.cfg.test_image_short_size, model.cfg.test_image_max_size,
    )
    state_dict = mge.load(args.weight_file)
    if "state_dict" in state_dict:
        state_dict = state_dict["state_dict"]
    model.load_state_dict(state_dict)
    model.eval()
    
    traced_resnet = tm.trace_module(model, mge.tensor(image),im_info=mge.tensor(im_info))
    # 可以在这里进行基于 trace_module 的图手术,以及模型转换
    traced_resnet.eval()
    mge.save(traced_resnet,"test.tm")
    @jit.trace(symbolic=True, capture_as_const=True)
    def infer_func(data, im_info, model):
        pred = model(data,im_info)
        return pred

    output = infer_func(mge.tensor(image),im_info=mge.tensor(im_info), model=traced_resnet)
    infer_func.dump("log-of-atss_res18_coco_3x_800size/test.mge", arg_names=["data"])

请提供完整的日志及报错信息

25 17:37:50[mgb] WRN [dnn]
            Cudnn8 will jit ptx code with cache. You can set
            CUDA_CACHE_MAXSIZE and CUDA_CACHE_PATH environment var to avoid repeat jit(very slow).
            For example `export CUDA_CACHE_MAXSIZE=2147483647` and `export CUDA_CACHE_PATH=/data/.cuda_cache`
25 17:37:53[mgb] ERR error while applying optimization pass PassConvertToCompatible: bad input shape for polyadic operator: {256}, {1,256,136,100}

backtrace:
/home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(_ZN3mgb13MegBrainErrorC1ERKSs+0x4a) [0x7fdfdad5590a]
/home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(+0x2db7867) [0x7fdfdadb7867]
/home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(_ZN6megdnn12ErrorHandler15on_megdnn_errorERKSs+0x14) [0x7fdfde9d86a4]
/home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(+0x6a078d8) [0x7fdfdea078d8]
/home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(+0x6a07a91) [0x7fdfdea07a91]
/home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(_ZN3mgb3opr8Elemwise20get_output_var_shapeEN6megdnn5param8Elemwise4ModeERKNS2_11SmallVectorINS2_11TensorShapeELj4EEE+0x37) [0x7fdfdaf150e7]
/home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(_ZNK3mgb3opr8Elemwise20get_output_var_shapeERKN6megdnn11SmallVectorINS2_11TensorShapeELj4EEERS5_+0x29) [0x7fdfdaf1e0c9]
/home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(_ZN3mgb2cg5mixin24OutshapePureByInshapeOpr10infer_descEmRN6megdnn11TensorShapeERKNS0_12static_infer6InpValE+0x19d) [0x7fdfdade9e0d]
/home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(_ZN3mgb2cg12static_infer22StaticInferManagerImpl13TagShapeTrait8do_inferERKNS1_6InpValE+0x57) [0x7fdfdae0b437]
/home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(+0x2e0b292) [0x7fdfdae0b292]

Traceback (most recent call last):
  File "/home/csy/megvii/Models/convert.py", line 63, in <module>
    infer_func.dump("log-of-atss_res18_coco_3x_800size/test.mge", arg_names=["data"])
  File "/home/csy/.local/lib/python3.10/site-packages/megengine/jit/tracing.py", line 1183, in dump
    dump_content, dump_info = G.dump_graph(
  File "/home/csy/.local/lib/python3.10/site-packages/megengine/core/tensor/megbrain_graph.py", line 456, in dump_graph
    dump_content = _imperative_rt.dump_graph(
RuntimeError: bad input shape for polyadic operator: {256}, {1,256,136,100}

backtrace:
/home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(_ZN3mgb13MegBrainErrorC1ERKSs+0x4a) [0x7fdfdad5590a]
/home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(+0x2db7867) [0x7fdfdadb7867]
/home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(_ZN6megdnn12ErrorHandler15on_megdnn_errorERKSs+0x14) [0x7fdfde9d86a4]
/home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(+0x6a078d8) [0x7fdfdea078d8]
/home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(+0x6a07a91) [0x7fdfdea07a91]
/home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(_ZN3mgb3opr8Elemwise20get_output_var_shapeEN6megdnn5param8Elemwise4ModeERKNS2_11SmallVectorINS2_11TensorShapeELj4EEE+0x37) [0x7fdfdaf150e7]
/home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(_ZNK3mgb3opr8Elemwise20get_output_var_shapeERKN6megdnn11SmallVectorINS2_11TensorShapeELj4EEERS5_+0x29) [0x7fdfdaf1e0c9]
/home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(_ZN3mgb2cg5mixin24OutshapePureByInshapeOpr10infer_descEmRN6megdnn11TensorShapeERKNS0_12static_infer6InpValE+0x19d) [0x7fdfdade9e0d]
/home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(_ZN3mgb2cg12static_infer22StaticInferManagerImpl13TagShapeTrait8do_inferERKNS1_6InpValE+0x57) [0x7fdfdae0b437]
/home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(+0x2e0b292) [0x7fdfdae0b292]

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions