Skip to content

diffusion_mnist broken throws CUDNNError: CUDNN_STATUS_BAD_PARAM (code 3) #367

Open
@mashu

Description

@mashu
mateusz@debian:~/model-zoo/vision/diffusion_mnist$ julia
               _
   _       _ _(_)_     |  Documentation: https://docs.julialang.org
  (_)     | (_) (_)    |
   _ _   _| |_  __ _   |  Type "?" for help, "]?" for Pkg help.
  | | | | | | |/ _` |  |
  | | |_| | | | (_| |  |  Version 1.8.2 (2022-09-29)
 _/ |\__'_|_|_|\__'_|  |  Official https://julialang.org/ release
|__/                   |

(@v1.8) pkg> activate .
  Activating project at `~/model-zoo/vision/diffusion_mnist`

julia> include("diffusion_mnist.jl")

julia> train()
[ Info: Training on GPU
[ Info: Start Training, total 50 epochs
[ Info: Epoch 1
ERROR: CUDNNError: CUDNN_STATUS_BAD_PARAM (code 3)
Stacktrace:
  [1] throw_api_error(res::CUDA.CUDNN.cudnnStatus_t)
    @ CUDA.CUDNN ~/.julia/packages/CUDA/DfvRa/lib/cudnn/error.jl:22
  [2] macro expansion
    @ ~/.julia/packages/CUDA/DfvRa/lib/cudnn/error.jl:35 [inlined]
  [3] cudnnSetConvolutionNdDescriptor(convDesc::Ptr{Nothing}, arrayLength::Int32, padA::Vector{Int32}, filterStrideA::Vector{Int32}, dilationA::Vector{Int32}, mode::CUDA.CUDNN.cudnnConvolutionMode_t, computeType::CUDA.CUDNN.cudnnDataType_t)
    @ CUDA.CUDNN ~/.julia/packages/CUDA/DfvRa/lib/utils/call.jl:26
  [4] cudnnSetConvolutionDescriptor(ptr::Ptr{Nothing}, padding::Vector{Int32}, stride::Vector{Int32}, dilation::Vector{Int32}, mode::CUDA.CUDNN.cudnnConvolutionMode_t, dataType::CUDA.CUDNN.cudnnDataType_t, mathType::CUDA.CUDNN.cudnnMathType_t, reorderType::CUDA.CUDNN.cudnnReorderType_t, groupCount::Int32)
    @ CUDA.CUDNN ~/.julia/packages/CUDA/DfvRa/lib/cudnn/convolution.jl:135
  [5] CUDA.CUDNN.cudnnConvolutionDescriptor(::Vector{Int32}, ::Vararg{Any})
    @ CUDA.CUDNN ~/.julia/packages/CUDA/DfvRa/lib/cudnn/descriptors.jl:39
  [6] CUDA.CUDNN.cudnnConvolutionDescriptor(cdims::DenseConvDims{2, 2, 2, 4, 2}, x::CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, pad::Tuple{Int64, Int64})
    @ NNlibCUDA ~/.julia/packages/NNlibCUDA/kCpTE/src/cudnn/conv.jl:48
  [7] cudnnConvolutionDescriptorAndPaddedInput
    @ ~/.julia/packages/NNlibCUDA/kCpTE/src/cudnn/conv.jl:43 [inlined]
  [8] ∇conv_data!(dx::CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, dy::CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, w::CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, cdims::DenseConvDims{2, 2, 2, 4, 2}; alpha::Int64, beta::Int64, algo::Int64)
    @ NNlibCUDA ~/.julia/packages/NNlibCUDA/kCpTE/src/cudnn/conv.jl:98
  [9] ∇conv_data!
    @ ~/.julia/packages/NNlibCUDA/kCpTE/src/cudnn/conv.jl:89 [inlined]
 [10] #∇conv_data#198
    @ ~/.julia/packages/NNlib/0QnJJ/src/conv.jl:99 [inlined]
 [11] ∇conv_data
    @ ~/.julia/packages/NNlib/0QnJJ/src/conv.jl:95 [inlined]
 [12] #rrule#318
    @ ~/.julia/packages/NNlib/0QnJJ/src/conv.jl:326 [inlined]
 [13] rrule
    @ ~/.julia/packages/NNlib/0QnJJ/src/conv.jl:316 [inlined]
 [14] rrule
    @ ~/.julia/packages/ChainRulesCore/C73ay/src/rules.jl:134 [inlined]
 [15] chain_rrule
    @ ~/.julia/packages/Zygote/dABKa/src/compiler/chainrules.jl:218 [inlined]
 [16] macro expansion
    @ ~/.julia/packages/Zygote/dABKa/src/compiler/interface2.jl:0 [inlined]
 [17] _pullback
    @ ~/.julia/packages/Zygote/dABKa/src/compiler/interface2.jl:9 [inlined]
 [18] _pullback
    @ ~/.julia/packages/Flux/4k0Ls/src/layers/conv.jl:333 [inlined]
 [19] _pullback(ctx::Zygote.Context{true}, f::ConvTranspose{2, 4, typeof(identity), CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, Bool}, args::CuArray{Float32, 4, CUDA.Mem.DeviceBuffer})
    @ Zygote ~/.julia/packages/Zygote/dABKa/src/compiler/interface2.jl:0
 [20] _pullback
    @ ~/model-zoo/vision/diffusion_mnist/diffusion_mnist.jl:140 [inlined]
 [21] _pullback(::Zygote.Context{true}, ::UNet, ::CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, ::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer})
    @ Zygote ~/.julia/packages/Zygote/dABKa/src/compiler/interface2.jl:0
 [22] _pullback
    @ ~/model-zoo/vision/diffusion_mnist/diffusion_mnist.jl:185 [inlined]
 [23] _pullback(::Zygote.Context{true}, ::typeof(model_loss), ::UNet, ::CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, ::Float32)
    @ Zygote ~/.julia/packages/Zygote/dABKa/src/compiler/interface2.jl:0
 [24] _pullback
    @ ~/model-zoo/vision/diffusion_mnist/diffusion_mnist.jl:176 [inlined]
 [25] _pullback(::Zygote.Context{true}, ::typeof(model_loss), ::UNet, ::CuArray{Float32, 4, CUDA.Mem.DeviceBuffer})
    @ Zygote ~/.julia/packages/Zygote/dABKa/src/compiler/interface2.jl:0
 [26] _pullback
    @ ~/model-zoo/vision/diffusion_mnist/diffusion_mnist.jl:265 [inlined]
 [27] _pullback(::Zygote.Context{true}, ::var"#12#14"{UNet})
    @ Zygote ~/.julia/packages/Zygote/dABKa/src/compiler/interface2.jl:0
 [28] pullback(f::Function, ps::Zygote.Params{Zygote.Buffer{Any, Vector{Any}}})
    @ Zygote ~/.julia/packages/Zygote/dABKa/src/compiler/interface.jl:373
 [29] withgradient(f::Function, args::Zygote.Params{Zygote.Buffer{Any, Vector{Any}}})
    @ Zygote ~/.julia/packages/Zygote/dABKa/src/compiler/interface.jl:123
 [30] train(; kws::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ Main ~/model-zoo/vision/diffusion_mnist/diffusion_mnist.jl:264
 [31] train()
    @ Main ~/model-zoo/vision/diffusion_mnist/diffusion_mnist.jl:222
 [32] top-level scope
    @ REPL[3]:1
 [33] top-level scope
    @ ~/.julia/packages/CUDA/DfvRa/src/initialization.jl:52

julia> CUDA.device()
CuDevice(0): NVIDIA GeForce RTX 3080

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions