Skip to content

improve performance/ergonomics of reading compound datatypes #592

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Mar 2, 2020
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ HDF5_jll = "0234f1f7-429e-5d53-9886-15a909be8d59"
Blosc = "a74b3585-a348-5f62-a45c-50e91977d574"
Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
Mmap = "a63ad114-7e13-5084-954f-fe012c677804"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"

[compat]
julia = "1.3"
Expand Down
193 changes: 110 additions & 83 deletions src/HDF5.jl
Original file line number Diff line number Diff line change
Expand Up @@ -465,13 +465,6 @@ end
==(a::HDF5ReferenceObj, b::HDF5ReferenceObj) = a.r == b.r
hash(x::HDF5ReferenceObj, h::UInt) = hash(x.r, h)

# Compound types
struct HDF5Compound{N}
data::NTuple{N,Any}
membername::NTuple{N,String}
membertype::NTuple{N,Type}
end

# Opaque types
struct HDF5Opaque
data
Expand All @@ -482,9 +475,22 @@ end
struct EmptyArray{T} end

# Stub types to encode fixed-size arrays for H5T_ARRAY
struct FixedArray{T,D} end
size(::Type{FixedArray{T,D}}) where {T,D} = D
eltype(::Type{FixedArray{T,D}}) where {T,D} = T
struct FixedArray{T,D,L}
data::NTuple{L, T}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how / where's this field being used ?

Copy link
Contributor Author

@kleinhenz kleinhenz Mar 2, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is used so that the julia type you construct for an H5T_ARRAY is memory compatible and can be directly written into which is necessary for compound datatypes with H5T_ARRAY members.

end
size(::Type{FixedArray{T,D,L}}) where {T,D,L} = D
eltype(::Type{FixedArray{T,D,L}}) where {T,D,L} = T

struct FixedString{N}
data::NTuple{N, Cchar}
end
length(::Type{FixedString{N}}) where N = N

struct VariableArray{T}
len::Csize_t
p::Ptr{Cvoid}
end
eltype(::Type{VariableArray{T}}) where T = T

# VLEN objects
struct HDF5Vlen{T}
Expand Down Expand Up @@ -1459,73 +1465,70 @@ function getindex(parent::Union{HDF5File, HDF5Group, HDF5Dataset}, r::HDF5Refere
h5object(obj_id, parent)
end

# Helper for reading compound types
function read_row(io::IO, membertype, membersize)
row = Any[]
for (dtype, dsize) in zip(membertype, membersize)
if dtype === String
push!(row, unpad(read!(io, Vector{UInt8}(undef,dsize)), H5T_STR_NULLPAD))
elseif dtype<:HDF5.FixedArray && eltype(dtype)<:HDF5BitsKind
val = read!(io, Vector{eltype(dtype)}(undef,prod(size(dtype))))
push!(row, reshape(val, size(dtype)))
elseif dtype<:HDF5BitsKind
push!(row, read(io, dtype))
else
# for other types, just store the raw bytes and let the user
# decide what to do
push!(row, read!(io, Vector{UInt8}(undef,dsize)))
end
# convert Cstring/FixedString to String
function normalize_types(x::NamedTuple{T}) where T
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

where was this normalization obtained from?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this was taken from the current handling of strings/vlen types.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

except that for vlen arrays I use a non-owning unsafe wrap and then make a copy. I think this is the right thing to do since hdf5 wants to manage its own memory.


vals = map(values(x)) do xi
Ti = typeof(xi)
if Ti == Cstring
unsafe_string(xi)
elseif Ti <: FixedString
join(Char.(xi.data))
elseif Ti <: FixedArray
reshape(collect(xi.data), size(Ti)...)
elseif Ti <: VariableArray
copy(unsafe_wrap(Array, convert(Ptr{eltype(xi)}, xi.p), xi.len, own=false))
elseif Ti <: NamedTuple
normalize_types(xi)
else
xi
end
return (row...,)
end

NamedTuple{T}(vals)
end

# Read compound type
function read(obj::HDF5Dataset, T::Union{Type{Array{HDF5Compound{N}}},Type{HDF5Compound{N}}}) where {N}
t = datatype(obj)
local sz = 0; local n;
local membername; local membertype;
local memberoffset; local memberfiletype; local membersize;
try
memberfiletype = Vector{HDF5Datatype}(undef,N)
membertype = Vector{Type}(undef,N)
membername = Vector{String}(undef,N)
memberoffset = Vector{UInt64}(undef,N)
membersize = Vector{UInt32}(undef,N)
for i = 1:N
filetype = HDF5Datatype(h5t_get_member_type(t.id, i-1))
memberfiletype[i] = filetype
membertype[i] = hdf5_to_julia_eltype(filetype)
memberoffset[i] = sz
membersize[i] = sizeof(filetype)
sz += sizeof(filetype)
membername[i] = h5t_get_member_name(t.id, i-1)
end
finally
close(t)
end
# Build the "memory type"
memtype_id = h5t_create(H5T_COMPOUND, sz)
for i = 1:N
h5t_insert(memtype_id, membername[i], memberoffset[i], memberfiletype[i].id) # FIXME strings
end
# Read the raw data
buf = Vector{UInt8}(undef,length(obj)*sz)
h5d_read(obj.id, memtype_id, H5S_ALL, H5S_ALL, obj.xfer, buf)

# Convert to the appropriate data format using iobuffer
iobuff = IOBuffer(buf)
data = Any[]
while !eof(iobuff)
push!(data, read_row(iobuff, membertype, membersize))
end
# convert HDF5Compound type parameters to tuples
membername = (membername...,)
membertype = (membertype...,)
if T === HDF5Compound{N}
return HDF5Compound(data[1], membername, membertype)
# get a vector of all the leaf types in a (possibly nested) named tuple
function get_all_types(::Type{NamedTuple{T, U}}) where T where U
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm guessing there's a better way to do this that I can't currently think of, but seems fine for now.

Copy link
Member

@musm musm Mar 2, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems faster / better

function get_all_types1(::Type{NamedTuple})
    types = DataType[]
    for Ui in U.types
        if Ui <: NamedTuple
          append!(types, get_all_types(Ui))
        else
          push!(types, Ui)
        end
    end
    return types
end

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe this would be better/more julian?

do_normalize(::Type{T}) where T = false
do_normalize(::Type{NamedTuple{T, U}}) where T where U = any(i -> do_normalize(fieldtype(U,i)), 1:fieldcount(U))
do_normalize(::Type{T}) where T <: Union{Cstring, FixedString, FixedArray, VariableArray} = true

do_reclaim(::Type{T}) where T = false
do_reclaim(::Type{NamedTuple{T, U}}) where T where U =  any(i -> do_reclaim(fieldtype(U,i)), 1:fieldcount(U))
do_reclaim(::Type{T}) where T <: Union{Cstring, VariableArray} = true

the only reason I collect all the types is to see if I need to call reclaim or do an extra normalization step to turn the data into a native julia type.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah that might be simpler. It's certainly in our best favor to make the code run as fast as possible and be as simple as possible to read for maintenance. You can also benchmark performance.

types = []
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should probably be

Suggested change
types = []
types = DataType[]

for i in 1:fieldcount(U)
Ui = fieldtype(U, i)
if Ui <: NamedTuple
append!(types, get_all_types(Ui))
else
return [HDF5Compound(elem, membername, membertype) for elem in data]
push!(types, Ui)
end
end
types
end

function read(dset::HDF5Dataset, T::Union{Type{Array{U}}, Type{U}}) where U <: NamedTuple
filetype = HDF5.datatype(dset)
memtype_id = HDF5.h5t_get_native_type(filetype.id) # padded layout in memory
@assert sizeof(U) == HDF5.h5t_get_size(memtype_id) "Type sizes mismatch!"

buf = Array{U}(undef, size(dset))

HDF5.h5d_read(dset.id, memtype_id, HDF5.H5S_ALL, HDF5.H5S_ALL, HDF5.H5P_DEFAULT, buf)

types = get_all_types(U)
normalize = any(t -> t <: Union{Cstring, FixedString, FixedArray, VariableArray}, types)
out = normalize ? normalize_types.(buf) : buf

reclaim = any(t -> t <: Union{Cstring, VariableArray}, types)
if reclaim
dspace = dataspace(dset)
# NOTE I have seen this call fail but I cannot reproduce
h5d_vlen_reclaim(memtype_id, dspace.id, H5P_DEFAULT, buf)
end

HDF5.h5t_close(memtype_id)

if T <: NamedTuple
return out[1]
else
return out
end
end

# Read OPAQUE datasets and attributes
Expand Down Expand Up @@ -1575,6 +1578,7 @@ function read(obj::DatasetOrAttribute, ::Type{HDF5Vlen{T}}) where {T<:Union{HDF5
for i = 1:len
h = structbuf[i]
data[i] = p2a(convert(Ptr{T}, h.p), Int(h.len))

end
data
end
Expand Down Expand Up @@ -2007,18 +2011,41 @@ function hdf5_to_julia_eltype(objtype)
T = HDF5Vlen{hdf5_to_julia_eltype(HDF5Datatype(super_id))}
elseif class_id == H5T_COMPOUND
N = Int(h5t_get_nmembers(objtype.id))
# check if should be interpreted as complex
if COMPLEX_SUPPORT[] && N == 2
membernames = ntuple(N) do i
h5t_get_member_name(objtype.id, i-1)
end
membertypes = ntuple(N) do i
hdf5_to_julia_eltype(HDF5Datatype(h5t_get_member_type(objtype.id, i-1)))

membernames = ntuple(N) do i
h5t_get_member_name(objtype.id, i-1)
end

membertypes = ntuple(N) do i
dtype = HDF5Datatype(h5t_get_member_type(objtype.id, i-1))
ci = h5t_get_class(dtype.id)

if ci == H5T_STRING
if h5t_is_variable_str(dtype.id)
return Cstring
else
n = h5t_get_size(dtype.id)
return FixedString{Int(n)}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again I'm guessing it's fine not do Int here as well. Unless you foresee negative consequences.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This one is actually necessary since NTuple requires the first type parameter to be Int64.

end
elseif ci == H5T_VLEN
superid = h5t_get_super(dtype.id)
T = VariableArray{hdf5_to_julia_eltype(HDF5Datatype(superid))}
else
return hdf5_to_julia_eltype(dtype)
end
iscomplex = (membernames == COMPLEX_FIELD_NAMES[]) && (membertypes[1] == membertypes[2]) && (membertypes[1] <: HDF5.HDF5Scalar)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

unrelated changes.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this part was refactored because now membernames/membertypes are used outside of of the COMPLEX_SUPPORT path.

T = iscomplex ? Complex{membertypes[1]} : HDF5Compound{N}
end

# check if should be interpreted as complex
iscomplex = COMPLEX_SUPPORT[] &&
N == 2 &&
(membernames == COMPLEX_FIELD_NAMES[]) &&
(membertypes[1] == membertypes[2]) &&
(membertypes[1] <: HDF5.HDF5Scalar)

if iscomplex
T = Complex{membertypes[1]}
else
T = HDF5Compound{N}
T = NamedTuple{Symbol.(membernames), Tuple{membertypes...}}
end
elseif class_id == H5T_ARRAY
T = hdf5array(objtype)
Expand Down Expand Up @@ -2423,7 +2450,7 @@ function hdf5array(objtype)
eltyp = HDF5Datatype(h5t_get_super(objtype.id))
T = hdf5_to_julia_eltype(eltyp)
dimsizes = ntuple(i -> Int(dims[nd-i+1]), nd) # reverse order
FixedArray{T, dimsizes}
FixedArray{T, dimsizes, prod(dimsizes)}
end

### Property manipulation ###
Expand Down
79 changes: 79 additions & 0 deletions test/compound.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
using Random, Test, HDF5

import HDF5.datatype
import Base.unsafe_convert

struct foo
a::Float64
b::String
c::String
d::Array{ComplexF64,2}
e::Array{Int64,1}
end

struct foo_hdf5
a::Float64
b::Cstring
c::NTuple{10, Cchar}
d::NTuple{9, ComplexF64}
e::HDF5.Hvl_t
end

function unsafe_convert(::Type{foo_hdf5}, x::foo)
foo_hdf5(x.a,
Base.unsafe_convert(Cstring, x.b),
ntuple(i -> x.c[i], length(x.c)),
ntuple(i -> x.d[i], length(x.d)),
HDF5.Hvl_t(convert(Csize_t, length(x.e)), convert(Ptr{Cvoid}, pointer(x.e)))
)
end

function datatype(::Type{foo_hdf5})
dtype = HDF5.h5t_create(HDF5.H5T_COMPOUND, sizeof(foo_hdf5))
HDF5.h5t_insert(dtype, "a", fieldoffset(foo_hdf5, 1), datatype(Float64))

vlenstr_dtype = HDF5.h5t_copy(HDF5.H5T_C_S1)
HDF5.h5t_set_size(vlenstr_dtype, HDF5.H5T_VARIABLE)
HDF5.h5t_set_cset(vlenstr_dtype, HDF5.H5T_CSET_UTF8)
HDF5.h5t_insert(dtype, "b", fieldoffset(foo_hdf5, 2), vlenstr_dtype)

fixedstr_dtype = HDF5.h5t_copy(HDF5.H5T_C_S1)
HDF5.h5t_set_size(fixedstr_dtype, 10 * sizeof(Cchar))
HDF5.h5t_set_cset(fixedstr_dtype, HDF5.H5T_CSET_UTF8)
HDF5.h5t_insert(dtype, "c", fieldoffset(foo_hdf5, 3), fixedstr_dtype)

hsz = HDF5.Hsize[3,3]
array_dtype = HDF5.h5t_array_create(datatype(ComplexF64).id, 2, hsz)
HDF5.h5t_insert(dtype, "d", fieldoffset(foo_hdf5, 4), array_dtype)

vlen_dtype = HDF5.h5t_vlen_create(datatype(Int64))
HDF5.h5t_insert(dtype, "e", fieldoffset(foo_hdf5, 5), vlen_dtype)

HDF5Datatype(dtype)
end

@testset "compound" begin
N = 10
v = [foo(rand(),
randstring(rand(10:100)),
randstring(10),
rand(ComplexF64, 3,3),
rand(1:10, rand(10:100))
)
for _ in 1:N]
v_write = unsafe_convert.(foo_hdf5, v)

fn = tempname()
h5open(fn, "w") do h5f
dtype = datatype(foo_hdf5)
space = dataspace(v_write)
dset = HDF5.h5d_create(h5f.id, "data", dtype.id, space.id)
HDF5.h5d_write(dset, dtype.id, v_write)
end

v_read = h5read(fn, "data")
for field in (:a, :b, :c, :d, :e)
f = x -> getfield(x, field)
@test f.(v) == f.(v_read)
end
end
13 changes: 4 additions & 9 deletions test/plain.jl
Original file line number Diff line number Diff line change
Expand Up @@ -320,12 +320,7 @@ rm(tmpdir, recursive=true)
test_files = joinpath(@__DIR__, "test_files")

d = h5read(joinpath(test_files, "compound.h5"), "/data")
@test typeof(d[1]) === HDF5.HDF5Compound{4}
@test length(d) == 2
dtypes = [typeof(x) for x in d[1].data]
@test dtypes == [Float64, Vector{Float64}, Vector{Float64}, Float64]
@test length(d[1].data[2]) == 3
@test d[1].membername == ("wgt", "xyz", "uvw", "E")
@test typeof(d[1]) == NamedTuple{(:wgt, :xyz, :uvw, :E), Tuple{Float64, Array{Float64, 1}, Array{Float64, 1}, Float64}}

# get-datasets
fn = tempname()
Expand Down Expand Up @@ -450,12 +445,12 @@ end # testset plain

HDF5.disable_complex_support()
z = read(fr, "ComplexF64")
@test isa(z, HDF5.HDF5Compound{2})
@test isa(z, NamedTuple{(:r, :i), Tuple{Float64, Float64}})

Acmplx32 = read(fr, "Acmplx32")
@test eltype(Acmplx32) == HDF5.HDF5Compound{2}
@test eltype(Acmplx32) == NamedTuple{(:r, :i), Tuple{Float32, Float32}}
Acmplx64 = read(fr, "Acmplx64")
@test eltype(Acmplx64) == HDF5.HDF5Compound{2}
@test eltype(Acmplx64) == NamedTuple{(:r, :i), Tuple{Float64, Float64}}

close(fr)

Expand Down
1 change: 1 addition & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ using Pkg
println("HDF5 version ", HDF5.h5_get_libversion())

include("plain.jl")
include("compound.jl")
include("readremote.jl")
include("extend_test.jl")
include("gc.jl")
Expand Down