Skip to content

Commit 755a2ce

Browse files
kleinhenzmusm
andauthored
improve performance and ergonomics of reading compound datatypes (#592)
* read compound datasets as named tuples * properly support variable/fixed length strings in compound dataset * add more compound dataset tests * add note * support vlen arrays in compound datasets * don't use fieldtypes for julia 1.0 compatibility * Update test/compound.jl remove unnecessary convert calls Co-Authored-By: Mustafa M. <[email protected]> * Update src/HDF5.jl whitespace Co-Authored-By: Mustafa M. <[email protected]> * cleanup * use dispatch instead of manually collecting all leaf types for type normalization * remove unnecessary Int conversion Co-authored-by: Mustafa M. <[email protected]>
1 parent 9163402 commit 755a2ce

File tree

5 files changed

+192
-97
lines changed

5 files changed

+192
-97
lines changed

Project.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ HDF5_jll = "0234f1f7-429e-5d53-9886-15a909be8d59"
77
Blosc = "a74b3585-a348-5f62-a45c-50e91977d574"
88
Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
99
Mmap = "a63ad114-7e13-5084-954f-fe012c677804"
10+
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
1011

1112
[compat]
1213
julia = "1.3"

src/HDF5.jl

Lines changed: 91 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -465,13 +465,6 @@ end
465465
==(a::HDF5ReferenceObj, b::HDF5ReferenceObj) = a.r == b.r
466466
hash(x::HDF5ReferenceObj, h::UInt) = hash(x.r, h)
467467

468-
# Compound types
469-
struct HDF5Compound{N}
470-
data::NTuple{N,Any}
471-
membername::NTuple{N,String}
472-
membertype::NTuple{N,Type}
473-
end
474-
475468
# Opaque types
476469
struct HDF5Opaque
477470
data
@@ -482,9 +475,24 @@ end
482475
struct EmptyArray{T} end
483476

484477
# Stub types to encode fixed-size arrays for H5T_ARRAY
485-
struct FixedArray{T,D} end
486-
size(::Type{FixedArray{T,D}}) where {T,D} = D
487-
eltype(::Type{FixedArray{T,D}}) where {T,D} = T
478+
struct FixedArray{T,D,L}
479+
data::NTuple{L, T}
480+
end
481+
size(::Type{FixedArray{T,D,L}}) where {T,D,L} = D
482+
size(x::T) where T <: FixedArray = size(T)
483+
eltype(::Type{FixedArray{T,D,L}}) where {T,D,L} = T
484+
eltype(x::T) where T <: FixedArray = eltype(T)
485+
486+
struct FixedString{N}
487+
data::NTuple{N, Cchar}
488+
end
489+
length(::Type{FixedString{N}}) where N = N
490+
491+
struct VariableArray{T}
492+
len::Csize_t
493+
p::Ptr{Cvoid}
494+
end
495+
eltype(::Type{VariableArray{T}}) where T = T
488496

489497
# VLEN objects
490498
struct HDF5Vlen{T}
@@ -1459,73 +1467,45 @@ function getindex(parent::Union{HDF5File, HDF5Group, HDF5Dataset}, r::HDF5Refere
14591467
h5object(obj_id, parent)
14601468
end
14611469

1462-
# Helper for reading compound types
1463-
function read_row(io::IO, membertype, membersize)
1464-
row = Any[]
1465-
for (dtype, dsize) in zip(membertype, membersize)
1466-
if dtype === String
1467-
push!(row, unpad(read!(io, Vector{UInt8}(undef,dsize)), H5T_STR_NULLPAD))
1468-
elseif dtype<:HDF5.FixedArray && eltype(dtype)<:HDF5BitsKind
1469-
val = read!(io, Vector{eltype(dtype)}(undef,prod(size(dtype))))
1470-
push!(row, reshape(val, size(dtype)))
1471-
elseif dtype<:HDF5BitsKind
1472-
push!(row, read(io, dtype))
1473-
else
1474-
# for other types, just store the raw bytes and let the user
1475-
# decide what to do
1476-
push!(row, read!(io, Vector{UInt8}(undef,dsize)))
1477-
end
1478-
end
1479-
return (row...,)
1480-
end
1470+
# convert special types to native julia types
1471+
normalize_types(x) = x
1472+
normalize_types(x::NamedTuple{T}) where T = NamedTuple{T}(map(normalize_types, values(x)))
1473+
normalize_types(x::Cstring) = unsafe_string(x)
1474+
normalize_types(x::FixedString) = join(Char.(x.data))
1475+
normalize_types(x::FixedArray) = reshape(collect(x.data), size(x)...)
1476+
normalize_types(x::VariableArray) = copy(unsafe_wrap(Array, convert(Ptr{eltype(x)}, x.p), x.len, own=false))
14811477

1482-
# Read compound type
1483-
function read(obj::HDF5Dataset, T::Union{Type{Array{HDF5Compound{N}}},Type{HDF5Compound{N}}}) where {N}
1484-
t = datatype(obj)
1485-
local sz = 0; local n;
1486-
local membername; local membertype;
1487-
local memberoffset; local memberfiletype; local membersize;
1488-
try
1489-
memberfiletype = Vector{HDF5Datatype}(undef,N)
1490-
membertype = Vector{Type}(undef,N)
1491-
membername = Vector{String}(undef,N)
1492-
memberoffset = Vector{UInt64}(undef,N)
1493-
membersize = Vector{UInt32}(undef,N)
1494-
for i = 1:N
1495-
filetype = HDF5Datatype(h5t_get_member_type(t.id, i-1))
1496-
memberfiletype[i] = filetype
1497-
membertype[i] = hdf5_to_julia_eltype(filetype)
1498-
memberoffset[i] = sz
1499-
membersize[i] = sizeof(filetype)
1500-
sz += sizeof(filetype)
1501-
membername[i] = h5t_get_member_name(t.id, i-1)
1502-
end
1503-
finally
1504-
close(t)
1505-
end
1506-
# Build the "memory type"
1507-
memtype_id = h5t_create(H5T_COMPOUND, sz)
1508-
for i = 1:N
1509-
h5t_insert(memtype_id, membername[i], memberoffset[i], memberfiletype[i].id) # FIXME strings
1510-
end
1511-
# Read the raw data
1512-
buf = Vector{UInt8}(undef,length(obj)*sz)
1513-
h5d_read(obj.id, memtype_id, H5S_ALL, H5S_ALL, obj.xfer, buf)
1514-
1515-
# Convert to the appropriate data format using iobuffer
1516-
iobuff = IOBuffer(buf)
1517-
data = Any[]
1518-
while !eof(iobuff)
1519-
push!(data, read_row(iobuff, membertype, membersize))
1520-
end
1521-
# convert HDF5Compound type parameters to tuples
1522-
membername = (membername...,)
1523-
membertype = (membertype...,)
1524-
if T === HDF5Compound{N}
1525-
return HDF5Compound(data[1], membername, membertype)
1526-
else
1527-
return [HDF5Compound(elem, membername, membertype) for elem in data]
1528-
end
1478+
do_normalize(::Type{T}) where T = false
1479+
do_normalize(::Type{NamedTuple{T, U}}) where T where U = any(i -> do_normalize(fieldtype(U,i)), 1:fieldcount(U))
1480+
do_normalize(::Type{T}) where T <: Union{Cstring, FixedString, FixedArray, VariableArray} = true
1481+
1482+
do_reclaim(::Type{T}) where T = false
1483+
do_reclaim(::Type{NamedTuple{T, U}}) where T where U = any(i -> do_reclaim(fieldtype(U,i)), 1:fieldcount(U))
1484+
do_reclaim(::Type{T}) where T <: Union{Cstring, VariableArray} = true
1485+
1486+
function read(dset::HDF5Dataset, T::Union{Type{Array{U}}, Type{U}}) where U <: NamedTuple
1487+
filetype = HDF5.datatype(dset)
1488+
memtype_id = HDF5.h5t_get_native_type(filetype.id) # padded layout in memory
1489+
@assert sizeof(U) == HDF5.h5t_get_size(memtype_id) "Type sizes mismatch!"
1490+
1491+
buf = Array{U}(undef, size(dset))
1492+
1493+
HDF5.h5d_read(dset.id, memtype_id, HDF5.H5S_ALL, HDF5.H5S_ALL, HDF5.H5P_DEFAULT, buf)
1494+
out = do_normalize(U) ? normalize_types.(buf) : buf
1495+
1496+
if do_reclaim(U)
1497+
dspace = dataspace(dset)
1498+
# NOTE I have seen this call fail but I cannot reproduce
1499+
h5d_vlen_reclaim(memtype_id, dspace.id, H5P_DEFAULT, buf)
1500+
end
1501+
1502+
HDF5.h5t_close(memtype_id)
1503+
1504+
if T <: NamedTuple
1505+
return out[1]
1506+
else
1507+
return out
1508+
end
15291509
end
15301510

15311511
# Read OPAQUE datasets and attributes
@@ -2006,19 +1986,42 @@ function hdf5_to_julia_eltype(objtype)
20061986
super_id = h5t_get_super(objtype.id)
20071987
T = HDF5Vlen{hdf5_to_julia_eltype(HDF5Datatype(super_id))}
20081988
elseif class_id == H5T_COMPOUND
2009-
N = Int(h5t_get_nmembers(objtype.id))
2010-
# check if should be interpreted as complex
2011-
if COMPLEX_SUPPORT[] && N == 2
2012-
membernames = ntuple(N) do i
2013-
h5t_get_member_name(objtype.id, i-1)
2014-
end
2015-
membertypes = ntuple(N) do i
2016-
hdf5_to_julia_eltype(HDF5Datatype(h5t_get_member_type(objtype.id, i-1)))
1989+
N = h5t_get_nmembers(objtype.id)
1990+
1991+
membernames = ntuple(N) do i
1992+
h5t_get_member_name(objtype.id, i-1)
1993+
end
1994+
1995+
membertypes = ntuple(N) do i
1996+
dtype = HDF5Datatype(h5t_get_member_type(objtype.id, i-1))
1997+
ci = h5t_get_class(dtype.id)
1998+
1999+
if ci == H5T_STRING
2000+
if h5t_is_variable_str(dtype.id)
2001+
return Cstring
2002+
else
2003+
n = h5t_get_size(dtype.id)
2004+
return FixedString{Int(n)}
2005+
end
2006+
elseif ci == H5T_VLEN
2007+
superid = h5t_get_super(dtype.id)
2008+
T = VariableArray{hdf5_to_julia_eltype(HDF5Datatype(superid))}
2009+
else
2010+
return hdf5_to_julia_eltype(dtype)
20172011
end
2018-
iscomplex = (membernames == COMPLEX_FIELD_NAMES[]) && (membertypes[1] == membertypes[2]) && (membertypes[1] <: HDF5.HDF5Scalar)
2019-
T = iscomplex ? Complex{membertypes[1]} : HDF5Compound{N}
2012+
end
2013+
2014+
# check if should be interpreted as complex
2015+
iscomplex = COMPLEX_SUPPORT[] &&
2016+
N == 2 &&
2017+
(membernames == COMPLEX_FIELD_NAMES[]) &&
2018+
(membertypes[1] == membertypes[2]) &&
2019+
(membertypes[1] <: HDF5.HDF5Scalar)
2020+
2021+
if iscomplex
2022+
T = Complex{membertypes[1]}
20202023
else
2021-
T = HDF5Compound{N}
2024+
T = NamedTuple{Symbol.(membernames), Tuple{membertypes...}}
20222025
end
20232026
elseif class_id == H5T_ARRAY
20242027
T = hdf5array(objtype)
@@ -2423,7 +2426,7 @@ function hdf5array(objtype)
24232426
eltyp = HDF5Datatype(h5t_get_super(objtype.id))
24242427
T = hdf5_to_julia_eltype(eltyp)
24252428
dimsizes = ntuple(i -> Int(dims[nd-i+1]), nd) # reverse order
2426-
FixedArray{T, dimsizes}
2429+
FixedArray{T, dimsizes, prod(dimsizes)}
24272430
end
24282431

24292432
### Property manipulation ###

test/compound.jl

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
using Random, Test, HDF5
2+
3+
import HDF5.datatype
4+
import Base.unsafe_convert
5+
6+
struct foo
7+
a::Float64
8+
b::String
9+
c::String
10+
d::Array{ComplexF64,2}
11+
e::Array{Int64,1}
12+
end
13+
14+
struct foo_hdf5
15+
a::Float64
16+
b::Cstring
17+
c::NTuple{10, Cchar}
18+
d::NTuple{9, ComplexF64}
19+
e::HDF5.Hvl_t
20+
end
21+
22+
function unsafe_convert(::Type{foo_hdf5}, x::foo)
23+
foo_hdf5(x.a,
24+
Base.unsafe_convert(Cstring, x.b),
25+
ntuple(i -> x.c[i], length(x.c)),
26+
ntuple(i -> x.d[i], length(x.d)),
27+
HDF5.Hvl_t(length(x.e), pointer(x.e))
28+
)
29+
end
30+
31+
function datatype(::Type{foo_hdf5})
32+
dtype = HDF5.h5t_create(HDF5.H5T_COMPOUND, sizeof(foo_hdf5))
33+
HDF5.h5t_insert(dtype, "a", fieldoffset(foo_hdf5, 1), datatype(Float64))
34+
35+
vlenstr_dtype = HDF5.h5t_copy(HDF5.H5T_C_S1)
36+
HDF5.h5t_set_size(vlenstr_dtype, HDF5.H5T_VARIABLE)
37+
HDF5.h5t_set_cset(vlenstr_dtype, HDF5.H5T_CSET_UTF8)
38+
HDF5.h5t_insert(dtype, "b", fieldoffset(foo_hdf5, 2), vlenstr_dtype)
39+
40+
fixedstr_dtype = HDF5.h5t_copy(HDF5.H5T_C_S1)
41+
HDF5.h5t_set_size(fixedstr_dtype, 10 * sizeof(Cchar))
42+
HDF5.h5t_set_cset(fixedstr_dtype, HDF5.H5T_CSET_UTF8)
43+
HDF5.h5t_insert(dtype, "c", fieldoffset(foo_hdf5, 3), fixedstr_dtype)
44+
45+
hsz = HDF5.Hsize[3,3]
46+
array_dtype = HDF5.h5t_array_create(datatype(ComplexF64).id, 2, hsz)
47+
HDF5.h5t_insert(dtype, "d", fieldoffset(foo_hdf5, 4), array_dtype)
48+
49+
vlen_dtype = HDF5.h5t_vlen_create(datatype(Int64))
50+
HDF5.h5t_insert(dtype, "e", fieldoffset(foo_hdf5, 5), vlen_dtype)
51+
52+
HDF5Datatype(dtype)
53+
end
54+
55+
@testset "compound" begin
56+
N = 10
57+
v = [foo(rand(),
58+
randstring(rand(10:100)),
59+
randstring(10),
60+
rand(ComplexF64, 3,3),
61+
rand(1:10, rand(10:100))
62+
)
63+
for _ in 1:N]
64+
v_write = unsafe_convert.(foo_hdf5, v)
65+
66+
fn = tempname()
67+
h5open(fn, "w") do h5f
68+
dtype = datatype(foo_hdf5)
69+
space = dataspace(v_write)
70+
dset = HDF5.h5d_create(h5f.id, "data", dtype.id, space.id)
71+
HDF5.h5d_write(dset, dtype.id, v_write)
72+
end
73+
74+
v_read = h5read(fn, "data")
75+
for field in (:a, :b, :c, :d, :e)
76+
f = x -> getfield(x, field)
77+
@test f.(v) == f.(v_read)
78+
end
79+
80+
T = NamedTuple{(:a, :b, :c, :d, :e, :f), Tuple{Int, Int, Int, Int, Int, Cstring}}
81+
TT = NamedTuple{(:a, :b, :c, :d, :e, :f), Tuple{Int, Int, Int, Int, Int, T}}
82+
TTT = NamedTuple{(:a, :b, :c, :d, :e, :f), Tuple{Int, Int, Int, Int, Int, TT}}
83+
TTTT = NamedTuple{(:a, :b, :c, :d, :e, :f), Tuple{Int, Int, Int, Int, Int, TTT}}
84+
85+
@test HDF5.do_reclaim(TTTT) == true
86+
@test HDF5.do_normalize(TTTT) == true
87+
88+
T = NamedTuple{(:a, :b, :c, :d, :e, :f), Tuple{Int, Int, Int, Int, Int, HDF5.FixedArray}}
89+
TT = NamedTuple{(:a, :b, :c, :d, :e, :f), Tuple{Int, Int, Int, Int, Int, T}}
90+
TTT = NamedTuple{(:a, :b, :c, :d, :e, :f), Tuple{Int, Int, Int, Int, Int, TT}}
91+
TTTT = NamedTuple{(:a, :b, :c, :d, :e, :f), Tuple{Int, Int, Int, Int, Int, TTT}}
92+
93+
@test HDF5.do_reclaim(TTTT) == false
94+
@test HDF5.do_normalize(TTTT) == true
95+
end

test/plain.jl

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -320,12 +320,7 @@ rm(tmpdir, recursive=true)
320320
test_files = joinpath(@__DIR__, "test_files")
321321

322322
d = h5read(joinpath(test_files, "compound.h5"), "/data")
323-
@test typeof(d[1]) === HDF5.HDF5Compound{4}
324-
@test length(d) == 2
325-
dtypes = [typeof(x) for x in d[1].data]
326-
@test dtypes == [Float64, Vector{Float64}, Vector{Float64}, Float64]
327-
@test length(d[1].data[2]) == 3
328-
@test d[1].membername == ("wgt", "xyz", "uvw", "E")
323+
@test typeof(d[1]) == NamedTuple{(:wgt, :xyz, :uvw, :E), Tuple{Float64, Array{Float64, 1}, Array{Float64, 1}, Float64}}
329324

330325
# get-datasets
331326
fn = tempname()
@@ -450,12 +445,12 @@ end # testset plain
450445

451446
HDF5.disable_complex_support()
452447
z = read(fr, "ComplexF64")
453-
@test isa(z, HDF5.HDF5Compound{2})
448+
@test isa(z, NamedTuple{(:r, :i), Tuple{Float64, Float64}})
454449

455450
Acmplx32 = read(fr, "Acmplx32")
456-
@test eltype(Acmplx32) == HDF5.HDF5Compound{2}
451+
@test eltype(Acmplx32) == NamedTuple{(:r, :i), Tuple{Float32, Float32}}
457452
Acmplx64 = read(fr, "Acmplx64")
458-
@test eltype(Acmplx64) == HDF5.HDF5Compound{2}
453+
@test eltype(Acmplx64) == NamedTuple{(:r, :i), Tuple{Float64, Float64}}
459454

460455
close(fr)
461456

test/runtests.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ using Pkg
55
println("HDF5 version ", HDF5.h5_get_libversion())
66

77
include("plain.jl")
8+
include("compound.jl")
89
include("readremote.jl")
910
include("extend_test.jl")
1011
include("gc.jl")

0 commit comments

Comments
 (0)