-
Notifications
You must be signed in to change notification settings - Fork 142
improve performance/ergonomics of reading compound datatypes #592
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
82a6ffc
349eaef
31ab8d2
4e3f6c9
dfe1667
d52ea72
8f14a29
733f4d0
6c8bc40
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -465,13 +465,6 @@ end | |||||
==(a::HDF5ReferenceObj, b::HDF5ReferenceObj) = a.r == b.r | ||||||
hash(x::HDF5ReferenceObj, h::UInt) = hash(x.r, h) | ||||||
|
||||||
# Compound types | ||||||
struct HDF5Compound{N} | ||||||
data::NTuple{N,Any} | ||||||
membername::NTuple{N,String} | ||||||
membertype::NTuple{N,Type} | ||||||
end | ||||||
|
||||||
# Opaque types | ||||||
struct HDF5Opaque | ||||||
data | ||||||
|
@@ -482,9 +475,22 @@ end | |||||
struct EmptyArray{T} end | ||||||
|
||||||
# Stub types to encode fixed-size arrays for H5T_ARRAY | ||||||
struct FixedArray{T,D} end | ||||||
size(::Type{FixedArray{T,D}}) where {T,D} = D | ||||||
eltype(::Type{FixedArray{T,D}}) where {T,D} = T | ||||||
struct FixedArray{T,D,L} | ||||||
data::NTuple{L, T} | ||||||
end | ||||||
size(::Type{FixedArray{T,D,L}}) where {T,D,L} = D | ||||||
eltype(::Type{FixedArray{T,D,L}}) where {T,D,L} = T | ||||||
|
||||||
struct FixedString{N} | ||||||
data::NTuple{N, Cchar} | ||||||
end | ||||||
length(::Type{FixedString{N}}) where N = N | ||||||
|
||||||
struct VariableArray{T} | ||||||
len::Csize_t | ||||||
p::Ptr{Cvoid} | ||||||
end | ||||||
eltype(::Type{VariableArray{T}}) where T = T | ||||||
|
||||||
# VLEN objects | ||||||
struct HDF5Vlen{T} | ||||||
|
@@ -1459,73 +1465,70 @@ function getindex(parent::Union{HDF5File, HDF5Group, HDF5Dataset}, r::HDF5Refere | |||||
h5object(obj_id, parent) | ||||||
end | ||||||
|
||||||
# Helper for reading compound types | ||||||
function read_row(io::IO, membertype, membersize) | ||||||
row = Any[] | ||||||
for (dtype, dsize) in zip(membertype, membersize) | ||||||
if dtype === String | ||||||
push!(row, unpad(read!(io, Vector{UInt8}(undef,dsize)), H5T_STR_NULLPAD)) | ||||||
elseif dtype<:HDF5.FixedArray && eltype(dtype)<:HDF5BitsKind | ||||||
val = read!(io, Vector{eltype(dtype)}(undef,prod(size(dtype)))) | ||||||
push!(row, reshape(val, size(dtype))) | ||||||
elseif dtype<:HDF5BitsKind | ||||||
push!(row, read(io, dtype)) | ||||||
else | ||||||
# for other types, just store the raw bytes and let the user | ||||||
# decide what to do | ||||||
push!(row, read!(io, Vector{UInt8}(undef,dsize))) | ||||||
end | ||||||
# convert Cstring/FixedString to String | ||||||
function normalize_types(x::NamedTuple{T}) where T | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. where was this normalization obtained from? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this was taken from the current handling of strings/vlen types. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. except that for vlen arrays I use a non-owning unsafe wrap and then make a copy. I think this is the right thing to do since hdf5 wants to manage its own memory. |
||||||
|
||||||
vals = map(values(x)) do xi | ||||||
Ti = typeof(xi) | ||||||
if Ti == Cstring | ||||||
unsafe_string(xi) | ||||||
elseif Ti <: FixedString | ||||||
join(Char.(xi.data)) | ||||||
elseif Ti <: FixedArray | ||||||
reshape(collect(xi.data), size(Ti)...) | ||||||
elseif Ti <: VariableArray | ||||||
copy(unsafe_wrap(Array, convert(Ptr{eltype(xi)}, xi.p), xi.len, own=false)) | ||||||
elseif Ti <: NamedTuple | ||||||
normalize_types(xi) | ||||||
else | ||||||
xi | ||||||
end | ||||||
return (row...,) | ||||||
end | ||||||
|
||||||
NamedTuple{T}(vals) | ||||||
end | ||||||
|
||||||
# Read compound type | ||||||
function read(obj::HDF5Dataset, T::Union{Type{Array{HDF5Compound{N}}},Type{HDF5Compound{N}}}) where {N} | ||||||
t = datatype(obj) | ||||||
local sz = 0; local n; | ||||||
local membername; local membertype; | ||||||
local memberoffset; local memberfiletype; local membersize; | ||||||
try | ||||||
memberfiletype = Vector{HDF5Datatype}(undef,N) | ||||||
membertype = Vector{Type}(undef,N) | ||||||
membername = Vector{String}(undef,N) | ||||||
memberoffset = Vector{UInt64}(undef,N) | ||||||
membersize = Vector{UInt32}(undef,N) | ||||||
for i = 1:N | ||||||
filetype = HDF5Datatype(h5t_get_member_type(t.id, i-1)) | ||||||
memberfiletype[i] = filetype | ||||||
membertype[i] = hdf5_to_julia_eltype(filetype) | ||||||
memberoffset[i] = sz | ||||||
membersize[i] = sizeof(filetype) | ||||||
sz += sizeof(filetype) | ||||||
membername[i] = h5t_get_member_name(t.id, i-1) | ||||||
end | ||||||
finally | ||||||
close(t) | ||||||
end | ||||||
# Build the "memory type" | ||||||
memtype_id = h5t_create(H5T_COMPOUND, sz) | ||||||
for i = 1:N | ||||||
h5t_insert(memtype_id, membername[i], memberoffset[i], memberfiletype[i].id) # FIXME strings | ||||||
end | ||||||
# Read the raw data | ||||||
buf = Vector{UInt8}(undef,length(obj)*sz) | ||||||
h5d_read(obj.id, memtype_id, H5S_ALL, H5S_ALL, obj.xfer, buf) | ||||||
|
||||||
# Convert to the appropriate data format using iobuffer | ||||||
iobuff = IOBuffer(buf) | ||||||
data = Any[] | ||||||
while !eof(iobuff) | ||||||
push!(data, read_row(iobuff, membertype, membersize)) | ||||||
end | ||||||
# convert HDF5Compound type parameters to tuples | ||||||
membername = (membername...,) | ||||||
membertype = (membertype...,) | ||||||
if T === HDF5Compound{N} | ||||||
return HDF5Compound(data[1], membername, membertype) | ||||||
# get a vector of all the leaf types in a (possibly nested) named tuple | ||||||
function get_all_types(::Type{NamedTuple{T, U}}) where T where U | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm guessing there's a better way to do this that I can't currently think of, but seems fine for now.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems faster / better function get_all_types1(::Type{NamedTuple})
types = DataType[]
for Ui in U.types
if Ui <: NamedTuple
append!(types, get_all_types(Ui))
else
push!(types, Ui)
end
end
return types
end There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe this would be better/more julian?
the only reason I collect all the types is to see if I need to call reclaim or do an extra normalization step to turn the data into a native julia type. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah that might be simpler. It's certainly in our best favor to make the code run as fast as possible and be as simple as possible to read for maintenance. You can also benchmark performance. |
||||||
types = [] | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should probably be
Suggested change
|
||||||
for i in 1:fieldcount(U) | ||||||
Ui = fieldtype(U, i) | ||||||
if Ui <: NamedTuple | ||||||
append!(types, get_all_types(Ui)) | ||||||
else | ||||||
return [HDF5Compound(elem, membername, membertype) for elem in data] | ||||||
push!(types, Ui) | ||||||
end | ||||||
end | ||||||
types | ||||||
end | ||||||
|
||||||
function read(dset::HDF5Dataset, T::Union{Type{Array{U}}, Type{U}}) where U <: NamedTuple | ||||||
filetype = HDF5.datatype(dset) | ||||||
memtype_id = HDF5.h5t_get_native_type(filetype.id) # padded layout in memory | ||||||
@assert sizeof(U) == HDF5.h5t_get_size(memtype_id) "Type sizes mismatch!" | ||||||
|
||||||
buf = Array{U}(undef, size(dset)) | ||||||
|
||||||
HDF5.h5d_read(dset.id, memtype_id, HDF5.H5S_ALL, HDF5.H5S_ALL, HDF5.H5P_DEFAULT, buf) | ||||||
|
||||||
types = get_all_types(U) | ||||||
normalize = any(t -> t <: Union{Cstring, FixedString, FixedArray, VariableArray}, types) | ||||||
out = normalize ? normalize_types.(buf) : buf | ||||||
|
||||||
reclaim = any(t -> t <: Union{Cstring, VariableArray}, types) | ||||||
if reclaim | ||||||
dspace = dataspace(dset) | ||||||
# NOTE I have seen this call fail but I cannot reproduce | ||||||
h5d_vlen_reclaim(memtype_id, dspace.id, H5P_DEFAULT, buf) | ||||||
end | ||||||
|
||||||
HDF5.h5t_close(memtype_id) | ||||||
|
||||||
if T <: NamedTuple | ||||||
return out[1] | ||||||
else | ||||||
return out | ||||||
end | ||||||
end | ||||||
|
||||||
# Read OPAQUE datasets and attributes | ||||||
|
@@ -1575,6 +1578,7 @@ function read(obj::DatasetOrAttribute, ::Type{HDF5Vlen{T}}) where {T<:Union{HDF5 | |||||
for i = 1:len | ||||||
h = structbuf[i] | ||||||
data[i] = p2a(convert(Ptr{T}, h.p), Int(h.len)) | ||||||
|
||||||
kleinhenz marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
end | ||||||
data | ||||||
end | ||||||
|
@@ -2007,18 +2011,41 @@ function hdf5_to_julia_eltype(objtype) | |||||
T = HDF5Vlen{hdf5_to_julia_eltype(HDF5Datatype(super_id))} | ||||||
elseif class_id == H5T_COMPOUND | ||||||
N = Int(h5t_get_nmembers(objtype.id)) | ||||||
kleinhenz marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
# check if should be interpreted as complex | ||||||
if COMPLEX_SUPPORT[] && N == 2 | ||||||
membernames = ntuple(N) do i | ||||||
h5t_get_member_name(objtype.id, i-1) | ||||||
end | ||||||
membertypes = ntuple(N) do i | ||||||
hdf5_to_julia_eltype(HDF5Datatype(h5t_get_member_type(objtype.id, i-1))) | ||||||
|
||||||
membernames = ntuple(N) do i | ||||||
h5t_get_member_name(objtype.id, i-1) | ||||||
end | ||||||
|
||||||
membertypes = ntuple(N) do i | ||||||
dtype = HDF5Datatype(h5t_get_member_type(objtype.id, i-1)) | ||||||
ci = h5t_get_class(dtype.id) | ||||||
|
||||||
if ci == H5T_STRING | ||||||
if h5t_is_variable_str(dtype.id) | ||||||
return Cstring | ||||||
else | ||||||
n = h5t_get_size(dtype.id) | ||||||
return FixedString{Int(n)} | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Again I'm guessing it's fine not do There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This one is actually necessary since NTuple requires the first type parameter to be |
||||||
end | ||||||
elseif ci == H5T_VLEN | ||||||
superid = h5t_get_super(dtype.id) | ||||||
T = VariableArray{hdf5_to_julia_eltype(HDF5Datatype(superid))} | ||||||
else | ||||||
return hdf5_to_julia_eltype(dtype) | ||||||
end | ||||||
iscomplex = (membernames == COMPLEX_FIELD_NAMES[]) && (membertypes[1] == membertypes[2]) && (membertypes[1] <: HDF5.HDF5Scalar) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. unrelated changes. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this part was refactored because now |
||||||
T = iscomplex ? Complex{membertypes[1]} : HDF5Compound{N} | ||||||
end | ||||||
|
||||||
# check if should be interpreted as complex | ||||||
iscomplex = COMPLEX_SUPPORT[] && | ||||||
N == 2 && | ||||||
(membernames == COMPLEX_FIELD_NAMES[]) && | ||||||
(membertypes[1] == membertypes[2]) && | ||||||
(membertypes[1] <: HDF5.HDF5Scalar) | ||||||
|
||||||
if iscomplex | ||||||
T = Complex{membertypes[1]} | ||||||
else | ||||||
T = HDF5Compound{N} | ||||||
T = NamedTuple{Symbol.(membernames), Tuple{membertypes...}} | ||||||
end | ||||||
elseif class_id == H5T_ARRAY | ||||||
T = hdf5array(objtype) | ||||||
|
@@ -2423,7 +2450,7 @@ function hdf5array(objtype) | |||||
eltyp = HDF5Datatype(h5t_get_super(objtype.id)) | ||||||
T = hdf5_to_julia_eltype(eltyp) | ||||||
dimsizes = ntuple(i -> Int(dims[nd-i+1]), nd) # reverse order | ||||||
FixedArray{T, dimsizes} | ||||||
FixedArray{T, dimsizes, prod(dimsizes)} | ||||||
end | ||||||
|
||||||
### Property manipulation ### | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
using Random, Test, HDF5 | ||
|
||
import HDF5.datatype | ||
import Base.unsafe_convert | ||
|
||
struct foo | ||
a::Float64 | ||
b::String | ||
c::String | ||
d::Array{ComplexF64,2} | ||
e::Array{Int64,1} | ||
end | ||
|
||
struct foo_hdf5 | ||
a::Float64 | ||
b::Cstring | ||
c::NTuple{10, Cchar} | ||
d::NTuple{9, ComplexF64} | ||
e::HDF5.Hvl_t | ||
end | ||
|
||
function unsafe_convert(::Type{foo_hdf5}, x::foo) | ||
foo_hdf5(x.a, | ||
Base.unsafe_convert(Cstring, x.b), | ||
ntuple(i -> x.c[i], length(x.c)), | ||
ntuple(i -> x.d[i], length(x.d)), | ||
HDF5.Hvl_t(convert(Csize_t, length(x.e)), convert(Ptr{Cvoid}, pointer(x.e))) | ||
kleinhenz marked this conversation as resolved.
Show resolved
Hide resolved
|
||
) | ||
end | ||
|
||
function datatype(::Type{foo_hdf5}) | ||
dtype = HDF5.h5t_create(HDF5.H5T_COMPOUND, sizeof(foo_hdf5)) | ||
HDF5.h5t_insert(dtype, "a", fieldoffset(foo_hdf5, 1), datatype(Float64)) | ||
|
||
vlenstr_dtype = HDF5.h5t_copy(HDF5.H5T_C_S1) | ||
HDF5.h5t_set_size(vlenstr_dtype, HDF5.H5T_VARIABLE) | ||
HDF5.h5t_set_cset(vlenstr_dtype, HDF5.H5T_CSET_UTF8) | ||
HDF5.h5t_insert(dtype, "b", fieldoffset(foo_hdf5, 2), vlenstr_dtype) | ||
|
||
fixedstr_dtype = HDF5.h5t_copy(HDF5.H5T_C_S1) | ||
HDF5.h5t_set_size(fixedstr_dtype, 10 * sizeof(Cchar)) | ||
HDF5.h5t_set_cset(fixedstr_dtype, HDF5.H5T_CSET_UTF8) | ||
HDF5.h5t_insert(dtype, "c", fieldoffset(foo_hdf5, 3), fixedstr_dtype) | ||
|
||
hsz = HDF5.Hsize[3,3] | ||
musm marked this conversation as resolved.
Show resolved
Hide resolved
|
||
array_dtype = HDF5.h5t_array_create(datatype(ComplexF64).id, 2, hsz) | ||
HDF5.h5t_insert(dtype, "d", fieldoffset(foo_hdf5, 4), array_dtype) | ||
|
||
vlen_dtype = HDF5.h5t_vlen_create(datatype(Int64)) | ||
HDF5.h5t_insert(dtype, "e", fieldoffset(foo_hdf5, 5), vlen_dtype) | ||
|
||
HDF5Datatype(dtype) | ||
end | ||
|
||
@testset "compound" begin | ||
N = 10 | ||
v = [foo(rand(), | ||
randstring(rand(10:100)), | ||
randstring(10), | ||
rand(ComplexF64, 3,3), | ||
rand(1:10, rand(10:100)) | ||
) | ||
for _ in 1:N] | ||
v_write = unsafe_convert.(foo_hdf5, v) | ||
|
||
fn = tempname() | ||
h5open(fn, "w") do h5f | ||
dtype = datatype(foo_hdf5) | ||
space = dataspace(v_write) | ||
dset = HDF5.h5d_create(h5f.id, "data", dtype.id, space.id) | ||
HDF5.h5d_write(dset, dtype.id, v_write) | ||
end | ||
|
||
v_read = h5read(fn, "data") | ||
for field in (:a, :b, :c, :d, :e) | ||
f = x -> getfield(x, field) | ||
@test f.(v) == f.(v_read) | ||
end | ||
end |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
how / where's this field being used ?
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is used so that the julia type you construct for an
H5T_ARRAY
is memory compatible and can be directly written into which is necessary for compound datatypes withH5T_ARRAY
members.