Open
Description
As with recent work on e.g. issue #135 , we were looking at what there is any performance being left on the table with the ACSets set/ get API.
Here, I looked at the what's known in the docs as the "chaining" or "composition syntax". I trimmed down two versions of an operation used in CombinatorialSpaces.jl. One loops over some data using the composed syntax (loop_contracted!
), and the other decomposed (loop_expanded!
). All allocations are optimized away by the compiler when using the decomposed syntax, but the same is not done for the composed case. I think that recent PRs will give us a good starting off point.
julia> function loop_expanded!(buffer::Vector{Point3D}, sd::HasDeltaSet2D, ::Type{point_type}) where point_type
@inbounds for t in parts(sd, :DualTri)
buffer[t] = sd[sd[sd[t, :D_∂e1], :D_∂v1], :dual_point]
end
end;
julia> function loop_contracted!(buffer::Vector{Point3D}, sd::HasDeltaSet2D, ::Type{point_type}) where point_type
@inbounds for t in parts(sd, :DualTri)
buffer[t] = sd[t, [:D_∂e1, :D_∂v1, :dual_point]]
end
end;
julia> @btime sd[sd[sd[1, :D_∂e1], :D_∂v1], :dual_point];
1.146 μs (9 allocations: 1.78 KiB)
julia> @btime sd[1, [:D_∂e1, :D_∂v1, :dual_point]];
1.240 μs (12 allocations: 3.08 KiB)
julia> buffer = Vector{Point3D}(undef, nparts(sd, :DualTri));
julia> @btime loop_expanded!(buffer, sd, Point3{Float64});
4.073 ms (0 allocations: 0 bytes)
julia> buffer = Vector{Point3D}(undef, nparts(sd, :DualTri));
julia> @btime loop_contracted!(buffer, sd, Point3{Float64});
892.627 ms (7669178 allocations: 1.44 GiB)