Skip to content

Commit 9d6f678

Browse files
yohjimaneclaude
andcommitted
[xrAnimation] Implement true GPU instancing for bone rendering
Replaced CPU-side per-bone geometry generation with proper GPU instancing: **Before:** - Generated ~37,000+ vertices CPU-side every frame - 47 bones × (24 octahedron verts + 2 spheres × 384 verts each) - Heavy CPU load, inefficient memory transfers **After (True GPU Instancing):** - Generate unit geometry ONCE at initialization: - Unit octahedron: 24 vertices (generated once) - Unit sphere: ~384 vertices (generated once) - Per-frame: Only upload 47 transform matrices + colors (InstanceData) - GPU draws: 2 draw calls total (one for octahedrons, one for spheres) **Performance Improvement:** - Vertex data uploaded: ~37,000 verts → 408 verts (unit geometry) - Per-frame uploads: 47 × 80 bytes = 3,760 bytes (transforms + colors only) - Draw calls: Same bone shape rendered 47 times with ONE draw call **Implementation Details:** 1. **New Shaders** (src/xrAnimation/tools/renderer/shaders/bone_instanced.{vert,frag}): - Per-vertex: position + normal (unit geometry) - Per-instance: mat4 transform + vec4 color 2. **Unit Geometry Generation** (GenerateUnitOctahedron/GenerateUnitSphere): - Called once in Initialize() - Stored in GPU buffers permanently 3. **Instance Buffer Management**: - Uploads per-instance transforms + colors each frame - Combined buffer for bones and spheres (drawn with different firstInstance) 4. **Instanced Draw Calls**: - vkCmdDraw(cmd, unit_octahedron_vertex_count, bone_count, 0, 0) - vkCmdDraw(cmd, unit_sphere_vertex_count, sphere_count, 0, bone_count) **Files Changed:** - src/xrAnimation/tools/renderer/DebugRenderer.{h,cpp}: GPU instancing implementation - src/xrAnimation/tools/renderer/shaders/bone_instanced.{vert,frag}: Instanced shaders - src/xrAnimation/tools/CMakeLists.txt: Added shader compilation for bone_instanced 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <[email protected]>
1 parent 1eb0d44 commit 9d6f678

12 files changed

+743
-29
lines changed

src/xrAnimation/AnimationECS_Components.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,9 @@ struct SkeletonMetadata
251251
// Skeleton hierarchy (parent indices, -1 for roots)
252252
std::vector<int> joint_parents;
253253

254+
// Pre-computed children per joint (for O(1) child lookup instead of O(N) linear search)
255+
std::vector<std::vector<int>> joint_children;
256+
254257
// Extended bone metadata (physics shapes, IK constraints, rest lengths, etc.)
255258
XRay::Animation::ExtendedBoneMetadataCollection metadata;
256259

@@ -265,9 +268,26 @@ struct SkeletonMetadata
265268
void Clear()
266269
{
267270
joint_parents.clear();
271+
joint_children.clear();
268272
metadata.clear();
269273
skeleton = nullptr;
270274
}
275+
276+
// Build the joint_children map from joint_parents (call after populating joint_parents)
277+
void BuildChildrenMap()
278+
{
279+
joint_children.clear();
280+
joint_children.resize(joint_parents.size());
281+
282+
for (size_t i = 0; i < joint_parents.size(); ++i)
283+
{
284+
int parent = joint_parents[i];
285+
if (parent >= 0 && static_cast<size_t>(parent) < joint_children.size())
286+
{
287+
joint_children[parent].push_back(static_cast<int>(i));
288+
}
289+
}
290+
}
271291
};
272292

273293
} // namespace AnimationECS

src/xrAnimation/AnimationECS_DebugRender.cpp

Lines changed: 56 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -47,19 +47,17 @@ void SkeletonDebugRenderSystem::Render(entt::registry& registry, IDebugDrawConte
4747
buffers.models.begin(), buffers.models.end());
4848

4949
// Render this skeleton instance
50-
RenderSkeleton(ctx, pose_models, metadata.joint_parents,
51-
inst_transform.world_transform, debug_state,
52-
metadata.metadata);
50+
RenderSkeleton(ctx, pose_models, metadata,
51+
inst_transform.world_transform, debug_state);
5352
}
5453
}
5554

5655
void SkeletonDebugRenderSystem::RenderSkeleton(
5756
IDebugDrawContext& ctx,
5857
const std::vector<ozz::math::Float4x4>& pose_models,
59-
const std::vector<int>& skeleton_parents,
58+
const SkeletonMetadata& skeleton_metadata,
6059
const ozz::math::Float4x4& instance_transform,
61-
const SkeletonDebugState& debug_state,
62-
const XRay::Animation::ExtendedBoneMetadataCollection& bone_metadata)
60+
const SkeletonDebugState& debug_state)
6361
{
6462
if (pose_models.empty())
6563
return;
@@ -70,6 +68,11 @@ void SkeletonDebugRenderSystem::RenderSkeleton(
7068
const ozz::math::Float4 joint_color{0.92f, 0.35f, 0.35f, 0.65f}; // Red-orange
7169
const ozz::math::Float4 link_color{0.65f, 0.65f, 0.95f, 0.55f}; // Purple-blue
7270

71+
// Get references for easier access
72+
const auto& skeleton_parents = skeleton_metadata.joint_parents;
73+
const auto& joint_children = skeleton_metadata.joint_children;
74+
const auto& bone_metadata = skeleton_metadata.metadata;
75+
7376
// Lambda to compute rest length as fallback
7477
auto compute_rest_length = [&](int bone_index) -> float
7578
{
@@ -90,10 +93,10 @@ void SkeletonDebugRenderSystem::RenderSkeleton(
9093
if (result > kSkeletonDebugEpsilon)
9194
return result;
9295

93-
// Try child distance
94-
for (size_t child = 0; child < skeleton_parents.size(); ++child)
96+
// Try child distance using pre-computed children (O(1) instead of O(N))
97+
if (bone_index < static_cast<int>(joint_children.size()))
9598
{
96-
if (skeleton_parents[child] == bone_index)
99+
for (int child : joint_children[bone_index])
97100
{
98101
result = std::max(result, DistanceBetween(
99102
pose_models[bone_index], pose_models[child]));
@@ -103,7 +106,17 @@ void SkeletonDebugRenderSystem::RenderSkeleton(
103106
return result;
104107
};
105108

106-
// Render each bone
109+
// Collect all bone instances for batched rendering
110+
std::vector<IDebugDrawContext::BoneInstance> bone_instances;
111+
bone_instances.reserve(pose_models.size());
112+
113+
std::vector<IDebugDrawContext::SphereInstance> sphere_instances;
114+
if (debug_state.show_joint_positions)
115+
{
116+
sphere_instances.reserve(pose_models.size() * 2); // head + tail per bone
117+
}
118+
119+
// First pass: collect all instances and draw lines
107120
for (size_t bone = 0; bone < pose_models.size(); ++bone)
108121
{
109122
const ozz::math::Float4x4& pose_transform = pose_models[bone];
@@ -147,17 +160,16 @@ void SkeletonDebugRenderSystem::RenderSkeleton(
147160
ozz::math::Float3 tail_position = bone_position;
148161
bool has_child = false;
149162

150-
// Find first child
151-
for (size_t child = 0; child < skeleton_parents.size(); ++child)
163+
// Find first child using pre-computed children map (O(1) instead of O(N))
164+
if (bone < joint_children.size())
152165
{
153-
if (skeleton_parents[child] == static_cast<int>(bone) &&
154-
child < pose_models.size())
166+
const auto& children = joint_children[bone];
167+
if (!children.empty() && static_cast<size_t>(children[0]) < pose_models.size())
155168
{
156169
const ozz::math::Float4x4 child_world =
157-
instance_transform * pose_models[child];
170+
instance_transform * pose_models[children[0]];
158171
tail_position = ExtractTranslation(child_world);
159172
has_child = true;
160-
break;
161173
}
162174
}
163175

@@ -208,15 +220,26 @@ void SkeletonDebugRenderSystem::RenderSkeleton(
208220
kSkeletonDefaultRadius * 0.4f,
209221
bone_length * 0.45f);
210222

211-
// Draw bone shape
212-
const ozz::math::Float4 draw_color = (bone == 0) ? root_color : bone_color;
213-
ctx.DrawBoneShape(bone_position, tail_position, joint_radius, draw_color);
223+
// Collect bone instance for batched rendering
224+
IDebugDrawContext::BoneInstance bone_inst;
225+
bone_inst.head = bone_position;
226+
bone_inst.tail = tail_position;
227+
bone_inst.radius = joint_radius;
228+
bone_inst.color = (bone == 0) ? root_color : bone_color;
229+
bone_instances.push_back(bone_inst);
214230

215-
// Draw joint spheres
231+
// Collect joint sphere instances if enabled
216232
if (debug_state.show_joint_positions)
217233
{
218-
ctx.DrawSphere(bone_position, joint_radius, joint_color, 24);
219-
ctx.DrawSphere(tail_position, joint_radius * 0.6f, joint_color, 24);
234+
IDebugDrawContext::SphereInstance sphere_inst;
235+
sphere_inst.center = bone_position;
236+
sphere_inst.radius = joint_radius;
237+
sphere_inst.color = joint_color;
238+
sphere_instances.push_back(sphere_inst);
239+
240+
sphere_inst.center = tail_position;
241+
sphere_inst.radius = joint_radius * 0.6f;
242+
sphere_instances.push_back(sphere_inst);
220243
}
221244

222245
// Draw axes at root bone
@@ -231,6 +254,17 @@ void SkeletonDebugRenderSystem::RenderSkeleton(
231254
ozz::math::Float4{0.3f, 0.6f, 1.0f, 1.0f}); // Blue Z
232255
}
233256
}
257+
258+
// Second pass: render all collected instances in batches
259+
if (!bone_instances.empty())
260+
{
261+
ctx.DrawBoneShapesInstanced(bone_instances.data(), bone_instances.size());
262+
}
263+
264+
if (!sphere_instances.empty())
265+
{
266+
ctx.DrawSpheresInstanced(sphere_instances.data(), sphere_instances.size());
267+
}
234268
}
235269

236270
float SkeletonDebugRenderSystem::ComputeRestLength(

src/xrAnimation/AnimationECS_DebugRender.h

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -73,17 +73,15 @@ class SkeletonDebugRenderSystem
7373
/// </summary>
7474
/// <param name="ctx">Debug draw context</param>
7575
/// <param name="pose_models">Model-space transforms for each joint (current pose)</param>
76-
/// <param name="skeleton_parents">Parent indices for each joint (-1 for roots)</param>
76+
/// <param name="skeleton_metadata">Skeleton hierarchy, children map, and extended metadata</param>
7777
/// <param name="instance_transform">World transform for this instance</param>
7878
/// <param name="debug_state">Visualization settings</param>
79-
/// <param name="bone_metadata">Extended metadata collection</param>
8079
static void RenderSkeleton(
8180
IDebugDrawContext& ctx,
8281
const std::vector<ozz::math::Float4x4>& pose_models,
83-
const std::vector<int>& skeleton_parents,
82+
const SkeletonMetadata& skeleton_metadata,
8483
const ozz::math::Float4x4& instance_transform,
85-
const SkeletonDebugState& debug_state,
86-
const XRay::Animation::ExtendedBoneMetadataCollection& bone_metadata);
84+
const SkeletonDebugState& debug_state);
8785

8886
/// <summary>
8987
/// Compute rest length for a bone (distance to parent or child).

src/xrAnimation/IDebugDrawContext.h

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,53 @@ class IDebugDrawContext
119119
const ozz::math::Float4& color_x,
120120
const ozz::math::Float4& color_y,
121121
const ozz::math::Float4& color_z) = 0;
122+
123+
// ========================================================================
124+
// BATCHED/INSTANCED RENDERING API
125+
// ========================================================================
126+
127+
/// <summary>
128+
/// Instance data for bone shape rendering.
129+
/// </summary>
130+
struct BoneInstance {
131+
ozz::math::Float3 head; // Bone start position (joint/parent)
132+
ozz::math::Float3 tail; // Bone end position (child)
133+
float radius; // Bone thickness
134+
ozz::math::Float4 color; // RGBA color
135+
};
136+
137+
/// <summary>
138+
/// Instance data for sphere rendering.
139+
/// </summary>
140+
struct SphereInstance {
141+
ozz::math::Float3 center; // Sphere center position
142+
float radius; // Sphere radius
143+
ozz::math::Float4 color; // RGBA color
144+
};
145+
146+
/// <summary>
147+
/// Draw multiple bone shapes in a single batched call.
148+
/// </summary>
149+
/// <param name="instances">Array of bone instances to render</param>
150+
/// <param name="count">Number of instances in the array</param>
151+
/// <remarks>
152+
/// Performance optimization: Reduces per-bone CPU overhead and consolidates
153+
/// geometry generation and draw calls. Implementations should batch all bones
154+
/// into a single vertex buffer and issue minimal draw calls.
155+
/// </remarks>
156+
virtual void DrawBoneShapesInstanced(const BoneInstance* instances, size_t count) = 0;
157+
158+
/// <summary>
159+
/// Draw multiple spheres in a single batched call.
160+
/// </summary>
161+
/// <param name="instances">Array of sphere instances to render</param>
162+
/// <param name="count">Number of instances in the array</param>
163+
/// <remarks>
164+
/// Performance optimization: Reduces per-sphere CPU overhead and consolidates
165+
/// geometry generation and draw calls. Implementations should batch all spheres
166+
/// into a single vertex buffer and issue minimal draw calls.
167+
/// </remarks>
168+
virtual void DrawSpheresInstanced(const SphereInstance* instances, size_t count) = 0;
122169
};
123170

124171
} // namespace AnimationECS

src/xrAnimation/tools/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,10 @@ if(_ozz_animation_viewer_supported AND Vulkan_FOUND)
203203
add_shader(ozz_animation_viewer shaded_bone.vert)
204204
add_shader(ozz_animation_viewer shaded_bone.frag)
205205

206+
# GPU instanced bone shaders
207+
add_shader(ozz_animation_viewer bone_instanced.vert)
208+
add_shader(ozz_animation_viewer bone_instanced.frag)
209+
206210
# Skinned mesh instanced shaders
207211
add_shader(ozz_animation_viewer skinned_mesh_instanced.vert)
208212
add_shader(ozz_animation_viewer skinned_mesh_instanced.frag)

src/xrAnimation/tools/ozz_animation_viewer.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2408,6 +2408,9 @@ void InitializeECSInstances(ViewerState& state, VulkanRenderer& renderer) {
24082408
skeleton_metadata.joint_parents[j] = static_cast<int>(parents[j]);
24092409
}
24102410

2411+
// Build pre-computed children map for O(1) child lookups
2412+
skeleton_metadata.BuildChildrenMap();
2413+
24112414
// Store skeleton pointer and metadata
24122415
skeleton_metadata.skeleton = &state.skeleton;
24132416

0 commit comments

Comments
 (0)