Skip to content

Commit

Permalink
blut shader improvements, use bilinear sampling for 2x fewer samples
Browse files Browse the repository at this point in the history
  • Loading branch information
mmozeiko authored and ryanfleury committed Jan 23, 2024
1 parent a77d457 commit 1705fdd
Show file tree
Hide file tree
Showing 4 changed files with 209 additions and 155 deletions.
73 changes: 38 additions & 35 deletions src/render/d3d11/generated/render_d3d11.meta.h
Original file line number Diff line number Diff line change
Expand Up @@ -205,10 +205,14 @@ str8_lit_comp(
"cbuffer Globals : register(b0)\n"
"{\n"
" float4 rect;\n"
" float2 viewport_size;\n"
" float blur_size;\n"
" float is_vertical;\n"
" float4 corner_radii_px;\n"
" float2 direction;\n"
" float2 viewport_size;\n"
" uint blur_count;\n"
"}\n"
"\n"
"cbuffer Kernel : register(b1)\n"
"{\n"
" float4 kernel[32];\n"
"}\n"
"\n"
Expand All @@ -221,7 +225,7 @@ str8_lit_comp(
"{\n"
" float4 position : SV_POSITION;\n"
" float2 texcoord : TEX;\n"
" float2 cornercoord : CRN;\n"
" float2 sdf_sample_pos : SDF;\n"
" float corner_radius : RAD;\n"
"};\n"
"\n"
Expand All @@ -238,12 +242,12 @@ str8_lit_comp(
"Vertex2Pixel\n"
"vs_main(CPU2Vertex c2v)\n"
"{\n"
" float4 vertex_positions__scrn[] =\n"
" float2 vertex_positions__scrn[] =\n"
" {\n"
" float4(rect.x, rect.w, 0, 1) * float4(1, -1, 1, 1) + float4(0, viewport_size.y, 0, 0),\n"
" float4(rect.x, rect.y, 0, 1) * float4(1, -1, 1, 1) + float4(0, viewport_size.y, 0, 0),\n"
" float4(rect.z, rect.w, 0, 1) * float4(1, -1, 1, 1) + float4(0, viewport_size.y, 0, 0),\n"
" float4(rect.z, rect.y, 0, 1) * float4(1, -1, 1, 1) + float4(0, viewport_size.y, 0, 0),\n"
" rect.xw,\n"
" rect.xy,\n"
" rect.zw,\n"
" rect.zy,\n"
" };\n"
" float corner_radii__px[] =\n"
" {\n"
Expand All @@ -252,22 +256,20 @@ str8_lit_comp(
" corner_radii_px.w,\n"
" corner_radii_px.z,\n"
" };\n"
" float2 cornercoords__pct[] =\n"
" {\n"
" float2(0, 1),\n"
" float2(0, 0),\n"
" float2(1, 1),\n"
" float2(1, 0),\n"
" };\n"
" float4 vertex_position__scrn = vertex_positions__scrn[c2v.vertex_id];\n"
" float4 vertex_position__clip = float4(2*vertex_position__scrn.x/viewport_size.x - 1,\n"
" 2*vertex_position__scrn.y/viewport_size.y - 1,\n"
" 0, 1);\n"
" float2 cornercoords__pct = float2(\n"
" (c2v.vertex_id >> 1) ? 1.f : 0.f,\n"
" (c2v.vertex_id & 1) ? 0.f : 1.f);\n"
"\n"
" float2 vertex_position__pct = vertex_positions__scrn[c2v.vertex_id] / viewport_size;\n"
" float2 vertex_position__scr = 2.f * vertex_position__pct - 1.f;\n"
"\n"
" float2 rect_half_size = float2((rect.z-rect.x)/2, (rect.w-rect.y)/2);\n"
"\n"
" Vertex2Pixel v2p;\n"
" {\n"
" v2p.position = vertex_position__clip;\n"
" v2p.texcoord = float2(vertex_position__scrn.x/viewport_size.x, 1 - vertex_position__scrn.y/viewport_size.y);\n"
" v2p.cornercoord = cornercoords__pct[c2v.vertex_id];\n"
" v2p.position = float4(vertex_position__scr.x, -vertex_position__scr.y, 0.f, 1.f);\n"
" v2p.texcoord = vertex_position__pct;\n"
" v2p.sdf_sample_pos = (2.f * cornercoords__pct - 1.f) * rect_half_size;\n"
" v2p.corner_radius = corner_radii__px[c2v.vertex_id];\n"
" }\n"
" return v2p;\n"
Expand All @@ -279,26 +281,27 @@ str8_lit_comp(
"ps_main(Vertex2Pixel v2p) : SV_TARGET\n"
"{\n"
" // rjf: blend weighted texture samples into color\n"
" float4 color = stage_t2d.Sample(stage_sampler, v2p.texcoord) * kernel[0].x;\n"
" float4 color = kernel[0].x * stage_t2d.Sample(stage_sampler, v2p.texcoord);\n"
" color.a = kernel[0].x;\n"
" for(float i = 1; i < blur_size; i += 1)\n"
"\n"
" for(uint i = 1; i < blur_count; i += 1)\n"
" {\n"
" float weight = ((float[4])kernel[uint(i)/4])[uint(i)%4];\n"
" float4 min_sample = stage_t2d.Sample(stage_sampler, v2p.texcoord - float2(!is_vertical*i/viewport_size.x, is_vertical*i/viewport_size.y));\n"
" float4 max_sample = stage_t2d.Sample(stage_sampler, v2p.texcoord + float2(!is_vertical*i/viewport_size.x, is_vertical*i/viewport_size.y));\n"
" min_sample.a = 1;\n"
" max_sample.a = 1;\n"
" color += min_sample*weight;\n"
" color += max_sample*weight;\n"
" float weight = kernel[i].x;\n"
" float offset = kernel[i].y;\n"
" float4 min_sample = stage_t2d.Sample(stage_sampler, v2p.texcoord - offset * direction);\n"
" float4 max_sample = stage_t2d.Sample(stage_sampler, v2p.texcoord + offset * direction);\n"
" min_sample.a = 1.f;\n"
" max_sample.a = 1.f;\n"
" color += min_sample * weight;\n"
" color += max_sample * weight;\n"
" }\n"
" \n"
" // rjf: determine SDF sample position\n"
" float2 rect_half_size = float2((rect.z-rect.x)/2, (rect.w-rect.y)/2);\n"
" float2 sdf_sample_pos = float2((2*v2p.cornercoord.x-1)*rect_half_size.x,\n"
" (2*v2p.cornercoord.y-1)*rect_half_size.y);\n"
" float2 sdf_sample_pos = v2p.sdf_sample_pos;\n"
" \n"
" // rjf: sample for corners\n"
" float corner_sdf_s = rect_sdf(sdf_sample_pos, rect_half_size - float2(2.f, 2.f), v2p.corner_radius);\n"
" float corner_sdf_s = rect_sdf(sdf_sample_pos, rect_half_size - 2.f, v2p.corner_radius);\n"
" float corner_sdf_t = 1-smoothstep(0, 2, corner_sdf_s);\n"
" \n"
" // rjf: weight output color by sdf\n"
Expand Down
203 changes: 122 additions & 81 deletions src/render/d3d11/render_d3d11.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1183,99 +1183,140 @@ r_window_submit(OS_Handle window, R_Handle window_equip, R_PassList *passes)
case R_PassKind_Blur:
{
R_PassParams_Blur *params = pass->params_blur;
ID3D11SamplerState *sampler = r_d3d11_state->samplers[R_Tex2DSampleKind_Nearest];
ID3D11SamplerState *sampler = r_d3d11_state->samplers[R_Tex2DSampleKind_Linear];
ID3D11VertexShader *vshad = r_d3d11_state->vshads[R_D3D11_VShadKind_Blur];
ID3D11PixelShader *pshad = r_d3d11_state->pshads[R_D3D11_PShadKind_Blur];
ID3D11Buffer *uniforms_buffer = r_d3d11_state->uniform_type_kind_buffers[R_D3D11_VShadKind_Blur];

//- rjf: perform blur on each axis
ID3D11RenderTargetView *rtvs[Axis2_COUNT] =
{
wnd->stage_scratch_color_rtv,
wnd->stage_color_rtv,
};
ID3D11ShaderResourceView *srvs[Axis2_COUNT] =

// rjf: setup output merger
d_ctx->OMSetDepthStencilState(r_d3d11_state->noop_depth_stencil, 0);
d_ctx->OMSetBlendState(r_d3d11_state->main_blend_state, 0, 0xffffffff);

// rjf: set up viewport
Vec2S32 resolution = wnd->last_resolution;
D3D11_VIEWPORT viewport = { 0.0f, 0.0f, (F32)resolution.x, (F32)resolution.y, 0.0f, 1.0f };
d_ctx->RSSetViewports(1, &viewport);
d_ctx->RSSetState(r_d3d11_state->main_rasterizer);

// rjf: setup input assembly
d_ctx->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP);
d_ctx->IASetInputLayout(0);

// rjf: setup shaders
d_ctx->VSSetShader(vshad, 0, 0);
d_ctx->VSSetConstantBuffers(0, 1, &uniforms_buffer);
d_ctx->PSSetShader(pshad, 0, 0);
d_ctx->PSSetSamplers(0, 1, &sampler);

// rjf: setup scissor rect
{
wnd->stage_color_srv,
wnd->stage_scratch_color_srv,
};
for(Axis2 axis = (Axis2)0; axis < Axis2_COUNT; axis = (Axis2)(axis+1))
D3D11_RECT rect = { 0 };
rect.left = 0;
rect.right = (LONG)wnd->last_resolution.x;
rect.top = 0;
rect.bottom = (LONG)wnd->last_resolution.y;
d_ctx->RSSetScissorRects(1, &rect);
}

// rjf: set up uniforms
R_D3D11_Uniforms_Blur uniforms = { 0 };
{
// rjf: setup output merger
d_ctx->OMSetRenderTargets(1, &rtvs[axis], 0);
d_ctx->OMSetDepthStencilState(r_d3d11_state->noop_depth_stencil, 0);
d_ctx->OMSetBlendState(r_d3d11_state->main_blend_state, 0, 0xffffffff);

// rjf: set up viewport
Vec2S32 resolution = wnd->last_resolution;
D3D11_VIEWPORT viewport = { 0.0f, 0.0f, (F32)resolution.x, (F32)resolution.y, 0.0f, 1.0f };
d_ctx->RSSetViewports(1, &viewport);
d_ctx->RSSetState(r_d3d11_state->main_rasterizer);

// rjf: setup input assembly
d_ctx->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP);
d_ctx->IASetInputLayout(0);

// rjf: set up uniforms
F32 weights[ArrayCount(uniforms.kernel)*2] = {0};

F32 blur_size = Min(params->blur_size, ArrayCount(weights));
U64 blur_count = (U64)round_f32(blur_size);

F32 stdev = (blur_size-1.f)/2.f;
F32 one_over_root_2pi_stdev2 = 1/sqrt_f32(2*pi32*stdev*stdev);
F32 euler32 = 2.718281828459045f;

weights[0] = 1.f;
if(stdev > 0.f)
{
F32 stdev = (params->blur_size-1.f)/2.f;
F32 one_over_root_2pi_stdev2 = 1/sqrt_f32(2*pi32*stdev*stdev);
F32 euler32 = 2.718281828459045f;
R_D3D11_Uniforms_Blur uniforms = {0};
uniforms.viewport_size = v2f32(resolution.x, resolution.y);
uniforms.rect = params->rect;
uniforms.blur_size = params->blur_size;
uniforms.is_vertical = (F32)!!axis;
MemoryCopyArray(uniforms.corner_radii.v, params->corner_radii);
F32 kernel_x = 0;
uniforms.kernel[0].v[0] = 1.f;
if(stdev > 0.f)
{
for(U64 idx = 0; idx < ArrayCount(uniforms.kernel); idx += 1)
{
for(U64 v_idx = 0; v_idx < ArrayCount(uniforms.kernel[idx].v); v_idx += 1)
{
uniforms.kernel[idx].v[v_idx] = one_over_root_2pi_stdev2*pow_f32(euler32, -kernel_x*kernel_x/(2.f*stdev*stdev));
kernel_x += 1;
}
}
}
if(uniforms.kernel[0].v[0] > 1.f)
for(U64 idx = 0; idx < blur_count; idx += 1)
{
MemoryZeroArray(uniforms.kernel);
uniforms.kernel[0].v[0] = 1.f;
F32 kernel_x = (F32)idx;
weights[idx] = one_over_root_2pi_stdev2*pow_f32(euler32, -kernel_x*kernel_x/(2.f*stdev*stdev));
}
D3D11_MAPPED_SUBRESOURCE sub_rsrc = {0};
r_d3d11_state->device_ctx->Map(uniforms_buffer, 0, D3D11_MAP_WRITE_DISCARD, 0, &sub_rsrc);
MemoryCopy((U8 *)sub_rsrc.pData, &uniforms, sizeof(uniforms));
r_d3d11_state->device_ctx->Unmap(uniforms_buffer, 0);
}

// rjf: setup shaders
d_ctx->VSSetShader(vshad, 0, 0);
d_ctx->VSSetConstantBuffers(0, 1, &uniforms_buffer);
d_ctx->PSSetShader(pshad, 0, 0);
d_ctx->PSSetConstantBuffers(0, 1, &uniforms_buffer);
d_ctx->PSSetShaderResources(0, 1, &srvs[axis]);
d_ctx->PSSetSamplers(0, 1, &sampler);

// rjf: setup scissor rect
if(weights[0] > 1.f)
{
D3D11_RECT rect = {0};
rect.left = 0;
rect.right = (LONG)wnd->last_resolution.x;
rect.top = 0;
rect.bottom = (LONG)wnd->last_resolution.y;
d_ctx->RSSetScissorRects(1, &rect);
MemoryZeroArray(weights);
weights[0] = 1.f;
}

// rjf: draw
d_ctx->Draw(4, 0);

// rjf: unset srv
ID3D11ShaderResourceView *srv = 0;
d_ctx->PSSetShaderResources(0, 1, &srv);
else
{
// prepare weights & offsets for bilinear lookup
// blur filter wants to calculate w0*pixel[pos] + w1*pixel[pos+1] + ...
// with bilinear filter we can do this calulation by doing only w*sample(pos+t) = w*((1-t)*pixel[pos] + t*pixel[pos+1])
// we can see w0=w*(1-t) and w1=w*t
// thus w=w0+w1 and t=w1/w
for (U64 idx = 1; idx < blur_count; idx += 2)
{
F32 w0 = weights[idx + 0];
F32 w1 = weights[idx + 1];
F32 w = w0 + w1;
F32 t = w1 / w;

// each kernel element is float2(weight, offset)
// weights & offsets are adjusted for bilinear sampling
// zw elements are not used, a bit of waste but it allows for simpler shader code
uniforms.kernel[(idx+1)/2] = v4f32(w, (F32)idx + t, 0, 0);
}
uniforms.kernel[0].x = weights[0];
}

// technically we need just direction be different
// but there are 256 bytes of usable space anyway for each constant buffer chunk

uniforms.passes[Axis2_X].viewport_size = v2f32(resolution.x, resolution.y);
uniforms.passes[Axis2_X].rect = params->rect;
uniforms.passes[Axis2_X].direction = v2f32(1.f / resolution.x, 0);
uniforms.passes[Axis2_X].blur_count = 1 + blur_count / 2; // 2x smaller because of bilinear sampling
MemoryCopyArray(uniforms.passes[Axis2_X].corner_radii.v, params->corner_radii);

uniforms.passes[Axis2_Y].viewport_size = v2f32(resolution.x, resolution.y);
uniforms.passes[Axis2_Y].rect = params->rect;
uniforms.passes[Axis2_Y].direction = v2f32(0, 1.f / resolution.y);
uniforms.passes[Axis2_Y].blur_count = 1 + blur_count / 2; // 2x smaller because of bilinear sampling
MemoryCopyArray(uniforms.passes[Axis2_Y].corner_radii.v, params->corner_radii);

D3D11_MAPPED_SUBRESOURCE sub_rsrc = {0};
r_d3d11_state->device_ctx->Map(uniforms_buffer, 0, D3D11_MAP_WRITE_DISCARD, 0, &sub_rsrc);
MemoryCopy((U8 *)sub_rsrc.pData, &uniforms, sizeof(uniforms));
r_d3d11_state->device_ctx->Unmap(uniforms_buffer, 0);
}

ID3D11Buffer* uniforms_buffers[] = { uniforms_buffer, uniforms_buffer };

U32 uniform_offset[Axis2_COUNT][2] =
{
{ 0 * sizeof(R_D3D11_Uniforms_BlurPass) / 16, OffsetOf(R_D3D11_Uniforms_Blur, kernel) / 16 },
{ 1 * sizeof(R_D3D11_Uniforms_BlurPass) / 16, OffsetOf(R_D3D11_Uniforms_Blur, kernel) / 16 },
};

U32 uniform_count[Axis2_COUNT][2] =
{
{ sizeof(R_D3D11_Uniforms_BlurPass) / 16, sizeof(uniforms.kernel) / 16 },
{ sizeof(R_D3D11_Uniforms_BlurPass) / 16, sizeof(uniforms.kernel) / 16 },
};

// rjf: for unsetting srv
ID3D11ShaderResourceView* srv = 0;

// horizontal pass
d_ctx->OMSetRenderTargets(1, &wnd->stage_scratch_color_rtv, 0);
d_ctx->PSSetConstantBuffers1(0, ArrayCount(uniforms_buffers), uniforms_buffers, uniform_offset[Axis2_X], uniform_count[Axis2_X]);
d_ctx->PSSetShaderResources(0, 1, &wnd->stage_color_srv);
d_ctx->Draw(4, 0);
d_ctx->PSSetShaderResources(0, 1, &srv);

// vertical pass
d_ctx->OMSetRenderTargets(1, &wnd->stage_color_rtv, 0);
d_ctx->PSSetConstantBuffers1(0, ArrayCount(uniforms_buffers), uniforms_buffers, uniform_offset[Axis2_Y], uniform_count[Axis2_Y]);
d_ctx->PSSetShaderResources(0, 1, &wnd->stage_scratch_color_srv);
d_ctx->Draw(4, 0);
d_ctx->PSSetShaderResources(0, 1, &srv);
}break;


Expand Down
15 changes: 11 additions & 4 deletions src/render/d3d11/render_d3d11.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,20 @@ struct R_D3D11_Uniforms_Rect
Vec2F32 xform_scale;
};

struct R_D3D11_Uniforms_Blur
struct R_D3D11_Uniforms_BlurPass
{
Rng2F32 rect;
Vec2F32 viewport_size;
F32 blur_size;
F32 is_vertical;
Vec4F32 corner_radii;
Vec2F32 direction;
Vec2F32 viewport_size;
U32 blur_count;
U8 _padding0_[204];
};
StaticAssert(sizeof(R_D3D11_Uniforms_BlurPass) % 256 == 0, NotAligned); // constant count/offset must be aligned to 256 bytes

struct R_D3D11_Uniforms_Blur
{
R_D3D11_Uniforms_BlurPass passes[Axis2_COUNT];
Vec4F32 kernel[32];
};

Expand Down
Loading

0 comments on commit 1705fdd

Please sign in to comment.