Skip to content

Commit 1705fdd

Browse files
mmozeikoryanfleury
authored andcommitted
blut shader improvements, use bilinear sampling for 2x fewer samples
1 parent a77d457 commit 1705fdd

File tree

4 files changed

+209
-155
lines changed

4 files changed

+209
-155
lines changed

src/render/d3d11/generated/render_d3d11.meta.h

Lines changed: 38 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -205,10 +205,14 @@ str8_lit_comp(
205205
"cbuffer Globals : register(b0)\n"
206206
"{\n"
207207
" float4 rect;\n"
208-
" float2 viewport_size;\n"
209-
" float blur_size;\n"
210-
" float is_vertical;\n"
211208
" float4 corner_radii_px;\n"
209+
" float2 direction;\n"
210+
" float2 viewport_size;\n"
211+
" uint blur_count;\n"
212+
"}\n"
213+
"\n"
214+
"cbuffer Kernel : register(b1)\n"
215+
"{\n"
212216
" float4 kernel[32];\n"
213217
"}\n"
214218
"\n"
@@ -221,7 +225,7 @@ str8_lit_comp(
221225
"{\n"
222226
" float4 position : SV_POSITION;\n"
223227
" float2 texcoord : TEX;\n"
224-
" float2 cornercoord : CRN;\n"
228+
" float2 sdf_sample_pos : SDF;\n"
225229
" float corner_radius : RAD;\n"
226230
"};\n"
227231
"\n"
@@ -238,12 +242,12 @@ str8_lit_comp(
238242
"Vertex2Pixel\n"
239243
"vs_main(CPU2Vertex c2v)\n"
240244
"{\n"
241-
" float4 vertex_positions__scrn[] =\n"
245+
" float2 vertex_positions__scrn[] =\n"
242246
" {\n"
243-
" float4(rect.x, rect.w, 0, 1) * float4(1, -1, 1, 1) + float4(0, viewport_size.y, 0, 0),\n"
244-
" float4(rect.x, rect.y, 0, 1) * float4(1, -1, 1, 1) + float4(0, viewport_size.y, 0, 0),\n"
245-
" float4(rect.z, rect.w, 0, 1) * float4(1, -1, 1, 1) + float4(0, viewport_size.y, 0, 0),\n"
246-
" float4(rect.z, rect.y, 0, 1) * float4(1, -1, 1, 1) + float4(0, viewport_size.y, 0, 0),\n"
247+
" rect.xw,\n"
248+
" rect.xy,\n"
249+
" rect.zw,\n"
250+
" rect.zy,\n"
247251
" };\n"
248252
" float corner_radii__px[] =\n"
249253
" {\n"
@@ -252,22 +256,20 @@ str8_lit_comp(
252256
" corner_radii_px.w,\n"
253257
" corner_radii_px.z,\n"
254258
" };\n"
255-
" float2 cornercoords__pct[] =\n"
256-
" {\n"
257-
" float2(0, 1),\n"
258-
" float2(0, 0),\n"
259-
" float2(1, 1),\n"
260-
" float2(1, 0),\n"
261-
" };\n"
262-
" float4 vertex_position__scrn = vertex_positions__scrn[c2v.vertex_id];\n"
263-
" float4 vertex_position__clip = float4(2*vertex_position__scrn.x/viewport_size.x - 1,\n"
264-
" 2*vertex_position__scrn.y/viewport_size.y - 1,\n"
265-
" 0, 1);\n"
259+
" float2 cornercoords__pct = float2(\n"
260+
" (c2v.vertex_id >> 1) ? 1.f : 0.f,\n"
261+
" (c2v.vertex_id & 1) ? 0.f : 1.f);\n"
262+
"\n"
263+
" float2 vertex_position__pct = vertex_positions__scrn[c2v.vertex_id] / viewport_size;\n"
264+
" float2 vertex_position__scr = 2.f * vertex_position__pct - 1.f;\n"
265+
"\n"
266+
" float2 rect_half_size = float2((rect.z-rect.x)/2, (rect.w-rect.y)/2);\n"
267+
"\n"
266268
" Vertex2Pixel v2p;\n"
267269
" {\n"
268-
" v2p.position = vertex_position__clip;\n"
269-
" v2p.texcoord = float2(vertex_position__scrn.x/viewport_size.x, 1 - vertex_position__scrn.y/viewport_size.y);\n"
270-
" v2p.cornercoord = cornercoords__pct[c2v.vertex_id];\n"
270+
" v2p.position = float4(vertex_position__scr.x, -vertex_position__scr.y, 0.f, 1.f);\n"
271+
" v2p.texcoord = vertex_position__pct;\n"
272+
" v2p.sdf_sample_pos = (2.f * cornercoords__pct - 1.f) * rect_half_size;\n"
271273
" v2p.corner_radius = corner_radii__px[c2v.vertex_id];\n"
272274
" }\n"
273275
" return v2p;\n"
@@ -279,26 +281,27 @@ str8_lit_comp(
279281
"ps_main(Vertex2Pixel v2p) : SV_TARGET\n"
280282
"{\n"
281283
" // rjf: blend weighted texture samples into color\n"
282-
" float4 color = stage_t2d.Sample(stage_sampler, v2p.texcoord) * kernel[0].x;\n"
284+
" float4 color = kernel[0].x * stage_t2d.Sample(stage_sampler, v2p.texcoord);\n"
283285
" color.a = kernel[0].x;\n"
284-
" for(float i = 1; i < blur_size; i += 1)\n"
286+
"\n"
287+
" for(uint i = 1; i < blur_count; i += 1)\n"
285288
" {\n"
286-
" float weight = ((float[4])kernel[uint(i)/4])[uint(i)%4];\n"
287-
" float4 min_sample = stage_t2d.Sample(stage_sampler, v2p.texcoord - float2(!is_vertical*i/viewport_size.x, is_vertical*i/viewport_size.y));\n"
288-
" float4 max_sample = stage_t2d.Sample(stage_sampler, v2p.texcoord + float2(!is_vertical*i/viewport_size.x, is_vertical*i/viewport_size.y));\n"
289-
" min_sample.a = 1;\n"
290-
" max_sample.a = 1;\n"
291-
" color += min_sample*weight;\n"
292-
" color += max_sample*weight;\n"
289+
" float weight = kernel[i].x;\n"
290+
" float offset = kernel[i].y;\n"
291+
" float4 min_sample = stage_t2d.Sample(stage_sampler, v2p.texcoord - offset * direction);\n"
292+
" float4 max_sample = stage_t2d.Sample(stage_sampler, v2p.texcoord + offset * direction);\n"
293+
" min_sample.a = 1.f;\n"
294+
" max_sample.a = 1.f;\n"
295+
" color += min_sample * weight;\n"
296+
" color += max_sample * weight;\n"
293297
" }\n"
294298
" \n"
295299
" // rjf: determine SDF sample position\n"
296300
" float2 rect_half_size = float2((rect.z-rect.x)/2, (rect.w-rect.y)/2);\n"
297-
" float2 sdf_sample_pos = float2((2*v2p.cornercoord.x-1)*rect_half_size.x,\n"
298-
" (2*v2p.cornercoord.y-1)*rect_half_size.y);\n"
301+
" float2 sdf_sample_pos = v2p.sdf_sample_pos;\n"
299302
" \n"
300303
" // rjf: sample for corners\n"
301-
" float corner_sdf_s = rect_sdf(sdf_sample_pos, rect_half_size - float2(2.f, 2.f), v2p.corner_radius);\n"
304+
" float corner_sdf_s = rect_sdf(sdf_sample_pos, rect_half_size - 2.f, v2p.corner_radius);\n"
302305
" float corner_sdf_t = 1-smoothstep(0, 2, corner_sdf_s);\n"
303306
" \n"
304307
" // rjf: weight output color by sdf\n"

src/render/d3d11/render_d3d11.cpp

Lines changed: 122 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -1183,99 +1183,140 @@ r_window_submit(OS_Handle window, R_Handle window_equip, R_PassList *passes)
11831183
case R_PassKind_Blur:
11841184
{
11851185
R_PassParams_Blur *params = pass->params_blur;
1186-
ID3D11SamplerState *sampler = r_d3d11_state->samplers[R_Tex2DSampleKind_Nearest];
1186+
ID3D11SamplerState *sampler = r_d3d11_state->samplers[R_Tex2DSampleKind_Linear];
11871187
ID3D11VertexShader *vshad = r_d3d11_state->vshads[R_D3D11_VShadKind_Blur];
11881188
ID3D11PixelShader *pshad = r_d3d11_state->pshads[R_D3D11_PShadKind_Blur];
11891189
ID3D11Buffer *uniforms_buffer = r_d3d11_state->uniform_type_kind_buffers[R_D3D11_VShadKind_Blur];
1190-
1191-
//- rjf: perform blur on each axis
1192-
ID3D11RenderTargetView *rtvs[Axis2_COUNT] =
1193-
{
1194-
wnd->stage_scratch_color_rtv,
1195-
wnd->stage_color_rtv,
1196-
};
1197-
ID3D11ShaderResourceView *srvs[Axis2_COUNT] =
1190+
1191+
// rjf: setup output merger
1192+
d_ctx->OMSetDepthStencilState(r_d3d11_state->noop_depth_stencil, 0);
1193+
d_ctx->OMSetBlendState(r_d3d11_state->main_blend_state, 0, 0xffffffff);
1194+
1195+
// rjf: set up viewport
1196+
Vec2S32 resolution = wnd->last_resolution;
1197+
D3D11_VIEWPORT viewport = { 0.0f, 0.0f, (F32)resolution.x, (F32)resolution.y, 0.0f, 1.0f };
1198+
d_ctx->RSSetViewports(1, &viewport);
1199+
d_ctx->RSSetState(r_d3d11_state->main_rasterizer);
1200+
1201+
// rjf: setup input assembly
1202+
d_ctx->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP);
1203+
d_ctx->IASetInputLayout(0);
1204+
1205+
// rjf: setup shaders
1206+
d_ctx->VSSetShader(vshad, 0, 0);
1207+
d_ctx->VSSetConstantBuffers(0, 1, &uniforms_buffer);
1208+
d_ctx->PSSetShader(pshad, 0, 0);
1209+
d_ctx->PSSetSamplers(0, 1, &sampler);
1210+
1211+
// rjf: setup scissor rect
11981212
{
1199-
wnd->stage_color_srv,
1200-
wnd->stage_scratch_color_srv,
1201-
};
1202-
for(Axis2 axis = (Axis2)0; axis < Axis2_COUNT; axis = (Axis2)(axis+1))
1213+
D3D11_RECT rect = { 0 };
1214+
rect.left = 0;
1215+
rect.right = (LONG)wnd->last_resolution.x;
1216+
rect.top = 0;
1217+
rect.bottom = (LONG)wnd->last_resolution.y;
1218+
d_ctx->RSSetScissorRects(1, &rect);
1219+
}
1220+
1221+
// rjf: set up uniforms
1222+
R_D3D11_Uniforms_Blur uniforms = { 0 };
12031223
{
1204-
// rjf: setup output merger
1205-
d_ctx->OMSetRenderTargets(1, &rtvs[axis], 0);
1206-
d_ctx->OMSetDepthStencilState(r_d3d11_state->noop_depth_stencil, 0);
1207-
d_ctx->OMSetBlendState(r_d3d11_state->main_blend_state, 0, 0xffffffff);
1208-
1209-
// rjf: set up viewport
1210-
Vec2S32 resolution = wnd->last_resolution;
1211-
D3D11_VIEWPORT viewport = { 0.0f, 0.0f, (F32)resolution.x, (F32)resolution.y, 0.0f, 1.0f };
1212-
d_ctx->RSSetViewports(1, &viewport);
1213-
d_ctx->RSSetState(r_d3d11_state->main_rasterizer);
1214-
1215-
// rjf: setup input assembly
1216-
d_ctx->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP);
1217-
d_ctx->IASetInputLayout(0);
1218-
1219-
// rjf: set up uniforms
1224+
F32 weights[ArrayCount(uniforms.kernel)*2] = {0};
1225+
1226+
F32 blur_size = Min(params->blur_size, ArrayCount(weights));
1227+
U64 blur_count = (U64)round_f32(blur_size);
1228+
1229+
F32 stdev = (blur_size-1.f)/2.f;
1230+
F32 one_over_root_2pi_stdev2 = 1/sqrt_f32(2*pi32*stdev*stdev);
1231+
F32 euler32 = 2.718281828459045f;
1232+
1233+
weights[0] = 1.f;
1234+
if(stdev > 0.f)
12201235
{
1221-
F32 stdev = (params->blur_size-1.f)/2.f;
1222-
F32 one_over_root_2pi_stdev2 = 1/sqrt_f32(2*pi32*stdev*stdev);
1223-
F32 euler32 = 2.718281828459045f;
1224-
R_D3D11_Uniforms_Blur uniforms = {0};
1225-
uniforms.viewport_size = v2f32(resolution.x, resolution.y);
1226-
uniforms.rect = params->rect;
1227-
uniforms.blur_size = params->blur_size;
1228-
uniforms.is_vertical = (F32)!!axis;
1229-
MemoryCopyArray(uniforms.corner_radii.v, params->corner_radii);
1230-
F32 kernel_x = 0;
1231-
uniforms.kernel[0].v[0] = 1.f;
1232-
if(stdev > 0.f)
1233-
{
1234-
for(U64 idx = 0; idx < ArrayCount(uniforms.kernel); idx += 1)
1235-
{
1236-
for(U64 v_idx = 0; v_idx < ArrayCount(uniforms.kernel[idx].v); v_idx += 1)
1237-
{
1238-
uniforms.kernel[idx].v[v_idx] = one_over_root_2pi_stdev2*pow_f32(euler32, -kernel_x*kernel_x/(2.f*stdev*stdev));
1239-
kernel_x += 1;
1240-
}
1241-
}
1242-
}
1243-
if(uniforms.kernel[0].v[0] > 1.f)
1236+
for(U64 idx = 0; idx < blur_count; idx += 1)
12441237
{
1245-
MemoryZeroArray(uniforms.kernel);
1246-
uniforms.kernel[0].v[0] = 1.f;
1238+
F32 kernel_x = (F32)idx;
1239+
weights[idx] = one_over_root_2pi_stdev2*pow_f32(euler32, -kernel_x*kernel_x/(2.f*stdev*stdev));
12471240
}
1248-
D3D11_MAPPED_SUBRESOURCE sub_rsrc = {0};
1249-
r_d3d11_state->device_ctx->Map(uniforms_buffer, 0, D3D11_MAP_WRITE_DISCARD, 0, &sub_rsrc);
1250-
MemoryCopy((U8 *)sub_rsrc.pData, &uniforms, sizeof(uniforms));
1251-
r_d3d11_state->device_ctx->Unmap(uniforms_buffer, 0);
12521241
}
1253-
1254-
// rjf: setup shaders
1255-
d_ctx->VSSetShader(vshad, 0, 0);
1256-
d_ctx->VSSetConstantBuffers(0, 1, &uniforms_buffer);
1257-
d_ctx->PSSetShader(pshad, 0, 0);
1258-
d_ctx->PSSetConstantBuffers(0, 1, &uniforms_buffer);
1259-
d_ctx->PSSetShaderResources(0, 1, &srvs[axis]);
1260-
d_ctx->PSSetSamplers(0, 1, &sampler);
1261-
1262-
// rjf: setup scissor rect
1242+
if(weights[0] > 1.f)
12631243
{
1264-
D3D11_RECT rect = {0};
1265-
rect.left = 0;
1266-
rect.right = (LONG)wnd->last_resolution.x;
1267-
rect.top = 0;
1268-
rect.bottom = (LONG)wnd->last_resolution.y;
1269-
d_ctx->RSSetScissorRects(1, &rect);
1244+
MemoryZeroArray(weights);
1245+
weights[0] = 1.f;
12701246
}
1271-
1272-
// rjf: draw
1273-
d_ctx->Draw(4, 0);
1274-
1275-
// rjf: unset srv
1276-
ID3D11ShaderResourceView *srv = 0;
1277-
d_ctx->PSSetShaderResources(0, 1, &srv);
1247+
else
1248+
{
1249+
// prepare weights & offsets for bilinear lookup
1250+
// blur filter wants to calculate w0*pixel[pos] + w1*pixel[pos+1] + ...
1251+
// with bilinear filter we can do this calulation by doing only w*sample(pos+t) = w*((1-t)*pixel[pos] + t*pixel[pos+1])
1252+
// we can see w0=w*(1-t) and w1=w*t
1253+
// thus w=w0+w1 and t=w1/w
1254+
for (U64 idx = 1; idx < blur_count; idx += 2)
1255+
{
1256+
F32 w0 = weights[idx + 0];
1257+
F32 w1 = weights[idx + 1];
1258+
F32 w = w0 + w1;
1259+
F32 t = w1 / w;
1260+
1261+
// each kernel element is float2(weight, offset)
1262+
// weights & offsets are adjusted for bilinear sampling
1263+
// zw elements are not used, a bit of waste but it allows for simpler shader code
1264+
uniforms.kernel[(idx+1)/2] = v4f32(w, (F32)idx + t, 0, 0);
1265+
}
1266+
uniforms.kernel[0].x = weights[0];
1267+
}
1268+
1269+
// technically we need just direction be different
1270+
// but there are 256 bytes of usable space anyway for each constant buffer chunk
1271+
1272+
uniforms.passes[Axis2_X].viewport_size = v2f32(resolution.x, resolution.y);
1273+
uniforms.passes[Axis2_X].rect = params->rect;
1274+
uniforms.passes[Axis2_X].direction = v2f32(1.f / resolution.x, 0);
1275+
uniforms.passes[Axis2_X].blur_count = 1 + blur_count / 2; // 2x smaller because of bilinear sampling
1276+
MemoryCopyArray(uniforms.passes[Axis2_X].corner_radii.v, params->corner_radii);
1277+
1278+
uniforms.passes[Axis2_Y].viewport_size = v2f32(resolution.x, resolution.y);
1279+
uniforms.passes[Axis2_Y].rect = params->rect;
1280+
uniforms.passes[Axis2_Y].direction = v2f32(0, 1.f / resolution.y);
1281+
uniforms.passes[Axis2_Y].blur_count = 1 + blur_count / 2; // 2x smaller because of bilinear sampling
1282+
MemoryCopyArray(uniforms.passes[Axis2_Y].corner_radii.v, params->corner_radii);
1283+
1284+
D3D11_MAPPED_SUBRESOURCE sub_rsrc = {0};
1285+
r_d3d11_state->device_ctx->Map(uniforms_buffer, 0, D3D11_MAP_WRITE_DISCARD, 0, &sub_rsrc);
1286+
MemoryCopy((U8 *)sub_rsrc.pData, &uniforms, sizeof(uniforms));
1287+
r_d3d11_state->device_ctx->Unmap(uniforms_buffer, 0);
12781288
}
1289+
1290+
ID3D11Buffer* uniforms_buffers[] = { uniforms_buffer, uniforms_buffer };
1291+
1292+
U32 uniform_offset[Axis2_COUNT][2] =
1293+
{
1294+
{ 0 * sizeof(R_D3D11_Uniforms_BlurPass) / 16, OffsetOf(R_D3D11_Uniforms_Blur, kernel) / 16 },
1295+
{ 1 * sizeof(R_D3D11_Uniforms_BlurPass) / 16, OffsetOf(R_D3D11_Uniforms_Blur, kernel) / 16 },
1296+
};
1297+
1298+
U32 uniform_count[Axis2_COUNT][2] =
1299+
{
1300+
{ sizeof(R_D3D11_Uniforms_BlurPass) / 16, sizeof(uniforms.kernel) / 16 },
1301+
{ sizeof(R_D3D11_Uniforms_BlurPass) / 16, sizeof(uniforms.kernel) / 16 },
1302+
};
1303+
1304+
// rjf: for unsetting srv
1305+
ID3D11ShaderResourceView* srv = 0;
1306+
1307+
// horizontal pass
1308+
d_ctx->OMSetRenderTargets(1, &wnd->stage_scratch_color_rtv, 0);
1309+
d_ctx->PSSetConstantBuffers1(0, ArrayCount(uniforms_buffers), uniforms_buffers, uniform_offset[Axis2_X], uniform_count[Axis2_X]);
1310+
d_ctx->PSSetShaderResources(0, 1, &wnd->stage_color_srv);
1311+
d_ctx->Draw(4, 0);
1312+
d_ctx->PSSetShaderResources(0, 1, &srv);
1313+
1314+
// vertical pass
1315+
d_ctx->OMSetRenderTargets(1, &wnd->stage_color_rtv, 0);
1316+
d_ctx->PSSetConstantBuffers1(0, ArrayCount(uniforms_buffers), uniforms_buffers, uniform_offset[Axis2_Y], uniform_count[Axis2_Y]);
1317+
d_ctx->PSSetShaderResources(0, 1, &wnd->stage_scratch_color_srv);
1318+
d_ctx->Draw(4, 0);
1319+
d_ctx->PSSetShaderResources(0, 1, &srv);
12791320
}break;
12801321

12811322

src/render/d3d11/render_d3d11.h

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,20 @@ struct R_D3D11_Uniforms_Rect
3232
Vec2F32 xform_scale;
3333
};
3434

35-
struct R_D3D11_Uniforms_Blur
35+
struct R_D3D11_Uniforms_BlurPass
3636
{
3737
Rng2F32 rect;
38-
Vec2F32 viewport_size;
39-
F32 blur_size;
40-
F32 is_vertical;
4138
Vec4F32 corner_radii;
39+
Vec2F32 direction;
40+
Vec2F32 viewport_size;
41+
U32 blur_count;
42+
U8 _padding0_[204];
43+
};
44+
StaticAssert(sizeof(R_D3D11_Uniforms_BlurPass) % 256 == 0, NotAligned); // constant count/offset must be aligned to 256 bytes
45+
46+
struct R_D3D11_Uniforms_Blur
47+
{
48+
R_D3D11_Uniforms_BlurPass passes[Axis2_COUNT];
4249
Vec4F32 kernel[32];
4350
};
4451

0 commit comments

Comments
 (0)