Skip to content

Commit a1784a6

Browse files
Merge pull request #1374 from SixLabors/js/Block8x8F_TransposeAVX
Add Avx backed Block8x8F Transpose method
2 parents 4dadf24 + 685693a commit a1784a6

File tree

6 files changed

+172
-12
lines changed

6 files changed

+172
-12
lines changed

src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.cs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
1010
{
1111
internal partial struct Block8x8F
1212
{
13-
/// <summary>
14-
/// Transpose the block into the destination block.
13+
/// <summary>
14+
/// Fallback method to transpose a block into the destination block on non AVX supported CPUs.
1515
/// </summary>
1616
/// <param name="d">The destination block</param>
1717
[MethodImpl(InliningOptions.ShortMethod)]
18-
public void TransposeInto(ref Block8x8F d)
18+
public void TransposeIntoFallback(ref Block8x8F d)
1919
{
2020
d.V0L.X = V0L.X;
2121
d.V1L.X = V0L.Y;

src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.tt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
2323
{
2424
internal partial struct Block8x8F
2525
{
26-
/// <summary>
27-
/// Transpose the block into the destination block.
26+
/// <summary>
27+
/// Fallback method to transpose a block into the destination block on non AVX supported CPUs.
2828
/// </summary>
2929
/// <param name="d">The destination block</param>
3030
[MethodImpl(InliningOptions.ShortMethod)]
31-
public void TransposeInto(ref Block8x8F d)
31+
public void TransposeIntoFallback(ref Block8x8F d)
3232
{
3333
<#
3434
PushIndent(" ");

src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@
66
using System.Numerics;
77
using System.Runtime.CompilerServices;
88
using System.Runtime.InteropServices;
9+
#if SUPPORTS_RUNTIME_INTRINSICS
10+
using System.Runtime.Intrinsics;
11+
using System.Runtime.Intrinsics.X86;
12+
#endif
913
using System.Text;
1014

1115
// ReSharper disable InconsistentNaming
@@ -596,5 +600,98 @@ private static void GuardBlockIndex(int idx)
596600
DebugGuard.MustBeLessThan(idx, Size, nameof(idx));
597601
DebugGuard.MustBeGreaterThanOrEqualTo(idx, 0, nameof(idx));
598602
}
603+
604+
/// <summary>
605+
/// Transpose the block into the destination block.
606+
/// </summary>
607+
/// <param name="d">The destination block</param>
608+
[MethodImpl(InliningOptions.ShortMethod)]
609+
public void TransposeInto(ref Block8x8F d)
610+
{
611+
#if SUPPORTS_RUNTIME_INTRINSICS
612+
if (Avx.IsSupported)
613+
{
614+
this.TransposeIntoAvx(ref d);
615+
}
616+
else
617+
#endif
618+
{
619+
this.TransposeIntoFallback(ref d);
620+
}
621+
}
622+
623+
#if SUPPORTS_RUNTIME_INTRINSICS
624+
/// <summary>
625+
/// AVX-only variant for executing <see cref="TransposeInto(ref Block8x8F)"/>.
626+
/// <see href="https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536"/>
627+
/// </summary>
628+
[MethodImpl(InliningOptions.ShortMethod)]
629+
public void TransposeIntoAvx(ref Block8x8F d)
630+
{
631+
Vector256<float> r0 = Avx.InsertVector128(
632+
Unsafe.As<Vector4, Vector128<float>>(ref this.V0L).ToVector256(),
633+
Unsafe.As<Vector4, Vector128<float>>(ref this.V4L),
634+
1);
635+
636+
Vector256<float> r1 = Avx.InsertVector128(
637+
Unsafe.As<Vector4, Vector128<float>>(ref this.V1L).ToVector256(),
638+
Unsafe.As<Vector4, Vector128<float>>(ref this.V5L),
639+
1);
640+
641+
Vector256<float> r2 = Avx.InsertVector128(
642+
Unsafe.As<Vector4, Vector128<float>>(ref this.V2L).ToVector256(),
643+
Unsafe.As<Vector4, Vector128<float>>(ref this.V6L),
644+
1);
645+
646+
Vector256<float> r3 = Avx.InsertVector128(
647+
Unsafe.As<Vector4, Vector128<float>>(ref this.V3L).ToVector256(),
648+
Unsafe.As<Vector4, Vector128<float>>(ref this.V7L),
649+
1);
650+
651+
Vector256<float> r4 = Avx.InsertVector128(
652+
Unsafe.As<Vector4, Vector128<float>>(ref this.V0R).ToVector256(),
653+
Unsafe.As<Vector4, Vector128<float>>(ref this.V4R),
654+
1);
655+
656+
Vector256<float> r5 = Avx.InsertVector128(
657+
Unsafe.As<Vector4, Vector128<float>>(ref this.V1R).ToVector256(),
658+
Unsafe.As<Vector4, Vector128<float>>(ref this.V5R),
659+
1);
660+
661+
Vector256<float> r6 = Avx.InsertVector128(
662+
Unsafe.As<Vector4, Vector128<float>>(ref this.V2R).ToVector256(),
663+
Unsafe.As<Vector4, Vector128<float>>(ref this.V6R),
664+
1);
665+
666+
Vector256<float> r7 = Avx.InsertVector128(
667+
Unsafe.As<Vector4, Vector128<float>>(ref this.V3R).ToVector256(),
668+
Unsafe.As<Vector4, Vector128<float>>(ref this.V7R),
669+
1);
670+
671+
Vector256<float> t0 = Avx.UnpackLow(r0, r1);
672+
Vector256<float> t2 = Avx.UnpackLow(r2, r3);
673+
Vector256<float> v = Avx.Shuffle(t0, t2, 0x4E);
674+
Unsafe.As<Vector4, Vector256<float>>(ref d.V0L) = Avx.Blend(t0, v, 0xCC);
675+
Unsafe.As<Vector4, Vector256<float>>(ref d.V1L) = Avx.Blend(t2, v, 0x33);
676+
677+
Vector256<float> t4 = Avx.UnpackLow(r4, r5);
678+
Vector256<float> t6 = Avx.UnpackLow(r6, r7);
679+
v = Avx.Shuffle(t4, t6, 0x4E);
680+
Unsafe.As<Vector4, Vector256<float>>(ref d.V4L) = Avx.Blend(t4, v, 0xCC);
681+
Unsafe.As<Vector4, Vector256<float>>(ref d.V5L) = Avx.Blend(t6, v, 0x33);
682+
683+
Vector256<float> t1 = Avx.UnpackHigh(r0, r1);
684+
Vector256<float> t3 = Avx.UnpackHigh(r2, r3);
685+
v = Avx.Shuffle(t1, t3, 0x4E);
686+
Unsafe.As<Vector4, Vector256<float>>(ref d.V2L) = Avx.Blend(t1, v, 0xCC);
687+
Unsafe.As<Vector4, Vector256<float>>(ref d.V3L) = Avx.Blend(t3, v, 0x33);
688+
689+
Vector256<float> t5 = Avx.UnpackHigh(r4, r5);
690+
Vector256<float> t7 = Avx.UnpackHigh(r6, r7);
691+
v = Avx.Shuffle(t5, t7, 0x4E);
692+
Unsafe.As<Vector4, Vector256<float>>(ref d.V6L) = Avx.Blend(t5, v, 0xCC);
693+
Unsafe.As<Vector4, Vector256<float>>(ref d.V7L) = Avx.Blend(t7, v, 0x33);
694+
}
695+
#endif
599696
}
600697
}

src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright (c) Six Labors.
1+
// Copyright (c) Six Labors.
22
// Licensed under the Apache License, Version 2.0.
33

44
using System.Numerics;
@@ -50,8 +50,6 @@ internal static class FastFloatingPointDCT
5050
/// <param name="temp">Temporary block provided by the caller</param>
5151
public static void TransformIDCT(ref Block8x8F src, ref Block8x8F dest, ref Block8x8F temp)
5252
{
53-
// TODO: Transpose is a bottleneck now. We need full AVX support to optimize it:
54-
// https://github.com/dotnet/corefx/issues/22940
5553
src.TransposeInto(ref temp);
5654

5755
IDCT8x4_LeftPart(ref temp, ref dest);
@@ -340,4 +338,4 @@ public static void TransformFDCT(
340338
dest.MultiplyInplace(C_0_125);
341339
}
342340
}
343-
}
341+
}
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
// Copyright (c) Six Labors.
2+
// Licensed under the Apache License, Version 2.0.
3+
4+
using BenchmarkDotNet.Attributes;
5+
using SixLabors.ImageSharp.Formats.Jpeg.Components;
6+
7+
namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations
8+
{
9+
public class Block8x8F_Transpose
10+
{
11+
private static readonly Block8x8F Source = Create8x8FloatData();
12+
13+
[Benchmark(Baseline=true)]
14+
public void TransposeIntoVector4()
15+
{
16+
var dest = default(Block8x8F);
17+
Source.TransposeIntoFallback(ref dest);
18+
}
19+
20+
#if SUPPORTS_RUNTIME_INTRINSICS
21+
[Benchmark]
22+
public void TransposeIntoAvx()
23+
{
24+
var dest = default(Block8x8F);
25+
Source.TransposeIntoAvx(ref dest);
26+
}
27+
#endif
28+
29+
private static Block8x8F Create8x8FloatData()
30+
{
31+
var result = new float[64];
32+
for (int i = 0; i < 8; i++)
33+
{
34+
for (int j = 0; j < 8; j++)
35+
{
36+
result[(i * 8) + j] = (i * 10) + j;
37+
}
38+
}
39+
40+
var source = default(Block8x8F);
41+
source.LoadFrom(result);
42+
return source;
43+
}
44+
}
45+
}

tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ public void Load_Store_IntArray()
163163
}
164164

165165
[Fact]
166-
public void TransposeInto()
166+
public void TransposeIntoFallback()
167167
{
168168
float[] expected = Create8x8FloatData();
169169
ReferenceImplementations.Transpose8x8(expected);
@@ -172,14 +172,34 @@ public void TransposeInto()
172172
source.LoadFrom(Create8x8FloatData());
173173

174174
var dest = default(Block8x8F);
175-
source.TransposeInto(ref dest);
175+
source.TransposeIntoFallback(ref dest);
176176

177177
float[] actual = new float[64];
178178
dest.ScaledCopyTo(actual);
179179

180180
Assert.Equal(expected, actual);
181181
}
182182

183+
#if SUPPORTS_RUNTIME_INTRINSICS
184+
[Fact]
185+
public void TransposeIntoAvx()
186+
{
187+
float[] expected = Create8x8FloatData();
188+
ReferenceImplementations.Transpose8x8(expected);
189+
190+
var source = default(Block8x8F);
191+
source.LoadFrom(Create8x8FloatData());
192+
193+
var dest = default(Block8x8F);
194+
source.TransposeIntoAvx(ref dest);
195+
196+
float[] actual = new float[64];
197+
dest.ScaledCopyTo(actual);
198+
199+
Assert.Equal(expected, actual);
200+
}
201+
#endif
202+
183203
private class BufferHolder
184204
{
185205
public Block8x8F Buffer;

0 commit comments

Comments
 (0)