-
Notifications
You must be signed in to change notification settings - Fork 371
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add x32 avx512 gemm microkernels. These are way faster
f32_gemm_minmax_ukernel_1x16__avx512f_broadcast/selfiesegmentation_full/M:16384/N:32/K:32/real_time 1029482 ns 1029201 ns 1364 FLOPS=32.5935G/s f32_gemm_minmax_ukernel_4x16__avx512f_broadcast/selfiesegmentation_full/M:16384/N:32/K:32/real_time 586536 ns 586291 ns 2407 FLOPS=57.2078G/s f32_gemm_minmax_ukernel_5x16__avx512f_broadcast/selfiesegmentation_full/M:16384/N:32/K:32/real_time 541205 ns 541143 ns 2581 FLOPS=61.9994G/s f32_gemm_minmax_ukernel_6x16__avx512f_broadcast/selfiesegmentation_full/M:16384/N:32/K:32/real_time 566296 ns 566175 ns 2466 FLOPS=59.2525G/s f32_gemm_minmax_ukernel_7x16__avx512f_broadcast/selfiesegmentation_full/M:16384/N:32/K:32/real_time 557455 ns 557275 ns 2464 FLOPS=60.1921G/s f32_gemm_minmax_ukernel_8x16__avx512f_broadcast/selfiesegmentation_full/M:16384/N:32/K:32/real_time 586344 ns 586240 ns 2396 FLOPS=57.2265G/s f32_gemm_minmax_ukernel_1x32__avx512f_broadcast/selfiesegmentation_full/M:16384/N:32/K:32/real_time 568475 ns 568317 ns 2494 FLOPS=59.0254G/s f32_gemm_minmax_ukernel_4x32__avx512f_broadcast/selfiesegmentation_full/M:16384/N:32/K:32/real_time 343299 ns 343171 ns 3967 FLOPS=97.7411G/s f32_gemm_minmax_ukernel_5x32__avx512f_broadcast/selfiesegmentation_full/M:16384/N:32/K:32/real_time 326208 ns 326123 ns 4317 FLOPS=102.862G/s f32_gemm_minmax_ukernel_6x32__avx512f_broadcast/selfiesegmentation_full/M:16384/N:32/K:32/real_time 324746 ns 324675 ns 4317 FLOPS=103.325G/s f32_gemm_minmax_ukernel_7x32__avx512f_broadcast/selfiesegmentation_full/M:16384/N:32/K:32/real_time 319434 ns 319344 ns 4337 FLOPS=105.043G/s f32_gemm_minmax_ukernel_8x32__avx512f_broadcast/selfiesegmentation_full/M:16384/N:32/K:32/real_time 358816 ns 358735 ns 4029 FLOPS=93.5143G/s PiperOrigin-RevId: 692901687
- Loading branch information
1 parent
9dc7da7
commit 1fed338
Showing
37 changed files
with
4,979 additions
and
24 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
// Auto-generated file. Do not edit! | ||
// Template: src/f32-gemm/avx512-broadcast.c.in | ||
// Generator: tools/xngen | ||
// | ||
// Copyright 2019 Google LLC | ||
// | ||
// This source code is licensed under the BSD-style license found in the | ||
// LICENSE file in the root directory of this source tree. | ||
|
||
#include <assert.h> | ||
|
||
#include <immintrin.h> | ||
|
||
#include "xnnpack/gemm.h" | ||
#include "xnnpack/intrinsics-polyfill.h" | ||
|
||
|
||
void xnn_f32_gemm_minmax_ukernel_1x32__avx512f_broadcast( | ||
size_t mr, | ||
size_t nc, | ||
size_t kc, | ||
const float* restrict a, | ||
size_t a_stride, | ||
const float* restrict w, | ||
float* restrict c, | ||
size_t cm_stride, | ||
size_t cn_stride, | ||
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) | ||
{ | ||
assert(mr != 0); | ||
assert(mr <= 1); | ||
assert(nc != 0); | ||
assert(kc != 0); | ||
assert(kc % sizeof(float) == 0); | ||
assert(a != NULL); | ||
assert(w != NULL); | ||
assert(c != NULL); | ||
|
||
const float* a0 = a; | ||
float* c0 = c; | ||
do { | ||
__m512 vacc0x0 = _mm512_load_ps(w); | ||
__m512 vacc0x1 = _mm512_load_ps(w + 16); | ||
w += 32; | ||
|
||
size_t k = kc; | ||
do { | ||
const __m512 vb0 = _mm512_load_ps(w); | ||
const __m512 vb1 = _mm512_loadu_ps(w + 16); | ||
w += 32; | ||
|
||
const __m512 va0 = _mm512_set1_ps(*a0); | ||
vacc0x0 = _mm512_fmadd_ps(va0, vb0, vacc0x0); | ||
vacc0x1 = _mm512_fmadd_ps(va0, vb1, vacc0x1); | ||
|
||
a0 += 1; | ||
|
||
k -= sizeof(float); | ||
} while (k != 0); | ||
|
||
const __m512 vmin = _mm512_set1_ps(params->scalar.min); | ||
vacc0x0 = _mm512_max_ps(vmin, vacc0x0); | ||
vacc0x1 = _mm512_max_ps(vmin, vacc0x1); | ||
|
||
const __m512 vmax = _mm512_set1_ps(params->scalar.max); | ||
vacc0x0 = _mm512_min_ps(vmax, vacc0x0); | ||
vacc0x1 = _mm512_min_ps(vmax, vacc0x1); | ||
|
||
if XNN_LIKELY(nc >= 32) { | ||
_mm512_storeu_ps(c0, vacc0x0); | ||
_mm512_storeu_ps(c0 + 16, vacc0x1); | ||
c0 = (float*) ((uintptr_t) c0 + cn_stride); | ||
|
||
a0 = (const float*) ((uintptr_t) a0 - kc); | ||
|
||
nc -= 32; | ||
} else { | ||
if (nc & 16) { | ||
_mm512_storeu_ps(c0, vacc0x0); | ||
|
||
vacc0x0 = vacc0x1; | ||
|
||
c0 += 16; | ||
} | ||
if (nc & 15) { | ||
// Prepare mask for valid 32-bit elements (depends on nc). | ||
const __mmask16 vmask = _cvtu32_mask16((uint32_t) (UINT32_C(1) << (nc & 15)) - UINT32_C(1)); | ||
_mm512_mask_storeu_ps(c0, vmask, vacc0x0); | ||
} | ||
nc = 0; | ||
} | ||
} while (nc != 0); | ||
} |
Oops, something went wrong.