From 02c138f8d1d6ca7152823d44ad5709d13bcd06ee Mon Sep 17 00:00:00 2001 From: Lukacma Date: Wed, 25 Sep 2024 14:34:00 +0100 Subject: [PATCH] [AArch64] Implement intrinsics for SME2 FSCALE (#100128) This patch implements these intrinsics: FSCALE SINGLE AND MULTI ``` // Variants are also available for: // [_single_f32_x2], [_single_f64_x2], // [_single_f16_x4], [_single_f32_x4], [_single_f64_x4] svfloat16x2_t svscale[_single_f16_x2](svfloat16x2_t zd, svfloat16_t zm) __arm_streaming; // Variants are also available for: // [_f32_x2], [_f64_x2], // [_f16_x4], [_f32_x4], [_f64_x4] svfloat16x2_t svscale[_f16_x2](svfloat16x2_t zd, svfloat16x2_t zm) __arm_streaming ``` (cf. https://github.com/ARM-software/acle/pull/323) Co-authored-by: Caroline Concatto --- clang/include/clang/Basic/arm_sve.td | 10 + .../acle_sme2_fp8_scale.c | 452 ++++++++++++++++++ llvm/include/llvm/IR/IntrinsicsAArch64.td | 25 + .../Target/AArch64/AArch64ISelDAGToDAG.cpp | 28 ++ .../CodeGen/AArch64/sme2-intrinsics-fscale.ll | 186 +++++++ 5 files changed, 701 insertions(+) create mode 100644 clang/test/CodeGen/aarch64-fp8-intrinsics/acle_sme2_fp8_scale.c create mode 100644 llvm/test/CodeGen/AArch64/sme2-intrinsics-fscale.ll diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index edf73d9022b064..da496e30fbb523 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -2418,6 +2418,16 @@ let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2" in { def SVUUNPK_X4 : SInst<"svunpk_{d}[_{3}_x4]", "42.h", "UsUiUl", MergeNone, "aarch64_sve_uunpk_x4", [IsStreaming], []>; } +// +// Multi-vector scaling +// +let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2,fp8" in { + def FSCALE_SINGLE_X2 : Inst<"svscale[_single_{d}_x2]", "22x", "fhd", MergeNone, "aarch64_sme_fp8_scale_single_x2", [IsStreaming],[]>; + def FSCALE_SINGLE_X4 : Inst<"svscale[_single_{d}_x4]", "44x", "fhd", MergeNone, "aarch64_sme_fp8_scale_single_x4", [IsStreaming],[]>; + def FSCALE_X2 : Inst<"svscale[_{d}_x2]", "222.x", "fhd", MergeNone, "aarch64_sme_fp8_scale_x2", [IsStreaming],[]>; + def FSCALE_X4 : Inst<"svscale[_{d}_x4]", "444.x", "fhd", MergeNone, "aarch64_sme_fp8_scale_x4", [IsStreaming],[]>; +} + let SVETargetGuard = "sve2p1", SMETargetGuard = "sme2" in { // == BFloat16 multiply-subtract == def SVBFMLSLB : SInst<"svbfmlslb[_{d}]", "dd$$", "f", MergeNone, "aarch64_sve_bfmlslb", [IsOverloadNone, VerifyRuntimeMode], []>; diff --git a/clang/test/CodeGen/aarch64-fp8-intrinsics/acle_sme2_fp8_scale.c b/clang/test/CodeGen/aarch64-fp8-intrinsics/acle_sme2_fp8_scale.c new file mode 100644 index 00000000000000..b733e772ba3075 --- /dev/null +++ b/clang/test/CodeGen/aarch64-fp8-intrinsics/acle_sme2_fp8_scale.c @@ -0,0 +1,452 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED) A1 +#else +#define SVE_ACLE_FUNC(A1,A2) A1##A2 +#endif + + +// Single x2 +// CHECK-LABEL: @test_svscale_single_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[OP1]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sme.fp8.scale.single.x2.nxv8f16( [[TMP0]], [[TMP1]], [[OP2:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z26test_svscale_single_f16_x213svfloat16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[OP1]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sme.fp8.scale.single.x2.nxv8f16( [[TMP0]], [[TMP1]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat16x2_t test_svscale_single_f16_x2(svfloat16x2_t op1, svint16_t op2) __arm_streaming +{ + return SVE_ACLE_FUNC(svscale,_single_f16_x2)(op1, op2); +} + +// CHECK-LABEL: @test_svscale_single_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[OP1]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sme.fp8.scale.single.x2.nxv4f32( [[TMP0]], [[TMP1]], [[OP2:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z26test_svscale_single_f32_x213svfloat32x2_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[OP1]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sme.fp8.scale.single.x2.nxv4f32( [[TMP0]], [[TMP1]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat32x2_t test_svscale_single_f32_x2(svfloat32x2_t op1, svint32_t op2) __arm_streaming +{ + return SVE_ACLE_FUNC(svscale,_single_f32_x2)(op1, op2); +} + +// CHECK-LABEL: @test_svscale_single_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[OP1]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sme.fp8.scale.single.x2.nxv2f64( [[TMP0]], [[TMP1]], [[OP2:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z26test_svscale_single_f64_x213svfloat64x2_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[OP1]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sme.fp8.scale.single.x2.nxv2f64( [[TMP0]], [[TMP1]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat64x2_t test_svscale_single_f64_x2(svfloat64x2_t op1, svint64_t op2) __arm_streaming +{ + return SVE_ACLE_FUNC(svscale,_single_f64_x2)(op1, op2); +} + +// Single x4 +// CHECK-LABEL: @test_svscale_single_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[OP1]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[OP1]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[OP1]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sme.fp8.scale.single.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z26test_svscale_single_f16_x413svfloat16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[OP1]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[OP1]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[OP1]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sme.fp8.scale.single.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat16x4_t test_svscale_single_f16_x4(svfloat16x4_t op1, svint16_t op2) __arm_streaming +{ + return SVE_ACLE_FUNC(svscale,_single_f16_x4)(op1, op2); +} + +// CHECK-LABEL: @test_svscale_single_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[OP1]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[OP1]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[OP1]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sme.fp8.scale.single.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z26test_svscale_single_f32_x413svfloat32x4_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[OP1]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[OP1]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[OP1]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sme.fp8.scale.single.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat32x4_t test_svscale_single_f32_x4(svfloat32x4_t op1, svint32_t op2) __arm_streaming +{ + return SVE_ACLE_FUNC(svscale,_single_f32_x4)(op1, op2); +} + +// CHECK-LABEL: @test_svscale_single_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[OP1]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[OP1]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[OP1]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sme.fp8.scale.single.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP8]], [[TMP9]], i64 4) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 6) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z26test_svscale_single_f64_x413svfloat64x4_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[OP1]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[OP1]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[OP1]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sme.fp8.scale.single.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP8]], [[TMP9]], i64 4) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat64x4_t test_svscale_single_f64_x4(svfloat64x4_t op1, svint64_t op2) __arm_streaming +{ + return SVE_ACLE_FUNC(svscale,_single_f64_x4)(op1, op2); +} + +// Multi x2 +// CHECK-LABEL: @test_svscale_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[OP1]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[OP2:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[OP2]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sme.fp8.scale.x2.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z19test_svscale_f16_x213svfloat16x2_t11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[OP1]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[OP2:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[OP2]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sme.fp8.scale.x2.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat16x2_t test_svscale_f16_x2(svfloat16x2_t op1, svint16x2_t op2) __arm_streaming +{ + return SVE_ACLE_FUNC(svscale,_f16_x2)(op1, op2); +} + +// CHECK-LABEL: @test_svscale_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[OP1]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[OP2:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[OP2]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sme.fp8.scale.x2.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z19test_svscale_f32_x213svfloat32x2_t11svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[OP1]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[OP2:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[OP2]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sme.fp8.scale.x2.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat32x2_t test_svscale_f32_x2(svfloat32x2_t op1, svint32x2_t op2) __arm_streaming +{ + return SVE_ACLE_FUNC(svscale,_f32_x2)(op1, op2); +} + +// CHECK-LABEL: @test_svscale_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[OP1]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[OP2:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[OP2]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sme.fp8.scale.x2.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z19test_svscale_f64_x213svfloat64x2_t11svint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[OP1]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[OP2:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[OP2]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sme.fp8.scale.x2.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat64x2_t test_svscale_f64_x2(svfloat64x2_t op1, svint64x2_t op2) __arm_streaming +{ + return SVE_ACLE_FUNC(svscale,_f64_x2)(op1, op2); +} + +// Multi x4 +// CHECK-LABEL: @test_svscale_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[OP1]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[OP1]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[OP1]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[OP2:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[OP2]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[OP2]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[OP2]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sme.fp8.scale.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 8) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP12]], [[TMP13]], i64 16) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP14]], [[TMP15]], i64 24) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z19test_svscale_f16_x413svfloat16x4_t11svint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[OP1]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[OP1]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[OP1]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[OP2:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[OP2]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[OP2]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[OP2]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sme.fp8.scale.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 8) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP12]], [[TMP13]], i64 16) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP14]], [[TMP15]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svfloat16x4_t test_svscale_f16_x4(svfloat16x4_t op1, svint16x4_t op2) __arm_streaming +{ + return SVE_ACLE_FUNC(svscale,_f16_x4)(op1, op2); +} + +// CHECK-LABEL: @test_svscale_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[OP1]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[OP1]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[OP1]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[OP2:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[OP2]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[OP2]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[OP2]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sme.fp8.scale.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 4) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP12]], [[TMP13]], i64 8) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP14]], [[TMP15]], i64 12) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z19test_svscale_f32_x413svfloat32x4_t11svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[OP1]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[OP1]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[OP1]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[OP2:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[OP2]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[OP2]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[OP2]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sme.fp8.scale.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 4) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP12]], [[TMP13]], i64 8) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP14]], [[TMP15]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svfloat32x4_t test_svscale_f32_x4(svfloat32x4_t op1, svint32x4_t op2) __arm_streaming +{ + return SVE_ACLE_FUNC(svscale,_f32_x4)(op1, op2); +} + +// CHECK-LABEL: @test_svscale_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[OP1]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[OP1]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[OP1]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[OP2:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[OP2]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[OP2]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[OP2]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sme.fp8.scale.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 2) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP12]], [[TMP13]], i64 4) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP14]], [[TMP15]], i64 6) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z19test_svscale_f64_x413svfloat64x4_t11svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[OP1]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[OP1]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[OP1]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[OP2:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[OP2]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[OP2]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[OP2]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sme.fp8.scale.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 2) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP12]], [[TMP13]], i64 4) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP14]], [[TMP15]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svfloat64x4_t test_svscale_f64_x4(svfloat64x4_t op1, svint64x4_t op2) __arm_streaming +{ + return SVE_ACLE_FUNC(svscale,_f64_x4)(op1, op2); +} diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index 8ffa2d0878e116..b2a2e11240186d 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -3762,6 +3762,31 @@ let TargetPrefix = "aarch64" in { : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [llvm_i32_ty, llvm_nxv16i8_ty, llvm_i32_ty], [ImmArg>, ImmArg>, IntrReadMem]>; + + // + // Register scaling + // + def int_aarch64_sme_fp8_scale_single_x2 + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], + [LLVMMatchType<0>, LLVMMatchType<0>, LLVMVectorOfBitcastsToInt<0>], + [IntrNoMem]>; + + def int_aarch64_sme_fp8_scale_single_x4 + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], + [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, LLVMVectorOfBitcastsToInt<0>], + [IntrNoMem]>; + + def int_aarch64_sme_fp8_scale_x2 + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], + [LLVMMatchType<0>, LLVMMatchType<0>, + LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>], + [IntrNoMem]>; + + def int_aarch64_sme_fp8_scale_x4 + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], + [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, + LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>], + [IntrNoMem]>; } // SVE2.1 - ZIPQ1, ZIPQ2, UZPQ1, UZPQ2 diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 69806c9c3fdbf7..dfb6b08b1f73b2 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -5640,6 +5640,34 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { AArch64::SQDMULH_VG4_4Z4Z_S, AArch64::SQDMULH_VG4_4Z4Z_D})) SelectDestructiveMultiIntrinsic(Node, 4, true, Op); return; + case Intrinsic::aarch64_sme_fp8_scale_single_x2: + if (auto Op = SelectOpcodeFromVT( + Node->getValueType(0), + {0, AArch64::FSCALE_2ZZ_H, AArch64::FSCALE_2ZZ_S, + AArch64::FSCALE_2ZZ_D})) + SelectDestructiveMultiIntrinsic(Node, 2, false, Op); + return; + case Intrinsic::aarch64_sme_fp8_scale_single_x4: + if (auto Op = SelectOpcodeFromVT( + Node->getValueType(0), + {0, AArch64::FSCALE_4ZZ_H, AArch64::FSCALE_4ZZ_S, + AArch64::FSCALE_4ZZ_D})) + SelectDestructiveMultiIntrinsic(Node, 4, false, Op); + return; + case Intrinsic::aarch64_sme_fp8_scale_x2: + if (auto Op = SelectOpcodeFromVT( + Node->getValueType(0), + {0, AArch64::FSCALE_2Z2Z_H, AArch64::FSCALE_2Z2Z_S, + AArch64::FSCALE_2Z2Z_D})) + SelectDestructiveMultiIntrinsic(Node, 2, true, Op); + return; + case Intrinsic::aarch64_sme_fp8_scale_x4: + if (auto Op = SelectOpcodeFromVT( + Node->getValueType(0), + {0, AArch64::FSCALE_4Z4Z_H, AArch64::FSCALE_4Z4Z_S, + AArch64::FSCALE_4Z4Z_D})) + SelectDestructiveMultiIntrinsic(Node, 4, true, Op); + return; case Intrinsic::aarch64_sve_whilege_x2: if (auto Op = SelectOpcodeFromVT( Node->getValueType(0), diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-fscale.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-fscale.ll new file mode 100644 index 00000000000000..591fe8da6b79cd --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-fscale.ll @@ -0,0 +1,186 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+fp8 -force-streaming -verify-machineinstrs < %s | FileCheck %s + +; FSCALE (Single, x2) + +define { , } @multi_vec_scale_single_x2_half( %zdn1, %zdn2, %zm) { +; CHECK-LABEL: multi_vec_scale_single_x2_half: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fscale { z0.h, z1.h }, { z0.h, z1.h }, z2.h +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sme.fp8.scale.single.x2.nxv8f16( %zdn1, %zdn2, %zm) + ret { , } %res +} + +define { , } @multi_vec_scale_single_x2_float( %zdn1, %zdn2, %zm) { +; CHECK-LABEL: multi_vec_scale_single_x2_float: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fscale { z0.s, z1.s }, { z0.s, z1.s }, z2.s +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sme.fp8.scale.single.x2.nxv4f32( %zdn1, %zdn2, %zm) + ret { , } %res +} + +define { , } @multi_vec_scale_single_x2_double( %zdn1, %zdn2, %zm) { +; CHECK-LABEL: multi_vec_scale_single_x2_double: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fscale { z0.d, z1.d }, { z0.d, z1.d }, z2.d +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sme.fp8.scale.single.x2.nxv2f64( %zdn1, %zdn2, %zm) + ret { , } %res +} + +; FSCALE (Single, x4) + +define { , , , } @multi_vec_scale_single_x4_half( %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +; CHECK-LABEL: multi_vec_scale_single_x4_half: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: fscale { z0.h - z3.h }, { z0.h - z3.h }, z4.h +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sme.fp8.scale.single.x4.nxv8f16( %zdn1, %zdn2, %zdn3, %zdn4, %zm) + ret { , , , } %res +} + +define { , , , } @multi_vec_scale_single_x4_float( %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +; CHECK-LABEL: multi_vec_scale_single_x4_float: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: fscale { z0.s - z3.s }, { z0.s - z3.s }, z4.s +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sme.fp8.scale.single.x4.nxv4f32( %zdn1, %zdn2, %zdn3, %zdn4, %zm) + ret { , , , } %res +} + +define { , , , } @multi_vec_scale_single_x4_double( %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +; CHECK-LABEL: multi_vec_scale_single_x4_double: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: fscale { z0.d - z3.d }, { z0.d - z3.d }, z4.d +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sme.fp8.scale.single.x4.nxv2f64( %zdn1, %zdn2, %zdn3, %zdn4, %zm) + ret { , , , } %res +} + +; FSCALE (Multi, x2) +define { , } @multi_vec_scale_x2_half( %zdn1, %zdn2, %zm1, %zm2) { +; CHECK-LABEL: multi_vec_scale_x2_half: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3 +; CHECK-NEXT: fscale { z0.h, z1.h }, { z0.h, z1.h }, { z2.h, z3.h } +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sme.fp8.scale.x2.nxv8f16( %zdn1, %zdn2, %zm1, %zm2) + ret { , } %res +} + +define { , } @multi_vec_scale_x2_float( %zdn1, %zdn2, %zm1, %zm2 ) { +; CHECK-LABEL: multi_vec_scale_x2_float: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3 +; CHECK-NEXT: fscale { z0.s, z1.s }, { z0.s, z1.s }, { z2.s, z3.s } +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sme.fp8.scale.x2.nxv4f32( %zdn1, %zdn2, %zm1, %zm2) + ret { , } %res +} + +define { , } @multi_vec_scale_x2_double( %zdn1, %zdn2, %zm1, %zm2) { +; CHECK-LABEL: multi_vec_scale_x2_double: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3 +; CHECK-NEXT: fscale { z0.d, z1.d }, { z0.d, z1.d }, { z2.d, z3.d } +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sme.fp8.scale.x2.nxv2f64( %zdn1, %zdn2, %zm1, %zm2) + ret { , } %res +} + +; FSCALE (Multi, x4) +define { , , , } @multi_vec_scale_x4_half( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { +; CHECK-LABEL: multi_vec_scale_x4_half: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: fscale { z0.h - z3.h }, { z0.h - z3.h }, { z4.h - z7.h } +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sme.fp8.scale.x4.nxv8f16( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) + ret { , , , } %res +} + +define { , , , } @multi_vec_scale_x4_float( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { +; CHECK-LABEL: multi_vec_scale_x4_float: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: fscale { z0.s - z3.s }, { z0.s - z3.s }, { z4.s - z7.s } +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sme.fp8.scale.x4.nxv4f32( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) + ret { , , , } %res +} + +define { , , , } @multi_vec_scale_x4_double( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { +; CHECK-LABEL: multi_vec_scale_x4_double: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: fscale { z0.d - z3.d }, { z0.d - z3.d }, { z4.d - z7.d } +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sme.fp8.scale.x4.nxv2f64( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) + ret { , , , } %res +} + +declare { , } @llvm.aarch64.sme.fp8.scale.single.x2.nxv8f16(, , ) +declare { , } @llvm.aarch64.sme.fp8.scale.single.x2.nxv4f32(, , ) +declare { , } @llvm.aarch64.sme.fp8.scale.single.x2.nxv2f64(, , ) + +declare { , , , } @llvm.aarch64.sme.fp8.scale.single.x4.nxv8f16(, ,, , ) +declare { , , , } @llvm.aarch64.sme.fp8.scale.single.x4.nxv4f32(, ,, , ) +declare { , , , } @llvm.aarch64.sme.fp8.scale.single.x4.nxv2f64(, ,, , ) + +declare { , } @llvm.aarch64.sme.fp8.scale.x2.nxv8f16(, , , ) +declare { , } @llvm.aarch64.sme.fp8.scale.x2.nxv4f32(, , , ) +declare { , } @llvm.aarch64.sme.fp8.scale.x2.nxv2f64(, , , ) + +declare { , , , } @llvm.aarch64.sme.fp8.scale.x4.nxv8f16(, ,, , , , , ) +declare { , , , } @llvm.aarch64.sme.fp8.scale.x4.nxv4f32(, ,, , , , , ) +declare { , , , } @llvm.aarch64.sme.fp8.scale.x4.nxv2f64(, ,, , , , , )