[AArch64] Implement NEON vscale intrinsics #100347

Lukacma · 2024-07-24T11:19:25Z

This patch implements following intrinsics:

float16x4_t vscale_f16(float16x4_t vn, int16x4_t vm)	
float16x8_t vscaleq_f16(float16x8_t vn, int16x8_t vm)
float32x2_t vscale_f32(float32x2_t vn, int32x2_t vm)
float32x4_t vscaleq_f32(float32x4_t vn, int32x4_t vm)
float64x2_t vscaleq_f64(float64x2_t vn, int64x2_t vm)

as defined in ARM-software/acle#323

Co-authored-by: Hassnaa Hamdi [email protected]

llvmbot · 2024-07-24T11:19:57Z

@llvm/pr-subscribers-clang-codegen

@llvm/pr-subscribers-llvm-ir

Author: None (Lukacma)

Changes

This patch implements following intrinsics:

float16x4_t vscale_f16(float16x4_t vn, int16x4_t vm)	
float16x8_t vscaleq_f16(float16x8_t vn, int16x8_t vm)
float32x2_t vscale_f32(float32x2_t vn, int32x2_t vm)
float32x4_t vscaleq_f32(float32x4_t vn, int32x4_t vm)
float64x2_t vscaleq_f64(float64x2_t vn, int64x2_t vm)

as defined in ARM-software/acle#323

Co-authored-by: Hassnaa Hamdi <[email protected]>

Full diff: https://github.com/llvm/llvm-project/pull/100347.diff

7 Files Affected:

(modified) clang/include/clang/Basic/arm_neon.td (+6)
(modified) clang/lib/CodeGen/CGBuiltin.cpp (+8)
(added) clang/test/CodeGen/aarch64-neon-fp8-intrinsics/acle_neon_fscale.c (+58)
(modified) llvm/include/llvm/IR/IntrinsicsAArch64.td (+7)
(modified) llvm/lib/Target/AArch64/AArch64InstrFormats.td (+21)
(modified) llvm/lib/Target/AArch64/AArch64InstrInfo.td (+1-1)
(added) llvm/test/CodeGen/AArch64/neon-fp8-fscale.ll (+54)

diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index 3098fa67e6a51..f930c62a79280 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -2096,3 +2096,9 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "r
   def VLDAP1_LANE : WInst<"vldap1_lane", ".(c*!).I", "QUlQlUlldQdPlQPl">;
   def VSTL1_LANE  : WInst<"vstl1_lane", "v*(.!)I", "QUlQlUlldQdPlQPl">;
 }
+
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8" in {
+  // fscale
+  def FSCALE_V128 : WInst<"vscale", "..(.S)", "QdQfQh">;
+  def FSCALE_V64 : WInst<"vscale", "(.q)(.q)(.qS)", "fh">;
+}
\ No newline at end of file
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 5639239359ab8..816899e5c11e3 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -13491,6 +13491,14 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     Int = Intrinsic::aarch64_neon_suqadd;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vuqadd");
   }
+  case NEON::BI__builtin_neon_vscale_f16:
+  case NEON::BI__builtin_neon_vscaleq_f16:
+  case NEON::BI__builtin_neon_vscale_f32:
+  case NEON::BI__builtin_neon_vscaleq_f32:
+  case NEON::BI__builtin_neon_vscaleq_f64: {
+    Int = Intrinsic::aarch64_neon_fp8_fscale;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "fscale");
+  }
   }
 }
 
diff --git a/clang/test/CodeGen/aarch64-neon-fp8-intrinsics/acle_neon_fscale.c b/clang/test/CodeGen/aarch64-neon-fp8-intrinsics/acle_neon_fscale.c
new file mode 100644
index 0000000000000..b50d30876a7c5
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-neon-fp8-intrinsics/acle_neon_fscale.c
@@ -0,0 +1,58 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+#include <arm_neon.h>
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +fp8 -O3 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +fp8 -S -O3 -o /dev/null %s
+
+// CHECK-LABEL: define dso_local <4 x half> @test_vscale_f16(
+// CHECK-SAME: <4 x half> noundef [[VN:%.*]], <4 x i16> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[FSCALE2_I:%.*]] = tail call <4 x half> @llvm.aarch64.neon.fp8.fscale.v4f16(<4 x half> [[VN]], <4 x i16> [[VM]])
+// CHECK-NEXT:    ret <4 x half> [[FSCALE2_I]]
+//
+float16x4_t test_vscale_f16(float16x4_t vn, int16x4_t vm) {
+  return vscale_f16(vn, vm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vscaleq_f16(
+// CHECK-SAME: <8 x half> noundef [[VN:%.*]], <8 x i16> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[FSCALE2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fscale.v8f16(<8 x half> [[VN]], <8 x i16> [[VM]])
+// CHECK-NEXT:    ret <8 x half> [[FSCALE2_I]]
+//
+float16x8_t test_vscaleq_f16(float16x8_t vn, int16x8_t vm) {
+  return vscaleq_f16(vn, vm);
+
+}
+
+// CHECK-LABEL: define dso_local <2 x float> @test_vscale_f32(
+// CHECK-SAME: <2 x float> noundef [[VN:%.*]], <2 x i32> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[FSCALE2_I:%.*]] = tail call <2 x float> @llvm.aarch64.neon.fp8.fscale.v2f32(<2 x float> [[VN]], <2 x i32> [[VM]])
+// CHECK-NEXT:    ret <2 x float> [[FSCALE2_I]]
+//
+float32x2_t test_vscale_f32(float32x2_t vn, int32x2_t vm) {
+  return vscale_f32(vn, vm);
+
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vscaleq_f32(
+// CHECK-SAME: <4 x float> noundef [[VN:%.*]], <4 x i32> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[FSCALE2_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fscale.v4f32(<4 x float> [[VN]], <4 x i32> [[VM]])
+// CHECK-NEXT:    ret <4 x float> [[FSCALE2_I]]
+//
+float32x4_t test_vscaleq_f32(float32x4_t vn, int32x4_t vm) {
+  return vscaleq_f32(vn, vm);
+
+}
+
+// CHECK-LABEL: define dso_local <2 x double> @test_vscale_f64(
+// CHECK-SAME: <2 x double> noundef [[VN:%.*]], <2 x i64> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[FSCALE2_I:%.*]] = tail call <2 x double> @llvm.aarch64.neon.fp8.fscale.v2f64(<2 x double> [[VN]], <2 x i64> [[VM]])
+// CHECK-NEXT:    ret <2 x double> [[FSCALE2_I]]
+//
+float64x2_t test_vscale_f64(float64x2_t vn, int64x2_t vm) {
+  return vscaleq_f64(vn, vm);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 3735bf5222fce..1f1691a6235b8 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -563,6 +563,13 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
   def int_aarch64_neon_vcmla_rot90  : AdvSIMD_3VectorArg_Intrinsic;
   def int_aarch64_neon_vcmla_rot180 : AdvSIMD_3VectorArg_Intrinsic;
   def int_aarch64_neon_vcmla_rot270 : AdvSIMD_3VectorArg_Intrinsic;
+  
+  // FP8 fscale
+  def int_aarch64_neon_fp8_fscale : DefaultAttrsIntrinsic<
+                                    [llvm_anyvector_ty],
+                                    [LLVMMatchType<0>,
+                                    LLVMVectorOfBitcastsToInt<0>],
+                                    [IntrNoMem]>;
 }
 
 let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index e1ecc5a57dd26..46902fd9f8b0b 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -5985,6 +5985,27 @@ multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<3> opc,
         [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
 }
 
+// As above, but only floating point elements supported.
+let mayRaiseFPException = 1, Uses = [FPCR] in
+multiclass SIMDThreeVectorFP<bit U, bit S, bits<3> opc,
+                                 string asm, SDPatternOperator OpNode> {
+  def v4f16 : BaseSIMDThreeSameVector<0, U, {S,0b10}, {0b00,opc}, V64,
+                                      asm, ".4h",
+        [(set (v4f16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8f16 : BaseSIMDThreeSameVector<1, U, {S,0b10}, {0b00,opc}, V128,
+                                      asm, ".8h",
+        [(set (v8f16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (v8i16 V128:$Rm)))]>;
+  def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0b01}, {0b11,opc}, V64,
+                                      asm, ".2s",
+        [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0b01}, {0b11,opc}, V128,
+                                      asm, ".4s",
+        [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4i32 V128:$Rm)))]>;
+  def v2f64 : BaseSIMDThreeSameVector<1, U, {S,0b11}, {0b11,opc}, V128,
+                                      asm, ".2d",
+        [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2i64 V128:$Rm)))]>;
+}
+
 let mayRaiseFPException = 1, Uses = [FPCR] in
 multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<3> opc,
                                     string asm,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 1053ba9242768..1fa21278657ae 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -10128,7 +10128,7 @@ let Predicates = [HasFP8] in {
   defm BF2CVTL : SIMDMixedTwoVectorFP8<0b11, "bf2cvtl">;
   defm FCVTN_F16_F8 : SIMDThreeSameSizeVectorCvt<"fcvtn">;
   defm FCVTN_F32_F8 : SIMDThreeVectorCvt<"fcvtn">;
-  defm FSCALE : SIMDThreeSameVectorFP<0b1, 0b1, 0b111, "fscale", null_frag>;
+  defm FSCALE : SIMDThreeVectorFP<0b1, 0b1, 0b111, "fscale", int_aarch64_neon_fp8_fscale>;
 } // End let Predicates = [HasFP8]
 
 let Predicates = [HasFAMINMAX] in {
diff --git a/llvm/test/CodeGen/AArch64/neon-fp8-fscale.ll b/llvm/test/CodeGen/AArch64/neon-fp8-fscale.ll
new file mode 100644
index 0000000000000..da0e365db2d31
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-fp8-fscale.ll
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-linux -mattr=+neon,+fp8 < %s | FileCheck %s
+
+
+define <4 x half> @test_fscale_f16(<4 x half> %vn, <4 x i16> %vm) {
+; CHECK-LABEL: test_fscale_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fscale v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
+  %res = tail call <4 x half> @llvm.aarch64.neon.fp8.fscale.v4f16(<4 x half> %vn, <4 x i16> %vm)
+  ret <4 x half> %res
+}
+
+define <8 x half> @test_fscaleq_f16(<8 x half> %vn, <8 x i16> %vm) {
+; CHECK-LABEL: test_fscaleq_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fscale v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
+  %res = tail call <8 x half> @llvm.aarch64.neon.fp8.fscale.v8f16(<8 x half> %vn, <8 x i16> %vm)
+  ret <8 x half> %res
+}
+
+define <2 x float> @test_fscale_f32(<2 x float> %vn, <2 x i32> %vm) {
+; CHECK-LABEL: test_fscale_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fscale v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
+  %res = tail call <2 x float> @llvm.aarch64.neon.fp8.fscale.v2f32(<2 x float> %vn, <2 x i32> %vm)
+  ret <2 x float> %res
+}
+
+define <4 x float> @test_fscaleq_f32(<4 x float> %vn, <4 x i32> %vm) {
+; CHECK-LABEL: test_fscaleq_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fscale v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+  %res = tail call <4 x float> @llvm.aarch64.neon.fp8.fscale.v4f32(<4 x float> %vn, <4 x i32> %vm)
+  ret <4 x float> %res
+}
+
+define <2 x double> @test_fscaleq_f64(<2 x double> %vn, <2 x i64> %vm) {
+; CHECK-LABEL: test_fscaleq_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fscale v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
+  %res = tail call <2 x double> @llvm.aarch64.neon.fp8.fscale.v2f64(<2 x double> %vn, <2 x i64> %vm)
+  ret <2 x double> %res
+}
+
+declare <4 x half> @llvm.aarch64.neon.fp8.fscale.v4f16(<4 x half>, <4 x i16>)
+declare <8 x half> @llvm.aarch64.neon.fp8.fscale.v8f16(<8 x half>, <8 x i16>)
+declare <2 x float> @llvm.aarch64.neon.fp8.fscale.v2f32(<2 x float>, <2 x i32>)
+declare <4 x float> @llvm.aarch64.neon.fp8.fscale.v4f32(<4 x float>, <4 x i32>)
+declare <2 x double> @llvm.aarch64.neon.fp8.fscale.v2f64(<2 x double>, <2 x i64>)

llvmbot · 2024-07-24T11:19:58Z

@llvm/pr-subscribers-backend-aarch64

Author: None (Lukacma)

Changes

This patch implements following intrinsics:

float16x4_t vscale_f16(float16x4_t vn, int16x4_t vm)	
float16x8_t vscaleq_f16(float16x8_t vn, int16x8_t vm)
float32x2_t vscale_f32(float32x2_t vn, int32x2_t vm)
float32x4_t vscaleq_f32(float32x4_t vn, int32x4_t vm)
float64x2_t vscaleq_f64(float64x2_t vn, int64x2_t vm)

as defined in ARM-software/acle#323

Co-authored-by: Hassnaa Hamdi <[email protected]>

Full diff: https://github.com/llvm/llvm-project/pull/100347.diff

7 Files Affected:

(modified) clang/include/clang/Basic/arm_neon.td (+6)
(modified) clang/lib/CodeGen/CGBuiltin.cpp (+8)
(added) clang/test/CodeGen/aarch64-neon-fp8-intrinsics/acle_neon_fscale.c (+58)
(modified) llvm/include/llvm/IR/IntrinsicsAArch64.td (+7)
(modified) llvm/lib/Target/AArch64/AArch64InstrFormats.td (+21)
(modified) llvm/lib/Target/AArch64/AArch64InstrInfo.td (+1-1)
(added) llvm/test/CodeGen/AArch64/neon-fp8-fscale.ll (+54)

diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index 3098fa67e6a51..f930c62a79280 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -2096,3 +2096,9 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "r
   def VLDAP1_LANE : WInst<"vldap1_lane", ".(c*!).I", "QUlQlUlldQdPlQPl">;
   def VSTL1_LANE  : WInst<"vstl1_lane", "v*(.!)I", "QUlQlUlldQdPlQPl">;
 }
+
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8" in {
+  // fscale
+  def FSCALE_V128 : WInst<"vscale", "..(.S)", "QdQfQh">;
+  def FSCALE_V64 : WInst<"vscale", "(.q)(.q)(.qS)", "fh">;
+}
\ No newline at end of file
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 5639239359ab8..816899e5c11e3 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -13491,6 +13491,14 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     Int = Intrinsic::aarch64_neon_suqadd;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vuqadd");
   }
+  case NEON::BI__builtin_neon_vscale_f16:
+  case NEON::BI__builtin_neon_vscaleq_f16:
+  case NEON::BI__builtin_neon_vscale_f32:
+  case NEON::BI__builtin_neon_vscaleq_f32:
+  case NEON::BI__builtin_neon_vscaleq_f64: {
+    Int = Intrinsic::aarch64_neon_fp8_fscale;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "fscale");
+  }
   }
 }
 
diff --git a/clang/test/CodeGen/aarch64-neon-fp8-intrinsics/acle_neon_fscale.c b/clang/test/CodeGen/aarch64-neon-fp8-intrinsics/acle_neon_fscale.c
new file mode 100644
index 0000000000000..b50d30876a7c5
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-neon-fp8-intrinsics/acle_neon_fscale.c
@@ -0,0 +1,58 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+#include <arm_neon.h>
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +fp8 -O3 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +fp8 -S -O3 -o /dev/null %s
+
+// CHECK-LABEL: define dso_local <4 x half> @test_vscale_f16(
+// CHECK-SAME: <4 x half> noundef [[VN:%.*]], <4 x i16> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[FSCALE2_I:%.*]] = tail call <4 x half> @llvm.aarch64.neon.fp8.fscale.v4f16(<4 x half> [[VN]], <4 x i16> [[VM]])
+// CHECK-NEXT:    ret <4 x half> [[FSCALE2_I]]
+//
+float16x4_t test_vscale_f16(float16x4_t vn, int16x4_t vm) {
+  return vscale_f16(vn, vm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vscaleq_f16(
+// CHECK-SAME: <8 x half> noundef [[VN:%.*]], <8 x i16> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[FSCALE2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fscale.v8f16(<8 x half> [[VN]], <8 x i16> [[VM]])
+// CHECK-NEXT:    ret <8 x half> [[FSCALE2_I]]
+//
+float16x8_t test_vscaleq_f16(float16x8_t vn, int16x8_t vm) {
+  return vscaleq_f16(vn, vm);
+
+}
+
+// CHECK-LABEL: define dso_local <2 x float> @test_vscale_f32(
+// CHECK-SAME: <2 x float> noundef [[VN:%.*]], <2 x i32> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[FSCALE2_I:%.*]] = tail call <2 x float> @llvm.aarch64.neon.fp8.fscale.v2f32(<2 x float> [[VN]], <2 x i32> [[VM]])
+// CHECK-NEXT:    ret <2 x float> [[FSCALE2_I]]
+//
+float32x2_t test_vscale_f32(float32x2_t vn, int32x2_t vm) {
+  return vscale_f32(vn, vm);
+
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vscaleq_f32(
+// CHECK-SAME: <4 x float> noundef [[VN:%.*]], <4 x i32> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[FSCALE2_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fscale.v4f32(<4 x float> [[VN]], <4 x i32> [[VM]])
+// CHECK-NEXT:    ret <4 x float> [[FSCALE2_I]]
+//
+float32x4_t test_vscaleq_f32(float32x4_t vn, int32x4_t vm) {
+  return vscaleq_f32(vn, vm);
+
+}
+
+// CHECK-LABEL: define dso_local <2 x double> @test_vscale_f64(
+// CHECK-SAME: <2 x double> noundef [[VN:%.*]], <2 x i64> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[FSCALE2_I:%.*]] = tail call <2 x double> @llvm.aarch64.neon.fp8.fscale.v2f64(<2 x double> [[VN]], <2 x i64> [[VM]])
+// CHECK-NEXT:    ret <2 x double> [[FSCALE2_I]]
+//
+float64x2_t test_vscale_f64(float64x2_t vn, int64x2_t vm) {
+  return vscaleq_f64(vn, vm);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 3735bf5222fce..1f1691a6235b8 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -563,6 +563,13 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
   def int_aarch64_neon_vcmla_rot90  : AdvSIMD_3VectorArg_Intrinsic;
   def int_aarch64_neon_vcmla_rot180 : AdvSIMD_3VectorArg_Intrinsic;
   def int_aarch64_neon_vcmla_rot270 : AdvSIMD_3VectorArg_Intrinsic;
+  
+  // FP8 fscale
+  def int_aarch64_neon_fp8_fscale : DefaultAttrsIntrinsic<
+                                    [llvm_anyvector_ty],
+                                    [LLVMMatchType<0>,
+                                    LLVMVectorOfBitcastsToInt<0>],
+                                    [IntrNoMem]>;
 }
 
 let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index e1ecc5a57dd26..46902fd9f8b0b 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -5985,6 +5985,27 @@ multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<3> opc,
         [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
 }
 
+// As above, but only floating point elements supported.
+let mayRaiseFPException = 1, Uses = [FPCR] in
+multiclass SIMDThreeVectorFP<bit U, bit S, bits<3> opc,
+                                 string asm, SDPatternOperator OpNode> {
+  def v4f16 : BaseSIMDThreeSameVector<0, U, {S,0b10}, {0b00,opc}, V64,
+                                      asm, ".4h",
+        [(set (v4f16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8f16 : BaseSIMDThreeSameVector<1, U, {S,0b10}, {0b00,opc}, V128,
+                                      asm, ".8h",
+        [(set (v8f16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (v8i16 V128:$Rm)))]>;
+  def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0b01}, {0b11,opc}, V64,
+                                      asm, ".2s",
+        [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0b01}, {0b11,opc}, V128,
+                                      asm, ".4s",
+        [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4i32 V128:$Rm)))]>;
+  def v2f64 : BaseSIMDThreeSameVector<1, U, {S,0b11}, {0b11,opc}, V128,
+                                      asm, ".2d",
+        [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2i64 V128:$Rm)))]>;
+}
+
 let mayRaiseFPException = 1, Uses = [FPCR] in
 multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<3> opc,
                                     string asm,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 1053ba9242768..1fa21278657ae 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -10128,7 +10128,7 @@ let Predicates = [HasFP8] in {
   defm BF2CVTL : SIMDMixedTwoVectorFP8<0b11, "bf2cvtl">;
   defm FCVTN_F16_F8 : SIMDThreeSameSizeVectorCvt<"fcvtn">;
   defm FCVTN_F32_F8 : SIMDThreeVectorCvt<"fcvtn">;
-  defm FSCALE : SIMDThreeSameVectorFP<0b1, 0b1, 0b111, "fscale", null_frag>;
+  defm FSCALE : SIMDThreeVectorFP<0b1, 0b1, 0b111, "fscale", int_aarch64_neon_fp8_fscale>;
 } // End let Predicates = [HasFP8]
 
 let Predicates = [HasFAMINMAX] in {
diff --git a/llvm/test/CodeGen/AArch64/neon-fp8-fscale.ll b/llvm/test/CodeGen/AArch64/neon-fp8-fscale.ll
new file mode 100644
index 0000000000000..da0e365db2d31
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-fp8-fscale.ll
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-linux -mattr=+neon,+fp8 < %s | FileCheck %s
+
+
+define <4 x half> @test_fscale_f16(<4 x half> %vn, <4 x i16> %vm) {
+; CHECK-LABEL: test_fscale_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fscale v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
+  %res = tail call <4 x half> @llvm.aarch64.neon.fp8.fscale.v4f16(<4 x half> %vn, <4 x i16> %vm)
+  ret <4 x half> %res
+}
+
+define <8 x half> @test_fscaleq_f16(<8 x half> %vn, <8 x i16> %vm) {
+; CHECK-LABEL: test_fscaleq_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fscale v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
+  %res = tail call <8 x half> @llvm.aarch64.neon.fp8.fscale.v8f16(<8 x half> %vn, <8 x i16> %vm)
+  ret <8 x half> %res
+}
+
+define <2 x float> @test_fscale_f32(<2 x float> %vn, <2 x i32> %vm) {
+; CHECK-LABEL: test_fscale_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fscale v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
+  %res = tail call <2 x float> @llvm.aarch64.neon.fp8.fscale.v2f32(<2 x float> %vn, <2 x i32> %vm)
+  ret <2 x float> %res
+}
+
+define <4 x float> @test_fscaleq_f32(<4 x float> %vn, <4 x i32> %vm) {
+; CHECK-LABEL: test_fscaleq_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fscale v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+  %res = tail call <4 x float> @llvm.aarch64.neon.fp8.fscale.v4f32(<4 x float> %vn, <4 x i32> %vm)
+  ret <4 x float> %res
+}
+
+define <2 x double> @test_fscaleq_f64(<2 x double> %vn, <2 x i64> %vm) {
+; CHECK-LABEL: test_fscaleq_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fscale v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
+  %res = tail call <2 x double> @llvm.aarch64.neon.fp8.fscale.v2f64(<2 x double> %vn, <2 x i64> %vm)
+  ret <2 x double> %res
+}
+
+declare <4 x half> @llvm.aarch64.neon.fp8.fscale.v4f16(<4 x half>, <4 x i16>)
+declare <8 x half> @llvm.aarch64.neon.fp8.fscale.v8f16(<8 x half>, <8 x i16>)
+declare <2 x float> @llvm.aarch64.neon.fp8.fscale.v2f32(<2 x float>, <2 x i32>)
+declare <4 x float> @llvm.aarch64.neon.fp8.fscale.v4f32(<4 x float>, <4 x i32>)
+declare <2 x double> @llvm.aarch64.neon.fp8.fscale.v2f64(<2 x double>, <2 x i64>)

llvmbot · 2024-07-24T11:19:58Z

@llvm/pr-subscribers-clang

Author: None (Lukacma)

Changes

This patch implements following intrinsics:

float16x4_t vscale_f16(float16x4_t vn, int16x4_t vm)	
float16x8_t vscaleq_f16(float16x8_t vn, int16x8_t vm)
float32x2_t vscale_f32(float32x2_t vn, int32x2_t vm)
float32x4_t vscaleq_f32(float32x4_t vn, int32x4_t vm)
float64x2_t vscaleq_f64(float64x2_t vn, int64x2_t vm)

as defined in ARM-software/acle#323

Co-authored-by: Hassnaa Hamdi <[email protected]>

Full diff: https://github.com/llvm/llvm-project/pull/100347.diff

7 Files Affected:

(modified) clang/include/clang/Basic/arm_neon.td (+6)
(modified) clang/lib/CodeGen/CGBuiltin.cpp (+8)
(added) clang/test/CodeGen/aarch64-neon-fp8-intrinsics/acle_neon_fscale.c (+58)
(modified) llvm/include/llvm/IR/IntrinsicsAArch64.td (+7)
(modified) llvm/lib/Target/AArch64/AArch64InstrFormats.td (+21)
(modified) llvm/lib/Target/AArch64/AArch64InstrInfo.td (+1-1)
(added) llvm/test/CodeGen/AArch64/neon-fp8-fscale.ll (+54)

diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index 3098fa67e6a51..f930c62a79280 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -2096,3 +2096,9 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "r
   def VLDAP1_LANE : WInst<"vldap1_lane", ".(c*!).I", "QUlQlUlldQdPlQPl">;
   def VSTL1_LANE  : WInst<"vstl1_lane", "v*(.!)I", "QUlQlUlldQdPlQPl">;
 }
+
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8" in {
+  // fscale
+  def FSCALE_V128 : WInst<"vscale", "..(.S)", "QdQfQh">;
+  def FSCALE_V64 : WInst<"vscale", "(.q)(.q)(.qS)", "fh">;
+}
\ No newline at end of file
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 5639239359ab8..816899e5c11e3 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -13491,6 +13491,14 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     Int = Intrinsic::aarch64_neon_suqadd;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vuqadd");
   }
+  case NEON::BI__builtin_neon_vscale_f16:
+  case NEON::BI__builtin_neon_vscaleq_f16:
+  case NEON::BI__builtin_neon_vscale_f32:
+  case NEON::BI__builtin_neon_vscaleq_f32:
+  case NEON::BI__builtin_neon_vscaleq_f64: {
+    Int = Intrinsic::aarch64_neon_fp8_fscale;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "fscale");
+  }
   }
 }
 
diff --git a/clang/test/CodeGen/aarch64-neon-fp8-intrinsics/acle_neon_fscale.c b/clang/test/CodeGen/aarch64-neon-fp8-intrinsics/acle_neon_fscale.c
new file mode 100644
index 0000000000000..b50d30876a7c5
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-neon-fp8-intrinsics/acle_neon_fscale.c
@@ -0,0 +1,58 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+#include <arm_neon.h>
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +fp8 -O3 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +fp8 -S -O3 -o /dev/null %s
+
+// CHECK-LABEL: define dso_local <4 x half> @test_vscale_f16(
+// CHECK-SAME: <4 x half> noundef [[VN:%.*]], <4 x i16> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[FSCALE2_I:%.*]] = tail call <4 x half> @llvm.aarch64.neon.fp8.fscale.v4f16(<4 x half> [[VN]], <4 x i16> [[VM]])
+// CHECK-NEXT:    ret <4 x half> [[FSCALE2_I]]
+//
+float16x4_t test_vscale_f16(float16x4_t vn, int16x4_t vm) {
+  return vscale_f16(vn, vm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vscaleq_f16(
+// CHECK-SAME: <8 x half> noundef [[VN:%.*]], <8 x i16> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[FSCALE2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fscale.v8f16(<8 x half> [[VN]], <8 x i16> [[VM]])
+// CHECK-NEXT:    ret <8 x half> [[FSCALE2_I]]
+//
+float16x8_t test_vscaleq_f16(float16x8_t vn, int16x8_t vm) {
+  return vscaleq_f16(vn, vm);
+
+}
+
+// CHECK-LABEL: define dso_local <2 x float> @test_vscale_f32(
+// CHECK-SAME: <2 x float> noundef [[VN:%.*]], <2 x i32> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[FSCALE2_I:%.*]] = tail call <2 x float> @llvm.aarch64.neon.fp8.fscale.v2f32(<2 x float> [[VN]], <2 x i32> [[VM]])
+// CHECK-NEXT:    ret <2 x float> [[FSCALE2_I]]
+//
+float32x2_t test_vscale_f32(float32x2_t vn, int32x2_t vm) {
+  return vscale_f32(vn, vm);
+
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vscaleq_f32(
+// CHECK-SAME: <4 x float> noundef [[VN:%.*]], <4 x i32> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[FSCALE2_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fscale.v4f32(<4 x float> [[VN]], <4 x i32> [[VM]])
+// CHECK-NEXT:    ret <4 x float> [[FSCALE2_I]]
+//
+float32x4_t test_vscaleq_f32(float32x4_t vn, int32x4_t vm) {
+  return vscaleq_f32(vn, vm);
+
+}
+
+// CHECK-LABEL: define dso_local <2 x double> @test_vscale_f64(
+// CHECK-SAME: <2 x double> noundef [[VN:%.*]], <2 x i64> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[FSCALE2_I:%.*]] = tail call <2 x double> @llvm.aarch64.neon.fp8.fscale.v2f64(<2 x double> [[VN]], <2 x i64> [[VM]])
+// CHECK-NEXT:    ret <2 x double> [[FSCALE2_I]]
+//
+float64x2_t test_vscale_f64(float64x2_t vn, int64x2_t vm) {
+  return vscaleq_f64(vn, vm);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 3735bf5222fce..1f1691a6235b8 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -563,6 +563,13 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
   def int_aarch64_neon_vcmla_rot90  : AdvSIMD_3VectorArg_Intrinsic;
   def int_aarch64_neon_vcmla_rot180 : AdvSIMD_3VectorArg_Intrinsic;
   def int_aarch64_neon_vcmla_rot270 : AdvSIMD_3VectorArg_Intrinsic;
+  
+  // FP8 fscale
+  def int_aarch64_neon_fp8_fscale : DefaultAttrsIntrinsic<
+                                    [llvm_anyvector_ty],
+                                    [LLVMMatchType<0>,
+                                    LLVMVectorOfBitcastsToInt<0>],
+                                    [IntrNoMem]>;
 }
 
 let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index e1ecc5a57dd26..46902fd9f8b0b 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -5985,6 +5985,27 @@ multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<3> opc,
         [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
 }
 
+// As above, but only floating point elements supported.
+let mayRaiseFPException = 1, Uses = [FPCR] in
+multiclass SIMDThreeVectorFP<bit U, bit S, bits<3> opc,
+                                 string asm, SDPatternOperator OpNode> {
+  def v4f16 : BaseSIMDThreeSameVector<0, U, {S,0b10}, {0b00,opc}, V64,
+                                      asm, ".4h",
+        [(set (v4f16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8f16 : BaseSIMDThreeSameVector<1, U, {S,0b10}, {0b00,opc}, V128,
+                                      asm, ".8h",
+        [(set (v8f16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (v8i16 V128:$Rm)))]>;
+  def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0b01}, {0b11,opc}, V64,
+                                      asm, ".2s",
+        [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0b01}, {0b11,opc}, V128,
+                                      asm, ".4s",
+        [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4i32 V128:$Rm)))]>;
+  def v2f64 : BaseSIMDThreeSameVector<1, U, {S,0b11}, {0b11,opc}, V128,
+                                      asm, ".2d",
+        [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2i64 V128:$Rm)))]>;
+}
+
 let mayRaiseFPException = 1, Uses = [FPCR] in
 multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<3> opc,
                                     string asm,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 1053ba9242768..1fa21278657ae 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -10128,7 +10128,7 @@ let Predicates = [HasFP8] in {
   defm BF2CVTL : SIMDMixedTwoVectorFP8<0b11, "bf2cvtl">;
   defm FCVTN_F16_F8 : SIMDThreeSameSizeVectorCvt<"fcvtn">;
   defm FCVTN_F32_F8 : SIMDThreeVectorCvt<"fcvtn">;
-  defm FSCALE : SIMDThreeSameVectorFP<0b1, 0b1, 0b111, "fscale", null_frag>;
+  defm FSCALE : SIMDThreeVectorFP<0b1, 0b1, 0b111, "fscale", int_aarch64_neon_fp8_fscale>;
 } // End let Predicates = [HasFP8]
 
 let Predicates = [HasFAMINMAX] in {
diff --git a/llvm/test/CodeGen/AArch64/neon-fp8-fscale.ll b/llvm/test/CodeGen/AArch64/neon-fp8-fscale.ll
new file mode 100644
index 0000000000000..da0e365db2d31
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-fp8-fscale.ll
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-linux -mattr=+neon,+fp8 < %s | FileCheck %s
+
+
+define <4 x half> @test_fscale_f16(<4 x half> %vn, <4 x i16> %vm) {
+; CHECK-LABEL: test_fscale_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fscale v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
+  %res = tail call <4 x half> @llvm.aarch64.neon.fp8.fscale.v4f16(<4 x half> %vn, <4 x i16> %vm)
+  ret <4 x half> %res
+}
+
+define <8 x half> @test_fscaleq_f16(<8 x half> %vn, <8 x i16> %vm) {
+; CHECK-LABEL: test_fscaleq_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fscale v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
+  %res = tail call <8 x half> @llvm.aarch64.neon.fp8.fscale.v8f16(<8 x half> %vn, <8 x i16> %vm)
+  ret <8 x half> %res
+}
+
+define <2 x float> @test_fscale_f32(<2 x float> %vn, <2 x i32> %vm) {
+; CHECK-LABEL: test_fscale_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fscale v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
+  %res = tail call <2 x float> @llvm.aarch64.neon.fp8.fscale.v2f32(<2 x float> %vn, <2 x i32> %vm)
+  ret <2 x float> %res
+}
+
+define <4 x float> @test_fscaleq_f32(<4 x float> %vn, <4 x i32> %vm) {
+; CHECK-LABEL: test_fscaleq_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fscale v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+  %res = tail call <4 x float> @llvm.aarch64.neon.fp8.fscale.v4f32(<4 x float> %vn, <4 x i32> %vm)
+  ret <4 x float> %res
+}
+
+define <2 x double> @test_fscaleq_f64(<2 x double> %vn, <2 x i64> %vm) {
+; CHECK-LABEL: test_fscaleq_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fscale v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
+  %res = tail call <2 x double> @llvm.aarch64.neon.fp8.fscale.v2f64(<2 x double> %vn, <2 x i64> %vm)
+  ret <2 x double> %res
+}
+
+declare <4 x half> @llvm.aarch64.neon.fp8.fscale.v4f16(<4 x half>, <4 x i16>)
+declare <8 x half> @llvm.aarch64.neon.fp8.fscale.v8f16(<8 x half>, <8 x i16>)
+declare <2 x float> @llvm.aarch64.neon.fp8.fscale.v2f32(<2 x float>, <2 x i32>)
+declare <4 x float> @llvm.aarch64.neon.fp8.fscale.v4f32(<4 x float>, <4 x i32>)
+declare <2 x double> @llvm.aarch64.neon.fp8.fscale.v2f64(<2 x double>, <2 x i64>)

CarolineConcatto · 2024-07-26T15:12:11Z

llvm/lib/Target/AArch64/AArch64InstrFormats.td

@@ -5985,6 +5985,27 @@ multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<3> opc,
        [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
 }

+// As above, but only floating point elements supported.


nit: I believe we can remove this comments.
Looks like a c&p from the class above, that is also only supporting FP.

I think the wrong comment was removed here.

SpencerAbson · 2024-08-30T16:58:31Z

clang/include/clang/Basic/arm_neon.td

@@ -2096,3 +2096,9 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "r
  def VLDAP1_LANE : WInst<"vldap1_lane", ".(c*!).I", "QUlQlUlldQdPlQPl">;
  def VSTL1_LANE  : WInst<"vstl1_lane", "v*(.!)I", "QUlQlUlldQdPlQPl">;
 }
+
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8" in {


Should this be TargetGuard = "fp8,neon"?

SpencerAbson · 2024-08-30T17:16:13Z

llvm/lib/Target/AArch64/AArch64InstrFormats.td

@@ -5985,6 +5985,27 @@ multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<3> opc,
        [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
 }

+// As above, but only floating point elements supported.


I think the wrong comment was removed here.

SpencerAbson · 2024-08-30T17:17:09Z

llvm/lib/Target/AArch64/AArch64InstrFormats.td

@@ -5985,6 +5984,27 @@ multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<3> opc,
        [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
 }

+// As above, but only floating point elements supported.
+let mayRaiseFPException = 1, Uses = [FPCR] in
+multiclass SIMDThreeVectorFP<bit U, bit S, bits<3> opc,


Could the name of this class be changed to reflect that it's second operand's elements must be integers?

Renamed and moved.

SpencerAbson

Thanks, LGTM.

llvm-ci · 2024-09-26T15:51:02Z

LLVM Buildbot has detected a new failure on builder llvm-clang-x86_64-sie-ubuntu-fast running on sie-linux-worker while building clang,llvm at step 6 "test-build-unified-tree-check-all".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/144/builds/8043