forked from llvm/llvm-project
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[AArch64] Implement NEON vscale intrinsics (llvm#100347)
This patch implements following intrinsics: ``` float16x4_t vscale_f16(float16x4_t vn, int16x4_t vm) float16x8_t vscaleq_f16(float16x8_t vn, int16x8_t vm) float32x2_t vscale_f32(float32x2_t vn, int32x2_t vm) float32x4_t vscaleq_f32(float32x4_t vn, int32x4_t vm) float64x2_t vscaleq_f64(float64x2_t vn, int64x2_t vm) ``` as defined in ARM-software/acle#323 Co-authored-by: Hassnaa Hamdi <[email protected]>
- Loading branch information
1 parent
08d1a4a
commit ef2b513
Showing
7 changed files
with
154 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
58 changes: 58 additions & 0 deletions
58
clang/test/CodeGen/aarch64-neon-fp8-intrinsics/acle_neon_fscale.c
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 | ||
#include <arm_neon.h> | ||
|
||
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +fp8 -O3 -emit-llvm -o - %s | FileCheck %s | ||
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +fp8 -S -O3 -o /dev/null %s | ||
|
||
// CHECK-LABEL: define dso_local <4 x half> @test_vscale_f16( | ||
// CHECK-SAME: <4 x half> noundef [[VN:%.*]], <4 x i16> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { | ||
// CHECK-NEXT: entry: | ||
// CHECK-NEXT: [[FSCALE2_I:%.*]] = tail call <4 x half> @llvm.aarch64.neon.fp8.fscale.v4f16(<4 x half> [[VN]], <4 x i16> [[VM]]) | ||
// CHECK-NEXT: ret <4 x half> [[FSCALE2_I]] | ||
// | ||
float16x4_t test_vscale_f16(float16x4_t vn, int16x4_t vm) { | ||
return vscale_f16(vn, vm); | ||
} | ||
|
||
// CHECK-LABEL: define dso_local <8 x half> @test_vscaleq_f16( | ||
// CHECK-SAME: <8 x half> noundef [[VN:%.*]], <8 x i16> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { | ||
// CHECK-NEXT: entry: | ||
// CHECK-NEXT: [[FSCALE2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fscale.v8f16(<8 x half> [[VN]], <8 x i16> [[VM]]) | ||
// CHECK-NEXT: ret <8 x half> [[FSCALE2_I]] | ||
// | ||
float16x8_t test_vscaleq_f16(float16x8_t vn, int16x8_t vm) { | ||
return vscaleq_f16(vn, vm); | ||
|
||
} | ||
|
||
// CHECK-LABEL: define dso_local <2 x float> @test_vscale_f32( | ||
// CHECK-SAME: <2 x float> noundef [[VN:%.*]], <2 x i32> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { | ||
// CHECK-NEXT: entry: | ||
// CHECK-NEXT: [[FSCALE2_I:%.*]] = tail call <2 x float> @llvm.aarch64.neon.fp8.fscale.v2f32(<2 x float> [[VN]], <2 x i32> [[VM]]) | ||
// CHECK-NEXT: ret <2 x float> [[FSCALE2_I]] | ||
// | ||
float32x2_t test_vscale_f32(float32x2_t vn, int32x2_t vm) { | ||
return vscale_f32(vn, vm); | ||
|
||
} | ||
|
||
// CHECK-LABEL: define dso_local <4 x float> @test_vscaleq_f32( | ||
// CHECK-SAME: <4 x float> noundef [[VN:%.*]], <4 x i32> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { | ||
// CHECK-NEXT: entry: | ||
// CHECK-NEXT: [[FSCALE2_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fscale.v4f32(<4 x float> [[VN]], <4 x i32> [[VM]]) | ||
// CHECK-NEXT: ret <4 x float> [[FSCALE2_I]] | ||
// | ||
float32x4_t test_vscaleq_f32(float32x4_t vn, int32x4_t vm) { | ||
return vscaleq_f32(vn, vm); | ||
|
||
} | ||
|
||
// CHECK-LABEL: define dso_local <2 x double> @test_vscale_f64( | ||
// CHECK-SAME: <2 x double> noundef [[VN:%.*]], <2 x i64> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { | ||
// CHECK-NEXT: entry: | ||
// CHECK-NEXT: [[FSCALE2_I:%.*]] = tail call <2 x double> @llvm.aarch64.neon.fp8.fscale.v2f64(<2 x double> [[VN]], <2 x i64> [[VM]]) | ||
// CHECK-NEXT: ret <2 x double> [[FSCALE2_I]] | ||
// | ||
float64x2_t test_vscale_f64(float64x2_t vn, int64x2_t vm) { | ||
return vscaleq_f64(vn, vm); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 | ||
; RUN: llc -mtriple=aarch64-linux -mattr=+neon,+fp8 < %s | FileCheck %s | ||
|
||
|
||
define <4 x half> @test_fscale_f16(<4 x half> %vn, <4 x i16> %vm) { | ||
; CHECK-LABEL: test_fscale_f16: | ||
; CHECK: // %bb.0: | ||
; CHECK-NEXT: fscale v0.4h, v0.4h, v1.4h | ||
; CHECK-NEXT: ret | ||
%res = tail call <4 x half> @llvm.aarch64.neon.fp8.fscale.v4f16(<4 x half> %vn, <4 x i16> %vm) | ||
ret <4 x half> %res | ||
} | ||
|
||
define <8 x half> @test_fscaleq_f16(<8 x half> %vn, <8 x i16> %vm) { | ||
; CHECK-LABEL: test_fscaleq_f16: | ||
; CHECK: // %bb.0: | ||
; CHECK-NEXT: fscale v0.8h, v0.8h, v1.8h | ||
; CHECK-NEXT: ret | ||
%res = tail call <8 x half> @llvm.aarch64.neon.fp8.fscale.v8f16(<8 x half> %vn, <8 x i16> %vm) | ||
ret <8 x half> %res | ||
} | ||
|
||
define <2 x float> @test_fscale_f32(<2 x float> %vn, <2 x i32> %vm) { | ||
; CHECK-LABEL: test_fscale_f32: | ||
; CHECK: // %bb.0: | ||
; CHECK-NEXT: fscale v0.2s, v0.2s, v1.2s | ||
; CHECK-NEXT: ret | ||
%res = tail call <2 x float> @llvm.aarch64.neon.fp8.fscale.v2f32(<2 x float> %vn, <2 x i32> %vm) | ||
ret <2 x float> %res | ||
} | ||
|
||
define <4 x float> @test_fscaleq_f32(<4 x float> %vn, <4 x i32> %vm) { | ||
; CHECK-LABEL: test_fscaleq_f32: | ||
; CHECK: // %bb.0: | ||
; CHECK-NEXT: fscale v0.4s, v0.4s, v1.4s | ||
; CHECK-NEXT: ret | ||
%res = tail call <4 x float> @llvm.aarch64.neon.fp8.fscale.v4f32(<4 x float> %vn, <4 x i32> %vm) | ||
ret <4 x float> %res | ||
} | ||
|
||
define <2 x double> @test_fscaleq_f64(<2 x double> %vn, <2 x i64> %vm) { | ||
; CHECK-LABEL: test_fscaleq_f64: | ||
; CHECK: // %bb.0: | ||
; CHECK-NEXT: fscale v0.2d, v0.2d, v1.2d | ||
; CHECK-NEXT: ret | ||
%res = tail call <2 x double> @llvm.aarch64.neon.fp8.fscale.v2f64(<2 x double> %vn, <2 x i64> %vm) | ||
ret <2 x double> %res | ||
} | ||
|
||
declare <4 x half> @llvm.aarch64.neon.fp8.fscale.v4f16(<4 x half>, <4 x i16>) | ||
declare <8 x half> @llvm.aarch64.neon.fp8.fscale.v8f16(<8 x half>, <8 x i16>) | ||
declare <2 x float> @llvm.aarch64.neon.fp8.fscale.v2f32(<2 x float>, <2 x i32>) | ||
declare <4 x float> @llvm.aarch64.neon.fp8.fscale.v4f32(<4 x float>, <4 x i32>) | ||
declare <2 x double> @llvm.aarch64.neon.fp8.fscale.v2f64(<2 x double>, <2 x i64>) |