diff --git a/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_bf16.cpp b/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_bf16.cpp index 0217eb3741..25779a82a6 100644 --- a/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_bf16.cpp +++ b/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_bf16.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_half.cpp b/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_half.cpp index 08c7251d0b..8a526b2812 100644 --- a/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_half.cpp +++ b/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_half.cpp @@ -5,12 +5,14 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix-xmx8 +// REQUIRES: matrix-xmx8,gpu // Only runs on DPAS because AMX implementation does not support half data type // yet // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_int8.cpp b/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_int8.cpp index 8462dff815..2fa63de99a 100644 --- a/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_int8.cpp +++ b/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_int8.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_int8_packed.cpp b/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_int8_packed.cpp index 32f5e0138c..464d946ef1 100644 --- a/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_int8_packed.cpp +++ b/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_int8_packed.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // XFAIL: gpu diff --git a/SYCL/Matrix/Legacy/XMX8/element_wise_irreg_sum_rows.cpp b/SYCL/Matrix/Legacy/XMX8/element_wise_irreg_sum_rows.cpp index df7a479e06..7332417f86 100644 --- a/SYCL/Matrix/Legacy/XMX8/element_wise_irreg_sum_rows.cpp +++ b/SYCL/Matrix/Legacy/XMX8/element_wise_irreg_sum_rows.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // this code calculates the sum of rows into a global array of number of rows // elements. First, partial reduction is computed inside each SG, then atomic diff --git a/SYCL/Matrix/Legacy/XMX8/element_wise_ops.cpp b/SYCL/Matrix/Legacy/XMX8/element_wise_ops.cpp index c64a99fee5..baf2963867 100644 --- a/SYCL/Matrix/Legacy/XMX8/element_wise_ops.cpp +++ b/SYCL/Matrix/Legacy/XMX8/element_wise_ops.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/XMX8/joint_matrix_bf16.cpp b/SYCL/Matrix/Legacy/XMX8/joint_matrix_bf16.cpp index ee084fd400..170154dc10 100644 --- a/SYCL/Matrix/Legacy/XMX8/joint_matrix_bf16.cpp +++ b/SYCL/Matrix/Legacy/XMX8/joint_matrix_bf16.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/XMX8/joint_matrix_bfloat16.cpp b/SYCL/Matrix/Legacy/XMX8/joint_matrix_bfloat16.cpp index a616e9a0b0..1acaa3beff 100644 --- a/SYCL/Matrix/Legacy/XMX8/joint_matrix_bfloat16.cpp +++ b/SYCL/Matrix/Legacy/XMX8/joint_matrix_bfloat16.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/XMX8/joint_matrix_bfloat16_32x64.cpp b/SYCL/Matrix/Legacy/XMX8/joint_matrix_bfloat16_32x64.cpp index 8e7b701941..d559747053 100644 --- a/SYCL/Matrix/Legacy/XMX8/joint_matrix_bfloat16_32x64.cpp +++ b/SYCL/Matrix/Legacy/XMX8/joint_matrix_bfloat16_32x64.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // XFAIL: * diff --git a/SYCL/Matrix/Legacy/XMX8/joint_matrix_half.cpp b/SYCL/Matrix/Legacy/XMX8/joint_matrix_half.cpp index 3c2736ddd1..556ac61a56 100644 --- a/SYCL/Matrix/Legacy/XMX8/joint_matrix_half.cpp +++ b/SYCL/Matrix/Legacy/XMX8/joint_matrix_half.cpp @@ -5,11 +5,13 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix-xmx8 +// REQUIRES: matrix-xmx8,gpu // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // Only run on the GPU because half is not supported on AMX hardware // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/XMX8/joint_matrix_int8_vnni.cpp b/SYCL/Matrix/Legacy/XMX8/joint_matrix_int8_vnni.cpp index 272f22e554..8f96fdb3fd 100644 --- a/SYCL/Matrix/Legacy/XMX8/joint_matrix_int8_vnni.cpp +++ b/SYCL/Matrix/Legacy/XMX8/joint_matrix_int8_vnni.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // XFAIL: * diff --git a/SYCL/Matrix/Legacy/XMX8/joint_matrix_ss_int8.cpp b/SYCL/Matrix/Legacy/XMX8/joint_matrix_ss_int8.cpp index fc9536ff78..0d10afea9b 100644 --- a/SYCL/Matrix/Legacy/XMX8/joint_matrix_ss_int8.cpp +++ b/SYCL/Matrix/Legacy/XMX8/joint_matrix_ss_int8.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/XMX8/joint_matrix_su_int8.cpp b/SYCL/Matrix/Legacy/XMX8/joint_matrix_su_int8.cpp index d1ab5e003c..476606b1dd 100644 --- a/SYCL/Matrix/Legacy/XMX8/joint_matrix_su_int8.cpp +++ b/SYCL/Matrix/Legacy/XMX8/joint_matrix_su_int8.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/XMX8/joint_matrix_us_int8.cpp b/SYCL/Matrix/Legacy/XMX8/joint_matrix_us_int8.cpp index b07ad6ec28..b3bc8d7c68 100644 --- a/SYCL/Matrix/Legacy/XMX8/joint_matrix_us_int8.cpp +++ b/SYCL/Matrix/Legacy/XMX8/joint_matrix_us_int8.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/XMX8/joint_matrix_uu_int8.cpp b/SYCL/Matrix/Legacy/XMX8/joint_matrix_uu_int8.cpp index a14b259b4c..1a9e6bdcda 100644 --- a/SYCL/Matrix/Legacy/XMX8/joint_matrix_uu_int8.cpp +++ b/SYCL/Matrix/Legacy/XMX8/joint_matrix_uu_int8.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/element_wise_all_ops_bf16.cpp b/SYCL/Matrix/Legacy/element_wise_all_ops_bf16.cpp index a81f0e255f..b67c04d519 100644 --- a/SYCL/Matrix/Legacy/element_wise_all_ops_bf16.cpp +++ b/SYCL/Matrix/Legacy/element_wise_all_ops_bf16.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/element_wise_all_ops_half.cpp b/SYCL/Matrix/Legacy/element_wise_all_ops_half.cpp index bd8becff4c..79f2a72341 100644 --- a/SYCL/Matrix/Legacy/element_wise_all_ops_half.cpp +++ b/SYCL/Matrix/Legacy/element_wise_all_ops_half.cpp @@ -5,12 +5,14 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix +// REQUIRES: matrix,gpu // Only runs on DPAS because AMX implementation does not support half data type // yet // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/element_wise_all_ops_int8.cpp b/SYCL/Matrix/Legacy/element_wise_all_ops_int8.cpp index 49a16d3964..e7f82611fd 100644 --- a/SYCL/Matrix/Legacy/element_wise_all_ops_int8.cpp +++ b/SYCL/Matrix/Legacy/element_wise_all_ops_int8.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/element_wise_all_ops_int8_packed.cpp b/SYCL/Matrix/Legacy/element_wise_all_ops_int8_packed.cpp index d3e38f638b..ba97bdcb60 100644 --- a/SYCL/Matrix/Legacy/element_wise_all_ops_int8_packed.cpp +++ b/SYCL/Matrix/Legacy/element_wise_all_ops_int8_packed.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // XFAIL: gpu diff --git a/SYCL/Matrix/Legacy/element_wise_irreg_sum_rows.cpp b/SYCL/Matrix/Legacy/element_wise_irreg_sum_rows.cpp index 2b7895ea5b..269d1f7592 100644 --- a/SYCL/Matrix/Legacy/element_wise_irreg_sum_rows.cpp +++ b/SYCL/Matrix/Legacy/element_wise_irreg_sum_rows.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // this code calculates the sum of rows into a global array of number of rows // elements. First, partial reduction is computed inside each SG, then atomic diff --git a/SYCL/Matrix/Legacy/element_wise_ops.cpp b/SYCL/Matrix/Legacy/element_wise_ops.cpp index d9a407e131..a432215bee 100644 --- a/SYCL/Matrix/Legacy/element_wise_ops.cpp +++ b/SYCL/Matrix/Legacy/element_wise_ops.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/joint_matrix_bf16.cpp b/SYCL/Matrix/Legacy/joint_matrix_bf16.cpp index b02e8cfc07..97d6fd22b4 100644 --- a/SYCL/Matrix/Legacy/joint_matrix_bf16.cpp +++ b/SYCL/Matrix/Legacy/joint_matrix_bf16.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/joint_matrix_bfloat16.cpp b/SYCL/Matrix/Legacy/joint_matrix_bfloat16.cpp index fd926ca1c6..6402fb72fa 100644 --- a/SYCL/Matrix/Legacy/joint_matrix_bfloat16.cpp +++ b/SYCL/Matrix/Legacy/joint_matrix_bfloat16.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/joint_matrix_bfloat16_32x64.cpp b/SYCL/Matrix/Legacy/joint_matrix_bfloat16_32x64.cpp index ea94bfcae1..65f8b84b62 100644 --- a/SYCL/Matrix/Legacy/joint_matrix_bfloat16_32x64.cpp +++ b/SYCL/Matrix/Legacy/joint_matrix_bfloat16_32x64.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // XFAIL: * diff --git a/SYCL/Matrix/Legacy/joint_matrix_bfloat16_colmajorA_colmajorB.cpp b/SYCL/Matrix/Legacy/joint_matrix_bfloat16_colmajorA_colmajorB.cpp index 6274064463..aa7d80dba0 100644 --- a/SYCL/Matrix/Legacy/joint_matrix_bfloat16_colmajorA_colmajorB.cpp +++ b/SYCL/Matrix/Legacy/joint_matrix_bfloat16_colmajorA_colmajorB.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // This tests support of col major layout for matrix B which does transpose and // then VNNI transform. This is currently only available on AMX diff --git a/SYCL/Matrix/Legacy/joint_matrix_bfloat16_rowmajorA_rowmajorB.cpp b/SYCL/Matrix/Legacy/joint_matrix_bfloat16_rowmajorA_rowmajorB.cpp index ec64f32cfa..65a8fe1e23 100644 --- a/SYCL/Matrix/Legacy/joint_matrix_bfloat16_rowmajorA_rowmajorB.cpp +++ b/SYCL/Matrix/Legacy/joint_matrix_bfloat16_rowmajorA_rowmajorB.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // This tests support of row major layout for matrix B which does automatic VNNI // transform. This is currently only available on AMX diff --git a/SYCL/Matrix/Legacy/joint_matrix_half.cpp b/SYCL/Matrix/Legacy/joint_matrix_half.cpp index d88a9f0b1b..19f6a021ce 100644 --- a/SYCL/Matrix/Legacy/joint_matrix_half.cpp +++ b/SYCL/Matrix/Legacy/joint_matrix_half.cpp @@ -5,11 +5,13 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix +// REQUIRES: matrix,gpu // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // Only run on the GPU because half is not supported on AMX hardware // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/joint_matrix_int8_colmajorA_colmajorB.cpp b/SYCL/Matrix/Legacy/joint_matrix_int8_colmajorA_colmajorB.cpp index bccb0d23d9..9b0db278da 100644 --- a/SYCL/Matrix/Legacy/joint_matrix_int8_colmajorA_colmajorB.cpp +++ b/SYCL/Matrix/Legacy/joint_matrix_int8_colmajorA_colmajorB.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // This tests support of col major layout for matrix B which does transpose and // then VNNI transform. This is currently only available on AMX diff --git a/SYCL/Matrix/Legacy/joint_matrix_int8_vnni.cpp b/SYCL/Matrix/Legacy/joint_matrix_int8_vnni.cpp index c16d3ad726..525ef4fe51 100644 --- a/SYCL/Matrix/Legacy/joint_matrix_int8_vnni.cpp +++ b/SYCL/Matrix/Legacy/joint_matrix_int8_vnni.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // XFAIL: gpu diff --git a/SYCL/Matrix/Legacy/joint_matrix_ss_int8.cpp b/SYCL/Matrix/Legacy/joint_matrix_ss_int8.cpp index afdfd28feb..4ddb428713 100644 --- a/SYCL/Matrix/Legacy/joint_matrix_ss_int8.cpp +++ b/SYCL/Matrix/Legacy/joint_matrix_ss_int8.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/joint_matrix_su_int8.cpp b/SYCL/Matrix/Legacy/joint_matrix_su_int8.cpp index 7c12200762..b7edf91804 100644 --- a/SYCL/Matrix/Legacy/joint_matrix_su_int8.cpp +++ b/SYCL/Matrix/Legacy/joint_matrix_su_int8.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/joint_matrix_us_int8.cpp b/SYCL/Matrix/Legacy/joint_matrix_us_int8.cpp index 935606cbe6..6e999a6b62 100644 --- a/SYCL/Matrix/Legacy/joint_matrix_us_int8.cpp +++ b/SYCL/Matrix/Legacy/joint_matrix_us_int8.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/joint_matrix_uu_int8.cpp b/SYCL/Matrix/Legacy/joint_matrix_uu_int8.cpp index 054f3aaae5..2a7f289df0 100644 --- a/SYCL/Matrix/Legacy/joint_matrix_uu_int8.cpp +++ b/SYCL/Matrix/Legacy/joint_matrix_uu_int8.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/XMX8/element_wise_all_ops_bf16.cpp b/SYCL/Matrix/XMX8/element_wise_all_ops_bf16.cpp index b12923536a..a65e36554f 100644 --- a/SYCL/Matrix/XMX8/element_wise_all_ops_bf16.cpp +++ b/SYCL/Matrix/XMX8/element_wise_all_ops_bf16.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/XMX8/element_wise_all_ops_half.cpp b/SYCL/Matrix/XMX8/element_wise_all_ops_half.cpp index 62fd63cc88..bbb1b8fab1 100644 --- a/SYCL/Matrix/XMX8/element_wise_all_ops_half.cpp +++ b/SYCL/Matrix/XMX8/element_wise_all_ops_half.cpp @@ -5,12 +5,14 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix-xmx8 +// REQUIRES: matrix-xmx8,gpu // Only runs on DPAS because AMX implementation does not support half data type // yet // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/XMX8/element_wise_all_ops_int8.cpp b/SYCL/Matrix/XMX8/element_wise_all_ops_int8.cpp index 2605e89e30..ad85e6abad 100644 --- a/SYCL/Matrix/XMX8/element_wise_all_ops_int8.cpp +++ b/SYCL/Matrix/XMX8/element_wise_all_ops_int8.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp b/SYCL/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp index 7124332923..286884bf43 100644 --- a/SYCL/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp +++ b/SYCL/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // This test stores the matrix B that is VNNIed (packed) in a row major fashion. // This is expected to fail on the GPU because the implementation does not diff --git a/SYCL/Matrix/XMX8/element_wise_irreg_sum_rows.cpp b/SYCL/Matrix/XMX8/element_wise_irreg_sum_rows.cpp index cc3dd78a63..d89af2d7ac 100644 --- a/SYCL/Matrix/XMX8/element_wise_irreg_sum_rows.cpp +++ b/SYCL/Matrix/XMX8/element_wise_irreg_sum_rows.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // this code calculates the sum of rows into a global array of number of rows // elements. First, partial reduction is computed inside each SG, then atomic diff --git a/SYCL/Matrix/XMX8/element_wise_ops.cpp b/SYCL/Matrix/XMX8/element_wise_ops.cpp index 1d7b64e406..b1acf17705 100644 --- a/SYCL/Matrix/XMX8/element_wise_ops.cpp +++ b/SYCL/Matrix/XMX8/element_wise_ops.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/XMX8/joint_matrix_bfloat16.cpp b/SYCL/Matrix/XMX8/joint_matrix_bfloat16.cpp index e1f67c435e..e2d4f82a6a 100644 --- a/SYCL/Matrix/XMX8/joint_matrix_bfloat16.cpp +++ b/SYCL/Matrix/XMX8/joint_matrix_bfloat16.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp b/SYCL/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp index e7c1b42dd7..7dfcd2efc4 100644 --- a/SYCL/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp +++ b/SYCL/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // XFAIL: * diff --git a/SYCL/Matrix/XMX8/joint_matrix_half.cpp b/SYCL/Matrix/XMX8/joint_matrix_half.cpp index 355fef88e2..c84fc0d203 100644 --- a/SYCL/Matrix/XMX8/joint_matrix_half.cpp +++ b/SYCL/Matrix/XMX8/joint_matrix_half.cpp @@ -5,11 +5,13 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix-xmx8 +// REQUIRES: matrix-xmx8,gpu // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // Only run on the GPU because half is not supported on AMX hardware // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/XMX8/joint_matrix_int8_vnni.cpp b/SYCL/Matrix/XMX8/joint_matrix_int8_vnni.cpp index 0af6a21b85..2b69e13785 100644 --- a/SYCL/Matrix/XMX8/joint_matrix_int8_vnni.cpp +++ b/SYCL/Matrix/XMX8/joint_matrix_int8_vnni.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // XFAIL: * diff --git a/SYCL/Matrix/XMX8/joint_matrix_ss_int8.cpp b/SYCL/Matrix/XMX8/joint_matrix_ss_int8.cpp index 86d7f75308..212799fa07 100644 --- a/SYCL/Matrix/XMX8/joint_matrix_ss_int8.cpp +++ b/SYCL/Matrix/XMX8/joint_matrix_ss_int8.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/XMX8/joint_matrix_su_int8.cpp b/SYCL/Matrix/XMX8/joint_matrix_su_int8.cpp index 252a647f5d..8c6a15445f 100644 --- a/SYCL/Matrix/XMX8/joint_matrix_su_int8.cpp +++ b/SYCL/Matrix/XMX8/joint_matrix_su_int8.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/XMX8/joint_matrix_us_int8.cpp b/SYCL/Matrix/XMX8/joint_matrix_us_int8.cpp index e74e7ad46b..124cc97f53 100644 --- a/SYCL/Matrix/XMX8/joint_matrix_us_int8.cpp +++ b/SYCL/Matrix/XMX8/joint_matrix_us_int8.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/XMX8/joint_matrix_uu_int8.cpp b/SYCL/Matrix/XMX8/joint_matrix_uu_int8.cpp index 06934de225..84dc77f931 100644 --- a/SYCL/Matrix/XMX8/joint_matrix_uu_int8.cpp +++ b/SYCL/Matrix/XMX8/joint_matrix_uu_int8.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/element_wise_all_ops_bf16.cpp b/SYCL/Matrix/element_wise_all_ops_bf16.cpp index b871f13e56..439c069652 100644 --- a/SYCL/Matrix/element_wise_all_ops_bf16.cpp +++ b/SYCL/Matrix/element_wise_all_ops_bf16.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/element_wise_all_ops_half.cpp b/SYCL/Matrix/element_wise_all_ops_half.cpp index e860180c03..331920e9ea 100644 --- a/SYCL/Matrix/element_wise_all_ops_half.cpp +++ b/SYCL/Matrix/element_wise_all_ops_half.cpp @@ -5,12 +5,14 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix +// REQUIRES: matrix,gpu // Only runs on DPAS because AMX implementation does not support half data type // yet // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/element_wise_all_ops_int8.cpp b/SYCL/Matrix/element_wise_all_ops_int8.cpp index adcee2a750..45f2ef4bda 100644 --- a/SYCL/Matrix/element_wise_all_ops_int8.cpp +++ b/SYCL/Matrix/element_wise_all_ops_int8.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/element_wise_all_ops_int8_packed.cpp b/SYCL/Matrix/element_wise_all_ops_int8_packed.cpp index 6008079449..f33cc2cc12 100644 --- a/SYCL/Matrix/element_wise_all_ops_int8_packed.cpp +++ b/SYCL/Matrix/element_wise_all_ops_int8_packed.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // This test stores the matrix B that is VNNIed (packed) in a row major fashion. // This is expected to fail on the GPU because the implementation does not diff --git a/SYCL/Matrix/element_wise_irreg_sum_rows.cpp b/SYCL/Matrix/element_wise_irreg_sum_rows.cpp index 76e24de5c6..7aebe7e031 100644 --- a/SYCL/Matrix/element_wise_irreg_sum_rows.cpp +++ b/SYCL/Matrix/element_wise_irreg_sum_rows.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // This code calculates the sum of rows into a global array of number of rows // elements. First, partial reduction is computed inside each SG, then atomic diff --git a/SYCL/Matrix/element_wise_ops.cpp b/SYCL/Matrix/element_wise_ops.cpp index c3b949fd9f..d686ff4ec6 100644 --- a/SYCL/Matrix/element_wise_ops.cpp +++ b/SYCL/Matrix/element_wise_ops.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/joint_matrix_bfloat16.cpp b/SYCL/Matrix/joint_matrix_bfloat16.cpp index 77f1be403d..b8008a8c98 100644 --- a/SYCL/Matrix/joint_matrix_bfloat16.cpp +++ b/SYCL/Matrix/joint_matrix_bfloat16.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/joint_matrix_bfloat16_32x64.cpp b/SYCL/Matrix/joint_matrix_bfloat16_32x64.cpp index 5c955ec422..4bc0084172 100644 --- a/SYCL/Matrix/joint_matrix_bfloat16_32x64.cpp +++ b/SYCL/Matrix/joint_matrix_bfloat16_32x64.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // XFAIL: * diff --git a/SYCL/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp b/SYCL/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp index 7f6b5c9e4f..97cf4c97d2 100644 --- a/SYCL/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp +++ b/SYCL/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // This tests support of col major layout for matrix B which does transpose and // then VNNI transform. This is currently only available on AMX diff --git a/SYCL/Matrix/joint_matrix_bfloat16_rowmajorA_rowmajorB.cpp b/SYCL/Matrix/joint_matrix_bfloat16_rowmajorA_rowmajorB.cpp index ae2bd6b99b..064fbfd6cf 100644 --- a/SYCL/Matrix/joint_matrix_bfloat16_rowmajorA_rowmajorB.cpp +++ b/SYCL/Matrix/joint_matrix_bfloat16_rowmajorA_rowmajorB.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // This tests support of row major layout for matrix B which does automatic VNNI // transform. This is currently only available on AMX diff --git a/SYCL/Matrix/joint_matrix_half.cpp b/SYCL/Matrix/joint_matrix_half.cpp index 1d131a64a8..720920e1cb 100644 --- a/SYCL/Matrix/joint_matrix_half.cpp +++ b/SYCL/Matrix/joint_matrix_half.cpp @@ -5,11 +5,13 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix +// REQUIRES: matrix,gpu // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // Only run on the GPU because half is not supported on AMX hardware // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp b/SYCL/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp index c572e53c34..1859b57f8b 100644 --- a/SYCL/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp +++ b/SYCL/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // This tests support of col major layout for matrix B which does transpose and // then VNNI transform. This is currently only available on AMX diff --git a/SYCL/Matrix/joint_matrix_int8_vnni.cpp b/SYCL/Matrix/joint_matrix_int8_vnni.cpp index f8ae1a8cf7..7c77be8984 100644 --- a/SYCL/Matrix/joint_matrix_int8_vnni.cpp +++ b/SYCL/Matrix/joint_matrix_int8_vnni.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // XFAIL: gpu diff --git a/SYCL/Matrix/joint_matrix_ss_int8.cpp b/SYCL/Matrix/joint_matrix_ss_int8.cpp index 860e590357..dac9ea719b 100644 --- a/SYCL/Matrix/joint_matrix_ss_int8.cpp +++ b/SYCL/Matrix/joint_matrix_ss_int8.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/joint_matrix_su_int8.cpp b/SYCL/Matrix/joint_matrix_su_int8.cpp index bd89977fc3..83de8f45d0 100644 --- a/SYCL/Matrix/joint_matrix_su_int8.cpp +++ b/SYCL/Matrix/joint_matrix_su_int8.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/joint_matrix_us_int8.cpp b/SYCL/Matrix/joint_matrix_us_int8.cpp index 0690636c59..c4cd631fda 100644 --- a/SYCL/Matrix/joint_matrix_us_int8.cpp +++ b/SYCL/Matrix/joint_matrix_us_int8.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/joint_matrix_uu_int8.cpp b/SYCL/Matrix/joint_matrix_uu_int8.cpp index 42f2ff8fe6..33a5372c33 100644 --- a/SYCL/Matrix/joint_matrix_uu_int8.cpp +++ b/SYCL/Matrix/joint_matrix_uu_int8.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include