diff --git a/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_bf16.cpp b/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_bf16.cpp
index 0217eb3741..25779a82a6 100644
--- a/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_bf16.cpp
+++ b/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_bf16.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <random>
diff --git a/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_half.cpp b/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_half.cpp
index 08c7251d0b..8a526b2812 100644
--- a/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_half.cpp
+++ b/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_half.cpp
@@ -5,12 +5,14 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix-xmx8
+// REQUIRES: matrix-xmx8,gpu
 
 // Only runs on DPAS because AMX implementation does not support half data type
 // yet
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <random>
diff --git a/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_int8.cpp b/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_int8.cpp
index 8462dff815..2fa63de99a 100644
--- a/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_int8.cpp
+++ b/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_int8.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <random>
diff --git a/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_int8_packed.cpp b/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_int8_packed.cpp
index 32f5e0138c..464d946ef1 100644
--- a/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_int8_packed.cpp
+++ b/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_int8_packed.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // XFAIL: gpu
 
diff --git a/SYCL/Matrix/Legacy/XMX8/element_wise_irreg_sum_rows.cpp b/SYCL/Matrix/Legacy/XMX8/element_wise_irreg_sum_rows.cpp
index df7a479e06..7332417f86 100644
--- a/SYCL/Matrix/Legacy/XMX8/element_wise_irreg_sum_rows.cpp
+++ b/SYCL/Matrix/Legacy/XMX8/element_wise_irreg_sum_rows.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // this code calculates the sum of rows into a global array of number of rows
 // elements. First, partial reduction is computed inside each SG, then atomic
diff --git a/SYCL/Matrix/Legacy/XMX8/element_wise_ops.cpp b/SYCL/Matrix/Legacy/XMX8/element_wise_ops.cpp
index c64a99fee5..baf2963867 100644
--- a/SYCL/Matrix/Legacy/XMX8/element_wise_ops.cpp
+++ b/SYCL/Matrix/Legacy/XMX8/element_wise_ops.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/Legacy/XMX8/joint_matrix_bf16.cpp b/SYCL/Matrix/Legacy/XMX8/joint_matrix_bf16.cpp
index ee084fd400..170154dc10 100644
--- a/SYCL/Matrix/Legacy/XMX8/joint_matrix_bf16.cpp
+++ b/SYCL/Matrix/Legacy/XMX8/joint_matrix_bf16.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/Legacy/XMX8/joint_matrix_bfloat16.cpp b/SYCL/Matrix/Legacy/XMX8/joint_matrix_bfloat16.cpp
index a616e9a0b0..1acaa3beff 100644
--- a/SYCL/Matrix/Legacy/XMX8/joint_matrix_bfloat16.cpp
+++ b/SYCL/Matrix/Legacy/XMX8/joint_matrix_bfloat16.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/Legacy/XMX8/joint_matrix_bfloat16_32x64.cpp b/SYCL/Matrix/Legacy/XMX8/joint_matrix_bfloat16_32x64.cpp
index 8e7b701941..d559747053 100644
--- a/SYCL/Matrix/Legacy/XMX8/joint_matrix_bfloat16_32x64.cpp
+++ b/SYCL/Matrix/Legacy/XMX8/joint_matrix_bfloat16_32x64.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // XFAIL: *
 
diff --git a/SYCL/Matrix/Legacy/XMX8/joint_matrix_half.cpp b/SYCL/Matrix/Legacy/XMX8/joint_matrix_half.cpp
index 3c2736ddd1..556ac61a56 100644
--- a/SYCL/Matrix/Legacy/XMX8/joint_matrix_half.cpp
+++ b/SYCL/Matrix/Legacy/XMX8/joint_matrix_half.cpp
@@ -5,11 +5,13 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix-xmx8
+// REQUIRES: matrix-xmx8,gpu
 
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // Only run on the GPU because half is not supported on AMX hardware
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/Legacy/XMX8/joint_matrix_int8_vnni.cpp b/SYCL/Matrix/Legacy/XMX8/joint_matrix_int8_vnni.cpp
index 272f22e554..8f96fdb3fd 100644
--- a/SYCL/Matrix/Legacy/XMX8/joint_matrix_int8_vnni.cpp
+++ b/SYCL/Matrix/Legacy/XMX8/joint_matrix_int8_vnni.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // XFAIL: *
 
diff --git a/SYCL/Matrix/Legacy/XMX8/joint_matrix_ss_int8.cpp b/SYCL/Matrix/Legacy/XMX8/joint_matrix_ss_int8.cpp
index fc9536ff78..0d10afea9b 100644
--- a/SYCL/Matrix/Legacy/XMX8/joint_matrix_ss_int8.cpp
+++ b/SYCL/Matrix/Legacy/XMX8/joint_matrix_ss_int8.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/Legacy/XMX8/joint_matrix_su_int8.cpp b/SYCL/Matrix/Legacy/XMX8/joint_matrix_su_int8.cpp
index d1ab5e003c..476606b1dd 100644
--- a/SYCL/Matrix/Legacy/XMX8/joint_matrix_su_int8.cpp
+++ b/SYCL/Matrix/Legacy/XMX8/joint_matrix_su_int8.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/Legacy/XMX8/joint_matrix_us_int8.cpp b/SYCL/Matrix/Legacy/XMX8/joint_matrix_us_int8.cpp
index b07ad6ec28..b3bc8d7c68 100644
--- a/SYCL/Matrix/Legacy/XMX8/joint_matrix_us_int8.cpp
+++ b/SYCL/Matrix/Legacy/XMX8/joint_matrix_us_int8.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/Legacy/XMX8/joint_matrix_uu_int8.cpp b/SYCL/Matrix/Legacy/XMX8/joint_matrix_uu_int8.cpp
index a14b259b4c..1a9e6bdcda 100644
--- a/SYCL/Matrix/Legacy/XMX8/joint_matrix_uu_int8.cpp
+++ b/SYCL/Matrix/Legacy/XMX8/joint_matrix_uu_int8.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/Legacy/element_wise_all_ops_bf16.cpp b/SYCL/Matrix/Legacy/element_wise_all_ops_bf16.cpp
index a81f0e255f..b67c04d519 100644
--- a/SYCL/Matrix/Legacy/element_wise_all_ops_bf16.cpp
+++ b/SYCL/Matrix/Legacy/element_wise_all_ops_bf16.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <random>
diff --git a/SYCL/Matrix/Legacy/element_wise_all_ops_half.cpp b/SYCL/Matrix/Legacy/element_wise_all_ops_half.cpp
index bd8becff4c..79f2a72341 100644
--- a/SYCL/Matrix/Legacy/element_wise_all_ops_half.cpp
+++ b/SYCL/Matrix/Legacy/element_wise_all_ops_half.cpp
@@ -5,12 +5,14 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix
+// REQUIRES: matrix,gpu
 
 // Only runs on DPAS because AMX implementation does not support half data type
 // yet
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <random>
diff --git a/SYCL/Matrix/Legacy/element_wise_all_ops_int8.cpp b/SYCL/Matrix/Legacy/element_wise_all_ops_int8.cpp
index 49a16d3964..e7f82611fd 100644
--- a/SYCL/Matrix/Legacy/element_wise_all_ops_int8.cpp
+++ b/SYCL/Matrix/Legacy/element_wise_all_ops_int8.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <random>
diff --git a/SYCL/Matrix/Legacy/element_wise_all_ops_int8_packed.cpp b/SYCL/Matrix/Legacy/element_wise_all_ops_int8_packed.cpp
index d3e38f638b..ba97bdcb60 100644
--- a/SYCL/Matrix/Legacy/element_wise_all_ops_int8_packed.cpp
+++ b/SYCL/Matrix/Legacy/element_wise_all_ops_int8_packed.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // XFAIL: gpu
 
diff --git a/SYCL/Matrix/Legacy/element_wise_irreg_sum_rows.cpp b/SYCL/Matrix/Legacy/element_wise_irreg_sum_rows.cpp
index 2b7895ea5b..269d1f7592 100644
--- a/SYCL/Matrix/Legacy/element_wise_irreg_sum_rows.cpp
+++ b/SYCL/Matrix/Legacy/element_wise_irreg_sum_rows.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // this code calculates the sum of rows into a global array of number of rows
 // elements. First, partial reduction is computed inside each SG, then atomic
diff --git a/SYCL/Matrix/Legacy/element_wise_ops.cpp b/SYCL/Matrix/Legacy/element_wise_ops.cpp
index d9a407e131..a432215bee 100644
--- a/SYCL/Matrix/Legacy/element_wise_ops.cpp
+++ b/SYCL/Matrix/Legacy/element_wise_ops.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/Legacy/joint_matrix_bf16.cpp b/SYCL/Matrix/Legacy/joint_matrix_bf16.cpp
index b02e8cfc07..97d6fd22b4 100644
--- a/SYCL/Matrix/Legacy/joint_matrix_bf16.cpp
+++ b/SYCL/Matrix/Legacy/joint_matrix_bf16.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/Legacy/joint_matrix_bfloat16.cpp b/SYCL/Matrix/Legacy/joint_matrix_bfloat16.cpp
index fd926ca1c6..6402fb72fa 100644
--- a/SYCL/Matrix/Legacy/joint_matrix_bfloat16.cpp
+++ b/SYCL/Matrix/Legacy/joint_matrix_bfloat16.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/Legacy/joint_matrix_bfloat16_32x64.cpp b/SYCL/Matrix/Legacy/joint_matrix_bfloat16_32x64.cpp
index ea94bfcae1..65f8b84b62 100644
--- a/SYCL/Matrix/Legacy/joint_matrix_bfloat16_32x64.cpp
+++ b/SYCL/Matrix/Legacy/joint_matrix_bfloat16_32x64.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // XFAIL: *
 
diff --git a/SYCL/Matrix/Legacy/joint_matrix_bfloat16_colmajorA_colmajorB.cpp b/SYCL/Matrix/Legacy/joint_matrix_bfloat16_colmajorA_colmajorB.cpp
index 6274064463..aa7d80dba0 100644
--- a/SYCL/Matrix/Legacy/joint_matrix_bfloat16_colmajorA_colmajorB.cpp
+++ b/SYCL/Matrix/Legacy/joint_matrix_bfloat16_colmajorA_colmajorB.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // This tests support of col major layout for matrix B which does transpose and
 // then VNNI transform. This is currently only available on AMX
diff --git a/SYCL/Matrix/Legacy/joint_matrix_bfloat16_rowmajorA_rowmajorB.cpp b/SYCL/Matrix/Legacy/joint_matrix_bfloat16_rowmajorA_rowmajorB.cpp
index ec64f32cfa..65a8fe1e23 100644
--- a/SYCL/Matrix/Legacy/joint_matrix_bfloat16_rowmajorA_rowmajorB.cpp
+++ b/SYCL/Matrix/Legacy/joint_matrix_bfloat16_rowmajorA_rowmajorB.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // This tests support of row major layout for matrix B which does automatic VNNI
 // transform. This is currently only available on AMX
diff --git a/SYCL/Matrix/Legacy/joint_matrix_half.cpp b/SYCL/Matrix/Legacy/joint_matrix_half.cpp
index d88a9f0b1b..19f6a021ce 100644
--- a/SYCL/Matrix/Legacy/joint_matrix_half.cpp
+++ b/SYCL/Matrix/Legacy/joint_matrix_half.cpp
@@ -5,11 +5,13 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix
+// REQUIRES: matrix,gpu
 
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // Only run on the GPU because half is not supported on AMX hardware
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/Legacy/joint_matrix_int8_colmajorA_colmajorB.cpp b/SYCL/Matrix/Legacy/joint_matrix_int8_colmajorA_colmajorB.cpp
index bccb0d23d9..9b0db278da 100644
--- a/SYCL/Matrix/Legacy/joint_matrix_int8_colmajorA_colmajorB.cpp
+++ b/SYCL/Matrix/Legacy/joint_matrix_int8_colmajorA_colmajorB.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // This tests support of col major layout for matrix B which does transpose and
 // then VNNI transform. This is currently only available on AMX
diff --git a/SYCL/Matrix/Legacy/joint_matrix_int8_vnni.cpp b/SYCL/Matrix/Legacy/joint_matrix_int8_vnni.cpp
index c16d3ad726..525ef4fe51 100644
--- a/SYCL/Matrix/Legacy/joint_matrix_int8_vnni.cpp
+++ b/SYCL/Matrix/Legacy/joint_matrix_int8_vnni.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // XFAIL: gpu
 
diff --git a/SYCL/Matrix/Legacy/joint_matrix_ss_int8.cpp b/SYCL/Matrix/Legacy/joint_matrix_ss_int8.cpp
index afdfd28feb..4ddb428713 100644
--- a/SYCL/Matrix/Legacy/joint_matrix_ss_int8.cpp
+++ b/SYCL/Matrix/Legacy/joint_matrix_ss_int8.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/Legacy/joint_matrix_su_int8.cpp b/SYCL/Matrix/Legacy/joint_matrix_su_int8.cpp
index 7c12200762..b7edf91804 100644
--- a/SYCL/Matrix/Legacy/joint_matrix_su_int8.cpp
+++ b/SYCL/Matrix/Legacy/joint_matrix_su_int8.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/Legacy/joint_matrix_us_int8.cpp b/SYCL/Matrix/Legacy/joint_matrix_us_int8.cpp
index 935606cbe6..6e999a6b62 100644
--- a/SYCL/Matrix/Legacy/joint_matrix_us_int8.cpp
+++ b/SYCL/Matrix/Legacy/joint_matrix_us_int8.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/Legacy/joint_matrix_uu_int8.cpp b/SYCL/Matrix/Legacy/joint_matrix_uu_int8.cpp
index 054f3aaae5..2a7f289df0 100644
--- a/SYCL/Matrix/Legacy/joint_matrix_uu_int8.cpp
+++ b/SYCL/Matrix/Legacy/joint_matrix_uu_int8.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/XMX8/element_wise_all_ops_bf16.cpp b/SYCL/Matrix/XMX8/element_wise_all_ops_bf16.cpp
index b12923536a..a65e36554f 100644
--- a/SYCL/Matrix/XMX8/element_wise_all_ops_bf16.cpp
+++ b/SYCL/Matrix/XMX8/element_wise_all_ops_bf16.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <random>
diff --git a/SYCL/Matrix/XMX8/element_wise_all_ops_half.cpp b/SYCL/Matrix/XMX8/element_wise_all_ops_half.cpp
index 62fd63cc88..bbb1b8fab1 100644
--- a/SYCL/Matrix/XMX8/element_wise_all_ops_half.cpp
+++ b/SYCL/Matrix/XMX8/element_wise_all_ops_half.cpp
@@ -5,12 +5,14 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix-xmx8
+// REQUIRES: matrix-xmx8,gpu
 
 // Only runs on DPAS because AMX implementation does not support half data type
 // yet
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <random>
diff --git a/SYCL/Matrix/XMX8/element_wise_all_ops_int8.cpp b/SYCL/Matrix/XMX8/element_wise_all_ops_int8.cpp
index 2605e89e30..ad85e6abad 100644
--- a/SYCL/Matrix/XMX8/element_wise_all_ops_int8.cpp
+++ b/SYCL/Matrix/XMX8/element_wise_all_ops_int8.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <random>
diff --git a/SYCL/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp b/SYCL/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp
index 7124332923..286884bf43 100644
--- a/SYCL/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp
+++ b/SYCL/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // This test stores the matrix B that is VNNIed (packed) in a row major fashion.
 // This is expected to fail on the GPU because the implementation does not
diff --git a/SYCL/Matrix/XMX8/element_wise_irreg_sum_rows.cpp b/SYCL/Matrix/XMX8/element_wise_irreg_sum_rows.cpp
index cc3dd78a63..d89af2d7ac 100644
--- a/SYCL/Matrix/XMX8/element_wise_irreg_sum_rows.cpp
+++ b/SYCL/Matrix/XMX8/element_wise_irreg_sum_rows.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // this code calculates the sum of rows into a global array of number of rows
 // elements. First, partial reduction is computed inside each SG, then atomic
diff --git a/SYCL/Matrix/XMX8/element_wise_ops.cpp b/SYCL/Matrix/XMX8/element_wise_ops.cpp
index 1d7b64e406..b1acf17705 100644
--- a/SYCL/Matrix/XMX8/element_wise_ops.cpp
+++ b/SYCL/Matrix/XMX8/element_wise_ops.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/XMX8/joint_matrix_bfloat16.cpp b/SYCL/Matrix/XMX8/joint_matrix_bfloat16.cpp
index e1f67c435e..e2d4f82a6a 100644
--- a/SYCL/Matrix/XMX8/joint_matrix_bfloat16.cpp
+++ b/SYCL/Matrix/XMX8/joint_matrix_bfloat16.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp b/SYCL/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp
index e7c1b42dd7..7dfcd2efc4 100644
--- a/SYCL/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp
+++ b/SYCL/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // XFAIL: *
 
diff --git a/SYCL/Matrix/XMX8/joint_matrix_half.cpp b/SYCL/Matrix/XMX8/joint_matrix_half.cpp
index 355fef88e2..c84fc0d203 100644
--- a/SYCL/Matrix/XMX8/joint_matrix_half.cpp
+++ b/SYCL/Matrix/XMX8/joint_matrix_half.cpp
@@ -5,11 +5,13 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix-xmx8
+// REQUIRES: matrix-xmx8,gpu
 
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // Only run on the GPU because half is not supported on AMX hardware
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/XMX8/joint_matrix_int8_vnni.cpp b/SYCL/Matrix/XMX8/joint_matrix_int8_vnni.cpp
index 0af6a21b85..2b69e13785 100644
--- a/SYCL/Matrix/XMX8/joint_matrix_int8_vnni.cpp
+++ b/SYCL/Matrix/XMX8/joint_matrix_int8_vnni.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // XFAIL: *
 
diff --git a/SYCL/Matrix/XMX8/joint_matrix_ss_int8.cpp b/SYCL/Matrix/XMX8/joint_matrix_ss_int8.cpp
index 86d7f75308..212799fa07 100644
--- a/SYCL/Matrix/XMX8/joint_matrix_ss_int8.cpp
+++ b/SYCL/Matrix/XMX8/joint_matrix_ss_int8.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/XMX8/joint_matrix_su_int8.cpp b/SYCL/Matrix/XMX8/joint_matrix_su_int8.cpp
index 252a647f5d..8c6a15445f 100644
--- a/SYCL/Matrix/XMX8/joint_matrix_su_int8.cpp
+++ b/SYCL/Matrix/XMX8/joint_matrix_su_int8.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/XMX8/joint_matrix_us_int8.cpp b/SYCL/Matrix/XMX8/joint_matrix_us_int8.cpp
index e74e7ad46b..124cc97f53 100644
--- a/SYCL/Matrix/XMX8/joint_matrix_us_int8.cpp
+++ b/SYCL/Matrix/XMX8/joint_matrix_us_int8.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/XMX8/joint_matrix_uu_int8.cpp b/SYCL/Matrix/XMX8/joint_matrix_uu_int8.cpp
index 06934de225..84dc77f931 100644
--- a/SYCL/Matrix/XMX8/joint_matrix_uu_int8.cpp
+++ b/SYCL/Matrix/XMX8/joint_matrix_uu_int8.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/element_wise_all_ops_bf16.cpp b/SYCL/Matrix/element_wise_all_ops_bf16.cpp
index b871f13e56..439c069652 100644
--- a/SYCL/Matrix/element_wise_all_ops_bf16.cpp
+++ b/SYCL/Matrix/element_wise_all_ops_bf16.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <random>
diff --git a/SYCL/Matrix/element_wise_all_ops_half.cpp b/SYCL/Matrix/element_wise_all_ops_half.cpp
index e860180c03..331920e9ea 100644
--- a/SYCL/Matrix/element_wise_all_ops_half.cpp
+++ b/SYCL/Matrix/element_wise_all_ops_half.cpp
@@ -5,12 +5,14 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix
+// REQUIRES: matrix,gpu
 
 // Only runs on DPAS because AMX implementation does not support half data type
 // yet
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <random>
diff --git a/SYCL/Matrix/element_wise_all_ops_int8.cpp b/SYCL/Matrix/element_wise_all_ops_int8.cpp
index adcee2a750..45f2ef4bda 100644
--- a/SYCL/Matrix/element_wise_all_ops_int8.cpp
+++ b/SYCL/Matrix/element_wise_all_ops_int8.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <random>
diff --git a/SYCL/Matrix/element_wise_all_ops_int8_packed.cpp b/SYCL/Matrix/element_wise_all_ops_int8_packed.cpp
index 6008079449..f33cc2cc12 100644
--- a/SYCL/Matrix/element_wise_all_ops_int8_packed.cpp
+++ b/SYCL/Matrix/element_wise_all_ops_int8_packed.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // This test stores the matrix B that is VNNIed (packed) in a row major fashion.
 // This is expected to fail on the GPU because the implementation does not
diff --git a/SYCL/Matrix/element_wise_irreg_sum_rows.cpp b/SYCL/Matrix/element_wise_irreg_sum_rows.cpp
index 76e24de5c6..7aebe7e031 100644
--- a/SYCL/Matrix/element_wise_irreg_sum_rows.cpp
+++ b/SYCL/Matrix/element_wise_irreg_sum_rows.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // This code calculates the sum of rows into a global array of number of rows
 // elements. First, partial reduction is computed inside each SG, then atomic
diff --git a/SYCL/Matrix/element_wise_ops.cpp b/SYCL/Matrix/element_wise_ops.cpp
index c3b949fd9f..d686ff4ec6 100644
--- a/SYCL/Matrix/element_wise_ops.cpp
+++ b/SYCL/Matrix/element_wise_ops.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/joint_matrix_bfloat16.cpp b/SYCL/Matrix/joint_matrix_bfloat16.cpp
index 77f1be403d..b8008a8c98 100644
--- a/SYCL/Matrix/joint_matrix_bfloat16.cpp
+++ b/SYCL/Matrix/joint_matrix_bfloat16.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/joint_matrix_bfloat16_32x64.cpp b/SYCL/Matrix/joint_matrix_bfloat16_32x64.cpp
index 5c955ec422..4bc0084172 100644
--- a/SYCL/Matrix/joint_matrix_bfloat16_32x64.cpp
+++ b/SYCL/Matrix/joint_matrix_bfloat16_32x64.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // XFAIL: *
 
diff --git a/SYCL/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp b/SYCL/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp
index 7f6b5c9e4f..97cf4c97d2 100644
--- a/SYCL/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp
+++ b/SYCL/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // This tests support of col major layout for matrix B which does transpose and
 // then VNNI transform. This is currently only available on AMX
diff --git a/SYCL/Matrix/joint_matrix_bfloat16_rowmajorA_rowmajorB.cpp b/SYCL/Matrix/joint_matrix_bfloat16_rowmajorA_rowmajorB.cpp
index ae2bd6b99b..064fbfd6cf 100644
--- a/SYCL/Matrix/joint_matrix_bfloat16_rowmajorA_rowmajorB.cpp
+++ b/SYCL/Matrix/joint_matrix_bfloat16_rowmajorA_rowmajorB.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // This tests support of row major layout for matrix B which does automatic VNNI
 // transform. This is currently only available on AMX
diff --git a/SYCL/Matrix/joint_matrix_half.cpp b/SYCL/Matrix/joint_matrix_half.cpp
index 1d131a64a8..720920e1cb 100644
--- a/SYCL/Matrix/joint_matrix_half.cpp
+++ b/SYCL/Matrix/joint_matrix_half.cpp
@@ -5,11 +5,13 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix
+// REQUIRES: matrix,gpu
 
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // Only run on the GPU because half is not supported on AMX hardware
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp b/SYCL/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp
index c572e53c34..1859b57f8b 100644
--- a/SYCL/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp
+++ b/SYCL/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // This tests support of col major layout for matrix B which does transpose and
 // then VNNI transform. This is currently only available on AMX
diff --git a/SYCL/Matrix/joint_matrix_int8_vnni.cpp b/SYCL/Matrix/joint_matrix_int8_vnni.cpp
index f8ae1a8cf7..7c77be8984 100644
--- a/SYCL/Matrix/joint_matrix_int8_vnni.cpp
+++ b/SYCL/Matrix/joint_matrix_int8_vnni.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // XFAIL: gpu
 
diff --git a/SYCL/Matrix/joint_matrix_ss_int8.cpp b/SYCL/Matrix/joint_matrix_ss_int8.cpp
index 860e590357..dac9ea719b 100644
--- a/SYCL/Matrix/joint_matrix_ss_int8.cpp
+++ b/SYCL/Matrix/joint_matrix_ss_int8.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/joint_matrix_su_int8.cpp b/SYCL/Matrix/joint_matrix_su_int8.cpp
index bd89977fc3..83de8f45d0 100644
--- a/SYCL/Matrix/joint_matrix_su_int8.cpp
+++ b/SYCL/Matrix/joint_matrix_su_int8.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/joint_matrix_us_int8.cpp b/SYCL/Matrix/joint_matrix_us_int8.cpp
index 0690636c59..c4cd631fda 100644
--- a/SYCL/Matrix/joint_matrix_us_int8.cpp
+++ b/SYCL/Matrix/joint_matrix_us_int8.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/joint_matrix_uu_int8.cpp b/SYCL/Matrix/joint_matrix_uu_int8.cpp
index 42f2ff8fe6..33a5372c33 100644
--- a/SYCL/Matrix/joint_matrix_uu_int8.cpp
+++ b/SYCL/Matrix/joint_matrix_uu_int8.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>