-
Notifications
You must be signed in to change notification settings - Fork 558
MetalPerformancePrimitives macOS xcode26.4 b2
Rolf Bjarne Kvinge edited this page Feb 27, 2026
·
1 revision
#MetalPerformancePrimitives.framework
diff -ruN /Applications/Xcode_26.3.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsMatMul2d.h /Applications/Xcode_26.4.0-beta2.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsMatMul2d.h
--- /Applications/Xcode_26.3.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsMatMul2d.h 2025-11-09 03:56:31
+++ /Applications/Xcode_26.4.0-beta2.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsMatMul2d.h 2026-02-16 07:20:12
@@ -10,34 +10,55 @@
// C can be tensor_handle, tensor_offset, tensor_inline or cooperative_tensor.
// Data type combinations supported by this operation are as follows:
//
-// A B C
-// ---------------------------
-// half half half
-// half int8_t half
-// int8_t half half
-// half half float
-// half float float
-// half int8_t float
-// float half float
-// float float float
-// float int8_t float
-// int8_t half float
-// int8_t float float
-// int8_t int8_t int32_t
-// bfloat bfloat bfloat
-// bfloat bfloat float
-// bfloat float float
-// bfloat int8_t bfloat
-// bfloat int8_t float
-// float bfloat float
-// int8_t bfloat bfloat
-// int8_t bfloat float
-// bfloat half bfloat
-// bfloat half half
-// bfloat half float
-// half bfloat bfloat
-// half bfloat half
-// half bfloat float
+// Left Right Destination
+// ------- ------------- -----------
+// half half half
+// half int8_t half
+// half uint8_t half
+// int8_t half half
+// uint8_t half half
+// half half float
+// half float float
+// half int8_t float
+// half uint8_t float
+// float half float
+// float float float
+// float int8_t float
+// float uint8_t float
+// int8_t half float
+// uint8_t half float
+// int8_t float float
+// uint8_t float float
+// int8_t int8_t int32_t
+// uint8_t uint8_t int32_t
+// bfloat bfloat bfloat
+// bfloat bfloat float
+// bfloat float float
+// bfloat int8_t bfloat
+// bfloat int8_t float
+// float bfloat float
+// int8_t bfloat bfloat
+// int8_t bfloat float
+// bfloat half bfloat
+// bfloat half half
+// bfloat half float
+// half bfloat bfloat
+// half bfloat half
+// half bfloat float
+// bfloat uint8_t bfloat
+// bfloat uint8_t float
+// uint8_t bfloat bfloat
+// uint8_t bfloat float
+// half int4b_format half
+// half int4b_format float
+// half uint4b_format half
+// half uint4b_format float
+// int8_t int4b_format int32_t
+// uint8_t uint4b_format int32_t
+// bfloat int4b_format bfloat
+// bfloat uint4b_format bfloat
+// bfloat int4b_format float
+// bfloat uint4b_format float
//
// Basic usage is in the following example which takes M x K matrix A of type
// half, K x N matrix B of type half, both in device memory and produces M x N
diff -ruN /Applications/Xcode_26.3.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsConvolution2dImpl.h /Applications/Xcode_26.4.0-beta2.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsConvolution2dImpl.h
--- /Applications/Xcode_26.3.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsConvolution2dImpl.h 2025-11-09 04:29:53
+++ /Applications/Xcode_26.4.0-beta2.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsConvolution2dImpl.h 2026-02-16 07:09:43
@@ -4213,7 +4213,10 @@
extern "C" TENSOROPS_EXPORT EXTERNALLY_DEFINED_ATTR uint16_t
__tensorops_impl_conv2d_cooperative_destination_tensor_num_elements(
const thread convolution2d_descriptor &descriptor,
- __tensor_ops_detail::__const_thread_void_t, int threads);
+ __tensor_ops_detail::__const_thread_void_t,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype, int threads);
extern "C" TENSOROPS_EXPORT
EXTERNALLY_DEFINED_ATTR __tensor_ops_detail::__thread_void_t
__tensorops_impl_conv2d_cooperative_destination_tensor_elements(
@@ -4225,6 +4228,9 @@
__tensor_ops_detail::__thread_void_t, uint16_t,
__tensor_ops_detail::__tensor_ops_datatype,
__tensor_ops_detail::__thread_void_t,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
__tensor_ops_detail::__tensor_ops_datatype, int threads);
extern "C" TENSOROPS_EXPORT EXTERNALLY_DEFINED_ATTR void
__tensorops_impl_conv2d_cooperative_destination_tensor_init(
@@ -4256,6 +4262,9 @@
__tensorops_impl_conv2d_cooperative_destination_tensor_is_valid_element(
const thread convolution2d_descriptor &,
__tensor_ops_detail::__thread_void_t, uint16_t,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
__tensor_ops_detail::__tensor_ops_datatype, int threads);
extern "C" TENSOROPS_EXPORT EXTERNALLY_DEFINED_ATTR uint16_t
__tensorops_impl_conv2d_cooperative_destination_tensor_get_element_index(
@@ -4266,98 +4275,148 @@
thread convolution2d_descriptor &desc, thread void *storage,
const thread void *source,
__tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
- int sourceRank, int threads);
+ int sourceRank, __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void
__tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_tg_f32(
thread convolution2d_descriptor &desc, thread void *storage,
const thread void *source,
__tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
- int sourceRank, int threads);
+ int sourceRank, __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void
__tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_dv_i32(
thread convolution2d_descriptor &desc, thread void *storage,
const thread void *source,
__tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
- int sourceRank, int threads);
+ int sourceRank, __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void
__tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_tg_i32(
thread convolution2d_descriptor &desc, thread void *storage,
const thread void *source,
__tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
- int sourceRank, int threads);
+ int sourceRank, __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void
__tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_dv_f16(
thread convolution2d_descriptor &desc, thread void *storage,
const thread void *source,
__tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
- int sourceRank, int threads);
+ int sourceRank, __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void
__tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_tg_f16(
thread convolution2d_descriptor &desc, thread void *storage,
const thread void *source,
__tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
- int sourceRank, int threads);
+ int sourceRank, __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void
__tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_dv_bf(
thread convolution2d_descriptor &desc, thread void *storage,
const thread void *source,
__tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
- int sourceRank, int threads);
+ int sourceRank,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void
__tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_tg_bf(
thread convolution2d_descriptor &desc, thread void *storage,
const thread void *source,
__tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
- int sourceRank, int threads);
+ int sourceRank,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void
__tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_dv_f32(
thread convolution2d_descriptor &desc, const thread void *storage,
const thread void *destination,
__tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
- int threads);
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void
__tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_tg_f32(
thread convolution2d_descriptor &desc, const thread void *storage,
const thread void *destination,
__tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
- int threads);
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void
__tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_dv_i32(
thread convolution2d_descriptor &desc, const thread void *storage,
const thread void *destination,
__tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
- int threads);
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void
__tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_tg_i32(
thread convolution2d_descriptor &desc, const thread void *storage,
const thread void *destination,
__tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
- int threads);
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void
__tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_dv_f16(
thread convolution2d_descriptor &desc, const thread void *storage,
const thread void *destination,
__tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
- int threads);
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void
__tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_tg_f16(
thread convolution2d_descriptor &desc, const thread void *storage,
const thread void *destination,
__tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
- int threads);
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void
__tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_dv_bf(
thread convolution2d_descriptor &desc, const thread void *storage,
const thread void *destination,
__tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
- int threads);
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void
__tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_tg_bf(
thread convolution2d_descriptor &desc, const thread void *storage,
const thread void *destination,
__tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
- int threads);
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ int threads);
template <convolution2d_descriptor descriptor,
convolution2d_cooperative_operand operand, typename scope,
@@ -4424,8 +4483,14 @@
{
metal::execution_threads t = scope();
int threads = t.size();
+ __tensor_ops_detail::__tensor_ops_datatype d_data_type =
+ __tensor_ops_detail::__type_to_tensor_ops_datatype<element_t>::value;
+ __tensor_ops_detail::__tensor_ops_datatype a_data_type =
+ __tensor_ops_detail::__type_to_tensor_ops_datatype<a_value_type>::value;
+ __tensor_ops_detail::__tensor_ops_datatype w_data_type =
+ __tensor_ops_detail::__type_to_tensor_ops_datatype<w_value_type>::value;
return __tensorops_impl_conv2d_cooperative_destination_tensor_num_elements(
- descriptor, storage, threads);
+ descriptor, storage, d_data_type, a_data_type, w_data_type, threads);
}
static void construct(thread void *this_)
@@ -4544,16 +4609,23 @@
tensorType>();
const thread void *source = (const thread void *)(&sourceT);
+
+ __tensor_ops_detail::__tensor_ops_datatype d_data_type =
+ __tensor_ops_detail::__type_to_tensor_ops_datatype<element_t>::value;
+ __tensor_ops_detail::__tensor_ops_datatype a_data_type =
+ __tensor_ops_detail::__type_to_tensor_ops_datatype<a_value_type>::value;
+ __tensor_ops_detail::__tensor_ops_datatype w_data_type =
+ __tensor_ops_detail::__type_to_tensor_ops_datatype<w_value_type>::value;
if constexpr (__tensor_ops_detail::__is_same_v<elem_t, half>)
{
if constexpr (__tensor_ops_detail::__is_device_addrspace_v<sourcePtrType>)
__tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_dv_f16(
- desc, storage, source, sourceDescType, sourceRank, threads);
+ desc, storage, source, sourceDescType, sourceRank, d_data_type, a_data_type, w_data_type, threads);
else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
sourcePtrType>)
__tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_tg_f16(
- desc, storage, source, sourceDescType, sourceRank, threads);
+ desc, storage, source, sourceDescType, sourceRank, d_data_type, a_data_type, w_data_type, threads);
else
static_assert(__tensor_ops_detail::__assert_false_v<sourcePtrType>,
"Unsupported address space");
@@ -4562,11 +4634,11 @@
{
if constexpr (__tensor_ops_detail::__is_device_addrspace_v<sourcePtrType>)
__tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_dv_i32(
- desc, storage, source, sourceDescType, sourceRank, threads);
+ desc, storage, source, sourceDescType, sourceRank, d_data_type, a_data_type, w_data_type, threads);
else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
sourcePtrType>)
__tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_tg_i32(
- desc, storage, source, sourceDescType, sourceRank, threads);
+ desc, storage, source, sourceDescType, sourceRank, d_data_type, a_data_type, w_data_type, threads);
else
static_assert(__tensor_ops_detail::__assert_false_v<sourcePtrType>,
"Unsupported address space");
@@ -4575,11 +4647,11 @@
{
if constexpr (__tensor_ops_detail::__is_device_addrspace_v<sourcePtrType>)
__tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_dv_f32(
- desc, storage, source, sourceDescType, sourceRank, threads);
+ desc, storage, source, sourceDescType, sourceRank, d_data_type, a_data_type, w_data_type, threads);
else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
sourcePtrType>)
__tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_tg_f32(
- desc, storage, source, sourceDescType, sourceRank, threads);
+ desc, storage, source, sourceDescType, sourceRank, d_data_type, a_data_type, w_data_type, threads);
else
static_assert(__tensor_ops_detail::__assert_false_v<sourcePtrType>,
"Unsupported address space");
@@ -4589,11 +4661,11 @@
{
if constexpr (__tensor_ops_detail::__is_device_addrspace_v<sourcePtrType>)
__tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_dv_bf(
- desc, storage, source, sourceDescType, sourceRank, threads);
+ desc, storage, source, sourceDescType, sourceRank, d_data_type, a_data_type, w_data_type, threads);
else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
sourcePtrType>)
__tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_tg_bf(
- desc, storage, source, sourceDescType, sourceRank, threads);
+ desc, storage, source, sourceDescType, sourceRank, d_data_type, a_data_type, w_data_type, threads);
else
static_assert(__tensor_ops_detail::__assert_false_v<sourcePtrType>,
"Unsupported address space");
@@ -4631,17 +4703,24 @@
tensorType>();
const thread void *destination = (const thread void *)(&destinationT);
+
+ __tensor_ops_detail::__tensor_ops_datatype d_data_type =
+ __tensor_ops_detail::__type_to_tensor_ops_datatype<element_t>::value;
+ __tensor_ops_detail::__tensor_ops_datatype a_data_type =
+ __tensor_ops_detail::__type_to_tensor_ops_datatype<a_value_type>::value;
+ __tensor_ops_detail::__tensor_ops_datatype w_data_type =
+ __tensor_ops_detail::__type_to_tensor_ops_datatype<w_value_type>::value;
if constexpr (__tensor_ops_detail::__is_same_v<elem_t, half>)
{
if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
destination_ptr_type>)
__tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_dv_f16(
- desc, storage, destination, destination_desc_type, threads);
+ desc, storage, destination, destination_desc_type, d_data_type, a_data_type, w_data_type, threads);
else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
destination_ptr_type>)
__tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_tg_f16(
- desc, storage, destination, destination_desc_type, threads);
+ desc, storage, destination, destination_desc_type, d_data_type, a_data_type, w_data_type, threads);
else
static_assert(
__tensor_ops_detail::__assert_false_v<destination_ptr_type>,
@@ -4652,11 +4731,11 @@
if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
destination_ptr_type>)
__tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_dv_i32(
- desc, storage, destination, destination_desc_type, threads);
+ desc, storage, destination, destination_desc_type, d_data_type, a_data_type, w_data_type, threads);
else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
destination_ptr_type>)
__tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_tg_i32(
- desc, storage, destination, destination_desc_type, threads);
+ desc, storage, destination, destination_desc_type, d_data_type, a_data_type, w_data_type, threads);
else
static_assert(
__tensor_ops_detail::__assert_false_v<destination_ptr_type>,
@@ -4667,11 +4746,11 @@
if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
destination_ptr_type>)
__tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_dv_f32(
- desc, storage, destination, destination_desc_type, threads);
+ desc, storage, destination, destination_desc_type, d_data_type, a_data_type, w_data_type, threads);
else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
destination_ptr_type>)
__tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_tg_f32(
- desc, storage, destination, destination_desc_type, threads);
+ desc, storage, destination, destination_desc_type, d_data_type, a_data_type, w_data_type, threads);
else
static_assert(
__tensor_ops_detail::__assert_false_v<destination_ptr_type>,
@@ -4683,11 +4762,11 @@
if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
destination_ptr_type>)
__tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_dv_bf(
- desc, storage, destination, destination_desc_type, threads);
+ desc, storage, destination, destination_desc_type, d_data_type, a_data_type, w_data_type, threads);
else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
destination_ptr_type>)
__tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_tg_bf(
- desc, storage, destination, destination_desc_type, threads);
+ desc, storage, destination, destination_desc_type, d_data_type, a_data_type, w_data_type, threads);
else
static_assert(
__tensor_ops_detail::__assert_false_v<destination_ptr_type>,
@@ -4726,9 +4805,16 @@
int threads = t.size();
__tensor_ops_detail::__tensor_ops_datatype dataType = __tensor_ops_detail::__type_to_tensor_ops_datatype<element_t>::value;
+ __tensor_ops_detail::__tensor_ops_datatype d_data_type =
+ __tensor_ops_detail::__type_to_tensor_ops_datatype<element_t>::value;
+ __tensor_ops_detail::__tensor_ops_datatype a_data_type =
+ __tensor_ops_detail::__type_to_tensor_ops_datatype<a_value_type>::value;
+ __tensor_ops_detail::__tensor_ops_datatype w_data_type =
+ __tensor_ops_detail::__type_to_tensor_ops_datatype<w_value_type>::value;
+
return __tensorops_impl_conv2d_cooperative_destination_tensor_is_valid_element(
descriptor, (__tensor_ops_detail::__thread_void_t)storage, idx,
- dataType, threads);
+ dataType, d_data_type, a_data_type, w_data_type, threads);
}
template <typename index_t, __tensor_ops_detail::__rank_t rank>
@@ -4742,13 +4828,21 @@
static_assert(rank == 4, "multidimensional_indices returns 4D indices");
__tensor_ops_detail::__tensor_ops_datatype dataType = __tensor_ops_detail::__type_to_tensor_ops_datatype<element_t>::value;
+
+ __tensor_ops_detail::__tensor_ops_datatype d_data_type =
+ __tensor_ops_detail::__type_to_tensor_ops_datatype<element_t>::value;
+ __tensor_ops_detail::__tensor_ops_datatype a_data_type =
+ __tensor_ops_detail::__type_to_tensor_ops_datatype<a_value_type>::value;
+ __tensor_ops_detail::__tensor_ops_datatype w_data_type =
+ __tensor_ops_detail::__type_to_tensor_ops_datatype<w_value_type>::value;
+
if constexpr (__tensor_ops_detail::__is_same_v<coord_t, ushort>)
{
ushort coords[4];
__tensorops_impl_conv2d_cooperative_destination_tensor_get_coordinate(
descriptor, (__tensor_ops_detail::__thread_void_t)storage, idx,
dataType, coords, __tensor_ops_detail::__tensor_ops_datatype_uint16,
- threads);
+ d_data_type, a_data_type, w_data_type, threads);
return {coords[0], coords[1], coords[2], coords[3]};
}
else if constexpr (__tensor_ops_detail::__is_same_v<coord_t, short>)
@@ -4757,7 +4851,7 @@
__tensorops_impl_conv2d_cooperative_destination_tensor_get_coordinate(
descriptor, (__tensor_ops_detail::__thread_void_t)storage, idx,
dataType, coords, __tensor_ops_detail::__tensor_ops_datatype_int16,
- threads);
+ d_data_type, a_data_type, w_data_type, threads);
return {coords[0], coords[1], coords[2], coords[3]};
}
else if constexpr (__tensor_ops_detail::__is_same_v<coord_t, uint>)
@@ -4766,7 +4860,7 @@
__tensorops_impl_conv2d_cooperative_destination_tensor_get_coordinate(
descriptor, (__tensor_ops_detail::__thread_void_t)storage, idx,
dataType, coords, __tensor_ops_detail::__tensor_ops_datatype_uint32,
- threads);
+ d_data_type, a_data_type, w_data_type, threads);
return {coords[0], coords[1], coords[2], coords[3]};
}
else if constexpr (__tensor_ops_detail::__is_same_v<coord_t, int>)
@@ -4775,7 +4869,7 @@
__tensorops_impl_conv2d_cooperative_destination_tensor_get_coordinate(
descriptor, (__tensor_ops_detail::__thread_void_t)storage, idx,
dataType, coords, __tensor_ops_detail::__tensor_ops_datatype_int32,
- threads);
+ d_data_type, a_data_type, w_data_type, threads);
return {coords[0], coords[1], coords[2], coords[3]};
}
}
diff -ruN /Applications/Xcode_26.3.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsMatMul2dImpl.h /Applications/Xcode_26.4.0-beta2.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsMatMul2dImpl.h
--- /Applications/Xcode_26.3.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsMatMul2dImpl.h 2025-11-09 03:56:31
+++ /Applications/Xcode_26.4.0-beta2.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsMatMul2dImpl.h 2026-02-16 09:54:25
@@ -208,6 +208,30 @@
__tensor_ops_detail::__tensor_ops_datatype,
int);
extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_tensor_load_dv_ui8(
+ __matmul2d_cooperative_operand_index,
+ __matmul2d_descriptor,
+ __tensor_ops_detail::__thread_void_t,
+ __tensor_ops_detail::__const_thread_void_t,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type,
+ int,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ int);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_tensor_load_tg_ui8(
+ __matmul2d_cooperative_operand_index,
+ __matmul2d_descriptor,
+ __tensor_ops_detail::__thread_void_t,
+ __tensor_ops_detail::__const_thread_void_t,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type,
+ int,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ int);
+extern "C" EXTERNALLY_DEFINED_ATTR void
__tensorops_impl_matmul2d_op_cooperative_tensor_load_dv_f32(
__matmul2d_cooperative_operand_index,
__matmul2d_descriptor,
@@ -323,6 +347,28 @@
__tensor_ops_detail::__tensor_ops_datatype,
int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_tensor_store_dv_ui8(
+ __matmul2d_cooperative_operand_index,
+ __matmul2d_descriptor,
+ __tensor_ops_detail::__const_thread_void_t,
+ __tensor_ops_detail::__const_thread_void_t,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_tensor_store_tg_ui8(
+ __matmul2d_cooperative_operand_index,
+ __matmul2d_descriptor,
+ __tensor_ops_detail::__const_thread_void_t,
+ __tensor_ops_detail::__const_thread_void_t,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
__tensorops_impl_matmul2d_op_cooperative_tensor_store_dv_f32(
__matmul2d_cooperative_operand_index,
__matmul2d_descriptor,
@@ -745,6 +791,33 @@
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_i8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_i8_tg_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_i8_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_ui8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_ui8_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_ui8_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_ui8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_ui8_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_ui8_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_ui8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_ui8_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_ui8_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_ui8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_ui8_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_ui8_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_ui8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_ui8_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_ui8_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_ui8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_ui8_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_ui8_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui8_tg_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui8_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui8_tg_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui8_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_ui8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_ui8_tg_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_ui8_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_f16_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
@@ -772,6 +845,33 @@
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_f16_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_dv_f16_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_tg_f16_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f16_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_dv_f16_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_tg_f16_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f16_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f16_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f16_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_f16_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_f16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
@@ -853,6 +953,33 @@
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_i8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_i8_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_dv_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_dv_f16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
@@ -934,6 +1061,33 @@
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_i8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_i8_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_dv_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_dv_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_dv_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_tg_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_tg_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_tg_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_dv_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_dv_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_dv_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_tg_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_tg_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_tg_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_f16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
@@ -961,6 +1115,33 @@
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_f16_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_dv_f16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_tg_f16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_dv_f16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_tg_f16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f16_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f16_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_f16_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_f32_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
@@ -988,6 +1169,33 @@
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_f32_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_dv_f32_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_tg_f32_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f32_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_dv_f32_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_tg_f32_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f32_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f32_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f32_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_f32_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_i8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_i8_tg_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_i8_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
@@ -1015,6 +1223,33 @@
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_i8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_i8_tg_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_i8_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_ui8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_ui8_tg_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_dv_ui8_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_ui8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_ui8_tg_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_tg_ui8_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_ui8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_ui8_tg_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_ui8_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_ui8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_ui8_tg_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_dv_ui8_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_ui8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_ui8_tg_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_tg_ui8_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_ui8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_ui8_tg_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_ui8_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_ui8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_ui8_tg_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_ui8_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_ui8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_ui8_tg_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_ui8_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_ui8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_ui8_tg_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_ui8_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_b16_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
@@ -1393,14 +1628,295 @@
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_b16_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_tg_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_tg_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f16_dv_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f16_dv_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f16_tg_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f16_tg_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_ui8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_ui8_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_ui8_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_ui8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_ui8_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_ui8_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_ui8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_ui8_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_ui8_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_ui8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_ui8_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_ui8_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_ui8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_ui8_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_ui8_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_ui8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_ui8_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_ui8_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_ui8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_ui8_tg_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_ui8_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_ui8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_ui8_tg_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_ui8_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_ui8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_ui8_tg_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_ui8_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_dv_b16_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_tg_b16_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_b16_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_dv_b16_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_tg_b16_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_b16_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_b16_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_b16_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_b16_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_dv_b16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_tg_b16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_b16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_dv_b16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_tg_b16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_b16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_b16_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_b16_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_b16_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_i4_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_i4_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_i4_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_i4_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_i4_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_i4_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_i4_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_i4_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_i4_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_i4_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_i4_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_i4_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i4_dv_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i4_tg_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i4_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i4_dv_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i4_tg_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i4_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_i4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_i4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_i4_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_i4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_i4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_i4_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_i4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_i4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_i4_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_i4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_i4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_i4_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i4_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i4_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_ui4_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_ui4_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_ui4_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_ui4_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_ui4_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_ui4_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_ui4_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_ui4_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_ui4_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_ui4_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_ui4_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_ui4_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui4_dv_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui4_tg_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui4_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui4_dv_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui4_tg_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui4_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_ui4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_ui4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_ui4_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_ui4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_ui4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_ui4_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_ui4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_ui4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_ui4_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_ui4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_ui4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_ui4_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui4_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui4_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_i4_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_i4_tg_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_i4_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_i4_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_i4_tg_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_i4_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_i4_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_i4_tg_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_i4_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_i4_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_i4_tg_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_i4_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_i4_dv_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_i4_tg_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_i4_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_i4_dv_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_i4_tg_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_i4_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_ui4_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_ui4_tg_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_dv_ui4_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_ui4_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_ui4_tg_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_tg_ui4_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_ui4_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_ui4_tg_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_dv_ui4_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_ui4_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_ui4_tg_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_tg_ui4_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_ui4_dv_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_ui4_tg_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_ui4_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_ui4_dv_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_ui4_tg_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_ui4_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_i4_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_i4_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_i4_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_i4_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_i4_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_i4_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_i4_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_i4_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_i4_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_i4_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_i4_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_i4_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i4_dv_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i4_tg_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i4_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i4_dv_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i4_tg_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i4_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_ui4_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_ui4_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_ui4_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_ui4_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_ui4_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_ui4_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_ui4_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_ui4_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_ui4_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_ui4_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_ui4_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_ui4_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_ui4_dv_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_ui4_tg_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_ui4_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_ui4_dv_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_ui4_tg_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_ui4_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_i4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_i4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_i4_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_i4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_i4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_i4_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_i4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_i4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_i4_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_i4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_i4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_i4_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i4_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i4_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_ui4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_ui4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_ui4_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_ui4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_ui4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_ui4_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_ui4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_ui4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_ui4_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_ui4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_ui4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_ui4_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_ui4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_ui4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_ui4_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_ui4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_ui4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_ui4_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f16_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
@@ -1408,14 +1924,7 @@
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_tg_i8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_tg_i8_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f16_dv_i8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f16_dv_i8_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f16_tg_i8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f16_tg_i8_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_i8_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
@@ -1423,14 +1932,15 @@
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_tg_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_tg_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_i8_dv_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_i8_dv_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_i8_tg_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_i8_tg_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_ui8_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_ui8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_ui8_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_ui8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_ui8_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_ui8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_ui8_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_ui8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f16_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
@@ -1438,14 +1948,15 @@
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_tg_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_tg_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f16_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f16_dv_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f16_tg_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f16_tg_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_th_f16_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_th_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_dv_f16_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_dv_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_th_f16_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_th_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_dv_f16_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_dv_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
@@ -1453,14 +1964,7 @@
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_tg_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_tg_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f16_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f16_dv_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f16_tg_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f16_tg_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f32_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f32_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
@@ -1468,14 +1972,7 @@
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f32_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f32_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_tg_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_tg_i8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f16_dv_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f16_dv_i8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f16_tg_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f16_tg_i8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_i8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
@@ -1483,14 +1980,15 @@
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_tg_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_tg_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f32_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f32_dv_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f32_tg_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f32_tg_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_ui8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_ui8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_ui8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_ui8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_f16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_f16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
@@ -1498,14 +1996,7 @@
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_f16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_tg_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_tg_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f32_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f32_dv_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f32_tg_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f32_tg_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_f32_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_f32_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
@@ -1513,14 +2004,7 @@
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_f32_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f32_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_i8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_tg_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_tg_i8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f32_dv_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f32_dv_i8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f32_tg_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f32_tg_i8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_i8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_i8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
@@ -1528,14 +2012,15 @@
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_i8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_i8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_tg_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_tg_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_i8_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_i8_dv_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_i8_tg_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_i8_tg_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_ui8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_ui8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_ui8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_ui8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
@@ -1543,14 +2028,15 @@
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_tg_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_tg_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_i8_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_i8_dv_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_i8_tg_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_i8_tg_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_th_f16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_th_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_dv_f16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_th_f16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_th_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_dv_f16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f32_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f32_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
@@ -1558,14 +2044,15 @@
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f32_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f32_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_i8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_i8_tg_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_tg_i8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_tg_i8_tg_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_i8_dv_i8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_i8_dv_i8_tg_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_i8_tg_i8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_i8_tg_i8_tg_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_th_f32_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_th_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_dv_f32_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_th_f32_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_th_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_dv_f32_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_i8_th_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_i8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_i8_th_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
@@ -1573,14 +2060,15 @@
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_i8_th_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_i8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_i8_th_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_tg_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_tg_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_b16_dv_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_b16_dv_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_b16_tg_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_b16_tg_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_i8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_th_ui8_th_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_th_ui8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_dv_ui8_th_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_dv_ui8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_th_ui8_th_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_th_ui8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_dv_ui8_th_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_dv_ui8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_b16_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_b16_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
@@ -1588,14 +2076,7 @@
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_b16_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_b16_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_tg_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_tg_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_b16_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_b16_dv_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_b16_tg_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_b16_tg_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_b16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_b16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
@@ -1603,14 +2084,7 @@
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_b16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_b16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_tg_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_tg_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_b16_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_b16_dv_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_b16_tg_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_b16_tg_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f32_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f32_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
@@ -1618,14 +2092,7 @@
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f32_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f32_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_i8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_i8_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_tg_i8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_tg_i8_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_b16_dv_i8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_b16_dv_i8_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_b16_tg_i8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_b16_tg_i8_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_i8_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_i8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_i8_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
@@ -1633,14 +2100,7 @@
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_i8_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_i8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_i8_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_i8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_tg_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_tg_i8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_b16_dv_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_b16_dv_i8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_b16_tg_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_b16_tg_i8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_i8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_i8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_i8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
@@ -1648,14 +2108,7 @@
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_i8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_i8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_tg_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_tg_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f32_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f32_dv_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f32_tg_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f32_tg_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_b16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_b16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
@@ -1663,14 +2116,7 @@
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_b16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_b16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_tg_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_tg_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_i8_dv_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_i8_dv_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_i8_tg_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_i8_tg_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_b16_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_b16_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
@@ -1678,14 +2124,7 @@
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_b16_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_b16_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_tg_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_tg_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_i8_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_i8_dv_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_i8_tg_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_i8_tg_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_b16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_b16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
@@ -1693,14 +2132,7 @@
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_b16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_b16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_tg_f16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_tg_f16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_b16_dv_f16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_b16_dv_f16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_b16_tg_f16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_b16_tg_f16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
@@ -1708,14 +2140,7 @@
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_tg_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_tg_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_b16_dv_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_b16_dv_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_b16_tg_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_b16_tg_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
@@ -1723,14 +2148,7 @@
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_tg_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_tg_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_b16_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_b16_dv_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_b16_tg_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_b16_tg_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
@@ -1738,14 +2156,7 @@
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_tg_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_tg_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f16_dv_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f16_dv_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f16_tg_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f16_tg_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
@@ -1753,14 +2164,7 @@
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_tg_b16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_tg_b16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f16_dv_b16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f16_dv_b16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f16_tg_b16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f16_tg_b16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
@@ -1768,14 +2172,7 @@
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_tg_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_tg_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f16_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f16_dv_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f16_tg_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
-extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_tg_f16_tg_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
@@ -1783,6 +2180,39 @@
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_ui8_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_ui8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_ui8_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_ui8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_ui8_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_ui8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_ui8_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_ui8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_ui8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_ui8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_ui8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_ui8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_th_b16_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_th_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_dv_b16_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_dv_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_th_b16_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_th_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_dv_b16_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_dv_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_th_b16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_th_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_dv_b16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_th_b16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_th_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_dv_b16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
template <__matmul2d_descriptor descriptor,
__matmul2d_cooperative_operand_index operand_index,
@@ -1794,23 +2224,33 @@
typename... args>
struct __operand_layout
{
- static_assert(__tensor_ops_detail::__is_same_v<left_element_type, int8_t> ||
+ static_assert(__tensor_ops_detail::__is_same_v<left_element_type, uint8_t> ||
+ __tensor_ops_detail::__is_same_v<left_element_type, int8_t> ||
+#if __HAVE_INT4B_FORMAT_TYPE__
+ __tensor_ops_detail::__is_same_v<left_element_type, metal::uint4b_format> ||
+ __tensor_ops_detail::__is_same_v<left_element_type, metal::int4b_format> ||
+#endif
__tensor_ops_detail::__is_same_v<left_element_type, float> ||
#if __HAVE_BFLOAT__
__tensor_ops_detail::__is_same_v<left_element_type, bfloat> ||
#endif
__tensor_ops_detail::__is_same_v<left_element_type, half>,
"cooperative tensor source data type can only be one of "
- "int8_t/float/half/bfloat");
+ "uint8_t/int8_t/uint4b_format/int4b_format/float/half/bfloat");
- static_assert(__tensor_ops_detail::__is_same_v<right_element_type, int8_t> ||
+ static_assert(__tensor_ops_detail::__is_same_v<right_element_type, uint8_t> ||
+ __tensor_ops_detail::__is_same_v<right_element_type, int8_t> ||
+#if __HAVE_INT4B_FORMAT_TYPE__
+ __tensor_ops_detail::__is_same_v<right_element_type, metal::uint4b_format> ||
+ __tensor_ops_detail::__is_same_v<right_element_type, metal::int4b_format> ||
+#endif
__tensor_ops_detail::__is_same_v<right_element_type, float> ||
#if __HAVE_BFLOAT__
__tensor_ops_detail::__is_same_v<right_element_type, bfloat> ||
#endif
__tensor_ops_detail::__is_same_v<right_element_type, half>,
"cooperative tensor source data type can only be one of "
- "int8_t/float/half/bfloat");
+ "uint8_t/int8_t/uint4b_format/int4b_format/float/half/bfloat");
static_assert(__tensor_ops_detail::__is_same_v<destination_element_type, float> ||
__tensor_ops_detail::__is_same_v<destination_element_type, half> ||
@@ -2021,6 +2461,21 @@
static_assert(__tensor_ops_detail::__assert_false_v<sourcePtrType>,
"Unsupported address space");
}
+ else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, uint8_t>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<sourcePtrType>)
+ __tensorops_impl_matmul2d_op_cooperative_tensor_load_dv_ui8(
+ operand_index, desc, storage, source, sourceDescType, sourceRank, leftDataType,
+ rightDataType, destinationDataType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ sourcePtrType>)
+ __tensorops_impl_matmul2d_op_cooperative_tensor_load_tg_ui8(
+ operand_index, desc, storage, source, sourceDescType, sourceRank, leftDataType,
+ rightDataType, destinationDataType, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<sourcePtrType>,
+ "Unsupported address space");
+ }
else
static_assert(__tensor_ops_detail::__assert_false_v<elem_t>,
"Unsupported type");
@@ -2141,6 +2596,22 @@
static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>,
"Unsupported address space");
}
+ else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, uint8_t>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ destinationPtrType>)
+ __tensorops_impl_matmul2d_op_cooperative_tensor_store_dv_ui8(
+ operand_index, desc, storage, destination, destinationDescType, leftDataType,
+ rightDataType, destinationDataType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destinationPtrType>)
+ __tensorops_impl_matmul2d_op_cooperative_tensor_store_tg_ui8(
+ operand_index, desc, storage, destination, destinationDescType, leftDataType,
+ rightDataType, destinationDataType, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>,
+ "Unsupported address space");
+ }
else
static_assert(__tensor_ops_detail::__assert_false_v<elem_t>,
"Unsupported type");
@@ -2296,8 +2767,8 @@
__cooperative_tensor_t<descriptor,
__matmul2d_cooperative_operand_index::destination,
scope,
- typename __tensor_ops_detail::__remove_addrspace_t<__tensor_ops_detail::__remove_reference_t<left_operand>>::value_type,
- typename __tensor_ops_detail::__remove_addrspace_t<__tensor_ops_detail::__remove_reference_t<right_operand>>::value_type,
+ typename __tensor_ops_detail::__remove_addrspace_t<typename __tensor_ops_detail::__remove_addrspace_t<__tensor_ops_detail::__remove_reference_t<left_operand>>::element_type>,
+ typename __tensor_ops_detail::__remove_addrspace_t<typename __tensor_ops_detail::__remove_addrspace_t<__tensor_ops_detail::__remove_reference_t<right_operand>>::element_type>,
element_type, coord_type, args...>;
template <__matmul2d_descriptor descriptor, typename scope,
@@ -2331,6 +2802,11 @@
"Input cooperative tensors require a single SIMD group");
static_assert(__tensor_ops_detail::__is_same_v<coord_type, int>, "coord_type must be int");
+#if __HAVE_INT4B_FORMAT_TYPE__
+ static_assert(!metal::is_numeric_format_v<left_element_type>,
+ "Input cooperative tensor element type cannot be a format type");
+#endif
+
return __cooperative_tensor_left_input_t<descriptor, scope, left_element_type, right_element_type,
element_type, coord_type, args...>();
}
@@ -2424,6 +2900,11 @@
static_assert(__tensor_ops_detail::__is_same_v<scope, metal::execution_simdgroup>,
"Input cooperative tensors require a single SIMD group");
static_assert(__tensor_ops_detail::__is_same_v<coord_type, int>, "coord_type must be int");
+
+#if __HAVE_INT4B_FORMAT_TYPE__
+ static_assert(!metal::is_numeric_format_v<right_element_type>,
+ "Input cooperative tensor element type cannot be a format type");
+#endif
return __cooperative_tensor_right_input_t<descriptor, scope, left_element_type, right_element_type,
element_type, coord_type, args...>();
@@ -2646,8 +3127,8 @@
using left_elem_t = typename left_t::element_type;
using right_elem_t = typename right_t::element_type;
- using left_value_t = typename left_t::value_type;
- using right_value_t = typename right_t::value_type;
+ using left_value_t = __tensor_ops_detail::__remove_addrspace_t<left_elem_t>;
+ using right_value_t = __tensor_ops_detail::__remove_addrspace_t<right_elem_t>;
static_assert(__tensor_ops_detail::__is_tensorops_execution_scope_v<scope>,
"scope should be of type __tensorops_scope");
@@ -3035,7 +3516,7 @@
__tensor_ops_detail::__type_to_tensor_ops_datatype<right_value_t>::value;
__tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_init(
- (__tensor_ops_detail::__thread_void_t)storage, descriptor,
+ (__tensor_ops_detail::__thread_void_t)storage, descriptor,
reduction_dim, leftDataType, rightDataType, elementDataType, threads);
}
@@ -3277,1407 +3758,798 @@
static_assert(descriptor.k == 16 || descriptor.k == 32, "K must be 16 or 32 if both inputs are cooperative tensors");
}
- // single thread
- if constexpr (__tensor_ops_detail::__is_same_v<scope, metal::execution_thread>)
- {
- if constexpr (!__tensor_ops_detail::__is_cooperative_tensor_type_v<
- leftTensorType> &&
- !__tensor_ops_detail::__is_cooperative_tensor_type_v<
- rightTensorType> &&
- !__tensor_ops_detail::__is_cooperative_tensor_type_v<
- destinationTensorType>)
- {
- thread void *left = (thread void *)(&leftIn);
- thread void *right = (thread void *)(&rightIn);
- thread void *destination = (thread void *)(&destinationT);
+ if constexpr (!__tensor_ops_detail::__is_same_v<scope, metal::execution_thread>) {
+ // SIMD group(s) scope
+ static_assert((descriptor.m % 8) == 0 || (descriptor.m % 16) == 0, "M must be a multiple of 8 or 16");
+ static_assert((descriptor.n % 8) == 0 || (descriptor.n % 16) == 0, "N must be a multiple of 8 or 16");
+ static_assert((descriptor.m % 16) == 0 || (descriptor.n % 16) == 0, "At least one of M or N must be a multiple of 16");
- const __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType =
- __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type<
- leftTensorType>();
- const __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType =
- __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type<
- rightTensorType>();
- const __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
- destinationDescType =
- __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type<
- destinationTensorType>();
-
- if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, half> &&
- __tensor_ops_detail::__is_same_v<rightValueType, half> &&
- __tensor_ops_detail::__is_same_v<destinationValueType,
- half>)
+ if constexpr (descriptor.k != static_cast<int>(metal::dynamic_extent) && descriptor.k != dynamic_length_v<int>) {
+ if constexpr (metal::is_same_v<leftValueType, metal::int4b_format> || metal::is_same_v<leftValueType, metal::uint4b_format> ||
+ metal::is_same_v<rightValueType, metal::int4b_format> || metal::is_same_v<rightValueType, metal::uint4b_format>)
{
- if constexpr (
- __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
- __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
- __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
- __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_dv_f16(
- desc, left, leftDescType, right, rightDescType, destination,
- destinationDescType);
- else if constexpr (
- __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
- __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
- __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
- __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f16_dv_f16(
- desc, left, leftDescType, right, rightDescType, destination,
- destinationDescType);
- else if constexpr (
- __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
- __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
- __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
- __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_dv_f16(
- desc, left, leftDescType, right, rightDescType, destination,
- destinationDescType);
- else if constexpr (
- __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
- __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
- __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
- __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_dv_f16(
- desc, left, leftDescType, right, rightDescType, destination,
- destinationDescType);
- else if constexpr (
- __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
- __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
- __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
- __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_th_f16(
- desc, left, leftDescType, right, rightDescType, destination,
- destinationDescType);
- else if constexpr (
- __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
- __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
- __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
- __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f16_th_f16(
- desc, left, leftDescType, right, rightDescType, destination,
- destinationDescType);
- else if constexpr (
- __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
- __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
- __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
- __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_th_f16(
- desc, left, leftDescType, right, rightDescType, destination,
- destinationDescType);
- else if constexpr (
- __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
- __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
- __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
- __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_th_f16(
- desc, left, leftDescType, right, rightDescType, destination,
- destinationDescType);
- else
- static_assert(
- __tensor_ops_detail::__assert_false_v<destinationPtrType>,
- "Unsupported address space");
+ static_assert((descriptor.k % 32) == 0, "K must be dynamic or a multiple of 32 with sub-byte element types");
}
- else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
- half> &&
- __tensor_ops_detail::__is_same_v<rightValueType,
- int8_t> &&
- __tensor_ops_detail::__is_same_v<
- destinationValueType, half>)
- {
- if constexpr (
- __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
- __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
- __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
- __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_dv_f16(
- desc, left, leftDescType, right, rightDescType, destination,
- destinationDescType);
- else if constexpr (
- __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
- __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
- __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
- __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_i8_dv_f16(
- desc, left, leftDescType, right, rightDescType, destination,
- destinationDescType);
- else if constexpr (
- __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
- __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
- __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
- __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_dv_f16(
- desc, left, leftDescType, right, rightDescType, destination,
- destinationDescType);
- else if constexpr (
- __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
- __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
- __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
- __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_dv_f16(
- desc, left, leftDescType, right, rightDescType, destination,
- destinationDescType);
- else if constexpr (
- __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
- __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
- __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
- __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_th_f16(
- desc, left, leftDescType, right, rightDescType, destination,
- destinationDescType);
- else if constexpr (
- __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
- __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
- __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
- __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_i8_th_f16(
- desc, left, leftDescType, right, rightDescType, destination,
- destinationDescType);
- else if constexpr (
- __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
- __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
- __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
- __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_th_f16(
- desc, left, leftDescType, right, rightDescType, destination,
- destinationDescType);
- else if constexpr (
- __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
- __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
- __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
- __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_th_f16(
- desc, left, leftDescType, right, rightDescType, destination,
- destinationDescType);
- else
- static_assert(
- __tensor_ops_detail::__assert_false_v<destinationPtrType>,
- "Unsupported address space");
+ else
+ static_assert((descriptor.k % 16) == 0, "K must be dynamic or a multiple of 16");
}
- else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
- int8_t> &&
- __tensor_ops_detail::__is_same_v<rightValueType,
- half> &&
- __tensor_ops_detail::__is_same_v<
- destinationValueType, half>)
+ }
+ else {
+ // Single thread scope
+ static_assert(descriptor.m == 1 || descriptor.m == 2 || descriptor.m == 4 || (descriptor.m % 8) == 0,
+ "M must be 1, 2, 4, or a multiple of 8 with execution_thread");
+ static_assert(descriptor.n == 1 || descriptor.n == 2 || descriptor.n == 4 || (descriptor.n % 8) == 0,
+ "N must be 1, 2, 4, or a multiple of 8 with execution_thread");
+
+ if constexpr (descriptor.k != static_cast<int>(metal::dynamic_extent) && descriptor.k != dynamic_length_v<int>)
+ static_assert((descriptor.k % 16) == 0, "K must be dynamic or a multiple of 16");
+ }
+
+ // single thread
+ if constexpr (__tensor_ops_detail::__is_same_v<scope, metal::execution_thread>)
+ {
+ if constexpr (!__tensor_ops_detail::__is_cooperative_tensor_type_v<leftTensorType> && !__tensor_ops_detail::__is_cooperative_tensor_type_v<rightTensorType> && !__tensor_ops_detail::__is_cooperative_tensor_type_v<destinationTensorType>)
{
- if constexpr (
- __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
- __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
- __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
- __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_dv_f16(
- desc, left, leftDescType, right, rightDescType, destination,
- destinationDescType);
- else if constexpr (
- __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
- __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
- __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
- __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f16_dv_f16(
- desc, left, leftDescType, right, rightDescType, destination,
- destinationDescType);
- else if constexpr (
- __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
- __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
- __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
- __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_dv_f16(
- desc, left, leftDescType, right, rightDescType, destination,
- destinationDescType);
- else if constexpr (
- __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
- __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
- __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
- __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_dv_f16(
- desc, left, leftDescType, right, rightDescType, destination,
- destinationDescType);
- else if constexpr (
- __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
- __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
- __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
- __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_th_f16(
- desc, left, leftDescType, right, rightDescType, destination,
- destinationDescType);
- else if constexpr (
- __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
- __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
- __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
- __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f16_th_f16(
- desc, left, leftDescType, right, rightDescType, destination,
- destinationDescType);
- else if constexpr (
- __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
- __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
- __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
- __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_th_f16(
- desc, left, leftDescType, right, rightDescType, destination,
- destinationDescType);
- else if constexpr (
- __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
- __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
- __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
- __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_th_f16(
- desc, left, leftDescType, right, rightDescType, destination,
- destinationDescType);
+ const __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType = __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type<leftTensorType>();
+ const __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType = __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type<rightTensorType>();
+ const __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType = __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type<destinationTensorType>();
+
+ thread void *left = (thread void *)(&leftIn);
+ thread void *right = (thread void *)(&rightIn);
+ thread void *destination = (thread void *)(&destinationT);
+
+ if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, half> && __tensor_ops_detail::__is_same_v<rightValueType, half> && __tensor_ops_detail::__is_same_v<destinationValueType, half>) {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, half> && __tensor_ops_detail::__is_same_v<rightValueType, int8_t> && __tensor_ops_detail::__is_same_v<destinationValueType, half>) {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_i8_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_i8_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, half> && __tensor_ops_detail::__is_same_v<rightValueType, uint8_t> && __tensor_ops_detail::__is_same_v<destinationValueType, half>) {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_ui8_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_ui8_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_ui8_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_ui8_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_ui8_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_ui8_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_ui8_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_ui8_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, int8_t> && __tensor_ops_detail::__is_same_v<rightValueType, half> && __tensor_ops_detail::__is_same_v<destinationValueType, half>) {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, uint8_t> && __tensor_ops_detail::__is_same_v<rightValueType, half> && __tensor_ops_detail::__is_same_v<destinationValueType, half>) {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_dv_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_dv_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_th_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_th_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_dv_f16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_dv_f16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_th_f16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_th_f16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, half> && __tensor_ops_detail::__is_same_v<rightValueType, half> && __tensor_ops_detail::__is_same_v<destinationValueType, float>) {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, half> && __tensor_ops_detail::__is_same_v<rightValueType, float> && __tensor_ops_detail::__is_same_v<destinationValueType, float>) {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f32_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f32_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f32_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f32_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, half> && __tensor_ops_detail::__is_same_v<rightValueType, int8_t> && __tensor_ops_detail::__is_same_v<destinationValueType, float>) {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_i8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, half> && __tensor_ops_detail::__is_same_v<rightValueType, uint8_t> && __tensor_ops_detail::__is_same_v<destinationValueType, float>) {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_ui8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_ui8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_ui8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_ui8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_ui8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_ui8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_ui8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_ui8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, float> && __tensor_ops_detail::__is_same_v<rightValueType, half> && __tensor_ops_detail::__is_same_v<destinationValueType, float>) {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, float> && __tensor_ops_detail::__is_same_v<rightValueType, float> && __tensor_ops_detail::__is_same_v<destinationValueType, float>) {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f32_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_f32_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_f32_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_f32_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, float> && __tensor_ops_detail::__is_same_v<rightValueType, int8_t> && __tensor_ops_detail::__is_same_v<destinationValueType, float>) {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_i8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_i8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_i8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_i8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, float> && __tensor_ops_detail::__is_same_v<rightValueType, uint8_t> && __tensor_ops_detail::__is_same_v<destinationValueType, float>) {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_ui8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_ui8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_ui8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_ui8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_ui8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_ui8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_ui8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_ui8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, int8_t> && __tensor_ops_detail::__is_same_v<rightValueType, half> && __tensor_ops_detail::__is_same_v<destinationValueType, float>) {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, uint8_t> && __tensor_ops_detail::__is_same_v<rightValueType, half> && __tensor_ops_detail::__is_same_v<destinationValueType, float>) {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_dv_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_dv_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_th_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_th_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_dv_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_dv_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_th_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_th_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, int8_t> && __tensor_ops_detail::__is_same_v<rightValueType, float> && __tensor_ops_detail::__is_same_v<destinationValueType, float>) {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f32_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> && __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> && __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_