Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support i1 datatype with an experimental flag. #18713

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
lialan marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -2915,6 +2915,26 @@ setLoweringConfigForComputeOps(mlir::FunctionOpInterface entryPointFn,
}
}

// Make sure the innermost tile size times element size is multiple
// of byte bits. This is required for now because we do not fully
// support sub-byte vector stores. Once vector stores are supported
// then this can be eliminated. Note that emulating sub-byte sized vector
// loads and stores will have a performance impact.
lialan marked this conversation as resolved.
Show resolved Hide resolved
auto resultTypes = rootOperation->getResultTypes();
if (commonVecTileSizes.size() != 0 && !resultTypes.empty()) {
auto elementTypeSize =
cast<ShapedType>(rootOperation->getResultTypes().front())
.getElementType()
.getIntOrFloatBitWidth();
// for now just enable for i1
if (elementTypeSize == 1) {
auto innermostTileSize = commonVecTileSizes.back();
commonVecTileSizes.back() =
llvm::alignTo(innermostTileSize * elementTypeSize, 8) /
elementTypeSize;
}
}

// Set the lowering configs with new tile sizes.
for (auto op : computeOps) {
int numLoops = cast<TilingInterface>(op).getLoopIteratorTypes().size();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1958,3 +1958,28 @@ func.func @test_tiling_cpu_default(%arg0: tensor<256x256xi8>, %arg1: tensor<256x
// CHECK: func @test_tiling_cpu_default(
// CHECK-SAME: translation_info = #[[TRANSLATION_INFO]]
// CHECK: linalg.quantized_matmul {lowering_config = #[[CONFIG0]]}

// -----

#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>
func.func @i1_type() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<8xi1>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<8xi1>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<8xi1>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [8], strides = [1] : !flow.dispatch.tensor<readonly:tensor<8xi1>> -> tensor<8xi1>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [8], strides = [1] : !flow.dispatch.tensor<readonly:tensor<8xi1>> -> tensor<8xi1>
%5 = tensor.empty() : tensor<8xi1>
%6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3, %4 : tensor<8xi1>, tensor<8xi1>) outs(%5 : tensor<8xi1>) {
^bb0(%in: i1, %in_0: i1, %out: i1):
%7 = arith.xori %in, %in_0 : i1
linalg.yield %7 : i1
} -> tensor<8xi1>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [8], strides = [1] : tensor<8xi1> -> !flow.dispatch.tensor<writeonly:tensor<8xi1>>
return
}

// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[8], [8], [0], [0]]>
// CHECK: func @i1_type()
// CHECK: linalg.generic {
// CHECK-SAME: {lowering_config = #[[CONFIG]]}
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ iree_lit_test_suite(
"encode_device_tensors_packing.mlir",
"encode_host_tensors.mlir",
"encode_host_tensors_packing.mlir",
"encode_host_tensors_packing_i1.mlir",
"fold_globals.mlir",
"fold_uniform_operands.mlir",
"fuse_dispatch_bindings.mlir",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ iree_lit_test_suite(
"encode_device_tensors_packing.mlir"
"encode_host_tensors.mlir"
"encode_host_tensors_packing.mlir"
"encode_host_tensors_packing_i1.mlir"
"fold_globals.mlir"
"fold_uniform_operands.mlir"
"fuse_dispatch_bindings.mlir"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ util.func public @denseTensorSizeOfDynamic(%arg0: index) -> index {
// CHECK-DAG: %[[C5:.+]] = arith.constant 5 : index
// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index
// CHECK: %[[MUL:.+]] = arith.muli %arg0, %[[C5]] : index
// CHECK: %[[DIV:.+]] = arith.divui %[[MUL]], %[[C2]] : index
// CHECK: %[[DIV:.+]] = arith.ceildivui %[[MUL]], %[[C2]] : index
%0 = stream.tensor.sizeof tensor<?x5xi4>{%arg0} : index
// CHECK: util.return %[[DIV]]
util.return %0 : index
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
// RUN: iree-opt --split-input-file --iree-stream-encode-host-tensors --iree-experimental-packed-i1-storage %s | FileCheck %s

func.func @unaligned_i1_size() -> index {
%0 = stream.tensor.sizeof tensor<12xi1> : index
return %0 : index
}
// CHECK: func @unaligned_i1_size() -> index {
// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index
// CHECK: return %[[C2]] : index

// -----

func.func @aligned_i1_size() -> index {
%0 = stream.tensor.sizeof tensor<24xi1> : index
return %0 : index
}

// CHECK: func @aligned_i1_size() -> index {
// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index
// CHECK: return %[[C3]] : index
23 changes: 16 additions & 7 deletions compiler/src/iree/compiler/Utils/ElementPackingUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,17 @@

namespace mlir::iree_compiler {

llvm::cl::opt<bool> clEnableI1Support(
"iree-experimental-packed-i1-storage",
llvm::cl::desc(
"Experimental feature: enable i1 data type support in codegen"),
llvm::cl::init(false));

bool needToPackSubByteElementBitWidth(unsigned bitWidth) {
// Enable i1 support if requested.
if (clEnableI1Support && bitWidth == 1) {
return true;
}
// Require the original bit width to be some power of two for now to avoid
// trickiness and weirdness of packing and cross-byte access.
// Also disallow boolean values for now--they may require separate interface
Expand Down Expand Up @@ -114,15 +124,14 @@ Value calculateStorageElementCountInBytes(Location loc,
if (needToPackSubByteElementBitWidth(elementBits)) {
assert(8 % elementBits == 0);
unsigned byteElements = 8 / elementBits;
// Perform some basic sanity check to make sure the total count is byte
// aligned for fully static shapes.
if (paddedDynamicDims.empty() && (staticCount * elementBits) % 8 != 0) {
return nullptr;
}
auto divisor = builder.create<arith::ConstantIndexOp>(loc, byteElements);
// TODO(antiagainst): We may want to emit runtime check to make sure this is
// divisible.
value = builder.createOrFold<arith::DivUIOp>(loc, value, divisor);
auto divisor = builder.create<arith::ConstantIndexOp>(loc, byteElements);
if (!clEnableI1Support && paddedDynamicDims.empty() &&
(staticCount * elementBits) % 8 != 0) {
return nullptr;
}
value = builder.createOrFold<arith::CeilDivUIOp>(loc, value, divisor);
}

return value;
Expand Down
50 changes: 50 additions & 0 deletions tests/e2e/subbyte_types/BUILD.bazel
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Copyright 2024 The IREE Authors
#
# Licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

# Tests of end-to-end IREE support for individual ops in the TOSA dialect.
# Each test file should have a name matching the corresponding TOSA op and test only the
# functionality of that op (though may make use of other ops where necessary). Tests should be
# written using the IREE Check framework.
# See https://iree.dev/developers/general/testing-guide/#iree-core-end-to-end-e2e-tests.

load("//build_tools/bazel:enforce_glob.bzl", "enforce_glob")
load("//build_tools/bazel:iree_check_test.bzl", "iree_check_single_backend_test_suite")

package(
features = ["layering_check"],
licenses = ["notice"], # Apache 2.0
)

LLVM_SRCS = enforce_glob(
# keep sorted
[
"subbyte_types.mlir",
],
include = ["*.mlir"],
exclude = [],
)

iree_check_single_backend_test_suite(
name = "check_llvm-cpu_subbyte_emulation",
srcs = LLVM_SRCS,
compiler_flags = [
"--iree-llvmcpu-target-cpu=generic",
"--iree-experimental-packed-i1-storage",
],
driver = "local-task",
tags = [
# subbyte support for wasm is not on priorities.
"nowasm",
],
target_backend = "llvm-cpu",
)

test_suite(
name = "check",
tests = [
":check_llvm-cpu_subbyte_emulation",
],
)
29 changes: 29 additions & 0 deletions tests/e2e/subbyte_types/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
################################################################################
# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from #
# tests/e2e/subbyte_types/BUILD.bazel #
# #
# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary #
# CMake-only content. #
# #
# To disable autogeneration for this file entirely, delete this header. #
################################################################################

iree_add_all_subdirs()

iree_check_single_backend_test_suite(
NAME
check_llvm-cpu_subbyte_emulation
SRCS
"subbyte_types.mlir"
TARGET_BACKEND
"llvm-cpu"
DRIVER
"local-task"
COMPILER_FLAGS
"--iree-llvmcpu-target-cpu=generic"
"--iree-experimental-packed-i1-storage"
LABELS
"nowasm"
)

### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
28 changes: 28 additions & 0 deletions tests/e2e/subbyte_types/subbyte_types.mlir
lialan marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
func.func @i1_type() {
%c0 = arith.constant 0 : index
%c255 = arith.constant 255 : i8
%input1 = util.unfoldable_constant dense<[85]> : tensor<1xi8> // b01010101
%input2 = util.unfoldable_constant dense<[170]> : tensor<1xi8> // b10101010
%lhs = flow.tensor.bitcast %input1 : tensor<1xi8> -> tensor<8xi1>
%rhs = flow.tensor.bitcast %input2 : tensor<1xi8> -> tensor<8xi1>
%empty = tensor.empty() : tensor<8xi1>
%res = linalg.generic
{indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]}
ins(%lhs, %rhs : tensor<8xi1>, tensor<8xi1>) outs(%empty: tensor<8xi1>) {
^bb0(%inlhs: i1, %inrhs: i1, %out: i1):
%inres = arith.xori %inlhs, %inrhs: i1
linalg.yield %inres : i1
} -> tensor<8xi1>
%tensor_res = flow.tensor.bitcast %res : tensor<8xi1> -> tensor<1xi8>
check.expect_eq_const(%tensor_res, dense<[255]> : tensor<1xi8>) : tensor<1xi8>
return
}

func.func @i1_type_slice() {
%input = util.unfoldable_constant dense<[0, 255, 0]> : tensor<3xi8>
%flat_input_all = flow.tensor.bitcast %input : tensor<3xi8> -> tensor<24xi1>
%slice = tensor.extract_slice %flat_input_all[8][8][1] : tensor<24xi1> to tensor<8xi1>
%tensor_res = flow.tensor.bitcast %slice : tensor<8xi1> -> tensor<1xi8>
check.expect_eq_const(%tensor_res, dense<[255]> : tensor<1xi8>) : tensor<1xi8>
return
}
Loading