From 569e8887a3ade9c0f4743e025afd04498c744a40 Mon Sep 17 00:00:00 2001
From: aidansander <aisander@xsjaisander40x.xlnx.xilinx.com>
Date: Fri, 9 Aug 2024 10:53:42 -0700
Subject: [PATCH 1/5] Added pass for getInvBf16 and corresponding unit test

Still need to add conversion test
---
 .../Transforms/VectorToAIEVecConversions.cpp  | 48 ++++++++++++++++++-
 .../bf16_inv_lut/bf16_inv_lut.mlir            |  2 +-
 .../bf16_inv_lut_mlir_to_llvm.mlir            | 34 +++++++++++++
 .../aievec_tests/bf16_inv_lut/dut_simple.cc   |  1 +
 .../aievec_tests/bf16_inv_lut/testbench.cc    |  6 +++
 5 files changed, 88 insertions(+), 3 deletions(-)
 create mode 100644 test/unit_tests/aievec_tests/bf16_inv_lut/bf16_inv_lut_mlir_to_llvm.mlir
 create mode 100644 test/unit_tests/aievec_tests/bf16_inv_lut/dut_simple.cc
diff --git a/lib/Dialect/AIEVec/Transforms/VectorToAIEVecConversions.cpp b/lib/Dialect/AIEVec/Transforms/VectorToAIEVecConversions.cpp
index 3c2301e4b1..5618ecbb42 100644
--- a/lib/Dialect/AIEVec/Transforms/VectorToAIEVecConversions.cpp
+++ b/lib/Dialect/AIEVec/Transforms/VectorToAIEVecConversions.cpp
@@ -2017,6 +2017,49 @@ struct ComputeExpOpByLUTPattern : OpConversionPattern<math::ExpOp> {
 //  %1 = arith.truncf %0 : f32 to bf16
 // to -
 //  %0 = emitc.call "getInvBf16"(%0) : f32 -> bf16;
+struct ComputeInvOpByLUTLLVMPattern : OpConversionPattern<arith::DivFOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(arith::DivFOp divOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    Type srcType = adaptor.getLhs().getType();
+    if (!divOp->hasOneUse() || isa<VectorType>(srcType) ||
+        !isa<FloatType>(srcType))
+      return failure();
+
+    if (!isNarrowingOp(*divOp->getUsers().begin()))
+      return failure();
+
+    auto fType = cast<FloatType>(srcType);
+    if (fType.getWidth() != 32)
+      return failure();
+
+    auto constOp = dyn_cast<arith::ConstantOp>(divOp.getLhs().getDefiningOp());
+    if (!constOp ||
+        cast<FloatAttr>(constOp.getValue()).getValue().convertToDouble() !=
+            1.0f)
+      return failure();
+
+    StringRef funcName = "getInvBf16";
+    auto moduleOp = divOp->getParentOfType<mlir::ModuleOp>();
+    Type floatTy = rewriter.getF32Type();
+    Type bfloat16Ty = rewriter.getBF16Type();
+    func::FuncOp fn_op =
+        getOrGenerateFuncOp(rewriter, moduleOp, funcName, TypeRange{floatTy},
+                            TypeRange{bfloat16Ty});
+
+    auto truncOp = cast<arith::TruncFOp>(*divOp->getUsers().begin());
+
+    rewriter.setInsertionPoint(truncOp);
+    SmallVector<Value> invOperands = {adaptor.getRhs()};
+    rewriter.replaceOpWithNewOp<func::CallOp>(truncOp, fn_op, invOperands);
+    rewriter.eraseOp(divOp);
+
+    return success();
+  }
+};
+
 struct ComputeInvOpByLUTPattern : OpConversionPattern<arith::DivFOp> {
   using OpConversionPattern::OpConversionPattern;
 
@@ -3095,6 +3138,7 @@ static void populateAIEVecV2ConversionPatterns(RewritePatternSet &patterns,
       >(patterns.getContext(), 128, 1024, 256, 1024);
     patterns.add<
         ComputeExpOpByLUTPattern,
+        ComputeInvOpByLUTPattern,
         LowerVectorAddFOpToAIEVecAddElemOp,
         LowerVectorSubFOpToAIEVecSubElemOp,
         LowerVectorAddIOpToAIEVecAddElemOp,
@@ -3102,11 +3146,11 @@ static void populateAIEVecV2ConversionPatterns(RewritePatternSet &patterns,
       >(patterns.getContext());
   } else if (backend == TargetBackend::LLVMIR){
       patterns.add<
-      ComputeExpOpByLUTLLVMPattern
+      ComputeExpOpByLUTLLVMPattern,
+      ComputeInvOpByLUTLLVMPattern
       >(patterns.getContext());
   }
   patterns.add<
-      ComputeInvOpByLUTPattern,
       ComputeTanhOpByLUTPattern,
       ComputeSqrtOpPattern,
       ComputeRsqrtOpPattern,
diff --git a/test/unit_tests/aievec_tests/bf16_inv_lut/bf16_inv_lut.mlir b/test/unit_tests/aievec_tests/bf16_inv_lut/bf16_inv_lut.mlir
index 9d0b435cd7..e0a786efa3 100644
--- a/test/unit_tests/aievec_tests/bf16_inv_lut/bf16_inv_lut.mlir
+++ b/test/unit_tests/aievec_tests/bf16_inv_lut/bf16_inv_lut.mlir
@@ -2,11 +2,11 @@
 // Copyright (C) 2023, Advanced Micro Devices, Inc.
 
 // REQUIRES: valid_xchess_license
+// RUN: mkdir -p %t/data; cd %t
 // RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=16" --convert-vector-to-aievec="aie-target=aie2" -lower-affine | aie-translate -aie2=true --aievec-to-cpp -o dut.cc
 // RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I%aie_runtime_lib%/AIE2 -I %aietools/include -D__AIEARCH__=20 -D__AIENGINE__ -I. -c %aie_runtime_lib%/AIE2/lut_based_ops.cpp -o lut_based_ops.o
 // RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I%aie_runtime_lib%/AIE2 -I %aietools/include -D__AIEARCH__=20 -D__AIENGINE__ -I. -c dut.cc -o dut.o
 // RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I%aie_runtime_lib%/AIE2 -I %aietools/include -D__AIEARCH__=20 -D__AIENGINE__ -I. %S/testbench.cc work/dut.o work/lut_based_ops.o
-// RUN: mkdir -p data
 // RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
 // RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
 // CHECK: TEST PASSED
diff --git a/test/unit_tests/aievec_tests/bf16_inv_lut/bf16_inv_lut_mlir_to_llvm.mlir b/test/unit_tests/aievec_tests/bf16_inv_lut/bf16_inv_lut_mlir_to_llvm.mlir
new file mode 100644
index 0000000000..f7e4c88b48
--- /dev/null
+++ b/test/unit_tests/aievec_tests/bf16_inv_lut/bf16_inv_lut_mlir_to_llvm.mlir
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+
+// REQUIRES: valid_xchess_license
+// REQUIRES: peano 
+// RUN: mkdir -p %t/data; cd %t
+// RUN: aie-opt %s --mlir-print-ir-after-all -affine-super-vectorize="virtual-vector-size=16" %vector-to-llvmir% -o llvmir.mlir >& mlir_passes.txt
+// RUN: aie-translate --mlir-to-llvmir llvmir.mlir -o dut_part.ll
+// RUN: %PEANO_INSTALL_DIR/bin/clang -S -emit-llvm %clang_aie2_lib_args -I%aie_runtime_lib%/AIE2/ -c %S/dut_simple.cc -o lut_based_ops.ll
+// RUN: %PEANO_INSTALL_DIR/bin/clang -S -emit-llvm %clang_aie2_lib_args -c %aie_runtime_lib%/AIE2/lut_based_ops.cpp -o lut_constants.ll
+// RUN: llvm-link -S lut_based_ops.ll dut_part.ll -o dut_functions.ll
+// RUN: llvm-link -S lut_constants.ll dut_functions.ll -o dut.ll
+// RUN: %PEANO_INSTALL_DIR/bin/clang %clang_aie2_args -c dut.ll -o dut.o
+// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I%aie_runtime_lib%/AIE2 -I %aietools/include -DTO_LLVM -D__AIEARCH__=20 -D__AIENGINE__ -I. %S/testbench.cc dut.o 
+// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
+// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
+// CHECK: TEST PASSED
+
+module {
+  func.func @dut(%arg0: memref<1024xbf16>{llvm.noalias}, %arg1: f32, %arg2: memref<1024xbf16>{llvm.noalias}) {
+    memref.assume_alignment %arg0, 32 : memref<1024xbf16>
+    memref.assume_alignment %arg2, 32 : memref<1024xbf16>
+    %cst = arith.constant 1.000000e+00 : f32
+    %0 = arith.divf %cst, %arg1 : f32
+    %1 = arith.truncf %0 : f32 to bf16
+    affine.for %arg3 = 0 to 1024 {
+      %2 = affine.load %arg0[%arg3] : memref<1024xbf16>
+      %3 = arith.mulf %1, %2 : bf16
+      affine.store %3, %arg2[%arg3] : memref<1024xbf16>
+    }
+    return
+  }
+}
+
diff --git a/test/unit_tests/aievec_tests/bf16_inv_lut/dut_simple.cc b/test/unit_tests/aievec_tests/bf16_inv_lut/dut_simple.cc
new file mode 100644
index 0000000000..2f34961ad5
--- /dev/null
+++ b/test/unit_tests/aievec_tests/bf16_inv_lut/dut_simple.cc
@@ -0,0 +1 @@
+#include "lut_based_ops.h"
diff --git a/test/unit_tests/aievec_tests/bf16_inv_lut/testbench.cc b/test/unit_tests/aievec_tests/bf16_inv_lut/testbench.cc
index 1f0dd03cc8..39f5f3a8dc 100644
--- a/test/unit_tests/aievec_tests/bf16_inv_lut/testbench.cc
+++ b/test/unit_tests/aievec_tests/bf16_inv_lut/testbench.cc
@@ -5,7 +5,13 @@
 #include <cstdio>
 #include <cstdlib>
 
+#ifdef TO_LLVM
+extern "C" {
+#endif
 void dut(bfloat16 *restrict in0, float sum, bfloat16 *restrict out0);
+#ifdef TO_LLVM
+}
+#endif
 void dut_ref(bfloat16 *in0, float sum, bfloat16 *out0);
 
 alignas(32) bfloat16 g_in0[IN0_SIZE];

From 32b04ceb2a2d9d8768de8f35b0e32450216b2234 Mon Sep 17 00:00:00 2001
From: aidansander <aisander@xsjaisander40x.xlnx.xilinx.com>
Date: Thu, 15 Aug 2024 10:36:27 -0700
Subject: [PATCH 2/5] Renamed test for consistency and updated helper name

---
 .../Transforms/VectorToAIEVecConversions.cpp     | 16 ++++++++--------
 ..._mlir_to_llvm.mlir => bf16_inv_lut-llvm.mlir} |  0
 2 files changed, 8 insertions(+), 8 deletions(-)
 rename test/unit_tests/aievec_tests/bf16_inv_lut/{bf16_inv_lut_mlir_to_llvm.mlir => bf16_inv_lut-llvm.mlir} (100%)

diff --git a/lib/Dialect/AIEVec/Transforms/VectorToAIEVecConversions.cpp b/lib/Dialect/AIEVec/Transforms/VectorToAIEVecConversions.cpp
index 5618ecbb42..12c505a9f4 100644
--- a/lib/Dialect/AIEVec/Transforms/VectorToAIEVecConversions.cpp
+++ b/lib/Dialect/AIEVec/Transforms/VectorToAIEVecConversions.cpp
@@ -2010,13 +2010,6 @@ struct ComputeExpOpByLUTPattern : OpConversionPattern<math::ExpOp> {
   }
 };
 
-// Lower the inverse of a float to a function call
-// Convert the pattern-
-//  %cst = arith.constant 1.000000e+00 : f32
-//  %0 = arith.divf %cst, %arg1 : f32
-//  %1 = arith.truncf %0 : f32 to bf16
-// to -
-//  %0 = emitc.call "getInvBf16"(%0) : f32 -> bf16;
 struct ComputeInvOpByLUTLLVMPattern : OpConversionPattern<arith::DivFOp> {
   using OpConversionPattern::OpConversionPattern;
 
@@ -2046,7 +2039,7 @@ struct ComputeInvOpByLUTLLVMPattern : OpConversionPattern<arith::DivFOp> {
     Type floatTy = rewriter.getF32Type();
     Type bfloat16Ty = rewriter.getBF16Type();
     func::FuncOp fn_op =
-        getOrGenerateFuncOp(rewriter, moduleOp, funcName, TypeRange{floatTy},
+        getOrInsertFuncDecl(rewriter, moduleOp, funcName, TypeRange{floatTy},
                             TypeRange{bfloat16Ty});
 
     auto truncOp = cast<arith::TruncFOp>(*divOp->getUsers().begin());
@@ -2060,6 +2053,13 @@ struct ComputeInvOpByLUTLLVMPattern : OpConversionPattern<arith::DivFOp> {
   }
 };
 
+// Lower the inverse of a float to a function call
+// Convert the pattern-
+//  %cst = arith.constant 1.000000e+00 : f32
+//  %0 = arith.divf %cst, %arg1 : f32
+//  %1 = arith.truncf %0 : f32 to bf16
+// to -
+//  %0 = emitc.call "getInvBf16"(%0) : f32 -> bf16;
 struct ComputeInvOpByLUTPattern : OpConversionPattern<arith::DivFOp> {
   using OpConversionPattern::OpConversionPattern;
 
diff --git a/test/unit_tests/aievec_tests/bf16_inv_lut/bf16_inv_lut_mlir_to_llvm.mlir b/test/unit_tests/aievec_tests/bf16_inv_lut/bf16_inv_lut-llvm.mlir
similarity index 100%
rename from test/unit_tests/aievec_tests/bf16_inv_lut/bf16_inv_lut_mlir_to_llvm.mlir
rename to test/unit_tests/aievec_tests/bf16_inv_lut/bf16_inv_lut-llvm.mlir

From 10b071e5b95530b7cb59c7b28dff7dfa0fa43431 Mon Sep 17 00:00:00 2001
From: aidansander <aisander@xsjaisander40x.xlnx.xilinx.com>
Date: Fri, 9 Aug 2024 11:19:18 -0700
Subject: [PATCH 3/5] Added conversion test for getInvBf16 llvm pass

---
 .../VectorToAIEVec/test_lut_based_ops.mlir      | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/test/Conversion/VectorToAIEVec/test_lut_based_ops.mlir b/test/Conversion/VectorToAIEVec/test_lut_based_ops.mlir
index 282b1c3dc0..6a51f5ba2b 100644
--- a/test/Conversion/VectorToAIEVec/test_lut_based_ops.mlir
+++ b/test/Conversion/VectorToAIEVec/test_lut_based_ops.mlir
@@ -4,6 +4,7 @@
 // CHECK-LABEL: func private @getExpBf16(vector<16xbf16>) -> vector<8xi64>
 // CHECK-LABEL: func @test_exp_lut
 // CHECK-SAME: %[[A:[A-Za-z0-9]+]]: vector<16xbf16>
+module{
 func.func @test_exp_lut(%a: vector<16xbf16>) -> vector<16xbf16> {
     // CHECK: %[[C0:.*]] = arith.constant 0 : i32
     // CHECK: %[[CALL:.*]] = call @getExpBf16(%[[A]]) : (vector<16xbf16>) -> vector<8xi64>
@@ -13,3 +14,19 @@ func.func @test_exp_lut(%a: vector<16xbf16>) -> vector<16xbf16> {
     // CHECK: return %[[SRS]] : vector<16xbf16>
     return %0 : vector<16xbf16>
 }
+
+}
+
+module{
+// CHECK-LABEL: func private @getInvBf16(f32) -> bf16
+// CHECK-LABEL: func @test_inv_lut
+// CHECK-SAME: %[[A:[A-Za-z0-9]+]]: f32
+func.func @test_inv_lut(%a: f32) -> bf16{
+    // CHECK: %[[RET:.*]] = call @getInvBf16(%[[A]]) : (f32) -> bf16
+    %cst = arith.constant 1.000000e+00 : f32
+    %0 = arith.divf %cst, %a : f32
+    %1 = arith.truncf %0 : f32 to bf16
+    // CHECK: return %[[RET]] : bf16
+    return %1 : bf16
+}
+}

From 6271f6da2acb6ae28fd4ecf428d4aa1d528bf18a Mon Sep 17 00:00:00 2001
From: aidansander <aisander@xsjaisander40x.xlnx.xilinx.com>
Date: Thu, 15 Aug 2024 12:04:21 -0700
Subject: [PATCH 4/5] Moved duplicated match() code for inv LUT into helper

---
 .../Transforms/VectorToAIEVecConversions.cpp  | 57 ++++++++-----------
 1 file changed, 24 insertions(+), 33 deletions(-)

diff --git a/lib/Dialect/AIEVec/Transforms/VectorToAIEVecConversions.cpp b/lib/Dialect/AIEVec/Transforms/VectorToAIEVecConversions.cpp
index 12c505a9f4..cce5c6ca9f 100644
--- a/lib/Dialect/AIEVec/Transforms/VectorToAIEVecConversions.cpp
+++ b/lib/Dialect/AIEVec/Transforms/VectorToAIEVecConversions.cpp
@@ -491,6 +491,28 @@ static bool matchExpOpForLUT(math::ExpOp::Adaptor adaptor) {
 
   return true;
 }
+static bool matchInvOpForLUT(arith::DivFOp::Adaptor adaptor,
+                             arith::DivFOp divOp) {
+  Type srcType = adaptor.getLhs().getType();
+  if (!divOp->hasOneUse() || isa<VectorType>(srcType) ||
+      !isa<FloatType>(srcType))
+    return false;
+
+  if (!isNarrowingOp(*divOp->getUsers().begin()))
+    return false;
+
+  auto fType = cast<FloatType>(srcType);
+  if (fType.getWidth() != 32)
+    return false;
+
+  auto constOp = divOp.getLhs().getDefiningOp<arith::ConstantOp>();
+  if (!constOp ||
+      cast<FloatAttr>(constOp.getValue()).getValue().convertToDouble() !=
+          1.0f) {
+    return false;
+  }
+  return true;
+}
 
 //===----------------------------------------------------------------------===//
 // Rewrite patterns
@@ -2016,22 +2038,7 @@ struct ComputeInvOpByLUTLLVMPattern : OpConversionPattern<arith::DivFOp> {
   LogicalResult
   matchAndRewrite(arith::DivFOp divOp, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    Type srcType = adaptor.getLhs().getType();
-    if (!divOp->hasOneUse() || isa<VectorType>(srcType) ||
-        !isa<FloatType>(srcType))
-      return failure();
-
-    if (!isNarrowingOp(*divOp->getUsers().begin()))
-      return failure();
-
-    auto fType = cast<FloatType>(srcType);
-    if (fType.getWidth() != 32)
-      return failure();
-
-    auto constOp = dyn_cast<arith::ConstantOp>(divOp.getLhs().getDefiningOp());
-    if (!constOp ||
-        cast<FloatAttr>(constOp.getValue()).getValue().convertToDouble() !=
-            1.0f)
+    if (!matchInvOpForLUT(adaptor, divOp))
       return failure();
 
     StringRef funcName = "getInvBf16";
@@ -2066,24 +2073,8 @@ struct ComputeInvOpByLUTPattern : OpConversionPattern<arith::DivFOp> {
   LogicalResult
   matchAndRewrite(arith::DivFOp divOp, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    Type srcType = adaptor.getLhs().getType();
-    if (!divOp->hasOneUse() || isa<VectorType>(srcType) ||
-        !isa<FloatType>(srcType))
-      return failure();
-
-    if (!isNarrowingOp(*divOp->getUsers().begin()))
+    if (!matchInvOpForLUT(adaptor, divOp))
       return failure();
-
-    auto fType = cast<FloatType>(srcType);
-    if (fType.getWidth() != 32)
-      return failure();
-
-    auto constOp = dyn_cast<arith::ConstantOp>(divOp.getLhs().getDefiningOp());
-    if (!constOp ||
-        cast<FloatAttr>(constOp.getValue()).getValue().convertToDouble() !=
-            1.0f)
-      return failure();
-
     StringRef includeName = "lut_based_ops.h";
     auto moduleOp = divOp->getParentOfType<mlir::ModuleOp>();
     rewriter.setInsertionPointToStart(

From 6a2ec41c46beb5963dac57749353ad1cc0c1f2a0 Mon Sep 17 00:00:00 2001
From: aidansander <aisander@xsjaisander40x.xlnx.xilinx.com>
Date: Thu, 15 Aug 2024 12:38:21 -0700
Subject: [PATCH 5/5] Adding appropriate requires to bf16_inv_lut-llvm test

---
 .../unit_tests/aievec_tests/bf16_inv_lut/bf16_inv_lut-llvm.mlir | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/unit_tests/aievec_tests/bf16_inv_lut/bf16_inv_lut-llvm.mlir b/test/unit_tests/aievec_tests/bf16_inv_lut/bf16_inv_lut-llvm.mlir
index f7e4c88b48..2b964184eb 100644
--- a/test/unit_tests/aievec_tests/bf16_inv_lut/bf16_inv_lut-llvm.mlir
+++ b/test/unit_tests/aievec_tests/bf16_inv_lut/bf16_inv_lut-llvm.mlir
@@ -2,7 +2,7 @@
 // Copyright (C) 2023, Advanced Micro Devices, Inc.
 
 // REQUIRES: valid_xchess_license
-// REQUIRES: peano 
+// REQUIRES: peano, peano_and_chess
 // RUN: mkdir -p %t/data; cd %t
 // RUN: aie-opt %s --mlir-print-ir-after-all -affine-super-vectorize="virtual-vector-size=16" %vector-to-llvmir% -o llvmir.mlir >& mlir_passes.txt
 // RUN: aie-translate --mlir-to-llvmir llvmir.mlir -o dut_part.ll