microsoft
diff --git a/‎include/triton-shared/AnalysisStructured/PtrAnalysis.h
Lines changed: 41 additions & 2 deletions b/‎include/triton-shared/AnalysisStructured/PtrAnalysis.h
Lines changed: 41 additions & 2 deletions
diff --git a/‎lib/Analysis/OpFoldResultUtils.cpp
Lines changed: 28 additions & 3 deletions b/‎lib/Analysis/OpFoldResultUtils.cpp
Lines changed: 28 additions & 3 deletions
@@ -112,11 +112,15 @@ struct PtrState {
 
   // Process addition of two PtrStates.
   LogicalResult addState(const PtrState &lhsState, const PtrState &rhsState,
-                         Operation *op, OpBuilder &builder);
+                         bool isAnalysisingUnstructured, Operation *op,
+                         OpBuilder &builder);
 
   // Process multiplication of two PtrStates
   LogicalResult mulState(const PtrState &lhsState, const PtrState &rhsState,
-                         Operation *op, OpBuilder &builder);
+                         bool isAnalysisingUnstructured, Operation *op,
+                         OpBuilder &builder);
+
+  LogicalResult mergeUnstructuredState(const PtrState &other, Operation *op);
 
   tts::MakeTensorPtrOp createTTSMakeTensorPtrOp(OpBuilder &builder,
                                                 Location loc);
@@ -147,6 +151,41 @@ class PtrAnalysis {
 
   DenseSet<Value> maybeStructuredArgs;
   const bool enableMakeGatherScatterTensorPtr;
+  // If false, PtrAnalysis will analysis structured ptr while only identify
+  // unstructured ptr.
+  // If true, PtrAnalysis will caclulate strides and offsets for
+  // unstructured pointers. This is used to support gather/scatter access.
+  // The default mode is false. Only set to true when caclulating
+  // unstructured pointers for gather/scatter access.
+  // The reason to have different mode is to support case like:
+  //
+  // ptr + (row_offsets[:,None] % mod_offset + some_number) +
+  //    row_indices[:None]
+  //
+  // (row_offsets[:,None] % mod_offset + some_number) is structured and
+  // has modulo.
+  // row_indices[:, None] is unstructured.
+  // When visiting the add operation, we need to apply the modulo to
+  //   (row_offsets[:,None] % mod_offset + some_number).
+  // But we don't have the information about how to apply the modulo.
+  //
+  // To simplify the analysis, we do the work in two modes:
+  // 1. Analyze to identify the unstructured pointers.
+  // 2. Analyze to calculate the strides and offsets for unstructured pointers.
+  // In mode 1, isAnalysisingUnstructured is set to false, so we only
+  //    identify the unstructured pointers and do not calculate the strides and
+  //    offsets.
+  // When visit the operand again to calculate the offsets and strides for the
+  // unstructured state, we'll set isAnalysisingUnstructured to true.
+  // This means that we switched to mode 2 now and are analyzing the
+  // unstructured pointers and calculating the strides and offsets for them. In
+  // mode 2, we know that the pointer is unstructured, so we can just use the
+  // value of arith::RemSIOp as offset directly. Once the analysis is done,
+  // we'll switch back to mode 1.
+  //
+  // Note that this is might be a temporary solution, and we may need to
+  // revisit this in the future to support more complex cases.
+  bool isAnalysisingUnstructured = false;
 
 public:
   PtrAnalysis(bool enableMakeGatherScatterTensorPtr)
 
@@ -14,6 +14,9 @@
 #include "mlir/Transforms/DialectConversion.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 
+#include "llvm/Support/Debug.h"
+#define DEBUG_TYPE "triton-ptr-analysis"
+
 namespace mlir {
 
 std::optional<int64_t> getIntAttr(const OpFoldResult ofr) {
@@ -115,11 +118,32 @@ OpFoldResult expandOFRIndex(OpFoldResult ofr, OpFoldResult targetForTy,
       v = indexTypeCast(v, targetEltTy, loc, b);
     return b.create<triton::SplatOp>(loc, targetTy, v).getResult();
   } else if (targetShapedTy && shapedTy) {
-    // TODO: support ShapedType to ShapedType.
     Type targetEltTy = targetShapedTy.getElementType();
     Type eltTy = shapedTy.getElementType();
-    if (targetShapedTy.getShape() != shapedTy.getShape())
-      llvm_unreachable("ShapedType to ShapedType must have same shape");
+    if (targetShapedTy.getShape() != shapedTy.getShape()) {
+      assert(targetEltTy == eltTy &&
+             "Only cast between same element type shaped types");
+      // This path is for case like:
+      // input_ptr + (row_indices[:, None] + row_offsets[:,None] % mod_offset) *
+      //   stride_m + col_offsets[None, :] * stride_n
+      // The modulo will be in shape of [ROW_SIZE, 1] while row_indices is in shape of [ROW_SIZE,].
+      LLVM_DEBUG({
+        llvm::dbgs() << "Reshaping ";
+        shapedTy.dump();
+        llvm::dbgs() << " to ";
+        targetShapedTy.dump();
+      });
+      SmallVector<Value> shapeValues;
+      for (auto dim : targetShapedTy.getShape()) {
+        shapeValues.push_back(b.create<arith::ConstantOp>(
+            loc, b.getIndexAttr(dim)));
+      }
+      RankedTensorType targetShapeTensorTy = RankedTensorType::get(
+          targetShapedTy.getShape().size(), b.getIndexType());
+      auto shapeTensor = b.create<tensor::FromElementsOp>(
+          loc, targetShapeTensorTy, shapeValues);
+      return b.create<triton::ReshapeOp>(loc, targetTy, v, shapeTensor).getResult();
+    }
     if (isa<IndexType>(targetEltTy) || isa<IndexType>(eltTy)) {
       assert((isa<IntegerType>(targetEltTy) || isa<IntegerType>(eltTy)) &&
              "Only cast between index type and integer type");
@@ -351,4 +375,5 @@ OpFoldResult compareOFRs(const OpFoldResult lhs, const OpFoldResult rhs,
   auto selectOp = b.create<arith::SelectOp>(loc, cmpOp, trueValue, falseValue);
   return selectOp.getResult();
 }
+
 } // namespace mlir