-
Notifications
You must be signed in to change notification settings - Fork 38
[Reprogram] Modify controlcode-lowering and lower-to-aie for DMA reprogramming #1330
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
…ogramming -- This commit includes modifications to `controlcode-lowering` and `lower-to-aie` passes for DMA reprogramming. -- This is being added to AMDAIE dialect to make [DMA reprogramming](#1287) work. Signed-off-by: Abhishek Varma <[email protected]>
|
||
/// Utility to fold the provided repetition count, unit dims, linear dims and | ||
/// to convert the sizes and strides into static versions and return them. | ||
LogicalResult foldDimsAndReturnAsStatic( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A few fairly trivial utilities to be pulled out to a common place.
There are a few other utilities which are a bit involved and can be addressed for de-duplication later.
if (npuDmaUsers.size() > 1) { | ||
return emitOpError() << "only a single `amdaie.npu.circular_dma_cpy_nd` " | ||
"user supported currently, but got: " | ||
<< npuDmaUsers.size(); | ||
} | ||
return npuDmaUsers[0]; | ||
if (npuDmaUsers.size() == 1) return npuDmaUsers[0]; | ||
return failure(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This change doesn't look needed?
// nuances. | ||
using BDDimLayoutAndLength = std::pair<AMDAIE::BDDimLayoutArrayAttr, int64_t>; | ||
|
||
BDDimLayoutAndLength convertSizeStrideToBDDimLayoutArrayAttr( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Isn't this function exactly the same, so it can be extracted?
std::optional<uint8_t> pktId) { | ||
OpBuilder::InsertionGuard g(rewriter); | ||
|
||
// Create DMA channel. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I know this is copied, but maybe update this comment to "Create DMA start on channel".
// AIEDeviceBuilder utilities | ||
//===----------------------------------------------------------------------===// | ||
// TODO(avarma): Copied from LowerToAIE. Templatize it later because of a few | ||
// nuances. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you specify the nuances?
// Find the last index with a zero stride. All dimensions before and including | ||
// this one will be converted into separate DMA ops, while the dimensions | ||
// after this will be included in the access pattern within a DMA op. This is | ||
// needed becaused low-level DMA BD configurations currently don't support | ||
// zero stride and/or because more dimensions are needed than available. | ||
int64_t lastZeroStrideIndex{-1}; | ||
for (size_t i = 0; i < strides.size(); i++) | ||
if (strides[i] == 0) lastZeroStrideIndex = i; | ||
|
||
// Convert all dimensions after the last index with zero stride to a | ||
// `BDDimLayoutArrayAttr` as these are the inner/intra DMA dimensions. | ||
auto [dims, transferLength] = convertSizeStrideToBDDimLayoutArrayAttr( | ||
rewriter, ArrayRef<int64_t>(sizes).drop_front(lastZeroStrideIndex + 1), | ||
ArrayRef<int64_t>(strides).drop_front(lastZeroStrideIndex + 1)); | ||
|
||
SmallVector<size_t> indexRange(lastZeroStrideIndex + 1); | ||
std::iota(indexRange.begin(), indexRange.end(), 0); | ||
// Compute the total number of iterations of all dimensions up till | ||
// `lastZeroStrideIndex`. | ||
int64_t numIters = std::accumulate( | ||
sizes.begin(), sizes.begin() + indexRange.size(), 1, std::multiplies<>()); | ||
// Compute the divisors to be used to get the indices for every dimension from | ||
// the total number of iterations (as if all dimensions are coalesced). | ||
SmallVector<int64_t> cartesianDivisors(indexRange.size(), 1); | ||
for (int64_t i = indexRange.size() - 2; i >= 0; i--) | ||
cartesianDivisors[i] = cartesianDivisors[i + 1] * sizes[i + 1]; | ||
|
||
// Create blocks with DMA ops. | ||
Block *succ = nullptr, *curr = bdBlock; | ||
for (size_t blockIndex = 0; blockIndex < bufferOps.size(); ++blockIndex) { | ||
// Iterate through the cartesian product of all dimension up to the last | ||
// dimension with zero strides to create a DMA chain of `dma_bd` ops. | ||
for (int64_t index = 0; index < numIters; index++) { | ||
SmallVector<int64_t> indices = llvm::map_to_vector( | ||
indexRange, | ||
[&](size_t i) { return (index / cartesianDivisors[i]) % sizes[i]; }); | ||
bool isFirst = llvm::all_of(indices, [](int64_t v) { return v == 0; }); | ||
bool isLast = llvm::all_of( | ||
indexRange, [&](size_t i) { return indices[i] == (sizes[i] - 1); }); | ||
if (blockIndex == bufferOps.size() - 1 && isLast) { | ||
succ = &endBlock; | ||
} else { | ||
succ = rewriter.createBlock(&endBlock); | ||
llvm::outs() << "8 = " << dmaStartOp << "\n"; | ||
} | ||
rewriter.setInsertionPointToStart(curr); | ||
int64_t addOffset = 0; | ||
for (size_t i = 0; i < indexRange.size(); i++) | ||
addOffset += (indices[i] * strides[i]); | ||
createDMAOps(succ, bufferOps[blockIndex], dims, isFirst, isLast, | ||
transferLength, offset + addOffset); | ||
curr = succ; | ||
} | ||
} | ||
return success(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unless there is some difference here, I think this could be extracted.
if (!bdIdOp) { | ||
if (failed(halfDmaToDmaStartBlocks(rewriter, deviceModel, halfDmaOp, | ||
controlCodeOp, connectionIndex, | ||
tileToMemOpMap))) { | ||
return WalkResult::interrupt(); | ||
} | ||
eraseCandidate(halfDmaOp); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Avoid the nesting:
if (!bdIdOp) { | |
if (failed(halfDmaToDmaStartBlocks(rewriter, deviceModel, halfDmaOp, | |
controlCodeOp, connectionIndex, | |
tileToMemOpMap))) { | |
return WalkResult::interrupt(); | |
} | |
eraseCandidate(halfDmaOp); | |
} | |
if (bdIdOp) return WalkResult::advance(); | |
if (failed(halfDmaToDmaStartBlocks(rewriter, deviceModel, halfDmaOp, | |
controlCodeOp, connectionIndex, | |
tileToMemOpMap))) { | |
return WalkResult::interrupt(); | |
} | |
eraseCandidate(halfDmaOp); |
}); | ||
if (res.wasInterrupted()) return failure(); | ||
|
||
for (Operation *op : toBeErased) rewriter.eraseOp(op); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You're going through toBeErased
again?
return success(); | ||
}; | ||
|
||
auto processTarget = |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you move processSource
and processTarget
into standalone functions?
return connectionOp.emitOpError() | ||
<< "expected target to be an logical objFifo-like op"; | ||
} | ||
// TODO(avarma): Need to set correct insertion point. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Still an issue?
@@ -127,6 +122,7 @@ class AIEDeviceBuilder { | |||
connectionToSourceTargetMemOps; | |||
/// Map from connection ops to the flow ops they have been converted into. | |||
DenseMap<AMDAIE::ConnectionOp, SmallVector<Operation *>> connectionToFlowOps; | |||
bool reprogramDmas; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you add some doc?
-- This commit includes modifications to
controlcode-lowering
andlower-to-aie
passes for DMA reprogramming.-- This is being added to AMDAIE dialect to make DMA reprogramming work.
Signed-off-by: Abhishek Varma [email protected]