diff --git a/mlir/include/mlir/Transforms/LoopUtils.h b/mlir/include/mlir/Transforms/LoopUtils.h --- a/mlir/include/mlir/Transforms/LoopUtils.h +++ b/mlir/include/mlir/Transforms/LoopUtils.h @@ -79,13 +79,14 @@ AffineMap *map, SmallVectorImpl *operands, OpBuilder &builder); -/// Skew the operations in the body of a 'affine.for' operation with the +/// Skew the operations in the body of an affine.for operation with the /// specified operation-wise shifts. The shifts are with respect to the /// original execution order, and are multiplied by the loop 'step' before being -/// applied. +/// applied. If `unrollPrologueEpilogue` is set, fully unroll the prologue and +/// epilogue loops when possible. LLVM_NODISCARD -LogicalResult instBodySkew(AffineForOp forOp, ArrayRef shifts, - bool unrollPrologueEpilogue = false); +LogicalResult affineForOpBodySkew(AffineForOp forOp, ArrayRef shifts, + bool unrollPrologueEpilogue = false); /// Tiles the specified band of perfectly nested loops creating tile-space loops /// and intra-tile loops. A band is a contiguous set of loops. diff --git a/mlir/lib/Transforms/PipelineDataTransfer.cpp b/mlir/lib/Transforms/PipelineDataTransfer.cpp --- a/mlir/lib/Transforms/PipelineDataTransfer.cpp +++ b/mlir/lib/Transforms/PipelineDataTransfer.cpp @@ -22,6 +22,7 @@ #include "mlir/Transforms/Utils.h" #include "llvm/ADT/DenseMap.h" #include "llvm/Support/Debug.h" + #define DEBUG_TYPE "affine-pipeline-data-transfer" using namespace mlir; @@ -46,9 +47,9 @@ // Returns the position of the tag memref operand given a DMA operation. // Temporary utility: will be replaced when DmaStart/DmaFinish abstract op's are // added. TODO(b/117228571) -static unsigned getTagMemRefPos(Operation &dmaInst) { - assert(isa(dmaInst) || isa(dmaInst)); - if (auto dmaStartOp = dyn_cast(dmaInst)) { +static unsigned getTagMemRefPos(Operation &dmaOp) { + assert(isa(dmaOp) || isa(dmaOp)); + if (auto dmaStartOp = dyn_cast(dmaOp)) { return dmaStartOp.getTagMemRefOperandIndex(); } // First operand for a dma finish operation. @@ -79,21 +80,20 @@ auto oldMemRefType = oldMemRef.getType().cast(); auto newMemRefType = doubleShape(oldMemRefType); - // The double buffer is allocated right before 'forInst'. - auto *forInst = forOp.getOperation(); - OpBuilder bOuter(forInst); + // The double buffer is allocated right before 'forOp'. + OpBuilder bOuter(forOp); // Put together alloc operands for any dynamic dimensions of the memref. SmallVector allocOperands; unsigned dynamicDimCount = 0; for (auto dimSize : oldMemRefType.getShape()) { if (dimSize == -1) - allocOperands.push_back(bOuter.create(forInst->getLoc(), oldMemRef, - dynamicDimCount++)); + allocOperands.push_back( + bOuter.create(forOp.getLoc(), oldMemRef, dynamicDimCount++)); } // Create and place the alloc right before the 'affine.for' operation. Value newMemRef = - bOuter.create(forInst->getLoc(), newMemRefType, allocOperands); + bOuter.create(forOp.getLoc(), newMemRefType, allocOperands); // Create 'iv mod 2' value to index the leading dimension. auto d0 = bInner.getAffineDimExpr(0); @@ -118,8 +118,8 @@ return false; } // Insert the dealloc op right after the for loop. - bOuter.setInsertionPointAfter(forInst); - bOuter.create(forInst->getLoc(), newMemRef); + bOuter.setInsertionPointAfter(forOp); + bOuter.create(forOp.getLoc(), newMemRef); return true; } @@ -219,11 +219,11 @@ } // For each start operation, we look for a matching finish operation. - for (auto *dmaStartInst : dmaStartInsts) { - for (auto *dmaFinishInst : dmaFinishInsts) { - if (checkTagMatch(cast(dmaStartInst), - cast(dmaFinishInst))) { - startWaitPairs.push_back({dmaStartInst, dmaFinishInst}); + for (auto *dmaStartOp : dmaStartInsts) { + for (auto *dmaFinishOp : dmaFinishInsts) { + if (checkTagMatch(cast(dmaStartOp), + cast(dmaFinishOp))) { + startWaitPairs.push_back({dmaStartOp, dmaFinishOp}); break; } } @@ -236,8 +236,7 @@ void PipelineDataTransfer::runOnAffineForOp(AffineForOp forOp) { auto mayBeConstTripCount = getConstantTripCount(forOp); if (!mayBeConstTripCount.hasValue()) { - LLVM_DEBUG( - forOp.emitRemark("won't pipeline due to unknown trip count loop")); + LLVM_DEBUG(forOp.emitRemark("won't pipeline due to unknown trip count")); return; } @@ -258,14 +257,14 @@ // the dimension we are adding here for the double buffering is the outermost // dimension. for (auto &pair : startWaitPairs) { - auto *dmaStartInst = pair.first; - Value oldMemRef = dmaStartInst->getOperand( - cast(dmaStartInst).getFasterMemPos()); + auto *dmaStartOp = pair.first; + Value oldMemRef = dmaStartOp->getOperand( + cast(dmaStartOp).getFasterMemPos()); if (!doubleBuffer(oldMemRef, forOp)) { // Normally, double buffering should not fail because we already checked // that there are no uses outside. LLVM_DEBUG(llvm::dbgs() - << "double buffering failed for" << dmaStartInst << "\n";); + << "double buffering failed for" << dmaStartOp << "\n";); // IR still valid and semantically correct. return; } @@ -275,13 +274,13 @@ // order to create the double buffer above.) // '-canonicalize' does this in a more general way, but we'll anyway do the // simple/common case so that the output / test cases looks clear. - if (auto *allocInst = oldMemRef.getDefiningOp()) { + if (auto *allocOp = oldMemRef.getDefiningOp()) { if (oldMemRef.use_empty()) { - allocInst->erase(); + allocOp->erase(); } else if (oldMemRef.hasOneUse()) { if (auto dealloc = dyn_cast(*oldMemRef.user_begin())) { dealloc.erase(); - allocInst->erase(); + allocOp->erase(); } } } @@ -289,22 +288,21 @@ // Double the buffers for tag memrefs. for (auto &pair : startWaitPairs) { - auto *dmaFinishInst = pair.second; - Value oldTagMemRef = - dmaFinishInst->getOperand(getTagMemRefPos(*dmaFinishInst)); + auto *dmaFinishOp = pair.second; + Value oldTagMemRef = dmaFinishOp->getOperand(getTagMemRefPos(*dmaFinishOp)); if (!doubleBuffer(oldTagMemRef, forOp)) { LLVM_DEBUG(llvm::dbgs() << "tag double buffering failed\n";); return; } // If the old tag has no uses or a single dealloc use, remove it. // (canonicalization handles more complex cases). - if (auto *tagAllocInst = oldTagMemRef.getDefiningOp()) { + if (auto *tagAllocOp = oldTagMemRef.getDefiningOp()) { if (oldTagMemRef.use_empty()) { - tagAllocInst->erase(); + tagAllocOp->erase(); } else if (oldTagMemRef.hasOneUse()) { if (auto dealloc = dyn_cast(*oldTagMemRef.user_begin())) { dealloc.erase(); - tagAllocInst->erase(); + tagAllocOp->erase(); } } } @@ -317,12 +315,12 @@ // Store shift for operation for later lookup for AffineApplyOp's. DenseMap instShiftMap; for (auto &pair : startWaitPairs) { - auto *dmaStartInst = pair.first; - assert(isa(dmaStartInst)); - instShiftMap[dmaStartInst] = 0; + auto *dmaStartOp = pair.first; + assert(isa(dmaStartOp)); + instShiftMap[dmaStartOp] = 0; // Set shifts for DMA start op's affine operand computation slices to 0. SmallVector sliceOps; - mlir::createAffineComputationSlice(dmaStartInst, &sliceOps); + mlir::createAffineComputationSlice(dmaStartOp, &sliceOps); if (!sliceOps.empty()) { for (auto sliceOp : sliceOps) { instShiftMap[sliceOp.getOperation()] = 0; @@ -331,7 +329,7 @@ // If a slice wasn't created, the reachable affine.apply op's from its // operands are the ones that go with it. SmallVector affineApplyInsts; - SmallVector operands(dmaStartInst->getOperands()); + SmallVector operands(dmaStartOp->getOperands()); getReachableAffineApplyOps(operands, affineApplyInsts); for (auto *op : affineApplyInsts) { instShiftMap[op] = 0; @@ -339,16 +337,14 @@ } } // Everything else (including compute ops and dma finish) are shifted by one. - for (auto &op : *forOp.getBody()) { - if (instShiftMap.find(&op) == instShiftMap.end()) { + for (auto &op : forOp.getBody()->without_terminator()) + if (instShiftMap.find(&op) == instShiftMap.end()) instShiftMap[&op] = 1; - } - } // Get shifts stored in map. std::vector shifts(forOp.getBody()->getOperations().size()); unsigned s = 0; - for (auto &op : *forOp.getBody()) { + for (auto &op : forOp.getBody()->without_terminator()) { assert(instShiftMap.find(&op) != instShiftMap.end()); shifts[s++] = instShiftMap[&op]; @@ -365,7 +361,7 @@ return; } - if (failed(instBodySkew(forOp, shifts))) { + if (failed(affineForOpBodySkew(forOp, shifts))) { LLVM_DEBUG(llvm::dbgs() << "op body skewing failed - unexpected\n";); return; } diff --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp --- a/mlir/lib/Transforms/Utils/LoopUtils.cpp +++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp @@ -156,65 +156,57 @@ f.walk([](AffineForOp forOp) { promoteIfSingleIteration(forOp); }); } -/// Generates a 'affine.for' op with the specified lower and upper bounds -/// while generating the right IV remappings for the shifted operations. The -/// operation blocks that go into the loop are specified in instGroupQueue -/// starting from the specified offset, and in that order; the first element of -/// the pair specifies the shift applied to that group of operations; note -/// that the shift is multiplied by the loop step before being applied. Returns -/// nullptr if the generated loop simplifies to a single iteration one. -static AffineForOp -generateLoop(AffineMap lbMap, AffineMap ubMap, - const std::vector>> - &instGroupQueue, - unsigned offset, AffineForOp srcForInst, OpBuilder b) { - auto lbOperands = srcForInst.getLowerBoundOperands(); - auto ubOperands = srcForInst.getUpperBoundOperands(); +/// Generates an affine.for op with the specified lower and upper bounds +/// while generating the right IV remappings to realize shifts for operations in +/// its body. The operations that go into the loop body are specified in +/// opGroupQueue starting from the specified offset, and in that order. The +/// first element of the pair specifies the shift applied to that group of +/// operations; the shift is multiplied by the loop step before being applied. +/// Returns nullptr if the generated loop simplifies to a single iteration one. +static AffineForOp generateShiftedLoop( + AffineMap lbMap, AffineMap ubMap, + const std::vector>> &opGroupQueue, + unsigned offset, AffineForOp srcForOp, OpBuilder b) { + auto lbOperands = srcForOp.getLowerBoundOperands(); + auto ubOperands = srcForOp.getUpperBoundOperands(); assert(lbMap.getNumInputs() == lbOperands.size()); assert(ubMap.getNumInputs() == ubOperands.size()); - auto loopChunk = - b.create(srcForInst.getLoc(), lbOperands, lbMap, ubOperands, - ubMap, srcForInst.getStep()); + auto loopChunk = b.create(srcForOp.getLoc(), lbOperands, lbMap, + ubOperands, ubMap, srcForOp.getStep()); auto loopChunkIV = loopChunk.getInductionVar(); - auto srcIV = srcForInst.getInductionVar(); + auto srcIV = srcForOp.getInductionVar(); BlockAndValueMapping operandMap; OpBuilder bodyBuilder = loopChunk.getBodyBuilder(); - for (auto it = instGroupQueue.begin() + offset, e = instGroupQueue.end(); - it != e; ++it) { + for (auto it = opGroupQueue.begin() + offset, e = opGroupQueue.end(); it != e; + ++it) { uint64_t shift = it->first; - auto insts = it->second; + auto ops = it->second; // All 'same shift' operations get added with their operands being // remapped to results of cloned operations, and their IV used remapped. // Generate the remapping if the shift is not zero: remappedIV = newIV - // shift. if (!srcIV.use_empty() && shift != 0) { auto ivRemap = bodyBuilder.create( - srcForInst.getLoc(), + srcForOp.getLoc(), bodyBuilder.getSingleDimShiftAffineMap( - -static_cast(srcForInst.getStep() * shift)), + -static_cast(srcForOp.getStep() * shift)), loopChunkIV); operandMap.map(srcIV, ivRemap); } else { operandMap.map(srcIV, loopChunkIV); } - for (auto *op : insts) { - if (!isa(op)) - bodyBuilder.clone(*op, operandMap); - } + for (auto *op : ops) + bodyBuilder.clone(*op, operandMap); }; if (succeeded(promoteIfSingleIteration(loopChunk))) return AffineForOp(); return loopChunk; } -/// Skew the operations in the body of a 'affine.for' operation with the -/// specified operation-wise shifts. The shifts are with respect to the -/// original execution order, and are multiplied by the loop 'step' before being -/// applied. A shift of zero for each operation will lead to no change. // The skewing of operations with respect to one another can be used for // example to allow overlap of asynchronous operations (such as DMA // communication) with computation, or just relative shifting of operations @@ -226,8 +218,9 @@ // asserts preservation of SSA dominance. A check for that as well as that for // memory-based dependence preservation check rests with the users of this // method. -LogicalResult mlir::instBodySkew(AffineForOp forOp, ArrayRef shifts, - bool unrollPrologueEpilogue) { +LogicalResult mlir::affineForOpBodySkew(AffineForOp forOp, + ArrayRef shifts, + bool unrollPrologueEpilogue) { if (forOp.getBody()->begin() == std::prev(forOp.getBody()->end())) return success(); @@ -263,11 +256,11 @@ // An array of operation groups sorted by shift amount; each group has all // operations with the same shift in the order in which they appear in the // body of the 'affine.for' op. - std::vector> sortedInstGroups(maxShift + 1); + std::vector> sortedOpGroups(maxShift + 1); unsigned pos = 0; - for (auto &op : *forOp.getBody()) { + for (auto &op : forOp.getBody()->without_terminator()) { auto shift = shifts[pos++]; - sortedInstGroups[shift].push_back(&op); + sortedOpGroups[shift].push_back(&op); } // Unless the shifts have a specific pattern (which actually would be the @@ -275,40 +268,39 @@ // Nevertheless, if 'unrollPrologueEpilogue' is set, we will treat the first // loop generated as the prologue and the last as epilogue and unroll these // fully. - AffineForOp prologue; - AffineForOp epilogue; + AffineForOp prologue, epilogue; // Do a sweep over the sorted shifts while storing open groups in a // vector, and generating loop portions as necessary during the sweep. A block // of operations is paired with its shift. - std::vector>> instGroupQueue; + std::vector>> opGroupQueue; auto origLbMap = forOp.getLowerBoundMap(); uint64_t lbShift = 0; OpBuilder b(forOp.getOperation()); - for (uint64_t d = 0, e = sortedInstGroups.size(); d < e; ++d) { + for (uint64_t d = 0, e = sortedOpGroups.size(); d < e; ++d) { // If nothing is shifted by d, continue. - if (sortedInstGroups[d].empty()) + if (sortedOpGroups[d].empty()) continue; - if (!instGroupQueue.empty()) { + if (!opGroupQueue.empty()) { assert(d >= 1 && "Queue expected to be empty when the first block is found"); // The interval for which the loop needs to be generated here is: // [lbShift, min(lbShift + tripCount, d)) and the body of the - // loop needs to have all operations in instQueue in that order. + // loop needs to have all operations in opQueue in that order. AffineForOp res; if (lbShift + tripCount * step < d * step) { - res = generateLoop( + res = generateShiftedLoop( b.getShiftedAffineMap(origLbMap, lbShift), b.getShiftedAffineMap(origLbMap, lbShift + tripCount * step), - instGroupQueue, 0, forOp, b); + opGroupQueue, /*offset=*/0, forOp, b); // Entire loop for the queued op groups generated, empty it. - instGroupQueue.clear(); + opGroupQueue.clear(); lbShift += tripCount * step; } else { - res = generateLoop(b.getShiftedAffineMap(origLbMap, lbShift), - b.getShiftedAffineMap(origLbMap, d), instGroupQueue, - 0, forOp, b); + res = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift), + b.getShiftedAffineMap(origLbMap, d), + opGroupQueue, /*offset=*/0, forOp, b); lbShift = d * step; } if (!prologue && res) @@ -319,16 +311,16 @@ lbShift = d * step; } // Augment the list of operations that get into the current open interval. - instGroupQueue.push_back({d, sortedInstGroups[d]}); + opGroupQueue.push_back({d, sortedOpGroups[d]}); } // Those operations groups left in the queue now need to be processed (FIFO) // and their loops completed. - for (unsigned i = 0, e = instGroupQueue.size(); i < e; ++i) { - uint64_t ubShift = (instGroupQueue[i].first + tripCount) * step; - epilogue = generateLoop(b.getShiftedAffineMap(origLbMap, lbShift), - b.getShiftedAffineMap(origLbMap, ubShift), - instGroupQueue, i, forOp, b); + for (unsigned i = 0, e = opGroupQueue.size(); i < e; ++i) { + uint64_t ubShift = (opGroupQueue[i].first + tripCount) * step; + epilogue = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift), + b.getShiftedAffineMap(origLbMap, ubShift), + opGroupQueue, /*offset=*/i, forOp, b); lbShift = ubShift; if (!prologue) prologue = epilogue; diff --git a/mlir/test/Transforms/pipeline-data-transfer.mlir b/mlir/test/Transforms/pipeline-data-transfer.mlir --- a/mlir/test/Transforms/pipeline-data-transfer.mlir +++ b/mlir/test/Transforms/pipeline-data-transfer.mlir @@ -36,23 +36,23 @@ // CHECK-NEXT: affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}} mod 2, %{{.*}}], %{{.*}}[%{{.*}} mod 2, 0], %{{.*}} : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32> // CHECK-NEXT: affine.for %{{.*}} = 1 to 8 { // CHECK-NEXT: affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}} mod 2, %{{.*}}], %{{.*}}[%{{.*}} mod 2, 0], %{{.*}} : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32> -// CHECK-NEXT: %{{.*}} = affine.apply [[MAP_MINUS_1]](%{{.*}}) -// CHECK-NEXT: %{{.*}} = affine.apply [[MOD_2]](%{{.*}}) -// CHECK-NEXT: %{{.*}} = affine.apply [[MOD_2]](%{{.*}}) +// CHECK-NEXT: affine.apply [[MAP_MINUS_1]](%{{.*}}) +// CHECK-NEXT: affine.apply [[MOD_2]](%{{.*}}) +// CHECK-NEXT: affine.apply [[MOD_2]](%{{.*}}) // CHECK-NEXT: affine.dma_wait %{{.*}}[%{{.*}} mod 2, 0], %{{.*}} : memref<2x1xf32> -// CHECK-NEXT: %{{.*}} = affine.load %{{.*}}[%{{.*}} mod 2, %{{.*}}] : memref<2x32xf32, 1> -// CHECK-NEXT: %{{.*}} = "compute"(%{{.*}}) : (f32) -> f32 +// CHECK-NEXT: affine.load %{{.*}}[%{{.*}} mod 2, %{{.*}}] : memref<2x32xf32, 1> +// CHECK-NEXT: "compute"(%{{.*}}) : (f32) -> f32 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}} mod 2, %{{.*}}] : memref<2x32xf32, 1> // CHECK-NEXT: affine.for %{{.*}} = 0 to 32 { // CHECK-NEXT: "do_more_compute"(%{{.*}}, %{{.*}}) : (index, index) -> () // CHECK-NEXT: } // CHECK-NEXT: } -// CHECK-NEXT: %{{.*}} = affine.apply [[MAP_MINUS_1]](%{{.*}}) -// CHECK-NEXT: %{{.*}} = affine.apply [[MOD_2]](%{{.*}}) -// CHECK-NEXT: %{{.*}} = affine.apply [[MOD_2]](%{{.*}}) +// CHECK-NEXT: affine.apply [[MAP_MINUS_1]](%{{.*}}) +// CHECK-NEXT: affine.apply [[MOD_2]](%{{.*}}) +// CHECK-NEXT: affine.apply [[MOD_2]](%{{.*}}) // CHECK-NEXT: affine.dma_wait %{{.*}}[%{{.*}} mod 2, 0], %{{.*}} : memref<2x1xf32> -// CHECK-NEXT: %{{.*}} = affine.load %{{.*}}[%{{.*}} mod 2, %{{.*}}] : memref<2x32xf32, 1> -// CHECK-NEXT: %{{.*}} = "compute"(%{{.*}}) : (f32) -> f32 +// CHECK-NEXT: affine.load %{{.*}}[%{{.*}} mod 2, %{{.*}}] : memref<2x32xf32, 1> +// CHECK-NEXT: "compute"(%{{.*}}) : (f32) -> f32 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}} mod 2, %{{.*}}] : memref<2x32xf32, 1> // CHECK-NEXT: affine.for %{{.*}} = 0 to 32 { // CHECK-NEXT: "do_more_compute"(%{{.*}}, %{{.*}}) : (index, index) -> () @@ -89,8 +89,8 @@ // CHECK-NEXT: affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[(%{{.*}} floordiv 4) mod 2, 0], [[TAG]][(%{{.*}} floordiv 4) mod 2, 0], %{{.*}} : memref<512xf32>, memref<2x4xf32, 1>, memref<2x1xi32> // CHECK-NEXT: affine.for %{{.*}} = 4 to 512 step 4 { // CHECK-NEXT: affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[(%{{.*}} floordiv 4) mod 2, 0], [[TAG]][(%{{.*}} floordiv 4) mod 2, 0], %{{.*}} : memref<512xf32>, memref<2x4xf32, 1>, memref<2x1xi32> -// CHECK-NEXT: %{{.*}} = affine.apply [[REMAP_SHIFT_MINUS_4]](%{{.*}}) -// CHECK-NEXT: %{{.*}} = affine.apply [[FLOOR_MOD_2]](%{{.*}}) +// CHECK-NEXT: affine.apply [[REMAP_SHIFT_MINUS_4]](%{{.*}}) +// CHECK-NEXT: affine.apply [[FLOOR_MOD_2]](%{{.*}}) // CHECK: affine.dma_wait [[TAG]][(%{{.*}} floordiv 4) mod 2, 0], %{{.*}} : memref<2x1xi32> // CHECK-NEXT: "compute"(%{{.*}}) : (index) -> () // CHECK-NEXT: } @@ -313,7 +313,7 @@ dealloc %tag : memref<1 x i32> dealloc %Av : memref<32 x 32 x f32, 2> return %v : f32 -// CHECK: %{{[0-9]+}} = affine.load %{{[0-9]+}}[%{{.*}}, %{{.*}}] : memref<32x32xf32, 2> +// CHECK: affine.load %{{[0-9]+}}[%{{.*}}, %{{.*}}] : memref<32x32xf32, 2> // CHECK: return } @@ -329,10 +329,10 @@ %tag = alloc() : memref<1 x i32> // Double buffering for dynamic shaped buffer. -// CHECK: %{{.*}} = alloc(%{{.*}}, %{{.*}}) : memref -// CHECK-NEXT: %{{.*}} = dim %{{.*}}, 0 : memref -// CHECK-NEXT: %{{.*}} = dim %{{.*}}, 1 : memref -// CHECK-NEXT: %{{.*}} = alloc(%{{.*}}, %{{.*}}) : memref<2x?x?xf32, 2> +// CHECK: alloc(%{{.*}}, %{{.*}}) : memref +// CHECK-NEXT: dim %{{.*}}, 0 : memref +// CHECK-NEXT: dim %{{.*}}, 1 : memref +// CHECK-NEXT: alloc(%{{.*}}, %{{.*}}) : memref<2x?x?xf32, 2> // CHECK: affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}} mod 2, 0, 0], %{{.*}}[%{{.*}} mod 2, 0], %{{.*}} affine.for %kTT = 0 to 16 { affine.dma_start %arg0[%zero, %zero], %Av[%zero, %zero], %tag[%zero], %num_elt :