diff --git a/mlir/include/mlir/Conversion/LoopsToGPU/LoopsToGPUPass.h b/mlir/include/mlir/Conversion/LoopsToGPU/LoopsToGPUPass.h --- a/mlir/include/mlir/Conversion/LoopsToGPU/LoopsToGPUPass.h +++ b/mlir/include/mlir/Conversion/LoopsToGPU/LoopsToGPUPass.h @@ -15,6 +15,7 @@ namespace mlir { class FuncOp; template class OpPassBase; +class Pass; /// Create a pass that converts loop nests into GPU kernels. It considers /// top-level affine.for and linalg.for operations as roots of loop nests and @@ -36,6 +37,13 @@ std::unique_ptr> createLoopToGPUPass(ArrayRef numWorkGroups, ArrayRef workGroupSize); + +/// Creates a pass that converts loop.parallel operations into a gpu.launch +/// operation. The mapping of loop dimensions to launch dimensions is derived +/// from mapping attributes. See ParallelToGpuLaunchLowering::matchAndRewrite +/// for a description of the used attributes. +std::unique_ptr createParallelLoopToGpuPass(); + } // namespace mlir #endif // MLIR_CONVERSION_LOOPSTOGPU_LOOPSTOGPUPASS_H_ diff --git a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp --- a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp +++ b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp @@ -533,8 +533,12 @@ /// Tries to derive a static upper bound from the defining operation of /// `upperBound`. -static Value deriveStaticUpperBound(Value upperBound) { +static Value deriveStaticUpperBound(Value upperBound, + PatternRewriter &rewriter) { Value constantBound = {}; + // We cannot rely on canonicalization at the moment, so we have to + // dig throuhg the expression. + // See https://bugs.llvm.org/show_bug.cgi?id=45008. if (AffineMinOp minOp = dyn_cast_or_null(upperBound.getDefiningOp())) { auto map = minOp.map(); @@ -549,6 +553,11 @@ break; } } + if (AffineConstantExpr constExpr = expr.dyn_cast()) { + constantBound = rewriter.create(minOp.getLoc(), + constExpr.getValue()); + break; + } } } return constantBound; @@ -615,46 +624,62 @@ if (annotation.processor < gpu::LaunchOp::kNumConfigOperands) { // Use the corresponding thread/grid index as replacement for the loop iv. - // TODO(herhut): Make the iv calculation depend on lower & upper bound. Value operand = launchOp.body().front().getArgument(annotation.processor); - Value appliedMap = - rewriter.create(loc, annotation.indexMap, operand); - // Add the lower bound, as the maps are 0 based but the loop might not be. - // TODO(herhut): Maybe move this explicitly into the maps? - newIndex = rewriter.create( - loc, appliedMap, cloningMap.lookupOrDefault(lowerBound)); + // Take the indexmap and add the lower bound and step computations in. + // This computes operand * step + lowerBound. + // Use an affine map here so that it composes nicely with the provided + // annotation. + AffineMap lowerAndStep = AffineMap::get( + 1, 2, + rewriter.getAffineDimExpr(0) * rewriter.getAffineSymbolExpr(0) + + rewriter.getAffineSymbolExpr(1)); + newIndex = rewriter.create( + loc, annotation.indexMap.compose(lowerAndStep), + ValueRange{operand, step, lowerBound}); // If there was also a bound, insert that, too. // TODO(herhut): Check that we do not assign bounds twice. if (annotation.boundMap) { // We pass as the single opererand to the bound-map the number of - // iterations, which is upperBound - lowerBound. To support inner loops - // with dynamic upper bounds (as generated by e.g. tiling), try to - // derive a max for the bounds. If the used bound for the hardware id is - // inprecise, wrap the contained code into a conditional. - // If the lower-bound is constant or defined before the launch, we can - // use it in the launch bounds. Otherwise fail. + // iterations, which is (upperBound - lowerBound) ceilDiv step. To + // support inner loops with dynamic upper bounds (as generated by e.g. + // tiling), try to derive a max for the bounds. If the used bound for + // the hardware id is imprecise, wrap the contained code into a + // conditional. If the lower-bound is constant or defined before the + // launch, we can use it in the launch bounds. Otherwise fail. if (!launchIndependent(lowerBound) && !isa(lowerBound.getDefiningOp())) return failure(); + // The step must also be constant or defined outside of the loop nest. + if (!launchIndependent(step) && !isa(step.getDefiningOp())) + return failure(); // If the upper-bound is constant or defined before the launch, we can // use it in the launch bounds directly. Otherwise try derive a bound. bool boundIsPrecise = launchIndependent(upperBound) || isa(upperBound.getDefiningOp()); - if (!boundIsPrecise) { - upperBound = deriveStaticUpperBound(upperBound); - if (!upperBound) - return failure(); - } { PatternRewriter::InsertionGuard guard(rewriter); rewriter.setInsertionPoint(launchOp); - - Value iterations = rewriter.create( - loc, - ensureLaunchIndependent(cloningMap.lookupOrDefault(upperBound)), - ensureLaunchIndependent(cloningMap.lookupOrDefault(lowerBound))); + if (!boundIsPrecise) { + upperBound = deriveStaticUpperBound(upperBound, rewriter); + if (!upperBound) + return failure(); + } + // Compute the number of iterations needed. We compute this as an + // affine expression ceilDiv (upperBound - lowerBound) step. We use + // affine here so that it composes nicely with the provided map. + AffineMap stepMap = + AffineMap::get(0, 3, + (rewriter.getAffineSymbolExpr(0) - + rewriter.getAffineSymbolExpr(1).ceilDiv( + rewriter.getAffineSymbolExpr(2)))); Value launchBound = rewriter.create( - loc, annotation.boundMap, iterations); + loc, annotation.boundMap.compose(stepMap), + ValueRange{ + ensureLaunchIndependent( + cloningMap.lookupOrDefault(upperBound)), + ensureLaunchIndependent( + cloningMap.lookupOrDefault(lowerBound)), + ensureLaunchIndependent(cloningMap.lookupOrDefault(step))}); launchOp.setOperand(annotation.processor, launchBound); } if (!boundIsPrecise) { @@ -748,8 +773,6 @@ bool leftNestingScope = false; while (!worklist.empty()) { Operation *op = worklist.pop_back_val(); - launchOp.dump(); - // Now walk over the body and clone it. // TODO: This is only correct if there either is no further loop.parallel // nested or this code is side-effect free. Otherwise we might need @@ -788,30 +811,7 @@ return matchSuccess(); } -namespace { -struct ParallelLoopToGpuPass : public OperationPass { - void runOnOperation() override; -}; -} // namespace - void mlir::populateParallelLoopToGPUPatterns(OwningRewritePatternList &patterns, MLIRContext *ctx) { patterns.insert(ctx); } - -void ParallelLoopToGpuPass::runOnOperation() { - OwningRewritePatternList patterns; - populateParallelLoopToGPUPatterns(patterns, &getContext()); - ConversionTarget target(getContext()); - target.addLegalDialect(); - target.addLegalDialect(); - target.addLegalDialect(); - target.addLegalDialect(); - target.addIllegalOp(); - if (failed(applyPartialConversion(getOperation(), target, patterns))) - signalPassFailure(); -} - -static PassRegistration - pass("convert-parallel-loops-to-gpu", "Convert mapped loop.parallel ops" - " to gpu launch operations."); diff --git a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp --- a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp +++ b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp @@ -9,9 +9,11 @@ #include "mlir/Conversion/LoopsToGPU/LoopsToGPUPass.h" #include "mlir/Conversion/LoopsToGPU/LoopsToGPU.h" #include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/Dialect/GPU/GPUDialect.h" #include "mlir/Dialect/LoopOps/LoopOps.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/Pass/Pass.h" +#include "mlir/Transforms/DialectConversion.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/Support/CommandLine.h" @@ -115,6 +117,21 @@ SmallVector workGroupSize; }; +struct ParallelLoopToGpuPass : public OperationPass { + void runOnOperation() override { + OwningRewritePatternList patterns; + populateParallelLoopToGPUPatterns(patterns, &getContext()); + ConversionTarget target(getContext()); + target.addLegalDialect(); + target.addLegalDialect(); + target.addLegalDialect(); + target.addLegalDialect(); + target.addIllegalOp(); + if (failed(applyPartialConversion(getOperation(), target, patterns))) + signalPassFailure(); + } +}; + } // namespace std::unique_ptr> @@ -130,6 +147,10 @@ workGroupSize); } +std::unique_ptr mlir::createParallelLoopToGpuPass() { + return std::make_unique(); +} + static PassRegistration registration(PASS_NAME, "Convert top-level loops to GPU kernels", [] { return std::make_unique(clNumBlockDims.getValue(), @@ -145,3 +166,7 @@ return std::make_unique(numWorkGroups, workGroupSize); }); + +static PassRegistration + pass("convert-parallel-loops-to-gpu", "Convert mapped loop.parallel ops" + " to gpu launch operations."); diff --git a/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir b/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir --- a/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir +++ b/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir @@ -15,24 +15,21 @@ return } -// CHECK: #map0 = affine_map<(d0) -> (d0)> -// CHECK: module { +// CHECK: #map0 = affine_map<()[s0, s1, s2] -> (s0 - s1 ceildiv s2)> +// CHECK: #map1 = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)> +// CHECK: module { // CHECK-LABEL: func @parallel_loop_bidy_bidx( -// CHECK-SAME: [[VAL_0:%.*]]: index, [[VAL_1:%.*]]: index, [[VAL_2:%.*]]: index, [[VAL_3:%.*]]: index, [[VAL_4:%.*]]: index, [[VAL_5:%.*]]: memref, [[VAL_6:%.*]]: memref) { +// CHECK-SAME: [[VAL_0:%.*]]: index, [[VAL_1:%.*]]: index, [[VAL_2:%.*]]: index, [[VAL_3:%.*]]: index, [[VAL_4:%.*]]: index, [[VAL_5:%.*]]: memref, [[VAL_6:%.*]]: memref) { // CHECK: [[VAL_7:%.*]] = constant 2 : index // CHECK: [[VAL_8:%.*]] = constant 1 : index -// CHECK: [[VAL_9:%.*]] = subi [[VAL_2]], [[VAL_0]] : index -// CHECK: [[VAL_10:%.*]] = affine.apply #map0([[VAL_9]]) -// CHECK: [[VAL_11:%.*]] = subi [[VAL_3]], [[VAL_1]] : index -// CHECK: [[VAL_12:%.*]] = affine.apply #map0([[VAL_11]]) -// CHECK: gpu.launch blocks([[VAL_13:%.*]], [[VAL_14:%.*]], [[VAL_15:%.*]]) in ([[VAL_16:%.*]] = [[VAL_12]], [[VAL_17:%.*]] = [[VAL_10]], [[VAL_18:%.*]] = [[VAL_8]]) threads([[VAL_19:%.*]], [[VAL_20:%.*]], [[VAL_21:%.*]]) in ([[VAL_22:%.*]] = [[VAL_8]], [[VAL_23:%.*]] = [[VAL_8]], [[VAL_24:%.*]] = [[VAL_8]]) { -// CHECK: [[VAL_25:%.*]] = affine.apply #map0([[VAL_14]]) -// CHECK: [[VAL_26:%.*]] = addi [[VAL_25]], [[VAL_0]] : index -// CHECK: [[VAL_27:%.*]] = affine.apply #map0([[VAL_13]]) -// CHECK: [[VAL_28:%.*]] = addi [[VAL_27]], [[VAL_1]] : index -// CHECK: [[VAL_29:%.*]] = load [[VAL_5]]{{\[}}[[VAL_26]], [[VAL_28]]] : memref -// CHECK: store [[VAL_29]], [[VAL_6]]{{\[}}[[VAL_28]], [[VAL_26]]] : memref +// CHECK: [[VAL_9:%.*]] = affine.apply #map0(){{\[}}[[VAL_2]], [[VAL_0]], [[VAL_4]]] +// CHECK: [[VAL_10:%.*]] = affine.apply #map0(){{\[}}[[VAL_3]], [[VAL_1]], [[VAL_7]]] +// CHECK: gpu.launch blocks([[VAL_11:%.*]], [[VAL_12:%.*]], [[VAL_13:%.*]]) in ([[VAL_14:%.*]] = [[VAL_10]], [[VAL_15:%.*]] = [[VAL_9]], [[VAL_16:%.*]] = [[VAL_8]]) threads([[VAL_17:%.*]], [[VAL_18:%.*]], [[VAL_19:%.*]]) in ([[VAL_20:%.*]] = [[VAL_8]], [[VAL_21:%.*]] = [[VAL_8]], [[VAL_22:%.*]] = [[VAL_8]]) { +// CHECK: [[VAL_23:%.*]] = affine.apply #map1([[VAL_12]]){{\[}}[[VAL_4]], [[VAL_0]]] +// CHECK: [[VAL_24:%.*]] = affine.apply #map1([[VAL_11]]){{\[}}[[VAL_7]], [[VAL_1]]] +// CHECK: [[VAL_25:%.*]] = load [[VAL_5]]{{\[}}[[VAL_23]], [[VAL_24]]] : memref +// CHECK: store [[VAL_25]], [[VAL_6]]{{\[}}[[VAL_24]], [[VAL_23]]] : memref // CHECK: gpu.terminator // CHECK: } // CHECK: return @@ -69,36 +66,29 @@ return } -// CHECK: #map0 = affine_map<(d0) -> (d0)> -// CHECK: module { +// CHECK: #map0 = affine_map<()[s0, s1, s2] -> (s0 - s1 ceildiv s2)> +// CHECK: #map1 = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)> +// CHECK: module { // CHECK-LABEL: func @parallel_loop_tiled( -// CHECK-SAME: [[VAL_30:%.*]]: index, [[VAL_31:%.*]]: index, [[VAL_32:%.*]]: index, [[VAL_33:%.*]]: index, [[VAL_34:%.*]]: memref, [[VAL_35:%.*]]: memref) { -// CHECK: [[VAL_36:%.*]] = constant 0 : index -// CHECK: [[VAL_37:%.*]] = constant 1 : index -// CHECK: [[VAL_38:%.*]] = constant 4 : index -// CHECK: [[VAL_39:%.*]] = constant 1 : index -// CHECK: [[VAL_40:%.*]] = subi [[VAL_32]], [[VAL_30]] : index -// CHECK: [[VAL_41:%.*]] = affine.apply #map0([[VAL_40]]) -// CHECK: [[VAL_42:%.*]] = subi [[VAL_33]], [[VAL_31]] : index -// CHECK: [[VAL_43:%.*]] = affine.apply #map0([[VAL_42]]) -// CHECK: [[VAL_44:%.*]] = subi [[VAL_38]], [[VAL_36]] : index -// CHECK: [[VAL_45:%.*]] = affine.apply #map0([[VAL_44]]) -// CHECK: [[VAL_46:%.*]] = subi [[VAL_38]], [[VAL_36]] : index -// CHECK: [[VAL_47:%.*]] = affine.apply #map0([[VAL_46]]) -// CHECK: gpu.launch blocks([[VAL_48:%.*]], [[VAL_49:%.*]], [[VAL_50:%.*]]) in ([[VAL_51:%.*]] = [[VAL_43]], [[VAL_52:%.*]] = [[VAL_41]], [[VAL_53:%.*]] = [[VAL_39]]) threads([[VAL_54:%.*]], [[VAL_55:%.*]], [[VAL_56:%.*]]) in ([[VAL_57:%.*]] = [[VAL_47]], [[VAL_58:%.*]] = [[VAL_45]], [[VAL_59:%.*]] = [[VAL_39]]) { -// CHECK: [[VAL_60:%.*]] = affine.apply #map0([[VAL_49]]) -// CHECK: [[VAL_61:%.*]] = addi [[VAL_60]], [[VAL_30]] : index -// CHECK: [[VAL_62:%.*]] = affine.apply #map0([[VAL_48]]) -// CHECK: [[VAL_63:%.*]] = addi [[VAL_62]], [[VAL_31]] : index -// CHECK: [[VAL_64:%.*]] = affine.apply #map0([[VAL_55]]) -// CHECK: [[VAL_65:%.*]] = addi [[VAL_64]], [[VAL_36]] : index -// CHECK: [[VAL_66:%.*]] = affine.apply #map0([[VAL_54]]) -// CHECK: [[VAL_67:%.*]] = addi [[VAL_66]], [[VAL_36]] : index -// CHECK: [[VAL_68:%.*]] = addi [[VAL_61]], [[VAL_65]] : index -// CHECK: [[VAL_69:%.*]] = addi [[VAL_63]], [[VAL_67]] : index -// CHECK: [[VAL_70:%.*]] = load [[VAL_34]]{{\[}}[[VAL_68]], [[VAL_69]]] : memref -// CHECK: store [[VAL_70]], [[VAL_35]]{{\[}}[[VAL_69]], [[VAL_68]]] : memref +// CHECK-SAME: [[VAL_26:%.*]]: index, [[VAL_27:%.*]]: index, [[VAL_28:%.*]]: index, [[VAL_29:%.*]]: index, [[VAL_30:%.*]]: memref, [[VAL_31:%.*]]: memref) { +// CHECK: [[VAL_32:%.*]] = constant 0 : index +// CHECK: [[VAL_33:%.*]] = constant 1 : index +// CHECK: [[VAL_34:%.*]] = constant 4 : index +// CHECK: [[VAL_35:%.*]] = constant 1 : index +// CHECK: [[VAL_36:%.*]] = affine.apply #map0(){{\[}}[[VAL_28]], [[VAL_26]], [[VAL_34]]] +// CHECK: [[VAL_37:%.*]] = affine.apply #map0(){{\[}}[[VAL_29]], [[VAL_27]], [[VAL_34]]] +// CHECK: [[VAL_38:%.*]] = affine.apply #map0(){{\[}}[[VAL_34]], [[VAL_32]], [[VAL_33]]] +// CHECK: [[VAL_39:%.*]] = affine.apply #map0(){{\[}}[[VAL_34]], [[VAL_32]], [[VAL_33]]] +// CHECK: gpu.launch blocks([[VAL_40:%.*]], [[VAL_41:%.*]], [[VAL_42:%.*]]) in ([[VAL_43:%.*]] = [[VAL_37]], [[VAL_44:%.*]] = [[VAL_36]], [[VAL_45:%.*]] = [[VAL_35]]) threads([[VAL_46:%.*]], [[VAL_47:%.*]], [[VAL_48:%.*]]) in ([[VAL_49:%.*]] = [[VAL_39]], [[VAL_50:%.*]] = [[VAL_38]], [[VAL_51:%.*]] = [[VAL_35]]) { +// CHECK: [[VAL_52:%.*]] = affine.apply #map1([[VAL_41]]){{\[}}[[VAL_34]], [[VAL_26]]] +// CHECK: [[VAL_53:%.*]] = affine.apply #map1([[VAL_40]]){{\[}}[[VAL_34]], [[VAL_27]]] +// CHECK: [[VAL_54:%.*]] = affine.apply #map1([[VAL_47]]){{\[}}[[VAL_33]], [[VAL_32]]] +// CHECK: [[VAL_55:%.*]] = affine.apply #map1([[VAL_46]]){{\[}}[[VAL_33]], [[VAL_32]]] +// CHECK: [[VAL_56:%.*]] = addi [[VAL_52]], [[VAL_54]] : index +// CHECK: [[VAL_57:%.*]] = addi [[VAL_53]], [[VAL_55]] : index +// CHECK: [[VAL_58:%.*]] = load [[VAL_30]]{{\[}}[[VAL_56]], [[VAL_57]]] : memref +// CHECK: store [[VAL_58]], [[VAL_31]]{{\[}}[[VAL_57]], [[VAL_56]]] : memref // CHECK: gpu.terminator // CHECK: } // CHECK: return @@ -125,21 +115,20 @@ return } -// CHECK: #map0 = affine_map<(d0) -> (d0)> -// CHECK: module { +// CHECK: #map0 = affine_map<()[s0, s1, s2] -> (s0 - s1 ceildiv s2)> +// CHECK: #map1 = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)> +// CHECK: module { // CHECK-LABEL: func @parallel_loop_bidy_seq( -// CHECK-SAME: [[VAL_71:%.*]]: index, [[VAL_72:%.*]]: index, [[VAL_73:%.*]]: index, [[VAL_74:%.*]]: index, [[VAL_75:%.*]]: index, [[VAL_76:%.*]]: memref, [[VAL_77:%.*]]: memref) { -// CHECK: [[VAL_78:%.*]] = constant 2 : index -// CHECK: [[VAL_79:%.*]] = constant 1 : index -// CHECK: [[VAL_80:%.*]] = subi [[VAL_73]], [[VAL_71]] : index -// CHECK: [[VAL_81:%.*]] = affine.apply #map0([[VAL_80]]) -// CHECK: gpu.launch blocks([[VAL_82:%.*]], [[VAL_83:%.*]], [[VAL_84:%.*]]) in ([[VAL_85:%.*]] = [[VAL_79]], [[VAL_86:%.*]] = [[VAL_81]], [[VAL_87:%.*]] = [[VAL_79]]) threads([[VAL_88:%.*]], [[VAL_89:%.*]], [[VAL_90:%.*]]) in ([[VAL_91:%.*]] = [[VAL_79]], [[VAL_92:%.*]] = [[VAL_79]], [[VAL_93:%.*]] = [[VAL_79]]) { -// CHECK: [[VAL_94:%.*]] = affine.apply #map0([[VAL_83]]) -// CHECK: [[VAL_95:%.*]] = addi [[VAL_94]], [[VAL_71]] : index -// CHECK: loop.for [[VAL_96:%.*]] = [[VAL_72]] to [[VAL_74]] step [[VAL_78]] { -// CHECK: [[VAL_97:%.*]] = load [[VAL_76]]{{\[}}[[VAL_95]], [[VAL_96]]] : memref -// CHECK: store [[VAL_97]], [[VAL_77]]{{\[}}[[VAL_96]], [[VAL_95]]] : memref +// CHECK-SAME: [[VAL_59:%.*]]: index, [[VAL_60:%.*]]: index, [[VAL_61:%.*]]: index, [[VAL_62:%.*]]: index, [[VAL_63:%.*]]: index, [[VAL_64:%.*]]: memref, [[VAL_65:%.*]]: memref) { +// CHECK: [[VAL_66:%.*]] = constant 2 : index +// CHECK: [[VAL_67:%.*]] = constant 1 : index +// CHECK: [[VAL_68:%.*]] = affine.apply #map0(){{\[}}[[VAL_61]], [[VAL_59]], [[VAL_63]]] +// CHECK: gpu.launch blocks([[VAL_69:%.*]], [[VAL_70:%.*]], [[VAL_71:%.*]]) in ([[VAL_72:%.*]] = [[VAL_67]], [[VAL_73:%.*]] = [[VAL_68]], [[VAL_74:%.*]] = [[VAL_67]]) threads([[VAL_75:%.*]], [[VAL_76:%.*]], [[VAL_77:%.*]]) in ([[VAL_78:%.*]] = [[VAL_67]], [[VAL_79:%.*]] = [[VAL_67]], [[VAL_80:%.*]] = [[VAL_67]]) { +// CHECK: [[VAL_81:%.*]] = affine.apply #map1([[VAL_70]]){{\[}}[[VAL_63]], [[VAL_59]]] +// CHECK: loop.for [[VAL_82:%.*]] = [[VAL_60]] to [[VAL_62]] step [[VAL_66]] { +// CHECK: [[VAL_83:%.*]] = load [[VAL_64]]{{\[}}[[VAL_81]], [[VAL_82]]] : memref +// CHECK: store [[VAL_83]], [[VAL_65]]{{\[}}[[VAL_82]], [[VAL_81]]] : memref // CHECK: } // CHECK: gpu.terminator // CHECK: } @@ -177,30 +166,27 @@ return } -// CHECK: #map0 = affine_map<(d0) -> (d0)> -// CHECK: module { +// CHECK: #map0 = affine_map<()[s0, s1, s2] -> (s0 - s1 ceildiv s2)> +// CHECK: #map1 = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)> +// CHECK: module { // CHECK-LABEL: func @parallel_loop_tiled_seq( -// CHECK-SAME: [[VAL_98:%.*]]: index, [[VAL_99:%.*]]: index, [[VAL_100:%.*]]: index, [[VAL_101:%.*]]: index, [[VAL_102:%.*]]: memref, [[VAL_103:%.*]]: memref) { -// CHECK: [[VAL_104:%.*]] = constant 0 : index -// CHECK: [[VAL_105:%.*]] = constant 1 : index -// CHECK: [[VAL_106:%.*]] = constant 4 : index -// CHECK: [[VAL_107:%.*]] = constant 1 : index -// CHECK: [[VAL_108:%.*]] = subi [[VAL_100]], [[VAL_98]] : index -// CHECK: [[VAL_109:%.*]] = affine.apply #map0([[VAL_108]]) -// CHECK: [[VAL_110:%.*]] = subi [[VAL_106]], [[VAL_104]] : index -// CHECK: [[VAL_111:%.*]] = affine.apply #map0([[VAL_110]]) -// CHECK: gpu.launch blocks([[VAL_112:%.*]], [[VAL_113:%.*]], [[VAL_114:%.*]]) in ([[VAL_115:%.*]] = [[VAL_107]], [[VAL_116:%.*]] = [[VAL_109]], [[VAL_117:%.*]] = [[VAL_107]]) threads([[VAL_118:%.*]], [[VAL_119:%.*]], [[VAL_120:%.*]]) in ([[VAL_121:%.*]] = [[VAL_107]], [[VAL_122:%.*]] = [[VAL_111]], [[VAL_123:%.*]] = [[VAL_107]]) { -// CHECK: [[VAL_124:%.*]] = affine.apply #map0([[VAL_113]]) -// CHECK: [[VAL_125:%.*]] = addi [[VAL_124]], [[VAL_98]] : index -// CHECK: loop.for [[VAL_126:%.*]] = [[VAL_99]] to [[VAL_101]] step [[VAL_106]] { -// CHECK: [[VAL_127:%.*]] = affine.apply #map0([[VAL_119]]) -// CHECK: [[VAL_128:%.*]] = addi [[VAL_127]], [[VAL_104]] : index -// CHECK: loop.for [[VAL_129:%.*]] = [[VAL_104]] to [[VAL_106]] step [[VAL_105]] { -// CHECK: [[VAL_130:%.*]] = addi [[VAL_125]], [[VAL_128]] : index -// CHECK: [[VAL_131:%.*]] = addi [[VAL_126]], [[VAL_129]] : index -// CHECK: [[VAL_132:%.*]] = load [[VAL_102]]{{\[}}[[VAL_130]], [[VAL_131]]] : memref -// CHECK: store [[VAL_132]], [[VAL_103]]{{\[}}[[VAL_131]], [[VAL_130]]] : memref +// CHECK-SAME: [[VAL_84:%.*]]: index, [[VAL_85:%.*]]: index, [[VAL_86:%.*]]: index, [[VAL_87:%.*]]: index, [[VAL_88:%.*]]: memref, [[VAL_89:%.*]]: memref) { +// CHECK: [[VAL_90:%.*]] = constant 0 : index +// CHECK: [[VAL_91:%.*]] = constant 1 : index +// CHECK: [[VAL_92:%.*]] = constant 4 : index +// CHECK: [[VAL_93:%.*]] = constant 1 : index +// CHECK: [[VAL_94:%.*]] = affine.apply #map0(){{\[}}[[VAL_86]], [[VAL_84]], [[VAL_92]]] +// CHECK: [[VAL_95:%.*]] = affine.apply #map0(){{\[}}[[VAL_92]], [[VAL_90]], [[VAL_91]]] +// CHECK: gpu.launch blocks([[VAL_96:%.*]], [[VAL_97:%.*]], [[VAL_98:%.*]]) in ([[VAL_99:%.*]] = [[VAL_93]], [[VAL_100:%.*]] = [[VAL_94]], [[VAL_101:%.*]] = [[VAL_93]]) threads([[VAL_102:%.*]], [[VAL_103:%.*]], [[VAL_104:%.*]]) in ([[VAL_105:%.*]] = [[VAL_93]], [[VAL_106:%.*]] = [[VAL_95]], [[VAL_107:%.*]] = [[VAL_93]]) { +// CHECK: [[VAL_108:%.*]] = affine.apply #map1([[VAL_97]]){{\[}}[[VAL_92]], [[VAL_84]]] +// CHECK: loop.for [[VAL_109:%.*]] = [[VAL_85]] to [[VAL_87]] step [[VAL_92]] { +// CHECK: [[VAL_110:%.*]] = affine.apply #map1([[VAL_103]]){{\[}}[[VAL_91]], [[VAL_90]]] +// CHECK: loop.for [[VAL_111:%.*]] = [[VAL_90]] to [[VAL_92]] step [[VAL_91]] { +// CHECK: [[VAL_112:%.*]] = addi [[VAL_108]], [[VAL_110]] : index +// CHECK: [[VAL_113:%.*]] = addi [[VAL_109]], [[VAL_111]] : index +// CHECK: [[VAL_114:%.*]] = load [[VAL_88]]{{\[}}[[VAL_112]], [[VAL_113]]] : memref +// CHECK: store [[VAL_114]], [[VAL_89]]{{\[}}[[VAL_113]], [[VAL_112]]] : memref // CHECK: } // CHECK: } // CHECK: gpu.terminator @@ -211,6 +197,9 @@ // ----- +// This and the below are the same but with affine.min canonicalized (see https://bugs.llvm.org/show_bug.cgi?id=45008). +// Once the bug is fixed, this version (and supporting code) can be removed. + #map0 = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)> #map1 = affine_map<(d0, d1, d2) -> (d0, d1 - d2)> #map2 = affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)> @@ -261,61 +250,54 @@ } // CHECK: #map0 = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)> -// CHECK: #map1 = affine_map<(d0) -> (d0)> -// CHECK: #map2 = affine_map<(d0, d1, d2) -> (d0, d1 - d2)> -// CHECK: #map3 = affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)> +// CHECK: #map1 = affine_map<()[s0, s1, s2] -> (s0 - s1 ceildiv s2)> +// CHECK: #map2 = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)> +// CHECK: #map3 = affine_map<(d0, d1, d2) -> (d0, d1 - d2)> +// CHECK: #map4 = affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)> // CHECK: module { // CHECK-LABEL: func @sum( -// CHECK-SAME: [[VAL_133:%.*]]: memref, [[VAL_134:%.*]]: memref, [[VAL_135:%.*]]: memref) { -// CHECK: [[VAL_136:%.*]] = constant 1 : index -// CHECK: [[VAL_137:%.*]] = constant 0 : index -// CHECK: [[VAL_138:%.*]] = constant 3 : index -// CHECK: [[VAL_139:%.*]] = constant 2 : index -// CHECK: [[VAL_140:%.*]] = dim [[VAL_133]], 0 : memref -// CHECK: [[VAL_141:%.*]] = dim [[VAL_133]], 1 : memref -// CHECK: [[VAL_142:%.*]] = constant 1 : index -// CHECK: [[VAL_143:%.*]] = subi [[VAL_140]], [[VAL_137]] : index -// CHECK: [[VAL_144:%.*]] = affine.apply #map1([[VAL_143]]) -// CHECK: [[VAL_145:%.*]] = subi [[VAL_141]], [[VAL_137]] : index -// CHECK: [[VAL_146:%.*]] = affine.apply #map1([[VAL_145]]) -// CHECK: [[VAL_148:%.*]] = subi [[VAL_139]], [[VAL_137]] : index -// CHECK: [[VAL_149:%.*]] = affine.apply #map1([[VAL_148]]) -// CHECK: [[VAL_151:%.*]] = subi [[VAL_138]], [[VAL_137]] : index -// CHECK: [[VAL_152:%.*]] = affine.apply #map1([[VAL_151]]) -// CHECK: gpu.launch blocks([[VAL_153:%.*]], [[VAL_154:%.*]], [[VAL_155:%.*]]) in ([[VAL_156:%.*]] = [[VAL_144]], [[VAL_157:%.*]] = [[VAL_146]], [[VAL_158:%.*]] = [[VAL_142]]) threads([[VAL_159:%.*]], [[VAL_160:%.*]], [[VAL_161:%.*]]) in ([[VAL_162:%.*]] = [[VAL_149]], [[VAL_163:%.*]] = [[VAL_152]], [[VAL_164:%.*]] = [[VAL_142]]) { -// CHECK: [[VAL_165:%.*]] = affine.apply #map1([[VAL_153]]) -// CHECK: [[VAL_166:%.*]] = addi [[VAL_165]], [[VAL_137]] : index -// CHECK: [[VAL_167:%.*]] = affine.apply #map1([[VAL_154]]) -// CHECK: [[VAL_168:%.*]] = addi [[VAL_167]], [[VAL_137]] : index -// CHECK: [[VAL_169:%.*]] = dim [[VAL_133]], 0 : memref -// CHECK: [[VAL_170:%.*]] = affine.min #map2([[VAL_139]], [[VAL_169]], [[VAL_166]]) -// CHECK: [[VAL_171:%.*]] = dim [[VAL_133]], 1 : memref -// CHECK: [[VAL_172:%.*]] = affine.min #map2([[VAL_138]], [[VAL_171]], [[VAL_168]]) -// CHECK: [[VAL_173:%.*]] = std.subview [[VAL_133]]{{\[}}[[VAL_166]], [[VAL_168]]]{{\[}}[[VAL_170]], [[VAL_172]]]{{\[}}[[VAL_136]], [[VAL_136]]] : memref to memref -// CHECK: [[VAL_174:%.*]] = dim [[VAL_134]], 0 : memref -// CHECK: [[VAL_175:%.*]] = affine.min #map2([[VAL_139]], [[VAL_174]], [[VAL_166]]) -// CHECK: [[VAL_176:%.*]] = dim [[VAL_134]], 1 : memref -// CHECK: [[VAL_177:%.*]] = affine.min #map2([[VAL_138]], [[VAL_176]], [[VAL_168]]) -// CHECK: [[VAL_178:%.*]] = std.subview [[VAL_134]]{{\[}}[[VAL_166]], [[VAL_168]]]{{\[}}[[VAL_175]], [[VAL_177]]]{{\[}}[[VAL_136]], [[VAL_136]]] : memref to memref -// CHECK: [[VAL_179:%.*]] = dim [[VAL_135]], 0 : memref -// CHECK: [[VAL_180:%.*]] = affine.min #map2([[VAL_139]], [[VAL_179]], [[VAL_166]]) -// CHECK: [[VAL_181:%.*]] = dim [[VAL_135]], 1 : memref -// CHECK: [[VAL_182:%.*]] = affine.min #map2([[VAL_138]], [[VAL_181]], [[VAL_168]]) -// CHECK: [[VAL_183:%.*]] = std.subview [[VAL_135]]{{\[}}[[VAL_166]], [[VAL_168]]]{{\[}}[[VAL_180]], [[VAL_182]]]{{\[}}[[VAL_136]], [[VAL_136]]] : memref to memref -// CHECK: [[VAL_184:%.*]] = affine.apply #map1([[VAL_159]]) -// CHECK: [[VAL_185:%.*]] = addi [[VAL_184]], [[VAL_137]] : index -// CHECK: [[VAL_186:%.*]] = cmpi "slt", [[VAL_185]], [[VAL_170]] : index -// CHECK: loop.if [[VAL_186]] { -// CHECK: [[VAL_187:%.*]] = affine.apply #map1([[VAL_160]]) -// CHECK: [[VAL_188:%.*]] = addi [[VAL_187]], [[VAL_137]] : index -// CHECK: [[VAL_189:%.*]] = cmpi "slt", [[VAL_188]], [[VAL_172]] : index -// CHECK: loop.if [[VAL_189]] { -// CHECK: [[VAL_190:%.*]] = load [[VAL_173]]{{\[}}[[VAL_185]], [[VAL_188]]] : memref -// CHECK: [[VAL_191:%.*]] = load [[VAL_178]]{{\[}}[[VAL_185]], [[VAL_188]]] : memref -// CHECK: [[VAL_192:%.*]] = load [[VAL_183]]{{\[}}[[VAL_185]], [[VAL_188]]] : memref -// CHECK: [[VAL_193:%.*]] = addf [[VAL_190]], [[VAL_191]] : f32 -// CHECK: store [[VAL_193]], [[VAL_183]]{{\[}}[[VAL_185]], [[VAL_188]]] : memref +// CHECK-SAME: [[VAL_115:%.*]]: memref, [[VAL_116:%.*]]: memref, [[VAL_117:%.*]]: memref) { +// CHECK: [[VAL_118:%.*]] = constant 1 : index +// CHECK: [[VAL_119:%.*]] = constant 0 : index +// CHECK: [[VAL_120:%.*]] = constant 3 : index +// CHECK: [[VAL_121:%.*]] = constant 2 : index +// CHECK: [[VAL_122:%.*]] = dim [[VAL_115]], 0 : memref +// CHECK: [[VAL_123:%.*]] = dim [[VAL_115]], 1 : memref +// CHECK: [[VAL_124:%.*]] = constant 1 : index +// CHECK: [[VAL_125:%.*]] = affine.apply #map1(){{\[}}[[VAL_122]], [[VAL_119]], [[VAL_121]]] +// CHECK: [[VAL_126:%.*]] = affine.apply #map1(){{\[}}[[VAL_123]], [[VAL_119]], [[VAL_120]]] +// CHECK: [[VAL_127:%.*]] = affine.apply #map1(){{\[}}[[VAL_121]], [[VAL_119]], [[VAL_118]]] +// CHECK: [[VAL_128:%.*]] = affine.apply #map1(){{\[}}[[VAL_120]], [[VAL_119]], [[VAL_118]]] +// CHECK: gpu.launch blocks([[VAL_129:%.*]], [[VAL_130:%.*]], [[VAL_131:%.*]]) in ([[VAL_132:%.*]] = [[VAL_125]], [[VAL_133:%.*]] = [[VAL_126]], [[VAL_134:%.*]] = [[VAL_124]]) threads([[VAL_135:%.*]], [[VAL_136:%.*]], [[VAL_137:%.*]]) in ([[VAL_138:%.*]] = [[VAL_127]], [[VAL_139:%.*]] = [[VAL_128]], [[VAL_140:%.*]] = [[VAL_124]]) { +// CHECK: [[VAL_141:%.*]] = affine.apply #map2([[VAL_129]]){{\[}}[[VAL_121]], [[VAL_119]]] +// CHECK: [[VAL_142:%.*]] = affine.apply #map2([[VAL_130]]){{\[}}[[VAL_120]], [[VAL_119]]] +// CHECK: [[VAL_143:%.*]] = dim [[VAL_115]], 0 : memref +// CHECK: [[VAL_144:%.*]] = affine.min #map3([[VAL_121]], [[VAL_143]], [[VAL_141]]) +// CHECK: [[VAL_145:%.*]] = dim [[VAL_115]], 1 : memref +// CHECK: [[VAL_146:%.*]] = affine.min #map3([[VAL_120]], [[VAL_145]], [[VAL_142]]) +// CHECK: [[VAL_147:%.*]] = std.subview [[VAL_115]]{{\[}}[[VAL_141]], [[VAL_142]]]{{\[}}[[VAL_144]], [[VAL_146]]]{{\[}}[[VAL_118]], [[VAL_118]]] : memref to memref +// CHECK: [[VAL_148:%.*]] = dim [[VAL_116]], 0 : memref +// CHECK: [[VAL_149:%.*]] = affine.min #map3([[VAL_121]], [[VAL_148]], [[VAL_141]]) +// CHECK: [[VAL_150:%.*]] = dim [[VAL_116]], 1 : memref +// CHECK: [[VAL_151:%.*]] = affine.min #map3([[VAL_120]], [[VAL_150]], [[VAL_142]]) +// CHECK: [[VAL_152:%.*]] = std.subview [[VAL_116]]{{\[}}[[VAL_141]], [[VAL_142]]]{{\[}}[[VAL_149]], [[VAL_151]]]{{\[}}[[VAL_118]], [[VAL_118]]] : memref to memref +// CHECK: [[VAL_153:%.*]] = dim [[VAL_117]], 0 : memref +// CHECK: [[VAL_154:%.*]] = affine.min #map3([[VAL_121]], [[VAL_153]], [[VAL_141]]) +// CHECK: [[VAL_155:%.*]] = dim [[VAL_117]], 1 : memref +// CHECK: [[VAL_156:%.*]] = affine.min #map3([[VAL_120]], [[VAL_155]], [[VAL_142]]) +// CHECK: [[VAL_157:%.*]] = std.subview [[VAL_117]]{{\[}}[[VAL_141]], [[VAL_142]]]{{\[}}[[VAL_154]], [[VAL_156]]]{{\[}}[[VAL_118]], [[VAL_118]]] : memref to memref +// CHECK: [[VAL_158:%.*]] = affine.apply #map2([[VAL_135]]){{\[}}[[VAL_118]], [[VAL_119]]] +// CHECK: [[VAL_159:%.*]] = cmpi "slt", [[VAL_158]], [[VAL_144]] : index +// CHECK: loop.if [[VAL_159]] { +// CHECK: [[VAL_160:%.*]] = affine.apply #map2([[VAL_136]]){{\[}}[[VAL_118]], [[VAL_119]]] +// CHECK: [[VAL_161:%.*]] = cmpi "slt", [[VAL_160]], [[VAL_146]] : index +// CHECK: loop.if [[VAL_161]] { +// CHECK: [[VAL_162:%.*]] = load [[VAL_147]]{{\[}}[[VAL_158]], [[VAL_160]]] : memref +// CHECK: [[VAL_163:%.*]] = load [[VAL_152]]{{\[}}[[VAL_158]], [[VAL_160]]] : memref +// CHECK: [[VAL_164:%.*]] = load [[VAL_157]]{{\[}}[[VAL_158]], [[VAL_160]]] : memref +// CHECK: [[VAL_165:%.*]] = addf [[VAL_162]], [[VAL_163]] : f32 +// CHECK: store [[VAL_165]], [[VAL_157]]{{\[}}[[VAL_158]], [[VAL_160]]] : memref // CHECK: } // CHECK: } // CHECK: gpu.terminator @@ -324,3 +306,115 @@ // CHECK: } // CHECK: } +// ----- + +#map0 = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)> +#map1 = affine_map<(d0, d1, d2) -> (d0, d1 - d2)> +#map2 = affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)> +#map3 = affine_map<(d0) -> (d0)> +#map4 = affine_map<(d1, d2) -> (2, d1 - d2)> +#map5 = affine_map<(d1, d2) -> (3, d1 - d2)> + + +module { + func @sum2(%arg0: memref, %arg1: memref, %arg2: memref) { + %c1 = constant 1 : index + %c0 = constant 0 : index + %c3 = constant 3 : index + %c2 = constant 2 : index + %0 = dim %arg0, 0 : memref + %1 = dim %arg0, 1 : memref + loop.parallel (%arg3, %arg4) = (%c0, %c0) to (%0, %1) step (%c2, %c3) { + %2 = dim %arg0, 0 : memref + %3 = affine.min #map4(%2, %arg3) + %4 = dim %arg0, 1 : memref + %5 = affine.min #map5(%4, %arg4) + %6 = std.subview %arg0[%arg3, %arg4][%3, %5][%c1, %c1] : memref to memref + %7 = dim %arg1, 0 : memref + %8 = affine.min #map4(%7, %arg3) + %9 = dim %arg1, 1 : memref + %10 = affine.min #map5(%9, %arg4) + %11 = std.subview %arg1[%arg3, %arg4][%8, %10][%c1, %c1] : memref to memref + %12 = dim %arg2, 0 : memref + %13 = affine.min #map4(%12, %arg3) + %14 = dim %arg2, 1 : memref + %15 = affine.min #map5(%14, %arg4) + %16 = std.subview %arg2[%arg3, %arg4][%13, %15][%c1, %c1] : memref to memref + loop.parallel (%arg5, %arg6) = (%c0, %c0) to (%3, %5) step (%c1, %c1) { + %17 = load %6[%arg5, %arg6] : memref + %18 = load %11[%arg5, %arg6] : memref + %19 = load %16[%arg5, %arg6] : memref + %20 = addf %17, %18 : f32 + store %20, %16[%arg5, %arg6] : memref + loop.yield + } { mapping = [ + {processor = 3, map = #map3, bound = #map3}, + {processor = 4, map = #map3, bound = #map3} + ] } + loop.yield + } { mapping = [ + {processor = 0, map = #map3, bound = #map3}, + {processor = 1, map = #map3, bound = #map3} + ] } + return + } +} + +// CHECK: #map1 = affine_map<()[s0, s1, s2] -> (s0 - s1 ceildiv s2)> +// CHECK: #map2 = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)> +// CHECK: #map3 = affine_map<(d0, d1) -> (2, d0 - d1)> +// CHECK: #map4 = affine_map<(d0, d1) -> (3, d0 - d1)> +// CHECK: #map5 = affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)> + +// CHECK: module { +// CHECK-LABEL: func @sum2( +// CHECK-SAME: [[VAL_166:%.*]]: memref, [[VAL_167:%.*]]: memref, [[VAL_168:%.*]]: memref) { +// CHECK: [[VAL_169:%.*]] = constant 1 : index +// CHECK: [[VAL_170:%.*]] = constant 0 : index +// CHECK: [[VAL_171:%.*]] = constant 3 : index +// CHECK: [[VAL_172:%.*]] = constant 2 : index +// CHECK: [[VAL_173:%.*]] = dim [[VAL_166]], 0 : memref +// CHECK: [[VAL_174:%.*]] = dim [[VAL_166]], 1 : memref +// CHECK: [[VAL_175:%.*]] = constant 1 : index +// CHECK: [[VAL_176:%.*]] = affine.apply #map1(){{\[}}[[VAL_173]], [[VAL_170]], [[VAL_172]]] +// CHECK: [[VAL_177:%.*]] = affine.apply #map1(){{\[}}[[VAL_174]], [[VAL_170]], [[VAL_171]]] +// CHECK: [[VAL_178:%.*]] = constant 2 : index +// CHECK: [[VAL_179:%.*]] = affine.apply #map1(){{\[}}[[VAL_178]], [[VAL_170]], [[VAL_169]]] +// CHECK: [[VAL_180:%.*]] = constant 3 : index +// CHECK: [[VAL_181:%.*]] = affine.apply #map1(){{\[}}[[VAL_180]], [[VAL_170]], [[VAL_169]]] +// CHECK: gpu.launch blocks([[VAL_182:%.*]], [[VAL_183:%.*]], [[VAL_184:%.*]]) in ([[VAL_185:%.*]] = [[VAL_176]], [[VAL_186:%.*]] = [[VAL_177]], [[VAL_187:%.*]] = [[VAL_175]]) threads([[VAL_188:%.*]], [[VAL_189:%.*]], [[VAL_190:%.*]]) in ([[VAL_191:%.*]] = [[VAL_179]], [[VAL_192:%.*]] = [[VAL_181]], [[VAL_193:%.*]] = [[VAL_175]]) { +// CHECK: [[VAL_194:%.*]] = affine.apply #map2([[VAL_182]]){{\[}}[[VAL_172]], [[VAL_170]]] +// CHECK: [[VAL_195:%.*]] = affine.apply #map2([[VAL_183]]){{\[}}[[VAL_171]], [[VAL_170]]] +// CHECK: [[VAL_196:%.*]] = dim [[VAL_166]], 0 : memref +// CHECK: [[VAL_197:%.*]] = affine.min #map3([[VAL_196]], [[VAL_194]]) +// CHECK: [[VAL_198:%.*]] = dim [[VAL_166]], 1 : memref +// CHECK: [[VAL_199:%.*]] = affine.min #map4([[VAL_198]], [[VAL_195]]) +// CHECK: [[VAL_200:%.*]] = std.subview [[VAL_166]]{{\[}}[[VAL_194]], [[VAL_195]]]{{\[}}[[VAL_197]], [[VAL_199]]]{{\[}}[[VAL_169]], [[VAL_169]]] : memref to memref +// CHECK: [[VAL_201:%.*]] = dim [[VAL_167]], 0 : memref +// CHECK: [[VAL_202:%.*]] = affine.min #map3([[VAL_201]], [[VAL_194]]) +// CHECK: [[VAL_203:%.*]] = dim [[VAL_167]], 1 : memref +// CHECK: [[VAL_204:%.*]] = affine.min #map4([[VAL_203]], [[VAL_195]]) +// CHECK: [[VAL_205:%.*]] = std.subview [[VAL_167]]{{\[}}[[VAL_194]], [[VAL_195]]]{{\[}}[[VAL_202]], [[VAL_204]]]{{\[}}[[VAL_169]], [[VAL_169]]] : memref to memref +// CHECK: [[VAL_206:%.*]] = dim [[VAL_168]], 0 : memref +// CHECK: [[VAL_207:%.*]] = affine.min #map3([[VAL_206]], [[VAL_194]]) +// CHECK: [[VAL_208:%.*]] = dim [[VAL_168]], 1 : memref +// CHECK: [[VAL_209:%.*]] = affine.min #map4([[VAL_208]], [[VAL_195]]) +// CHECK: [[VAL_210:%.*]] = std.subview [[VAL_168]]{{\[}}[[VAL_194]], [[VAL_195]]]{{\[}}[[VAL_207]], [[VAL_209]]]{{\[}}[[VAL_169]], [[VAL_169]]] : memref to memref +// CHECK: [[VAL_211:%.*]] = affine.apply #map2([[VAL_188]]){{\[}}[[VAL_169]], [[VAL_170]]] +// CHECK: [[VAL_212:%.*]] = cmpi "slt", [[VAL_211]], [[VAL_197]] : index +// CHECK: loop.if [[VAL_212]] { +// CHECK: [[VAL_213:%.*]] = affine.apply #map2([[VAL_189]]){{\[}}[[VAL_169]], [[VAL_170]]] +// CHECK: [[VAL_214:%.*]] = cmpi "slt", [[VAL_213]], [[VAL_199]] : index +// CHECK: loop.if [[VAL_214]] { +// CHECK: [[VAL_215:%.*]] = load [[VAL_200]]{{\[}}[[VAL_211]], [[VAL_213]]] : memref +// CHECK: [[VAL_216:%.*]] = load [[VAL_205]]{{\[}}[[VAL_211]], [[VAL_213]]] : memref +// CHECK: [[VAL_217:%.*]] = load [[VAL_210]]{{\[}}[[VAL_211]], [[VAL_213]]] : memref +// CHECK: [[VAL_218:%.*]] = addf [[VAL_215]], [[VAL_216]] : f32 +// CHECK: store [[VAL_218]], [[VAL_210]]{{\[}}[[VAL_211]], [[VAL_213]]] : memref +// CHECK: } +// CHECK: } +// CHECK: gpu.terminator +// CHECK: } +// CHECK: return +// CHECK: } +// CHECK: }