diff --git a/mlir/include/mlir/Conversion/LoopsToGPU/LoopsToGPU.h b/mlir/include/mlir/Conversion/LoopsToGPU/LoopsToGPU.h --- a/mlir/include/mlir/Conversion/LoopsToGPU/LoopsToGPU.h +++ b/mlir/include/mlir/Conversion/LoopsToGPU/LoopsToGPU.h @@ -12,6 +12,8 @@ namespace mlir { class AffineForOp; +class MLIRContext; +class OwningRewritePatternList; struct LogicalResult; class Value; @@ -72,6 +74,11 @@ ArrayRef numWorkGroups, ArrayRef workGroupSizes); +/// Adds the conversion pattern from `loop.parallel` to `gpu.launch` to the +/// provided pattern list. +void populateParallelLoopToGPUPatterns(OwningRewritePatternList &patterns, + MLIRContext *ctx); + } // namespace mlir #endif // MLIR_CONVERSION_LOOPSTOGPU_LOOPSTOGPU_H_ diff --git a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp --- a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp +++ b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp @@ -20,8 +20,12 @@ #include "mlir/Dialect/LoopOps/LoopOps.h" #include "mlir/Dialect/StandardOps/Ops.h" #include "mlir/IR/AffineExpr.h" +#include "mlir/IR/BlockAndValueMapping.h" #include "mlir/IR/Builders.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/DialectConversion.h" #include "mlir/Transforms/LoopUtils.h" +#include "mlir/Transforms/Passes.h" #include "mlir/Transforms/RegionUtils.h" #include "llvm/ADT/Sequence.h" #include "llvm/Support/Debug.h" @@ -487,3 +491,307 @@ ArrayRef workGroupSizes) { return ::convertLoopToGPULaunch(forOp, numWorkGroups, workGroupSizes); } + +namespace { +struct ParallelToGpuLaunchLowering : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + PatternMatchResult matchAndRewrite(ParallelOp parallelOp, + PatternRewriter &rewriter) const override; +}; + +struct MappingAnnotation { + unsigned processor; + AffineMap indexMap; + AffineMap boundMap; +}; + +} // namespace + +static constexpr const char *kProcessorEntryName = "processor"; +static constexpr const char *kIndexMapEntryName = "map"; +static constexpr const char *kBoundMapEntryName = "bound"; + +/// Extracts the mapping annotations from the provided attribute. The attribute +/// is expected to be of the form +/// { processor = , map = , bound = } +/// where the bound is optional. +static MappingAnnotation extractMappingAnnotation(Attribute attribute) { + DictionaryAttr dict = attribute.cast(); + unsigned processor = dict.get(kProcessorEntryName) + .cast() + .getValue() + .getSExtValue(); + AffineMap map = dict.get(kIndexMapEntryName).cast().getValue(); + AffineMapAttr boundAttr = + dict.get(kBoundMapEntryName).dyn_cast_or_null(); + AffineMap bound; + if (boundAttr) + bound = boundAttr.getValue(); + return {processor, map, bound}; +} + +/// Tries to derive a static upper bound from the defining operation of +/// `upperBound`. +static Value deriveStaticUpperBound(Value upperBound) { + Value constantBound = {}; + if (AffineMinOp minOp = + dyn_cast_or_null(upperBound.getDefiningOp())) { + auto map = minOp.map(); + auto operands = minOp.operands(); + for (int sub = 0, e = map.getNumResults(); sub < e; ++sub) { + AffineExpr expr = map.getResult(sub); + if (AffineDimExpr dimExpr = expr.dyn_cast()) { + auto dimOperand = operands[dimExpr.getPosition()]; + auto defOp = dimOperand.getDefiningOp(); + if (ConstantOp constOp = dyn_cast_or_null(defOp)) { + constantBound = constOp; + break; + } + } + } + } + return constantBound; +} + +/// Modifies the current transformation state to capture the effect of the given +/// `loop.parallel` operation on index substitutions and the operations to be +/// inserted. +/// Specifically, if a dimension of a parallel loop is mapped to a hardware id, +/// this function will +/// - compute the loop index based on the hardware id and affine map from the +/// mapping and update `cloningMap` to substitute all uses. +/// - derive a new upper bound for the hardware id and augment the provided +/// `gpu.launch operation` accordingly. +/// - if the upper bound is imprecise, insert a conditional in the `gpu.launch` +/// and update the rewriter to insert into the conditional's body. +/// If the dimension is mapped to sequential, +/// - insert a for loop into the body and update the rewriter to insert into +/// the for loop's body. +/// - update the `cloningMap` to replace uses of the index with the index of +/// the new for loop. +/// In either case, +/// - append the instructions from the loops body to worklist, in reverse order. +/// To note the end of the current scope in case a loop or conditional was +/// inserted, a sentinel (the `gpu.launch` operation) is inserted into the +/// worklist. This signals the processor of the worklist to pop the rewriter +/// one scope-level up. +static LogicalResult processParallelLoop(ParallelOp parallelOp, + gpu::LaunchOp launchOp, + BlockAndValueMapping &cloningMap, + SmallVectorImpl &worklist, + PatternRewriter &rewriter) { + // TODO(herhut): Verify that this is a valid GPU mapping. + // processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential + ArrayAttr mapping = parallelOp.getAttrOfType("mapping"); + + // TODO(herhut): Support reductions. + if (!mapping || parallelOp.getNumResults() != 0) + return failure(); + + Location loc = parallelOp.getLoc(); + + auto launchIndependent = [&launchOp](Value val) { + return val.getParentRegion()->isAncestor(launchOp.getParentRegion()); + }; + + auto ensureLaunchIndependent = [&launchOp, &rewriter, + launchIndependent](Value val) -> Value { + if (launchIndependent(val)) + return val; + if (ConstantOp constOp = dyn_cast_or_null(val.getDefiningOp())) + return rewriter.create(constOp.getLoc(), constOp.getValue()); + return {}; + }; + + for (auto config : llvm::zip(mapping, parallelOp.getInductionVars(), + parallelOp.lowerBound(), parallelOp.upperBound(), + parallelOp.step())) { + MappingAnnotation annotation = + extractMappingAnnotation(std::get<0>(config)); + Value newIndex; + + if (annotation.processor < gpu::LaunchOp::kNumConfigOperands) { + // Use the corresponding thread/grid index as replacement for the loop iv. + // TODO(herhut): Make the iv calculation depend on lower & upper bound. + Value operand = launchOp.body().front().getArgument(annotation.processor); + Value appliedMap = + rewriter.create(loc, annotation.indexMap, operand); + // Add the lower bound, as the maps are 0 based but the loop might not be. + // TODO(herhut): Maybe move this explicitly into the maps? + newIndex = rewriter.create( + loc, appliedMap, cloningMap.lookupOrDefault(std::get<2>(config))); + // If there was also a bound, insert that, too. + // TODO(herhut): Check that we do not assign bounds twice. + if (annotation.boundMap) { + auto save = rewriter.saveInsertionPoint(); + rewriter.setInsertionPoint(launchOp); + // We pass as the single opererand to the bound-map the number of + // iterations, which is upperBound - lowerBound. To support inner loops + // with dynamic upper bounds (as generated by e.g. tiling), try to + // derive a max for the bounds. If the used bound for the hardware id is + // inprecise, wrap the contained code into a conditional. + Value lowerBound = std::get<2>(config); + Value upperBound = std::get<3>(config); + // If the lower-bound is constant or defined before the launch, we can + // use it in the launch bounds. Otherwise fail. + if (!launchIndependent(lowerBound) && + !isa(lowerBound.getDefiningOp())) + return failure(); + // If the upper-bound is constant or defined before the launch, we can + // use it in the launch bounds directly. Otherwise try derive a bound. + if (!launchIndependent(upperBound) && + !isa(upperBound.getDefiningOp())) { + upperBound = deriveStaticUpperBound(upperBound); + if (!upperBound) + return failure(); + } + Value iterations = rewriter.create( + loc, + ensureLaunchIndependent(cloningMap.lookupOrDefault(upperBound)), + ensureLaunchIndependent(cloningMap.lookupOrDefault(lowerBound))); + Value newBound = rewriter.create( + loc, annotation.boundMap, iterations); + launchOp.setOperand(annotation.processor, newBound); + rewriter.restoreInsertionPoint(save); + // TODO(herhut): It would be better to compare the underlying value + // here. + if (upperBound != std::get<3>(config)) { + // We are using an approximation, create a surrounding conditional. + CmpIOp pred = rewriter.create( + loc, CmpIPredicate::slt, newIndex, + cloningMap.lookupOrDefault(std::get<3>(config))); + loop::IfOp ifOp = rewriter.create(loc, pred, false); + rewriter.setInsertionPointToStart(&ifOp.thenRegion().front()); + // Put a sentinel into the worklist so we know when to pop out of the + // if body again. We use the launchOp here, as that cannot be part of + // the bodies instruction. + worklist.push_back(launchOp.getOperation()); + } + } + } else { + // Create a sequential for loop. + auto loopOp = rewriter.create( + loc, cloningMap.lookupOrDefault(std::get<2>(config)), + cloningMap.lookupOrDefault(std::get<3>(config)), + cloningMap.lookupOrDefault(std::get<4>(config))); + newIndex = loopOp.getInductionVar(); + rewriter.setInsertionPointToStart(loopOp.getBody()); + // Put a sentinel into the worklist so we know when to pop out of the loop + // body again. We use the launchOp here, as that cannot be part of the + // bodies instruction. + worklist.push_back(launchOp.getOperation()); + } + cloningMap.map(std::get<1>(config), newIndex); + } + Block *body = parallelOp.getBody(); + worklist.reserve(worklist.size() + body->getOperations().size()); + for (Operation &op : llvm::reverse(body->without_terminator())) + worklist.push_back(&op); + return success(); +} + +/// Lower a `loop.parallel` operation into a corresponding `gpu.launch` +/// operation. +/// +/// This essentially transforms a loop nest into a corresponding SIMT function. +/// The conversion is driven by mapping annotations on the `loop.parallel` +/// operations. The mapping is provided via a `DictionaryAttribute` named +/// `mapping`, which has three entries: +/// - processor: the hardware id to map to. 0-2 are block dimensions, 3-5 are +/// thread dimensions and 6 is sequential. +/// - map : An affine map that is used to pre-process hardware ids before +/// substitution. +/// - bound : An affine map that is used to compute the bound of the hardware +/// id based on an upper bound of the number of iterations. +/// If the `loop.parallel` contains nested `loop.parallel` operations, those +/// need to be annotated, as well. Structurally, the transformation works by +/// splicing all operations from nested `loop.parallel` operations into a single +/// sequence. Indices mapped to hardware ids are substituted with those ids, +/// wheras sequential mappings result in a sequential for-loop. To have more +/// flexibility when mapping code to hardware ids, the transform supports two +/// affine maps. The first `map` is used to compute the actual index for +/// substitution from the hardware id. The second `bound` is used to compute the +/// launch dimension for the hardware id from the number of iterations the +/// mapped loop is performing. Note that the number of iterations might be +/// imprecise if the corresponding loop-bounds are loop-dependent. In such case, +/// the hardware id might iterate over additional indices. The transformation +/// caters for this by predicating the created sequence of instructions on +/// the actual loop bound. This only works if an static upper bound for the +/// dynamic loop bound can be defived, currently via analyzing `affine.min` +/// operations. +PatternMatchResult +ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp, + PatternRewriter &rewriter) const { + // Create a launch operation. We start with bound one for all grid/block + // sizes. Those will be refined later as we discover them from mappings. + Location loc = parallelOp.getLoc(); + Value constantOne = rewriter.create(parallelOp.getLoc(), 1); + gpu::LaunchOp launchOp = rewriter.create( + parallelOp.getLoc(), constantOne, constantOne, constantOne, constantOne, + constantOne, constantOne); + rewriter.setInsertionPointToEnd(&launchOp.body().front()); + rewriter.create(loc); + rewriter.setInsertionPointToStart(&launchOp.body().front()); + + BlockAndValueMapping cloningMap; + SmallVector worklist; + if (failed(processParallelLoop(parallelOp, launchOp, cloningMap, worklist, + rewriter))) + return matchFailure(); + + while (!worklist.empty()) { + Operation *op = worklist.pop_back_val(); + + // Now walk over the body and clone it. + // TODO: This is only correct if there either is no further loop.parallel + // nested or this code is side-effect free. Otherwise we might need + // predication. + if (auto nestedParallel = dyn_cast(op)) { + // A nested loop.parallel needs insertion of code to compute indices. + // Insert that now. + processParallelLoop(nestedParallel, launchOp, cloningMap, worklist, + rewriter); + } else if (op == launchOp.getOperation()) { + // Found our sentinel value. We have finished the operations from one + // nesting level, pop one level back up. + auto parent = rewriter.getInsertionPoint()->getParentOp(); + rewriter.setInsertionPointAfter(parent); + } else { + // Otherwise we copy it over. + Operation *clone = rewriter.clone(*op, cloningMap); + cloningMap.map(op->getResults(), clone->getResults()); + } + } + + rewriter.eraseOp(parallelOp); + return matchSuccess(); +} + +namespace { +struct ParallelLoopToGpuPass : public OperationPass { + void runOnOperation() override; +}; +} // namespace + +void mlir::populateParallelLoopToGPUPatterns(OwningRewritePatternList &patterns, + MLIRContext *ctx) { + patterns.insert(ctx); +} + +void ParallelLoopToGpuPass::runOnOperation() { + OwningRewritePatternList patterns; + populateParallelLoopToGPUPatterns(patterns, &getContext()); + ConversionTarget target(getContext()); + target.addLegalDialect(); + target.addLegalDialect(); + target.addLegalDialect(); + target.addLegalDialect(); + target.addIllegalOp(); + if (failed(applyPartialConversion(getOperation(), target, patterns))) + signalPassFailure(); +} + +static PassRegistration + pass("convert-parallel-loops-to-gpu", "Convert mapped loop.parallel ops" + " to gpu launch operations."); \ No newline at end of file diff --git a/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir b/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir @@ -0,0 +1,326 @@ +// RUN: mlir-opt -convert-parallel-loops-to-gpu -split-input-file %s | FileCheck %s -dump-input-on-failure + +// 2-d parallel loop mapped to block.y and block.x + +func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index, + %arg3 : index, %arg4 : index, + %buf : memref, + %res : memref) { + %step = constant 2 : index + loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3) + step (%arg4, %step) { + %val = load %buf[%i0, %i1] : memref + store %val, %res[%i1, %i0] : memref + } { mapping = [{processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}, {processor = 0, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}] } + return +} + +// CHECK: #map0 = affine_map<(d0) -> (d0)> +// CHECK: module { + +// CHECK-LABEL: func @parallel_loop( +// CHECK-SAME: [[VAL_0:%.*]]: index, [[VAL_1:%.*]]: index, [[VAL_2:%.*]]: index, [[VAL_3:%.*]]: index, [[VAL_4:%.*]]: index, [[VAL_5:%.*]]: memref, [[VAL_6:%.*]]: memref) { +// CHECK: [[VAL_7:%.*]] = constant 2 : index +// CHECK: [[VAL_8:%.*]] = constant 1 : index +// CHECK: [[VAL_9:%.*]] = subi [[VAL_2]], [[VAL_0]] : index +// CHECK: [[VAL_10:%.*]] = affine.apply #map0([[VAL_9]]) +// CHECK: [[VAL_11:%.*]] = subi [[VAL_3]], [[VAL_1]] : index +// CHECK: [[VAL_12:%.*]] = affine.apply #map0([[VAL_11]]) +// CHECK: gpu.launch blocks([[VAL_13:%.*]], [[VAL_14:%.*]], [[VAL_15:%.*]]) in ([[VAL_16:%.*]] = [[VAL_12]], [[VAL_17:%.*]] = [[VAL_10]], [[VAL_18:%.*]] = [[VAL_8]]) threads([[VAL_19:%.*]], [[VAL_20:%.*]], [[VAL_21:%.*]]) in ([[VAL_22:%.*]] = [[VAL_8]], [[VAL_23:%.*]] = [[VAL_8]], [[VAL_24:%.*]] = [[VAL_8]]) { +// CHECK: [[VAL_25:%.*]] = affine.apply #map0([[VAL_14]]) +// CHECK: [[VAL_26:%.*]] = addi [[VAL_25]], [[VAL_0]] : index +// CHECK: [[VAL_27:%.*]] = affine.apply #map0([[VAL_13]]) +// CHECK: [[VAL_28:%.*]] = addi [[VAL_27]], [[VAL_1]] : index +// CHECK: [[VAL_29:%.*]] = load [[VAL_5]]{{\[}}[[VAL_26]], [[VAL_28]]] : memref +// CHECK: store [[VAL_29]], [[VAL_6]]{{\[}}[[VAL_28]], [[VAL_26]]] : memref +// CHECK: gpu.terminator +// CHECK: } +// CHECK: return +// CHECK: } +// CHECK: } + +// ----- + +// tiled 2-d parallel loop mapped to block.y and block.x and thread.y and thread.x. + +func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index, + %arg3 : index, + %buf : memref, + %res : memref) { + %zero = constant 0 : index + %one = constant 1 : index + %four = constant 4 : index + loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3) + step (%four, %four) { + loop.parallel (%si0, %si1) = (%zero, %zero) to (%four, %four) + step (%one, %one) { + %idx0 = addi %i0, %si0 : index + %idx1 = addi %i1, %si1 : index + %val = load %buf[%idx0, %idx1] : memref + store %val, %res[%idx1, %idx0] : memref + } { mapping = [ + {processor = 4, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}, + {processor = 3, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>} + ] } + } { mapping = [ + {processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}, + {processor = 0, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>} + ] } + return +} + +// CHECK: #map0 = affine_map<(d0) -> (d0)> +// CHECK: module { + +// CHECK-LABEL: func @parallel_loop( +// CHECK-SAME: [[VAL_30:%.*]]: index, [[VAL_31:%.*]]: index, [[VAL_32:%.*]]: index, [[VAL_33:%.*]]: index, [[VAL_34:%.*]]: memref, [[VAL_35:%.*]]: memref) { +// CHECK: [[VAL_36:%.*]] = constant 0 : index +// CHECK: [[VAL_37:%.*]] = constant 1 : index +// CHECK: [[VAL_38:%.*]] = constant 4 : index +// CHECK: [[VAL_39:%.*]] = constant 1 : index +// CHECK: [[VAL_40:%.*]] = subi [[VAL_32]], [[VAL_30]] : index +// CHECK: [[VAL_41:%.*]] = affine.apply #map0([[VAL_40]]) +// CHECK: [[VAL_42:%.*]] = subi [[VAL_33]], [[VAL_31]] : index +// CHECK: [[VAL_43:%.*]] = affine.apply #map0([[VAL_42]]) +// CHECK: [[VAL_44:%.*]] = subi [[VAL_38]], [[VAL_36]] : index +// CHECK: [[VAL_45:%.*]] = affine.apply #map0([[VAL_44]]) +// CHECK: [[VAL_46:%.*]] = subi [[VAL_38]], [[VAL_36]] : index +// CHECK: [[VAL_47:%.*]] = affine.apply #map0([[VAL_46]]) +// CHECK: gpu.launch blocks([[VAL_48:%.*]], [[VAL_49:%.*]], [[VAL_50:%.*]]) in ([[VAL_51:%.*]] = [[VAL_43]], [[VAL_52:%.*]] = [[VAL_41]], [[VAL_53:%.*]] = [[VAL_39]]) threads([[VAL_54:%.*]], [[VAL_55:%.*]], [[VAL_56:%.*]]) in ([[VAL_57:%.*]] = [[VAL_47]], [[VAL_58:%.*]] = [[VAL_45]], [[VAL_59:%.*]] = [[VAL_39]]) { +// CHECK: [[VAL_60:%.*]] = affine.apply #map0([[VAL_49]]) +// CHECK: [[VAL_61:%.*]] = addi [[VAL_60]], [[VAL_30]] : index +// CHECK: [[VAL_62:%.*]] = affine.apply #map0([[VAL_48]]) +// CHECK: [[VAL_63:%.*]] = addi [[VAL_62]], [[VAL_31]] : index +// CHECK: [[VAL_64:%.*]] = affine.apply #map0([[VAL_55]]) +// CHECK: [[VAL_65:%.*]] = addi [[VAL_64]], [[VAL_36]] : index +// CHECK: [[VAL_66:%.*]] = affine.apply #map0([[VAL_54]]) +// CHECK: [[VAL_67:%.*]] = addi [[VAL_66]], [[VAL_36]] : index +// CHECK: [[VAL_68:%.*]] = addi [[VAL_61]], [[VAL_65]] : index +// CHECK: [[VAL_69:%.*]] = addi [[VAL_63]], [[VAL_67]] : index +// CHECK: [[VAL_70:%.*]] = load [[VAL_34]]{{\[}}[[VAL_68]], [[VAL_69]]] : memref +// CHECK: store [[VAL_70]], [[VAL_35]]{{\[}}[[VAL_69]], [[VAL_68]]] : memref +// CHECK: gpu.terminator +// CHECK: } +// CHECK: return +// CHECK: } +// CHECK: } + +// ----- + +// 2-d parallel loop mapped to block.y and sequential + +func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index, + %arg3 : index, %arg4 : index, + %buf : memref, + %res : memref) { + %step = constant 2 : index + loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3) + step (%arg4, %step) { + %val = load %buf[%i0, %i1] : memref + store %val, %res[%i1, %i0] : memref + } { mapping = [ + {processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}, + {processor = 6, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>} + ] } + return +} + +// CHECK: #map0 = affine_map<(d0) -> (d0)> +// CHECK: module { + +// CHECK-LABEL: func @parallel_loop( +// CHECK-SAME: [[VAL_71:%.*]]: index, [[VAL_72:%.*]]: index, [[VAL_73:%.*]]: index, [[VAL_74:%.*]]: index, [[VAL_75:%.*]]: index, [[VAL_76:%.*]]: memref, [[VAL_77:%.*]]: memref) { +// CHECK: [[VAL_78:%.*]] = constant 2 : index +// CHECK: [[VAL_79:%.*]] = constant 1 : index +// CHECK: [[VAL_80:%.*]] = subi [[VAL_73]], [[VAL_71]] : index +// CHECK: [[VAL_81:%.*]] = affine.apply #map0([[VAL_80]]) +// CHECK: gpu.launch blocks([[VAL_82:%.*]], [[VAL_83:%.*]], [[VAL_84:%.*]]) in ([[VAL_85:%.*]] = [[VAL_79]], [[VAL_86:%.*]] = [[VAL_81]], [[VAL_87:%.*]] = [[VAL_79]]) threads([[VAL_88:%.*]], [[VAL_89:%.*]], [[VAL_90:%.*]]) in ([[VAL_91:%.*]] = [[VAL_79]], [[VAL_92:%.*]] = [[VAL_79]], [[VAL_93:%.*]] = [[VAL_79]]) { +// CHECK: [[VAL_94:%.*]] = affine.apply #map0([[VAL_83]]) +// CHECK: [[VAL_95:%.*]] = addi [[VAL_94]], [[VAL_71]] : index +// CHECK: loop.for [[VAL_96:%.*]] = [[VAL_72]] to [[VAL_74]] step [[VAL_78]] { +// CHECK: [[VAL_97:%.*]] = load [[VAL_76]]{{\[}}[[VAL_95]], [[VAL_96]]] : memref +// CHECK: store [[VAL_97]], [[VAL_77]]{{\[}}[[VAL_96]], [[VAL_95]]] : memref +// CHECK: } +// CHECK: gpu.terminator +// CHECK: } +// CHECK: return +// CHECK: } +// CHECK: } + +// ----- + +// tiled 2-d parallel loop mapped to block.y and seq. and thread.y and seq. + +func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index, + %arg3 : index, + %buf : memref, + %res : memref) { + %zero = constant 0 : index + %one = constant 1 : index + %four = constant 4 : index + loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3) + step (%four, %four) { + loop.parallel (%si0, %si1) = (%zero, %zero) to (%four, %four) + step (%one, %one) { + %idx0 = addi %i0, %si0 : index + %idx1 = addi %i1, %si1 : index + %val = load %buf[%idx0, %idx1] : memref + store %val, %res[%idx1, %idx0] : memref + } { mapping = [ + {processor = 4, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}, + {processor = 6, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>} + ] } + } { mapping = [ + {processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}, + {processor = 6, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>} + ] } + return +} + +// CHECK: #map0 = affine_map<(d0) -> (d0)> +// CHECK: module { + +// CHECK-LABEL: func @parallel_loop( +// CHECK-SAME: [[VAL_98:%.*]]: index, [[VAL_99:%.*]]: index, [[VAL_100:%.*]]: index, [[VAL_101:%.*]]: index, [[VAL_102:%.*]]: memref, [[VAL_103:%.*]]: memref) { +// CHECK: [[VAL_104:%.*]] = constant 0 : index +// CHECK: [[VAL_105:%.*]] = constant 1 : index +// CHECK: [[VAL_106:%.*]] = constant 4 : index +// CHECK: [[VAL_107:%.*]] = constant 1 : index +// CHECK: [[VAL_108:%.*]] = subi [[VAL_100]], [[VAL_98]] : index +// CHECK: [[VAL_109:%.*]] = affine.apply #map0([[VAL_108]]) +// CHECK: [[VAL_110:%.*]] = subi [[VAL_106]], [[VAL_104]] : index +// CHECK: [[VAL_111:%.*]] = affine.apply #map0([[VAL_110]]) +// CHECK: gpu.launch blocks([[VAL_112:%.*]], [[VAL_113:%.*]], [[VAL_114:%.*]]) in ([[VAL_115:%.*]] = [[VAL_107]], [[VAL_116:%.*]] = [[VAL_109]], [[VAL_117:%.*]] = [[VAL_107]]) threads([[VAL_118:%.*]], [[VAL_119:%.*]], [[VAL_120:%.*]]) in ([[VAL_121:%.*]] = [[VAL_107]], [[VAL_122:%.*]] = [[VAL_111]], [[VAL_123:%.*]] = [[VAL_107]]) { +// CHECK: [[VAL_124:%.*]] = affine.apply #map0([[VAL_113]]) +// CHECK: [[VAL_125:%.*]] = addi [[VAL_124]], [[VAL_98]] : index +// CHECK: loop.for [[VAL_126:%.*]] = [[VAL_99]] to [[VAL_101]] step [[VAL_106]] { +// CHECK: [[VAL_127:%.*]] = affine.apply #map0([[VAL_119]]) +// CHECK: [[VAL_128:%.*]] = addi [[VAL_127]], [[VAL_104]] : index +// CHECK: loop.for [[VAL_129:%.*]] = [[VAL_104]] to [[VAL_106]] step [[VAL_105]] { +// CHECK: [[VAL_130:%.*]] = addi [[VAL_125]], [[VAL_128]] : index +// CHECK: [[VAL_131:%.*]] = addi [[VAL_126]], [[VAL_129]] : index +// CHECK: [[VAL_132:%.*]] = load [[VAL_102]]{{\[}}[[VAL_130]], [[VAL_131]]] : memref +// CHECK: store [[VAL_132]], [[VAL_103]]{{\[}}[[VAL_131]], [[VAL_130]]] : memref +// CHECK: } +// CHECK: } +// CHECK: gpu.terminator +// CHECK: } +// CHECK: return +// CHECK: } +// CHECK: } + +// ----- + +#map0 = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)> +#map1 = affine_map<(d0, d1, d2) -> (d0, d1 - d2)> +#map2 = affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)> +#map3 = affine_map<(d0) -> (d0)> + +module { + func @sum(%arg0: memref, %arg1: memref, %arg2: memref) { + %c1 = constant 1 : index + %c0 = constant 0 : index + %c3 = constant 3 : index + %c2 = constant 2 : index + %0 = dim %arg0, 0 : memref + %1 = dim %arg0, 1 : memref + loop.parallel (%arg3, %arg4) = (%c0, %c0) to (%0, %1) step (%c2, %c3) { + %2 = dim %arg0, 0 : memref + %3 = affine.min #map1(%c2, %2, %arg3) + %4 = dim %arg0, 1 : memref + %5 = affine.min #map1(%c3, %4, %arg4) + %6 = std.subview %arg0[%arg3, %arg4][%3, %5][%c1, %c1] : memref to memref + %7 = dim %arg1, 0 : memref + %8 = affine.min #map1(%c2, %7, %arg3) + %9 = dim %arg1, 1 : memref + %10 = affine.min #map1(%c3, %9, %arg4) + %11 = std.subview %arg1[%arg3, %arg4][%8, %10][%c1, %c1] : memref to memref + %12 = dim %arg2, 0 : memref + %13 = affine.min #map1(%c2, %12, %arg3) + %14 = dim %arg2, 1 : memref + %15 = affine.min #map1(%c3, %14, %arg4) + %16 = std.subview %arg2[%arg3, %arg4][%13, %15][%c1, %c1] : memref to memref + loop.parallel (%arg5, %arg6) = (%c0, %c0) to (%3, %5) step (%c1, %c1) { + %17 = load %6[%arg5, %arg6] : memref + %18 = load %11[%arg5, %arg6] : memref + %19 = load %16[%arg5, %arg6] : memref + %20 = addf %17, %18 : f32 + store %20, %16[%arg5, %arg6] : memref + "loop.terminator"() : () -> () + } { mapping = [ + {processor = 3, map = #map3, bound = #map3}, + {processor = 4, map = #map3, bound = #map3} + ] } + "loop.terminator"() : () -> () + } { mapping = [ + {processor = 0, map = #map3, bound = #map3}, + {processor = 1, map = #map3, bound = #map3} + ] } + return + } +} + +// CHECK: #map0 = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)> +// CHECK: #map1 = affine_map<(d0) -> (d0)> +// CHECK: #map2 = affine_map<(d0, d1, d2) -> (d0, d1 - d2)> +// CHECK: #map3 = affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)> +// CHECK: module { + +// CHECK-LABEL: func @sum( +// CHECK-SAME: [[VAL_133:%.*]]: memref, [[VAL_134:%.*]]: memref, [[VAL_135:%.*]]: memref) { +// CHECK: [[VAL_136:%.*]] = constant 1 : index +// CHECK: [[VAL_137:%.*]] = constant 0 : index +// CHECK: [[VAL_138:%.*]] = constant 3 : index +// CHECK: [[VAL_139:%.*]] = constant 2 : index +// CHECK: [[VAL_140:%.*]] = dim [[VAL_133]], 0 : memref +// CHECK: [[VAL_141:%.*]] = dim [[VAL_133]], 1 : memref +// CHECK: [[VAL_142:%.*]] = constant 1 : index +// CHECK: [[VAL_143:%.*]] = subi [[VAL_140]], [[VAL_137]] : index +// CHECK: [[VAL_144:%.*]] = affine.apply #map1([[VAL_143]]) +// CHECK: [[VAL_145:%.*]] = subi [[VAL_141]], [[VAL_137]] : index +// CHECK: [[VAL_146:%.*]] = affine.apply #map1([[VAL_145]]) +// CHECK: [[VAL_148:%.*]] = subi [[VAL_139]], [[VAL_137]] : index +// CHECK: [[VAL_149:%.*]] = affine.apply #map1([[VAL_148]]) +// CHECK: [[VAL_151:%.*]] = subi [[VAL_138]], [[VAL_137]] : index +// CHECK: [[VAL_152:%.*]] = affine.apply #map1([[VAL_151]]) +// CHECK: gpu.launch blocks([[VAL_153:%.*]], [[VAL_154:%.*]], [[VAL_155:%.*]]) in ([[VAL_156:%.*]] = [[VAL_144]], [[VAL_157:%.*]] = [[VAL_146]], [[VAL_158:%.*]] = [[VAL_142]]) threads([[VAL_159:%.*]], [[VAL_160:%.*]], [[VAL_161:%.*]]) in ([[VAL_162:%.*]] = [[VAL_149]], [[VAL_163:%.*]] = [[VAL_152]], [[VAL_164:%.*]] = [[VAL_142]]) { +// CHECK: [[VAL_165:%.*]] = affine.apply #map1([[VAL_153]]) +// CHECK: [[VAL_166:%.*]] = addi [[VAL_165]], [[VAL_137]] : index +// CHECK: [[VAL_167:%.*]] = affine.apply #map1([[VAL_154]]) +// CHECK: [[VAL_168:%.*]] = addi [[VAL_167]], [[VAL_137]] : index +// CHECK: [[VAL_169:%.*]] = dim [[VAL_133]], 0 : memref +// CHECK: [[VAL_170:%.*]] = affine.min #map2([[VAL_139]], [[VAL_169]], [[VAL_166]]) +// CHECK: [[VAL_171:%.*]] = dim [[VAL_133]], 1 : memref +// CHECK: [[VAL_172:%.*]] = affine.min #map2([[VAL_138]], [[VAL_171]], [[VAL_168]]) +// CHECK: [[VAL_173:%.*]] = std.subview [[VAL_133]]{{\[}}[[VAL_166]], [[VAL_168]]]{{\[}}[[VAL_170]], [[VAL_172]]]{{\[}}[[VAL_136]], [[VAL_136]]] : memref to memref +// CHECK: [[VAL_174:%.*]] = dim [[VAL_134]], 0 : memref +// CHECK: [[VAL_175:%.*]] = affine.min #map2([[VAL_139]], [[VAL_174]], [[VAL_166]]) +// CHECK: [[VAL_176:%.*]] = dim [[VAL_134]], 1 : memref +// CHECK: [[VAL_177:%.*]] = affine.min #map2([[VAL_138]], [[VAL_176]], [[VAL_168]]) +// CHECK: [[VAL_178:%.*]] = std.subview [[VAL_134]]{{\[}}[[VAL_166]], [[VAL_168]]]{{\[}}[[VAL_175]], [[VAL_177]]]{{\[}}[[VAL_136]], [[VAL_136]]] : memref to memref +// CHECK: [[VAL_179:%.*]] = dim [[VAL_135]], 0 : memref +// CHECK: [[VAL_180:%.*]] = affine.min #map2([[VAL_139]], [[VAL_179]], [[VAL_166]]) +// CHECK: [[VAL_181:%.*]] = dim [[VAL_135]], 1 : memref +// CHECK: [[VAL_182:%.*]] = affine.min #map2([[VAL_138]], [[VAL_181]], [[VAL_168]]) +// CHECK: [[VAL_183:%.*]] = std.subview [[VAL_135]]{{\[}}[[VAL_166]], [[VAL_168]]]{{\[}}[[VAL_180]], [[VAL_182]]]{{\[}}[[VAL_136]], [[VAL_136]]] : memref to memref +// CHECK: [[VAL_184:%.*]] = affine.apply #map1([[VAL_159]]) +// CHECK: [[VAL_185:%.*]] = addi [[VAL_184]], [[VAL_137]] : index +// CHECK: [[VAL_186:%.*]] = cmpi "slt", [[VAL_185]], [[VAL_170]] : index +// CHECK: loop.if [[VAL_186]] { +// CHECK: [[VAL_187:%.*]] = affine.apply #map1([[VAL_160]]) +// CHECK: [[VAL_188:%.*]] = addi [[VAL_187]], [[VAL_137]] : index +// CHECK: [[VAL_189:%.*]] = cmpi "slt", [[VAL_188]], [[VAL_172]] : index +// CHECK: loop.if [[VAL_189]] { +// CHECK: [[VAL_190:%.*]] = load [[VAL_173]]{{\[}}[[VAL_185]], [[VAL_188]]] : memref +// CHECK: [[VAL_191:%.*]] = load [[VAL_178]]{{\[}}[[VAL_185]], [[VAL_188]]] : memref +// CHECK: [[VAL_192:%.*]] = load [[VAL_183]]{{\[}}[[VAL_185]], [[VAL_188]]] : memref +// CHECK: [[VAL_193:%.*]] = addf [[VAL_190]], [[VAL_191]] : f32 +// CHECK: store [[VAL_193]], [[VAL_183]]{{\[}}[[VAL_185]], [[VAL_188]]] : memref +// CHECK: } +// CHECK: } +// CHECK: gpu.terminator +// CHECK: } +// CHECK: return +// CHECK: } +// CHECK: } +