diff --git a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp --- a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp +++ b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp @@ -20,8 +20,12 @@ #include "mlir/Dialect/LoopOps/LoopOps.h" #include "mlir/Dialect/StandardOps/Ops.h" #include "mlir/IR/AffineExpr.h" +#include "mlir/IR/BlockAndValueMapping.h" #include "mlir/IR/Builders.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/DialectConversion.h" #include "mlir/Transforms/LoopUtils.h" +#include "mlir/Transforms/Passes.h" #include "mlir/Transforms/RegionUtils.h" #include "llvm/ADT/Sequence.h" #include "llvm/Support/Debug.h" @@ -487,3 +491,159 @@ ArrayRef workGroupSizes) { return ::convertLoopToGPULaunch(forOp, numWorkGroups, workGroupSizes); } + +namespace { +struct ParallelToGpuLaunchLowering : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + PatternMatchResult matchAndRewrite(ParallelOp parallelOp, + PatternRewriter &rewriter) const override; +}; + +std::tuple +extractMapAndOperand(Attribute attribute) { + DictionaryAttr dict = attribute.dyn_cast(); + unsigned processor = dict.get("processor").dyn_cast().getValue().getSExtValue(); + AffineMap map = dict.get("map").dyn_cast().getValue(); + AffineMapAttr boundAttr = dict.get("bound").dyn_cast_or_null(); + AffineMap bound; + if (boundAttr) bound = boundAttr.getValue(); + return {processor, map, bound}; +} + +LogicalResult processParallelLoop(ParallelOp parallelOp, gpu::LaunchOp launchOp, + BlockAndValueMapping &cloning_map, + SmallVectorImpl &worklist, + PatternRewriter &rewriter) { + // TODO(herhut): Verify that this is a valid GPU mapping. + // processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential + ArrayAttr mapping = parallelOp.getAttrOfType("mapping"); + // TODO(herhut): Support reductions. + if (!mapping || parallelOp.getNumResults() != 0) + return failure(); + + Location loc = parallelOp.getLoc(); + + for (auto config : llvm::zip(mapping, parallelOp.getInductionVars(), parallelOp.lowerBound(), parallelOp.upperBound(), parallelOp.step())) { + unsigned processor; + AffineMap map; + AffineMap bound; + std::tie(processor, map, bound) = extractMapAndOperand(std::get<0>(config)); + Value newIndex; + if (processor < gpu::LaunchOp::kNumConfigOperands) { + // Use the corresponding thread/grid index as replacement for the loop iv. + Value operand = launchOp.body().front().getArgument(processor); + Value appliedMap = rewriter.create(loc, map, operand); + // Add the lower bound, as the maps are 0 based but the loop might not be. + newIndex = rewriter.create( + loc, appliedMap, cloning_map.lookupOrDefault(std::get<2>(config))); + // If there was also a bound, insert that, too. + // TODO(herhut): Check that we do not assign bounds twice. + if (bound) { + auto save = rewriter.saveInsertionPoint(); + rewriter.setInsertionPoint(launchOp); + // We pass as the single opererand to the bound-map the number of + // iterations, which is upperBound - lowerBound. + Value iterations = rewriter.create( + loc, cloning_map.lookupOrDefault(std::get<3>(config)), + cloning_map.lookupOrDefault(std::get<2>(config))); + Value newBound = rewriter.create(loc, bound, iterations); + launchOp.setOperand(processor, newBound); + rewriter.restoreInsertionPoint(save); + } + } else { + // Create a sequential for loop. + auto loopOp = rewriter.create( + loc, cloning_map.lookupOrDefault(std::get<2>(config)), + cloning_map.lookupOrDefault(std::get<3>(config)), + cloning_map.lookupOrDefault(std::get<4>(config))); + newIndex = loopOp.getInductionVar(); + rewriter.setInsertionPointToStart(loopOp.getBody()); + // Put a sentinel into the worklist so we know when to pop out of the loop + // body again. We use the launchOp here, as that cannot be part of the + // bodies instruction. + worklist.push_back(launchOp.getOperation()); + } + cloning_map.map(std::get<1>(config), newIndex); + } + Block *body = parallelOp.getBody(); + worklist.reserve(worklist.size() + body->getOperations().size()); + for (Operation &op : llvm::reverse(body->without_terminator())) + worklist.push_back(&op); + return success(); +} + +} // namespace + +PatternMatchResult +ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp, + PatternRewriter &rewriter) const { + // Create a launch operation. We start with bound one for all grid/block + // sizes. Those will be refined later as we discover them from mappings. + Location loc = parallelOp.getLoc(); + Value constantOne = rewriter.create(parallelOp.getLoc(), 1); + gpu::LaunchOp launchOp = rewriter.create( + parallelOp.getLoc(), constantOne, constantOne, constantOne, constantOne, + constantOne, constantOne); + rewriter.setInsertionPointToEnd(&launchOp.body().front()); + rewriter.create(loc); + rewriter.setInsertionPointToStart(&launchOp.body().front()); + + BlockAndValueMapping cloning_map; + SmallVector worklist; + if (failed(processParallelLoop(parallelOp, launchOp, cloning_map, worklist, + rewriter))) + return matchFailure(); + + while (!worklist.empty()) { + Operation *op = worklist.pop_back_val(); + + // Now walk over the body and clone it. + // TODO: This is only correct if there either is no further loop.parallel + // nested + // or this code is side-effect free. Otherwise we might need + // predication. + if (auto nestedParallel = dyn_cast(op)) { + // A nested loop.parallel needs insertion of code to compute indices. + // Insert that now. + processParallelLoop(nestedParallel, launchOp, cloning_map, worklist, + rewriter); + } else if (op == launchOp.getOperation()) { + // Found our sentinel value. We have finished the operations from one + // nesting level, pop one level back up. + auto parent = rewriter.getInsertionPoint()->getParentOp(); + rewriter.setInsertionPointAfter(parent); + } else { + // Otherwise we copy it over. + Operation *clone = rewriter.clone(*op, cloning_map); + // TODO(herhut) Use generalized BlockAndValueMapping::map once landed. + for (auto pair : llvm::zip(op->getResults(), clone->getResults())) + cloning_map.map(std::get<0>(pair), std::get<1>(pair)); + } + } + + rewriter.eraseOp(parallelOp); + return matchSuccess(); +} + +namespace { + struct ParallelLoopToGpuPass : public OperationPass { + void runOnOperation() override; +}; +} + +void ParallelLoopToGpuPass::runOnOperation() { + OwningRewritePatternList patterns; + patterns.insert(&getContext()); + ConversionTarget target(getContext()); + target.addLegalDialect(); + target.addLegalDialect(); + target.addLegalDialect(); + target.addIllegalOp(); + if (failed(applyPartialConversion(getOperation(), target, patterns))) + signalPassFailure(); +} + +static PassRegistration + pass("convert-parallel-loop-to-gpu", "Convert mapped loop,parallel op to " + "gpu launch operations."); \ No newline at end of file diff --git a/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir b/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir @@ -0,0 +1,178 @@ +// RUN: mlir-opt -convert-parallel-loop-to-gpu -split-input-file %s | FileCheck %s -dump-input-on-failure + +// 2-d parallel loop mapped to block.y and block.x + +func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index, + %arg3 : index, %arg4 : index, + %buf : memref, + %res : memref) { + %step = constant 2 : index + loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3) + step (%arg4, %step) { + %val = load %buf[%i0, %i1] : memref + store %val, %res[%i1, %i0] : memref + } { mapping = [{processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}, {processor = 0, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}] } + return +} + +// CHECK-LABEL: func @parallel_loop( +// CHECK-SAME: [[VAL_0:%.*]]: index, [[VAL_1:%.*]]: index, [[VAL_2:%.*]]: index, [[VAL_3:%.*]]: index, [[VAL_4:%.*]]: index, [[VAL_5:%.*]]: memref, [[VAL_6:%.*]]: memref) { +// CHECK: [[VAL_7:%.*]] = constant 2 : index +// CHECK: [[VAL_8:%.*]] = constant 1 : index +// CHECK: [[VAL_9:%.*]] = subi [[VAL_2]], [[VAL_0]] : index +// CHECK: [[VAL_10:%.*]] = subi [[VAL_3]], [[VAL_1]] : index +// CHECK: gpu.launch blocks([[VAL_11:%.*]], [[VAL_12:%.*]], [[VAL_13:%.*]]) in ([[VAL_14:%.*]] = [[VAL_10]], [[VAL_15:%.*]] = [[VAL_9]], [[VAL_16:%.*]] = [[VAL_8]]) threads([[VAL_17:%.*]], [[VAL_18:%.*]], [[VAL_19:%.*]]) in ([[VAL_20:%.*]] = [[VAL_8]], [[VAL_21:%.*]] = [[VAL_8]], [[VAL_22:%.*]] = [[VAL_8]]) { +// CHECK: [[VAL_23:%.*]] = addi [[VAL_12]], [[VAL_0]] : index +// CHECK: [[VAL_24:%.*]] = addi [[VAL_11]], [[VAL_1]] : index +// CHECK: [[VAL_25:%.*]] = load [[VAL_5]]{{\[}}[[VAL_23]], [[VAL_24]]] : memref +// CHECK: store [[VAL_25]], [[VAL_6]]{{\[}}[[VAL_24]], [[VAL_23]]] : memref +// CHECK: gpu.terminator +// CHECK: } +// CHECK: return +// CHECK: } +// CHECK: } + +// ----- + +// tiled 2-d parallel loop mapped to block.y and block.x and thread.y and thread.x. + +func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index, + %arg3 : index, + %buf : memref, + %res : memref) { + %zero = constant 0 : index + %one = constant 1 : index + %four = constant 4 : index + loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3) + step (%four, %four) { + loop.parallel (%si0, %si1) = (%zero, %zero) to (%four, %four) + step (%one, %one) { + %idx0 = addi %i0, %si0 : index + %idx1 = addi %i1, %si1 : index + %val = load %buf[%idx0, %idx1] : memref + store %val, %res[%idx1, %idx0] : memref + } { mapping = [ + {processor = 4, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}, + {processor = 3, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>} + ] } + } { mapping = [ + {processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}, + {processor = 0, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>} + ] } + return +} + +// CHECK-LABEL: func @parallel_loop( +// CHECK-SAME: [[VAL_26:%.*]]: index, [[VAL_27:%.*]]: index, [[VAL_28:%.*]]: index, [[VAL_29:%.*]]: index, [[VAL_30:%.*]]: memref, [[VAL_31:%.*]]: memref) { +// CHECK: [[VAL_32:%.*]] = constant 0 : index +// CHECK: [[VAL_33:%.*]] = constant 1 : index +// CHECK: [[VAL_34:%.*]] = constant 4 : index +// CHECK: [[VAL_35:%.*]] = constant 1 : index +// CHECK: [[VAL_36:%.*]] = subi [[VAL_28]], [[VAL_26]] : index +// CHECK: [[VAL_37:%.*]] = subi [[VAL_29]], [[VAL_27]] : index +// CHECK: [[VAL_38:%.*]] = subi [[VAL_34]], [[VAL_32]] : index +// CHECK: [[VAL_39:%.*]] = subi [[VAL_34]], [[VAL_32]] : index +// CHECK: gpu.launch blocks([[VAL_40:%.*]], [[VAL_41:%.*]], [[VAL_42:%.*]]) in ([[VAL_43:%.*]] = [[VAL_37]], [[VAL_44:%.*]] = [[VAL_36]], [[VAL_45:%.*]] = [[VAL_35]]) threads([[VAL_46:%.*]], [[VAL_47:%.*]], [[VAL_48:%.*]]) in ([[VAL_49:%.*]] = [[VAL_39]], [[VAL_50:%.*]] = [[VAL_38]], [[VAL_51:%.*]] = [[VAL_35]]) { +// CHECK: [[VAL_52:%.*]] = addi [[VAL_41]], [[VAL_26]] : index +// CHECK: [[VAL_53:%.*]] = addi [[VAL_40]], [[VAL_27]] : index +// CHECK: [[VAL_54:%.*]] = addi [[VAL_47]], [[VAL_32]] : index +// CHECK: [[VAL_55:%.*]] = addi [[VAL_46]], [[VAL_32]] : index +// CHECK: [[VAL_56:%.*]] = addi [[VAL_52]], [[VAL_54]] : index +// CHECK: [[VAL_57:%.*]] = addi [[VAL_53]], [[VAL_55]] : index +// CHECK: [[VAL_58:%.*]] = load [[VAL_30]]{{\[}}[[VAL_56]], [[VAL_57]]] : memref +// CHECK: store [[VAL_58]], [[VAL_31]]{{\[}}[[VAL_57]], [[VAL_56]]] : memref +// CHECK: gpu.terminator +// CHECK: } +// CHECK: return +// CHECK: } + +// ----- + +// 2-d parallel loop mapped to block.y and sequential + +func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index, + %arg3 : index, %arg4 : index, + %buf : memref, + %res : memref) { + %step = constant 2 : index + loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3) + step (%arg4, %step) { + %val = load %buf[%i0, %i1] : memref + store %val, %res[%i1, %i0] : memref + } { mapping = [ + {processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}, + {processor = 6, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>} + ] } + return +} + +// CHECK-LABEL: func @parallel_loop( +// CHECK-SAME: [[VAL_59:%.*]]: index, [[VAL_60:%.*]]: index, [[VAL_61:%.*]]: index, [[VAL_62:%.*]]: index, [[VAL_63:%.*]]: index, [[VAL_64:%.*]]: memref, [[VAL_65:%.*]]: memref) { +// CHECK: [[VAL_66:%.*]] = constant 2 : index +// CHECK: [[VAL_67:%.*]] = constant 1 : index +// CHECK: [[VAL_68:%.*]] = subi [[VAL_61]], [[VAL_59]] : index +// CHECK: gpu.launch blocks([[VAL_69:%.*]], [[VAL_70:%.*]], [[VAL_71:%.*]]) in ([[VAL_72:%.*]] = [[VAL_67]], [[VAL_73:%.*]] = [[VAL_68]], [[VAL_74:%.*]] = [[VAL_67]]) threads([[VAL_75:%.*]], [[VAL_76:%.*]], [[VAL_77:%.*]]) in ([[VAL_78:%.*]] = [[VAL_67]], [[VAL_79:%.*]] = [[VAL_67]], [[VAL_80:%.*]] = [[VAL_67]]) { +// CHECK: [[VAL_81:%.*]] = addi [[VAL_70]], [[VAL_59]] : index +// CHECK: loop.for [[VAL_82:%.*]] = [[VAL_60]] to [[VAL_62]] step [[VAL_66]] { +// CHECK: [[VAL_83:%.*]] = load [[VAL_64]]{{\[}}[[VAL_81]], [[VAL_82]]] : memref +// CHECK: store [[VAL_83]], [[VAL_65]]{{\[}}[[VAL_82]], [[VAL_81]]] : memref +// CHECK: } +// CHECK: gpu.terminator +// CHECK: } +// CHECK: return +// CHECK: } + +// ----- + +// tiled 2-d parallel loop mapped to block.y and seq. and thread.y and seq. + +func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index, + %arg3 : index, + %buf : memref, + %res : memref) { + %zero = constant 0 : index + %one = constant 1 : index + %four = constant 4 : index + loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3) + step (%four, %four) { + loop.parallel (%si0, %si1) = (%zero, %zero) to (%four, %four) + step (%one, %one) { + %idx0 = addi %i0, %si0 : index + %idx1 = addi %i1, %si1 : index + %val = load %buf[%idx0, %idx1] : memref + store %val, %res[%idx1, %idx0] : memref + } { mapping = [ + {processor = 4, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}, + {processor = 6, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>} + ] } + } { mapping = [ + {processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}, + {processor = 6, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>} + ] } + return +} + +// CHECK-LABEL: func @parallel_loop( +// CHECK-SAME: [[VAL_84:%.*]]: index, [[VAL_85:%.*]]: index, [[VAL_86:%.*]]: index, [[VAL_87:%.*]]: index, [[VAL_88:%.*]]: memref, [[VAL_89:%.*]]: memref) { +// CHECK: [[VAL_90:%.*]] = constant 0 : index +// CHECK: [[VAL_91:%.*]] = constant 1 : index +// CHECK: [[VAL_92:%.*]] = constant 4 : index +// CHECK: [[VAL_93:%.*]] = constant 1 : index +// CHECK: [[VAL_94:%.*]] = subi [[VAL_86]], [[VAL_84]] : index +// CHECK: [[VAL_95:%.*]] = subi [[VAL_92]], [[VAL_90]] : index +// CHECK: gpu.launch blocks([[VAL_96:%.*]], [[VAL_97:%.*]], [[VAL_98:%.*]]) in ([[VAL_99:%.*]] = [[VAL_93]], [[VAL_100:%.*]] = [[VAL_94]], [[VAL_101:%.*]] = [[VAL_93]]) threads([[VAL_102:%.*]], [[VAL_103:%.*]], [[VAL_104:%.*]]) in ([[VAL_105:%.*]] = [[VAL_93]], [[VAL_106:%.*]] = [[VAL_95]], [[VAL_107:%.*]] = [[VAL_93]]) { +// CHECK: [[VAL_108:%.*]] = addi [[VAL_97]], [[VAL_84]] : index +// CHECK: loop.for [[VAL_109:%.*]] = [[VAL_85]] to [[VAL_87]] step [[VAL_92]] { +// CHECK: [[VAL_110:%.*]] = addi [[VAL_103]], [[VAL_90]] : index +// CHECK: loop.for [[VAL_111:%.*]] = [[VAL_90]] to [[VAL_92]] step [[VAL_91]] { +// CHECK: [[VAL_112:%.*]] = addi [[VAL_108]], [[VAL_110]] : index +// CHECK: [[VAL_113:%.*]] = addi [[VAL_109]], [[VAL_111]] : index +// CHECK: [[VAL_114:%.*]] = load [[VAL_88]]{{\[}}[[VAL_112]], [[VAL_113]]] : memref +// CHECK: store [[VAL_114]], [[VAL_89]]{{\[}}[[VAL_113]], [[VAL_112]]] : memref +// CHECK: } +// CHECK: } +// CHECK: gpu.terminator +// CHECK: } +// CHECK: return +// CHECK: } +