diff --git a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp --- a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp +++ b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp @@ -14,6 +14,7 @@ #include "mlir/Conversion/SCFToGPU/SCFToGPU.h" +#include "mlir/Analysis/SliceAnalysis.h" #include "mlir/Conversion/AffineToStandard/AffineToStandard.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Arith/IR/Arith.h" @@ -32,6 +33,8 @@ #include "llvm/ADT/Sequence.h" #include "llvm/Support/Debug.h" #include +#include + #define DEBUG_TYPE "loops-to-gpu" @@ -408,14 +411,15 @@ static LogicalResult processParallelLoop( ParallelOp parallelOp, gpu::LaunchOp launchOp, IRMapping &cloningMap, SmallVectorImpl &worklist, - DenseMap &bounds, PatternRewriter &rewriter) { + DenseMap &bounds, PatternRewriter &rewriter, + DenseMap>& loopReductionOperands + ) { // TODO: Verify that this is a valid GPU mapping. // processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential ArrayAttr mapping = parallelOp->getAttrOfType(gpu::getMappingAttrName()); - // TODO: Support reductions. - if (!mapping || parallelOp.getNumResults() != 0) + if (!mapping) return failure(); Location loc = parallelOp.getLoc(); @@ -521,18 +525,68 @@ } bounds[processor] = launchBound; } - if (!boundIsPrecise) { - // We are using an approximation, create a surrounding conditional. - Value originalBound = std::get<3>(config); - arith::CmpIOp pred = rewriter.create( - loc, arith::CmpIPredicate::slt, newIndex, - cloningMap.lookupOrDefault(originalBound)); - scf::IfOp ifOp = rewriter.create(loc, pred, false); + + // Thread-level reductions require special handling because of the + // loop-bounds guard that we insert to ensure that threads outside + // of the loop iteration space have no effects. If a thread-mapped + // loop has reductions, we always construct an if-else pair (for simplicity) + // where the if returns all operands of the reduction blocks in the + // thread-level reduction loop, and the else branch returns the initial + // values of the loop. This makes it easy to explain the semantics of + // a thread-level reduction: all threads contribute a value to the + // reduction, and threads outside of the iteration space contribute + // the initial value. + if (parallelOp.getNumReductions() > 0 && processor == gpu::Processor::ThreadX) { + assert (!loopReductionOperands.contains(parallelOp)); + + // If we we can always prove that the iteration space bounds are + // tight, generate a vacuously true check. + mlir::Value pred; + if (boundIsPrecise) { + pred = rewriter.create(loc, 1, 1); + } else { + Value originalBound = std::get<3>(config); + pred = rewriter.create( + loc, arith::CmpIPredicate::slt, newIndex, + cloningMap.lookupOrDefault(originalBound)); + } + auto ifOp = rewriter.create(loc, parallelOp.getResults().getTypes(), pred, true /* withElseRegion */); + + // Map the loop reduction operands of this parallel loop to + // the results of the if. + for (auto result : ifOp.getResults()) { + loopReductionOperands[parallelOp].push_back(result); + } + + // TODO (rohany): This feels pretty ugly to me. + // The way that this instruction copying is set up (in the driver + // function below) makes it hard to know when we're copying an + // instruction if we should yield it from the if here. Instead, + // when we process the reduction operations later, we'll reach + // in an add the operands to this yield. + rewriter.setInsertionPointToEnd(ifOp.thenBlock()); + rewriter.create(loc); + + // Initialize the else block to just return the initial values. + rewriter.setInsertionPointToStart(ifOp.elseBlock()); + rewriter.create(loc, parallelOp.getInitVals()); + rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front()); - // Put a sentinel into the worklist so we know when to pop out of the - // if body again. We use the launchOp here, as that cannot be part of - // the bodies instruction. worklist.push_back(launchOp.getOperation()); + } else { + if (!boundIsPrecise) { + // We are using an approximation, create a surrounding conditional. + Value originalBound = std::get<3>(config); + arith::CmpIOp pred = rewriter.create( + loc, arith::CmpIPredicate::slt, newIndex, + cloningMap.lookupOrDefault(originalBound)); + scf::IfOp ifOp = rewriter.create(loc, pred, false); + rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front()); + // Put a sentinel into the worklist so we know when to pop out of the + // if body again. We use the launchOp here, as that cannot be part of + // the bodies instruction. + worklist.push_back(launchOp.getOperation()); + } } } } else { @@ -567,6 +621,32 @@ return success(); } +// TODO (rohany): Avoid this duplication, but not sure where in MLIR +// this sort of utility might go. +// matchSimpleReduction is code borrowed from the SCF->OpenMP conversion +// pass. It checks if the input block matches a simple reduction on +// the input values, such as an add or max reduction. +template +static bool matchSimpleReduction(Block &block) { + if (block.empty() || llvm::hasSingleElement(block) || + std::next(block.begin(), 2) != block.end()) + return false; + + if (block.getNumArguments() != 2) + return false; + + SmallVector combinerOps; + Value reducedVal = matchReduction({block.getArguments()[1]}, + /*redPos=*/0, combinerOps); + + if (!reducedVal || !isa(reducedVal) || combinerOps.size() != 1) + return false; + + return isa(combinerOps[0]) && + isa(block.back()) && + block.front().getOperands() == block.getArguments(); +} + /// Lower a `scf.parallel` operation into a corresponding `gpu.launch` /// operation. /// @@ -599,6 +679,7 @@ LogicalResult ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp, PatternRewriter &rewriter) const { + auto ctx = parallelOp.getOperation()->getContext(); // Mark the operation as visited for recursive legality check. parallelOp->setAttr(kVisitedAttrName, rewriter.getUnitAttr()); @@ -609,6 +690,71 @@ // Create a launch operation. We start with bound one for all grid/block // sizes. Those will be refined later as we discover them from mappings. Location loc = parallelOp.getLoc(); + + // We introduce initial support for GPU reductions from SCF and allow + // for single-dimensional block and thread decompositions of reduction + // loops. If there is a reduction at the block level, we'll allocate + // a location for each reduction result, and have reductions at the + // block level perform atomics into that location. + llvm::DenseMap loopReductionCounters; + llvm::SmallVector reductionAllocs; + // loopReductionOperands maintains remapped operands for reduce + // operations, as the rewriting process for reductions will not + // just use a cloning map to maintain these. + llvm::DenseMap> loopReductionOperands; + // TODO (rohany): Maintaining this is unfortunate. + // threadAllReduceOps maintains the AllReduceOp operations generated + // for each loop reduction, since we need to move these around later. + llvm::SmallVector threadAllReduceOps; + if (parallelOp.getNumReductions() > 0) { + rewriter.setInsertionPoint(parallelOp); + + // Ensure that the outer parallel loop we're lowering is mapped + // over the BlockX parallel dimension. + ArrayAttr allVarsMapping = parallelOp.getOperation()->getAttrOfType(gpu::getMappingAttrName()); + assert(allVarsMapping.size() == 1); + auto mapping = allVarsMapping[0]; + auto annotation = dyn_cast(mapping); + auto processor = annotation.getProcessor(); + if (processor != gpu::Processor::BlockX) return failure(); + + // Allocate a GPU buffer for each reduction. + for (unsigned i = 0; i < parallelOp.getNumReductions(); i++) { + auto resultType = parallelOp.getResults()[i].getType(); + auto allocOp = rewriter.create( + loc, + MemRefType::get(llvm::SmallVector(), resultType), + Type() /* asyncToken */, + llvm::SmallVector() /* deps */, + llvm::SmallVector() /* dynamicSizes */, + llvm::SmallVector() /* symbolOperands */ + ); + reductionAllocs.push_back(allocOp); + + // Memset is broken, so use memref alloca's + loads and stores. + // By broken, I mean that it only supports 32 bit arguments... + auto initVal = parallelOp.getInitVals()[i]; + auto initValMemref = rewriter.create( + loc, + MemRefType::get(llvm::SmallVector(), initVal.getType()) + ); + rewriter.create( + loc, + initVal, + initValMemref, + llvm::SmallVector() /* indices */ + ); + // TODO (rohany): How is the source / destination inferred? + rewriter.create( + loc, + Type() /* asyncToken */, + llvm::SmallVector() /* async dependencies */, + allocOp.getResults()[0], + initValMemref + ); + } + } + Value constantOne = rewriter.create(parallelOp.getLoc(), 1); gpu::LaunchOp launchOp = rewriter.create( @@ -622,7 +768,7 @@ llvm::DenseMap launchBounds; SmallVector worklist; if (failed(processParallelLoop(parallelOp, launchOp, cloningMap, worklist, - launchBounds, rewriter))) + launchBounds, rewriter, loopReductionOperands))) return failure(); // Whether we have seen any side-effects. Reset when leaving an inner scope. @@ -645,7 +791,7 @@ // Insert that now. This will also update the worklist with the loops // body. if (failed(processParallelLoop(nestedParallel, launchOp, cloningMap, - worklist, launchBounds, rewriter))) + worklist, launchBounds, rewriter, loopReductionOperands))) return failure(); } else if (op == launchOp.getOperation()) { // Found our sentinel value. We have finished the operations from one @@ -654,6 +800,103 @@ rewriter.setInsertionPointAfter(parent); leftNestingScope = true; seenSideeffects = false; + } else if (auto redop = dyn_cast(op)) { + // Get the processor mapping of the parallel loop this reduction + // is occurring within. + auto parentLoop = dyn_cast(redop.getOperation()->getParentOp()); + ArrayAttr allVarsMapping = + parentLoop->getAttrOfType(gpu::getMappingAttrName()); + assert(allVarsMapping.size() == 1); + auto mapping = allVarsMapping[0]; + auto annotation = dyn_cast(mapping); + gpu::Processor processor = annotation.getProcessor(); + + // Extract the body of the reduction, we'll use this to match the type + // of reduction in the individual cases. + Block &reduction = redop.getRegion().front(); + + // The number of reductions that we've seen for each loop corresponds to + // which return value of the loop we are reducing for. + auto index = loopReductionCounters[parentLoop]++; + + // Block-level reductions will perform atomic rmw operations into a + // preallocated buffer. + if (processor == gpu::Processor::BlockX) { + // Only the first thread of each block needs to contribute + // to the reduction. + auto threadId = rewriter.create(loc, gpu::Dimension::x); + auto zeroIdx = rewriter.create(loc, 0); + auto cond = rewriter.create( + loc, arith::CmpIPredicate::eq, threadId, zeroIdx); + auto check = rewriter.create(loc, cond, false /* withElseRegion */); + { + mlir::OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPointToStart(&check.getThenRegion().front()); + arith::AtomicRMWKind rmwKind; + if (matchSimpleReduction(reduction)) { + rmwKind = arith::AtomicRMWKind::addf; + } else if (matchSimpleReduction(reduction)) { + rmwKind = arith::AtomicRMWKind::maxf; + } else { + return failure(); + } + // Do an atomic rmw into the memref allocation for block-level reductions. + rewriter.create( + loc, + rmwKind, + cloningMap.lookup(redop.getOperand()), + reductionAllocs[index].getOperation()->getResults()[0], + llvm::SmallVector() /* indices */ + ); + } + } else if (processor == gpu::Processor::ThreadX) { + // TODO (rohany): This feels ugly. + // When we see a ReduceOp inside a thread-level reduction, we need + // to take its operand, and have the IfOp containing that operand + // return it. This block adds the mapped operand of the ReduceOp + // to the yield of the parent if. Since we always add an IfOp if + // we are doing a reduction, we don't have to handle cases here. + auto operand = redop.getOperand(); + auto mappedOperand = cloningMap.lookup(operand); + auto parent = mappedOperand.getDefiningOp()->getParentOp(); + assert (isa(parent)); + auto parentIfOp = dyn_cast(parent); + parentIfOp.thenYield().getResultsMutable().append(mappedOperand); + auto finalOperand = loopReductionOperands[parentLoop][index]; + + // Thread-level reductions will simply utilize the GPU allreduce + // primitive that gives each thread the reduced value across + // an entire block. + gpu::AllReduceOperation allReduceOperation; + if (matchSimpleReduction(reduction)) { + allReduceOperation = gpu::AllReduceOperation::ADD; + } else if (matchSimpleReduction(reduction)) { + allReduceOperation = gpu::AllReduceOperation::MAX; + } else { + return failure(); + } + { + // We must adjust the rewriter to be after the IfOp + // to use the results of the IfOp. Without this move, + // we would be generating the AllReduceOp inside the + // then branch of the if. + OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPointAfter(parent); + // We know that the allreduce operation is uniform because + // we explictly hoist the reduces outside of the thread-level + // iteration space bounds check. + auto allReduce = rewriter.create( + loc, + finalOperand, + gpu::AllReduceOperationAttr::get(ctx, allReduceOperation), + true /* uniform */ + ); + threadAllReduceOps.push_back(allReduce); + cloningMap.map(parentLoop.getResults()[index], allReduce); + } + } else { + return failure(); + } } else { // Otherwise we copy it over. Operation *clone = rewriter.clone(*op, cloningMap); @@ -674,6 +917,56 @@ launchOp.setOperand(getLaunchOpArgumentNum(std::get<0>(bound)), std::get<1>(bound)); + // After handling all of the instructions inside the mapped + // parallel loops, we need to extract the reduction value + // from each GPU buffer. For each buffer, we allocate a + // corresponding buffer on the CPU, and issue a copy. + if (parallelOp.getNumReductions() > 0) { + rewriter.setInsertionPointAfter(launchOp); + for (unsigned i = 0; i < parallelOp.getNumReductions(); i++) { + auto result = parallelOp.getResults()[i]; + auto localReducVal = rewriter.create( + loc, + MemRefType::get(llvm::SmallVector(), result.getType()) + ); + rewriter.create( + loc, + Type() /* asyncToken */, + llvm::SmallVector() /* async dependencies */, + localReducVal, + reductionAllocs[i].getOperation()->getResults()[0] + ); + auto load = rewriter.create( + loc, + localReducVal, + llvm::SmallVector() /* indices */ + ); + result.replaceAllUsesWith(load); + } + + // TODO (rohany): This feels hacky. + // The final step here is to move AllReduceOps produced by + // thread-level reductions closer to the definitions + // of their operands. The way that scoping affects the + // rewriter position causes the operations after + // the thread-level reduction to be inserted before the + // AllReduceOps. In particular, when we see a ReduceOp + // inside a thread-level reduction, we add an AllReduceOp after + // the IfOp guard generated for the thread-level loop. But then, + // when we pop out of the nesting scope, we set the rewriter + // position to be after the IfOp, causing future operations to + // be inserted before the AllReduceOps, which is problematic when + // data flows from the AllReduceOp to the other instructions. This + // fixup phase is an attempt to move the AllReduceOperations as + // close as possible to their definition, but is not a gauranteed fix. + for (auto redop : threadAllReduceOps) { + auto operand = redop.getValue(); + auto definingOp = operand.getDefiningOp(); + assert (isa(definingOp)); + redop.getOperation()->moveAfter(definingOp); + } + } + rewriter.eraseOp(parallelOp); return success(); }