diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td --- a/mlir/include/mlir/Conversion/Passes.td +++ b/mlir/include/mlir/Conversion/Passes.td @@ -180,9 +180,9 @@ // SCFToGPU //===----------------------------------------------------------------------===// -def ConvertSimpleSCFToGPU : FunctionPass<"convert-scf-to-gpu"> { - let summary = "Convert top-level loops to GPU kernels"; - let constructor = "mlir::createSimpleSCFToGPUPass()"; +def ConvertAffineForToGPU : FunctionPass<"convert-affine-for-to-gpu"> { + let summary = "Convert top-level AffineFor Ops to GPU kernels"; + let constructor = "mlir::createAffineForToGPUPass()"; let options = [ Option<"numBlockDims", "gpu-block-dims", "unsigned", /*default=*/"1u", "Number of GPU block dimensions for mapping">, @@ -191,19 +191,6 @@ ]; } -def ConvertSCFToGPU : FunctionPass<"convert-loop-op-to-gpu"> { - let summary = "Convert top-level scf::ForOp to GPU kernels"; - let constructor = "mlir::createLoopToGPUPass()"; - let options = [ - ListOption<"numWorkGroups", "gpu-num-workgroups", "int64_t", - "Num workgroups in the GPU launch", - "llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated">, - ListOption<"workGroupSize", "gpu-workgroup-size", "int64_t", - "Workgroup Size in the GPU launch", - "llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated"> - ]; -} - def ConvertParallelLoopToGpu : Pass<"convert-parallel-loops-to-gpu"> { let summary = "Convert mapped scf.parallel ops to gpu launch operations"; let constructor = "mlir::createParallelLoopToGpuPass()"; diff --git a/mlir/include/mlir/Conversion/SCFToGPU/SCFToGPU.h b/mlir/include/mlir/Conversion/SCFToGPU/SCFToGPU.h --- a/mlir/include/mlir/Conversion/SCFToGPU/SCFToGPU.h +++ b/mlir/include/mlir/Conversion/SCFToGPU/SCFToGPU.h @@ -31,49 +31,14 @@ /// parallelization is performed, it is under the responsibility of the caller /// to strip-mine the loops and to perform the dependence analysis before /// calling the conversion. + +// TODO: Consider removing this in favor of affine.for -> affine.parallel +// detection followed by an affine.parallel -> scf.parallel -> gpu.launch +// conversion LogicalResult convertAffineLoopNestToGPULaunch(AffineForOp forOp, unsigned numBlockDims, unsigned numThreadDims); -/// Convert a perfect linalg loop nest with the outermost loop identified by -/// `forOp` into a gpu::Launch operation. Map `numBlockDims` outer loops to -/// GPU blocks and `numThreadDims` to GPU threads. The bounds of the loops that -/// are mapped should be independent of the induction variables of the other -/// mapped loops. -/// -/// No check on the size of the block or grid, or on the validity of -/// parallelization is performed, it is under the responsibility of the caller -/// to strip-mine the loops and to perform the dependence analysis before -/// calling the conversion. -LogicalResult convertLoopNestToGPULaunch(scf::ForOp forOp, - unsigned numBlockDims, - unsigned numThreadDims); - -/// Convert a loop operation into a GPU launch with the values provided in -/// `numWorkGroups` as the grid size and the values provided in `workGroupSizes` -/// as the block size. Size of `numWorkGroups` and workGroupSizes` must be less -/// than or equal to 3. The loop operation can be an imperfectly nested -/// computation with the following restrictions: -/// 1) The loop nest must contain as many perfectly nested loops as the number -/// of values passed in through `numWorkGroups`. This corresponds to the number -/// of grid dimensions of the launch. All loops within the loop nest must be -/// parallel. -/// 2) The body of the innermost loop of the above perfectly nested loops, must -/// contain statements that satisfy one of the two conditions below: -/// a) A perfect loop nest of depth greater than or equal to the number of -/// values passed in through `workGroupSizes`, i.e. the number of thread -/// dimensions of the launch. Loops at depth less than or equal to size of -/// `workGroupSizes` must be parallel. Loops nested deeper can be sequential -/// and are retained as such in the generated GPU launch code. -/// b) Statements that are safe to be executed by all threads within the -/// workgroup. No checks are performed that this is indeed the case. -/// TODO(ravishankarm) : Add checks that verify 2(b) above. -/// The above conditions are assumed to be satisfied by the computation rooted -/// at `forOp`. -LogicalResult convertLoopToGPULaunch(scf::ForOp forOp, - ArrayRef numWorkGroups, - ArrayRef workGroupSizes); - /// Adds the conversion pattern from `scf.parallel` to `gpu.launch` to the /// provided pattern list. void populateParallelLoopToGPUPatterns(OwningRewritePatternList &patterns, diff --git a/mlir/include/mlir/Conversion/SCFToGPU/SCFToGPUPass.h b/mlir/include/mlir/Conversion/SCFToGPU/SCFToGPUPass.h --- a/mlir/include/mlir/Conversion/SCFToGPU/SCFToGPUPass.h +++ b/mlir/include/mlir/Conversion/SCFToGPU/SCFToGPUPass.h @@ -19,27 +19,16 @@ class Pass; /// Create a pass that converts loop nests into GPU kernels. It considers -/// top-level affine.for and linalg.for operations as roots of loop nests and -/// converts them to the gpu.launch operations if possible. +/// top-level affine.for operations as roots of loop nests and converts them to +/// the gpu.launch operations if possible. /// /// No check on the size of the block or grid, or on the validity of /// parallelization is performed, it is under the responsibility of the caller /// to strip-mine the loops and to perform the dependence analysis before /// calling the conversion. std::unique_ptr> -createSimpleSCFToGPUPass(unsigned numBlockDims, unsigned numThreadDims); -std::unique_ptr> createSimpleSCFToGPUPass(); - -/// Create a pass that converts every loop operation within the body of the -/// FuncOp into a GPU launch. The number of workgroups and workgroup size for -/// the implementation is controlled by SSA values passed into conversion -/// method. For testing, the values are set as constants obtained from a command -/// line flag. See convertLoopToGPULaunch for a description of the required -/// semantics of the converted loop operation. -std::unique_ptr> -createLoopToGPUPass(ArrayRef numWorkGroups, - ArrayRef workGroupSize); -std::unique_ptr> createLoopToGPUPass(); +createAffineForToGPUPass(unsigned numBlockDims, unsigned numThreadDims); +std::unique_ptr> createAffineForToGPUPass(); /// Creates a pass that converts scf.parallel operations into a gpu.launch /// operation. The mapping of loop dimensions to launch dimensions is derived diff --git a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp --- a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp +++ b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp @@ -36,8 +36,6 @@ using namespace mlir; using namespace mlir::scf; -using llvm::seq; - // Extract an indexed value from KernelDim3. static Value getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos) { switch (pos) { @@ -57,44 +55,29 @@ static Operation::operand_range getLowerBoundOperands(AffineForOp forOp) { return forOp.getLowerBoundOperands(); } -static SmallVector getLowerBoundOperands(ForOp forOp) { - SmallVector bounds(1, forOp.lowerBound()); - return bounds; -} // Get the upper bound-related operands of a loop operation. static Operation::operand_range getUpperBoundOperands(AffineForOp forOp) { return forOp.getUpperBoundOperands(); } -static SmallVector getUpperBoundOperands(ForOp forOp) { - SmallVector bounds(1, forOp.upperBound()); - return bounds; -} // Get a Value that corresponds to the loop step. If the step is an attribute, // materialize a corresponding constant using builder. static Value getOrCreateStep(AffineForOp forOp, OpBuilder &builder) { return builder.create(forOp.getLoc(), forOp.getStep()); } -static Value getOrCreateStep(ForOp forOp, OpBuilder &) { return forOp.step(); } // Get a Value for the loop lower bound. If the value requires computation, // materialize the instructions using builder. static Value getOrEmitLowerBound(AffineForOp forOp, OpBuilder &builder) { return lowerAffineLowerBound(forOp, builder); } -static Value getOrEmitLowerBound(ForOp forOp, OpBuilder &) { - return forOp.lowerBound(); -} // Get a Value for the loop upper bound. If the value requires computation, // materialize the instructions using builder. static Value getOrEmitUpperBound(AffineForOp forOp, OpBuilder &builder) { return lowerAffineUpperBound(forOp, builder); } -static Value getOrEmitUpperBound(ForOp forOp, OpBuilder &) { - return forOp.upperBound(); -} // Check the structure of the loop nest: // - there are enough loops to map to numDims; @@ -102,8 +85,8 @@ // - the loop bounds can be computed above the outermost loop. // This roughly corresponds to the "matcher" part of the pattern-based // rewriting infrastructure. -template -static LogicalResult checkLoopNestMappableImpl(OpTy forOp, unsigned numDims) { +static LogicalResult checkAffineLoopNestMappableImpl(AffineForOp forOp, + unsigned numDims) { Region &limit = forOp.region(); for (unsigned i = 0, e = numDims; i < e; ++i) { Operation *nested = &forOp.getBody()->front(); @@ -122,32 +105,15 @@ if (forOp.getBody()->empty() || std::next(begin, 2) != end) return forOp.emitError("expected perfectly nested loops in the body"); - if (!(forOp = dyn_cast(nested))) + if (!(forOp = dyn_cast(nested))) return nested->emitError("expected a nested loop"); } return success(); } -template -static LogicalResult checkLoopNestMappable(OpTy forOp, unsigned numBlockDims, - unsigned numThreadDims) { - if (numBlockDims < 1 || numThreadDims < 1) { - LLVM_DEBUG(llvm::dbgs() << "nothing to map"); - return success(); - } - - if (numBlockDims > 3) { - return forOp.emitError("cannot map to more than 3 block dimensions"); - } - if (numThreadDims > 3) { - return forOp.emitError("cannot map to more than 3 thread dimensions"); - } - return checkLoopNestMappableImpl(forOp, numBlockDims + numThreadDims); -} - -template -static LogicalResult checkLoopOpMappable(OpTy forOp, unsigned numBlockDims, - unsigned numThreadDims) { +static LogicalResult checkAffineLoopNestMappable(AffineForOp forOp, + unsigned numBlockDims, + unsigned numThreadDims) { if (numBlockDims < 1 || numThreadDims < 1) { LLVM_DEBUG(llvm::dbgs() << "nothing to map"); return success(); @@ -159,52 +125,17 @@ if (numThreadDims > 3) { return forOp.emitError("cannot map to more than 3 thread dimensions"); } - if (numBlockDims != numThreadDims) { - // TODO(ravishankarm) : This can probably be relaxed by having a one-trip - // loop for the missing dimension, but there is not reason to handle this - // case for now. - return forOp.emitError( - "mismatch in block dimensions and thread dimensions"); - } - - // Check that the forOp contains perfectly nested loops for numBlockDims - if (failed(checkLoopNestMappableImpl(forOp, numBlockDims))) { - return failure(); - } - - // Get to the innermost loop. - for (auto i : seq(0, numBlockDims - 1)) { - forOp = cast(&forOp.getBody()->front()); - (void)i; - } - - // The forOp now points to the body of the innermost loop mapped to blocks. - for (Operation &op : *forOp.getBody()) { - // If the operation is a loop, check that it is mappable to workItems. - if (auto innerLoop = dyn_cast(&op)) { - if (failed(checkLoopNestMappableImpl(innerLoop, numThreadDims))) { - return failure(); - } - continue; - } - // TODO(ravishankarm) : If it is not a loop op, it is assumed that the - // statement is executed by all threads. It might be a collective operation, - // or some non-side effect instruction. Have to decide on "allowable" - // statements and check for those here. - } - return success(); + return checkAffineLoopNestMappableImpl(forOp, numBlockDims + numThreadDims); } namespace { // Helper structure that holds common state of the loop to GPU kernel // conversion. -struct LoopToGpuConverter { - template - Optional collectBounds(OpTy forOp, unsigned numLoops); +struct AffineLoopToGpuConverter { + Optional collectBounds(AffineForOp forOp, unsigned numLoops); - template - void createLaunch(OpTy rootForOp, OpTy innermostForOp, unsigned numBlockDims, - unsigned numThreadDims); + void createLaunch(AffineForOp rootForOp, AffineForOp innermostForOp, + unsigned numBlockDims, unsigned numThreadDims); // Ranges of the loops mapped to blocks or threads. SmallVector dims; @@ -229,15 +160,14 @@ // This may fail if the IR for computing loop bounds cannot be constructed, for // example if an affine loop uses semi-affine maps. Return the last loop to be // mapped on success, llvm::None on failure. -template -Optional LoopToGpuConverter::collectBounds(OpTy forOp, - unsigned numLoops) { +Optional +AffineLoopToGpuConverter::collectBounds(AffineForOp forOp, unsigned numLoops) { OpBuilder builder(forOp.getOperation()); dims.reserve(numLoops); lbs.reserve(numLoops); ivs.reserve(numLoops); steps.reserve(numLoops); - OpTy currentLoop = forOp; + AffineForOp currentLoop = forOp; for (unsigned i = 0; i < numLoops; ++i) { Value lowerBound = getOrEmitLowerBound(currentLoop, builder); Value upperBound = getOrEmitUpperBound(currentLoop, builder); @@ -257,133 +187,19 @@ steps.push_back(step); if (i != numLoops - 1) - currentLoop = cast(¤tLoop.getBody()->front()); + currentLoop = cast(¤tLoop.getBody()->front()); } return currentLoop; } -/// Given `nDims` perfectly nested loops rooted as `rootForOp`, convert them o -/// be partitioned across workgroups or workitems. The values for the -/// workgroup/workitem id along each dimension is passed in with `ids`. The -/// number of workgroups/workitems along each dimension are passed in with -/// `nids`. The innermost loop is mapped to the x-dimension, followed by the -/// next innermost loop to y-dimension, followed by z-dimension. -template -static OpTy createGPULaunchLoops(OpTy rootForOp, ArrayRef ids, - ArrayRef nids) { - auto nDims = ids.size(); - assert(nDims == nids.size()); - for (auto dim : llvm::seq(0, nDims)) { - // TODO(ravishankarm): Don't always need to generate a loop here. If nids >= - // number of iterations of the original loop, this becomes a if - // condition. Though that does rely on how the workgroup/workitem sizes are - // specified to begin with. - mapLoopToProcessorIds(rootForOp, ids[dim], nids[dim]); - if (dim != nDims - 1) { - rootForOp = cast(rootForOp.getBody()->front()); - } - } - return rootForOp; -} - -/// Utility method to convert the gpu::KernelDim3 object for representing id of -/// each workgroup/workitem and number of workgroup/workitems along a dimension -/// of the launch into a container. -static void packIdAndNumId(gpu::KernelDim3 kernelIds, - gpu::KernelDim3 kernelNids, unsigned nDims, - SmallVectorImpl &ids, - SmallVectorImpl &nids) { - assert(nDims <= 3 && "invalid number of launch dimensions"); - std::array allIds = {kernelIds.z, kernelIds.y, kernelIds.x}; - std::array allNids = {kernelNids.z, kernelNids.y, kernelNids.x}; - ids.clear(); - ids.append(std::next(allIds.begin(), allIds.size() - nDims), allIds.end()); - nids.clear(); - nids.append(std::next(allNids.begin(), allNids.size() - nDims), - allNids.end()); -} - -/// Generate the body of the launch operation. -template -static LogicalResult -createLaunchBody(OpBuilder &builder, OpTy rootForOp, gpu::LaunchOp launchOp, - unsigned numBlockDims, unsigned numThreadDims) { - OpBuilder::InsertionGuard bodyInsertionGuard(builder); - builder.setInsertionPointToEnd(&launchOp.body().front()); - auto terminatorOp = builder.create(launchOp.getLoc()); - - rootForOp.getOperation()->moveBefore(terminatorOp); - SmallVector workgroupID, numWorkGroups; - packIdAndNumId(launchOp.getBlockIds(), launchOp.getGridSize(), numBlockDims, - workgroupID, numWorkGroups); - - // Partition the loop for mapping to workgroups. - auto loopOp = createGPULaunchLoops(rootForOp, workgroupID, numWorkGroups); - - // Iterate over the body of the loopOp and get the loops to partition for - // thread blocks. - SmallVector threadRootForOps; - for (Operation &op : *loopOp.getBody()) { - if (auto threadRootForOp = dyn_cast(&op)) { - threadRootForOps.push_back(threadRootForOp); - } - } - - SmallVector workItemID, workGroupSize; - packIdAndNumId(launchOp.getThreadIds(), launchOp.getBlockSize(), - numThreadDims, workItemID, workGroupSize); - for (auto &loopOp : threadRootForOps) { - builder.setInsertionPoint(loopOp); - createGPULaunchLoops(loopOp, workItemID, workGroupSize); - } - return success(); -} - -// Convert the computation rooted at the `rootForOp`, into a GPU kernel with the -// given workgroup size and number of workgroups. -template -static LogicalResult createLaunchFromOp(OpTy rootForOp, - ArrayRef numWorkGroups, - ArrayRef workGroupSizes) { - OpBuilder builder(rootForOp.getOperation()); - if (numWorkGroups.size() > 3) { - return rootForOp.emitError("invalid ") - << numWorkGroups.size() << "-D workgroup specification"; - } - auto loc = rootForOp.getLoc(); - Value one = builder.create( - loc, builder.getIntegerAttr(builder.getIndexType(), 1)); - SmallVector numWorkGroups3D(3, one), workGroupSize3D(3, one); - for (auto numWorkGroup : enumerate(numWorkGroups)) { - numWorkGroups3D[numWorkGroup.index()] = numWorkGroup.value(); - } - for (auto workGroupSize : enumerate(workGroupSizes)) { - workGroupSize3D[workGroupSize.index()] = workGroupSize.value(); - } - - auto launchOp = builder.create( - rootForOp.getLoc(), numWorkGroups3D[0], numWorkGroups3D[1], - numWorkGroups3D[2], workGroupSize3D[0], workGroupSize3D[1], - workGroupSize3D[2]); - if (failed(createLaunchBody(builder, rootForOp, launchOp, - numWorkGroups.size(), workGroupSizes.size()))) { - return failure(); - } - - return success(); -} - // Replace the rooted at "rootForOp" with a GPU launch operation. This expects // "innermostForOp" to point to the last loop to be transformed to the kernel, // and to have (numBlockDims + numThreadDims) perfectly nested loops between // "rootForOp" and "innermostForOp". -// TODO(ravishankarm) : This method can be modified to use the -// createLaunchFromOp method, since that is a strict generalization of this -// method. -template -void LoopToGpuConverter::createLaunch(OpTy rootForOp, OpTy innermostForOp, - unsigned numBlockDims, - unsigned numThreadDims) { +void AffineLoopToGpuConverter::createLaunch(AffineForOp rootForOp, + AffineForOp innermostForOp, + unsigned numBlockDims, + unsigned numThreadDims) { OpBuilder builder(rootForOp.getOperation()); // Prepare the grid and block sizes for the launch operation. If there is // no loop mapped to a specific dimension, use constant "1" as its size. @@ -444,14 +260,13 @@ } // Generic loop to GPU kernel conversion function. -template -static LogicalResult convertLoopNestToGPULaunch(OpTy forOp, - unsigned numBlockDims, - unsigned numThreadDims) { - if (failed(checkLoopNestMappable(forOp, numBlockDims, numThreadDims))) +static LogicalResult convertAffineLoopNestToGPULaunch(AffineForOp forOp, + unsigned numBlockDims, + unsigned numThreadDims) { + if (failed(checkAffineLoopNestMappable(forOp, numBlockDims, numThreadDims))) return failure(); - LoopToGpuConverter converter; + AffineLoopToGpuConverter converter; auto maybeInnerLoop = converter.collectBounds(forOp, numBlockDims + numThreadDims); if (!maybeInnerLoop) @@ -461,35 +276,10 @@ return success(); } -// Generic loop to GPU kernel conversion function when loop is imperfectly -// nested. The workgroup size and num workgroups is provided as input -template -static LogicalResult convertLoopToGPULaunch(OpTy forOp, - ArrayRef numWorkGroups, - ArrayRef workGroupSize) { - if (failed(checkLoopOpMappable(forOp, numWorkGroups.size(), - workGroupSize.size()))) { - return failure(); - } - return createLaunchFromOp(forOp, numWorkGroups, workGroupSize); -} - LogicalResult mlir::convertAffineLoopNestToGPULaunch(AffineForOp forOp, unsigned numBlockDims, unsigned numThreadDims) { - return ::convertLoopNestToGPULaunch(forOp, numBlockDims, numThreadDims); -} - -LogicalResult mlir::convertLoopNestToGPULaunch(ForOp forOp, - unsigned numBlockDims, - unsigned numThreadDims) { - return ::convertLoopNestToGPULaunch(forOp, numBlockDims, numThreadDims); -} - -LogicalResult mlir::convertLoopToGPULaunch(scf::ForOp forOp, - ArrayRef numWorkGroups, - ArrayRef workGroupSizes) { - return ::convertLoopToGPULaunch(forOp, numWorkGroups, workGroupSizes); + return ::convertAffineLoopNestToGPULaunch(forOp, numBlockDims, numThreadDims); } namespace { diff --git a/mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp b/mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp --- a/mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp +++ b/mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp @@ -18,7 +18,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/Support/CommandLine.h" -#define PASS_NAME "convert-scf-to-gpu" +#define PASS_NAME "convert-affine-for-to-gpu" #define LOOPOP_TO_GPU_PASS_NAME "convert-loop-op-to-gpu" using namespace mlir; @@ -28,7 +28,7 @@ // A pass that traverses top-level loops in the function and converts them to // GPU launch operations. Nested launches are not allowed, so this does not // walk the function recursively to avoid considering nested loops. -struct ForLoopMapper : public ConvertSimpleSCFToGPUBase { +struct ForLoopMapper : public ConvertAffineForToGPUBase { ForLoopMapper() = default; ForLoopMapper(unsigned numBlockDims, unsigned numThreadDims) { this->numBlockDims = numBlockDims; @@ -41,49 +41,6 @@ if (failed(convertAffineLoopNestToGPULaunch(forOp, numBlockDims, numThreadDims))) signalPassFailure(); - } else if (auto forOp = dyn_cast(&op)) { - if (failed( - convertLoopNestToGPULaunch(forOp, numBlockDims, numThreadDims))) - signalPassFailure(); - } - } - } -}; - -// A pass that traverses top-level loops in the function and convertes them to -// GPU launch operations. The top-level loops itself does not have to be -// perfectly nested. The only requirement is that there be as many perfectly -// nested loops as the size of `numWorkGroups`. Within these any loop nest has -// to be perfectly nested upto depth equal to size of `workGroupSize`. -struct ImperfectlyNestedForLoopMapper - : public ConvertSCFToGPUBase { - ImperfectlyNestedForLoopMapper() = default; - ImperfectlyNestedForLoopMapper(ArrayRef numWorkGroups, - ArrayRef workGroupSize) { - this->numWorkGroups = numWorkGroups; - this->workGroupSize = workGroupSize; - } - - void runOnFunction() override { - // Insert the num work groups and workgroup sizes as constant values. This - // pass is only used for testing. - FuncOp funcOp = getFunction(); - OpBuilder builder(funcOp.getOperation()->getRegion(0)); - SmallVector numWorkGroupsVal, workGroupSizeVal; - for (auto val : numWorkGroups) { - auto constOp = builder.create( - funcOp.getLoc(), builder.getIntegerAttr(builder.getIndexType(), val)); - numWorkGroupsVal.push_back(constOp); - } - for (auto val : workGroupSize) { - auto constOp = builder.create( - funcOp.getLoc(), builder.getIntegerAttr(builder.getIndexType(), val)); - workGroupSizeVal.push_back(constOp); - } - for (ForOp forOp : llvm::make_early_inc_range(funcOp.getOps())) { - if (failed(convertLoopToGPULaunch(forOp, numWorkGroupsVal, - workGroupSizeVal))) { - return signalPassFailure(); } } } @@ -108,23 +65,13 @@ } // namespace std::unique_ptr> -mlir::createSimpleSCFToGPUPass(unsigned numBlockDims, unsigned numThreadDims) { +mlir::createAffineForToGPUPass(unsigned numBlockDims, unsigned numThreadDims) { return std::make_unique(numBlockDims, numThreadDims); } -std::unique_ptr> mlir::createSimpleSCFToGPUPass() { +std::unique_ptr> mlir::createAffineForToGPUPass() { return std::make_unique(); } -std::unique_ptr> -mlir::createLoopToGPUPass(ArrayRef numWorkGroups, - ArrayRef workGroupSize) { - return std::make_unique(numWorkGroups, - workGroupSize); -} -std::unique_ptr> mlir::createLoopToGPUPass() { - return std::make_unique(); -} - std::unique_ptr mlir::createParallelLoopToGpuPass() { return std::make_unique(); } diff --git a/mlir/test/Conversion/SCFToGPU/imperfect_2D.mlir b/mlir/test/Conversion/SCFToGPU/imperfect_2D.mlir deleted file mode 100644 --- a/mlir/test/Conversion/SCFToGPU/imperfect_2D.mlir +++ /dev/null @@ -1,83 +0,0 @@ -// RUN: mlir-opt -convert-loop-op-to-gpu="gpu-num-workgroups=2,2 gpu-workgroup-size=32,4" %s | FileCheck %s - -module { - // arg2 = arg0 * transpose(arg1) ; with intermediate buffer and tile size passed as argument - // CHECK: func {{@.*}}([[ARG0:%.*]]: memref, [[ARG1:%.*]]: memref, [[ARG2:%.*]]: memref, [[ARG3:%.*]]: index, [[ARG4:%.*]]: index) - func @foo(%arg0: memref, %arg1 : memref, %arg2 : memref, %arg3 : index, %arg4 : index) { - %0 = dim %arg0, 0 : memref - %1 = dim %arg0, 1 : memref - %c0 = constant 0 : index - %c1 = constant 1 : index - // CHECK: gpu.launch blocks([[ARG5:%.*]], [[ARG6:%.*]], [[ARG7:%.*]]) in ([[ARG11:%.*]] = {{%.*}}, [[ARG12:%.*]] = {{%.*}}, [[ARG13:%.*]] = {{%.*}}) threads([[ARG8:%.*]], [[ARG9:%.*]], [[ARG10:%.*]]) in ([[ARG14:%.*]] = {{%.*}}, [[ARG15:%.*]] = {{%.*}}, [[ARG16:%.*]] = {{%.*}}) - // CHECK: [[TEMP1:%.*]] = muli [[ARG3]], [[ARG6]] : index - // CHECK: [[BLOCKLOOPYLB:%.*]] = addi {{%.*}}, [[TEMP1]] : index - // CHECK: [[BLOCKLOOPYSTEP:%.*]] = muli [[ARG3]], [[ARG12]] : index - // CHECK: scf.for [[BLOCKLOOPYIV:%.*]] = [[BLOCKLOOPYLB]] to {{%.*}} step [[BLOCKLOOPYSTEP]] - scf.for %iv1 = %c0 to %0 step %arg3 { - - // CHECK: [[TEMP2:%.*]] = muli [[ARG4]], [[ARG5]] : index - // CHECK: [[BLOCKLOOPXLB:%.*]] = addi {{%.*}}, [[TEMP2]] : index - // CHECK: [[BLOCKLOOPXSTEP:%.*]] = muli [[ARG4]], [[ARG11]] : index - // CHECK: scf.for [[BLOCKLOOPXIV:%.*]] = [[BLOCKLOOPXLB]] to {{%.*}} step [[BLOCKLOOPXSTEP]] - - scf.for %iv2 = %c0 to %1 step %arg4 { - - // TODO: This is effectively shared memory. Lower it to a - // shared memory. - %2 = alloc(%arg3, %arg4) : memref - - // Load transpose tile - // CHECK: [[TEMP3:%.*]] = muli [[ARG20:%.*]], [[ARG9:%.*]] : index - // CHECK: [[THREADLOOP1YLB:%.*]] = addi {{%.*}}, [[TEMP3]] : index - // CHECK: [[THREADLOOP1YSTEP:%.*]] = muli [[ARG20]], [[ARG15]] : index - // CHECK: scf.for [[THREADLOOP1YIV:%.*]] = [[THREADLOOP1YLB]] to {{%.*}} step [[THREADLOOP1YSTEP]] - scf.for %iv3 = %c0 to %arg3 step %c1 { - // CHECK: [[TEMP4:%.*]] = muli [[ARG20]], [[ARG8]] : index - // CHECK: [[THREADLOOP1XLB:%.*]] = addi {{%.*}}, [[TEMP4]] : index - // CHECK: [[THREADLOOP1XSTEP:%.*]] = muli [[ARG20]], [[ARG14]] : index - // CHECK: scf.for [[THREADLOOP1XIV:%.*]] = [[THREADLOOP1XLB]] to {{%.*}} step [[THREADLOOP1XSTEP]] - scf.for %iv4 = %c1 to %arg4 step %c1 { - // CHECK: [[INDEX2:%.*]] = addi [[BLOCKLOOPYIV]], [[THREADLOOP1YIV]] : index - %10 = addi %iv1, %iv3 : index - // CHECK: [[INDEX1:%.*]] = addi [[BLOCKLOOPXIV]], [[THREADLOOP1XIV]] : index - %11 = addi %iv2, %iv4 : index - // CHECK: [[VAL1:%.*]] = load [[ARG1]]{{\[}}[[INDEX1]], [[INDEX2]]{{\]}} : memref - %12 = load %arg1[%11, %10] : memref - // CHECK: store [[VAL1]], [[SCRATCHSPACE:%.*]]{{\[}}[[THREADLOOP1XIV]], [[THREADLOOP1YIV]]{{\]}} : memref - store %12, %2[%iv4, %iv3] : memref - } - } - - // TODO: There needs to be a sync here for correctness, but - // testing only loop partitioning for now. - - // CHECK: [[TEMP5:%.*]] = muli [[ARG20]], [[ARG9]] : index - // CHECK: [[THREADLOOP2YLB:%.*]] = addi {{%.*}}, [[TEMP5]] : index - // CHECK: [[THREADLOOP2YSTEP:%.*]] = muli [[ARG20]], [[ARG15]] : index - // CHECK: scf.for [[THREADLOOP2YIV:%.*]] = [[THREADLOOP2YLB]] to {{%.*}} step [[THREADLOOP2YSTEP]] - scf.for %iv3 = %c0 to %arg3 step %c1 { - // CHECK: [[TEMP6:%.*]] = muli [[ARG20]], [[ARG8]] : index - // CHECK: [[THREADLOOP2XLB:%.*]] = addi {{%.*}}, [[TEMP6]] : index - // CHECK: [[THREADLOOP2XSTEP:%.*]] = muli [[ARG20]], [[ARG14]] : index - // CHECK: scf.for [[THREADLOOP2XIV:%.*]] = [[THREADLOOP2XLB]] to {{%.*}} step [[THREADLOOP2XSTEP]] - scf.for %iv4 = %c1 to %arg4 step %c1 { - // CHECK: [[INDEX3:%.*]] = addi [[BLOCKLOOPYIV]], [[THREADLOOP2YIV]] : index - %13 = addi %iv1, %iv3 : index - // CHECK: [[INDEX4:%.*]] = addi [[BLOCKLOOPXIV]], [[THREADLOOP2XIV]] : index - %14 = addi %iv2, %iv4 : index - // CHECK: {{%.*}} = load [[SCRATCHSPACE]]{{\[}}[[THREADLOOP2XIV]], [[THREADLOOP2YIV]]{{\]}} : memref - %15 = load %2[%iv4, %iv3] : memref - // CHECK: {{%.*}} = load [[ARG0]]{{\[}}[[INDEX3]], [[INDEX4]]{{\]}} - %16 = load %arg0[%13, %14] : memref - %17 = mulf %15, %16 : f32 - // CHECK: store {{%.*}}, [[ARG2]]{{\[}}[[INDEX3]], [[INDEX4]]{{\]}} - store %17, %arg2[%13, %14] : memref - } - } - - dealloc %2 : memref - } - } - return - } -} diff --git a/mlir/test/Conversion/SCFToGPU/imperfect_3D.mlir b/mlir/test/Conversion/SCFToGPU/imperfect_3D.mlir deleted file mode 100644 --- a/mlir/test/Conversion/SCFToGPU/imperfect_3D.mlir +++ /dev/null @@ -1,83 +0,0 @@ -// RUN: mlir-opt -convert-loop-op-to-gpu="gpu-num-workgroups=4,2,2 gpu-workgroup-size=32,2,2" %s | FileCheck %s - -module { - func @imperfect_3D(%arg0 : memref, %arg1 : memref, %arg2 : memref, %arg3 : memref, %t1 : index, %t2 : index, %t3 : index, %step1 : index, %step2 : index, %step3 : index) { - %0 = dim %arg0, 0 : memref - %1 = dim %arg0, 1 : memref - %2 = dim %arg0, 2 : memref - %c0 = constant 0 : index - // CHECK: gpu.launch - // CHECK: scf.for {{.*}} { - // CHECK: scf.for {{.*}} { - // CHECK: scf.for {{.*}} { - // CHECK: alloc - // CHECK: scf.for {{.*}} { - // CHECK: scf.for {{.*}} { - // CHECK: scf.for {{.*}} { - // CHECK: load - // CHECK: load - // CHECK: addf - // CHECK: store - // CHECK: } - // CHECK-NEXT: } - // CHECK-NEXT: } - // CHECK: scf.for {{.*}} { - // CHECK: scf.for {{.*}} { - // CHECK: scf.for {{.*}} { - // CHECK: load - // CHECK: load - // CHECK: mulf - // CHECK: store - // CHECK: } - // CHECK-NEXT: } - // CHECK-NEXT: } - // CHECK: dealloc - scf.for %iv1 = %c0 to %0 step %t1 { - scf.for %iv2 = %c0 to %1 step %t2 { - scf.for %iv3 = %c0 to %2 step %t3 { - %6 = alloc(%t1, %t2, %t3) : memref - %ubcmp1 = cmpi "slt", %0, %t1 : index - %ub1 = select %ubcmp1, %0, %t1 : index - %ubcmp2 = cmpi "slt", %1, %t2 : index - %ub2 = select %ubcmp2, %1, %t2 : index - %ubcmp3 = cmpi "slt", %2, %t3 : index - %ub3 = select %ubcmp3, %2, %t3 : index - scf.for %iv4 = %iv1 to %ub1 step %step1 { - scf.for %iv5 = %iv2 to %ub2 step %step2 { - scf.for %iv6 = %iv3 to %ub3 step %step3 { - %7 = load %arg0[%iv4, %iv5, %iv6] : memref - %8 = load %arg1[%iv4, %iv6, %iv5] : memref - %9 = addf %7, %8 : f32 - %10 = subi %iv4, %iv1 : index - %11 = divi_signed %10, %step1 : index - %12 = subi %iv5, %iv2 : index - %13 = divi_signed %12, %step2 : index - %14 = subi %iv6, %iv3 : index - %15 = divi_signed %14, %step3 : index - store %9, %6[%11, %13, %15] : memref - } - } - } - scf.for %iv7 = %iv1 to %ub1 step %step1 { - scf.for %iv8 = %iv2 to %ub2 step %step2 { - scf.for %iv9 = %iv3 to %ub3 step %step3 { - %16 = subi %iv7, %iv1 : index - %17 = divi_signed %16, %step1 : index - %18 = subi %iv8, %iv2 : index - %19 = divi_signed %18, %step2 : index - %20 = subi %iv9, %iv3 : index - %21 = divi_signed %20, %step3 : index - %22 = load %6[%17, %19, %21] : memref - %23 = load %arg2[%iv9, %iv8, %iv7] : memref - %24 = mulf %22, %23 : f32 - store %24, %arg3[%iv7, %iv8, %iv9] : memref - } - } - } - dealloc %6 : memref - } - } - } - return - } -} diff --git a/mlir/test/Conversion/SCFToGPU/imperfect_4D.mlir b/mlir/test/Conversion/SCFToGPU/imperfect_4D.mlir deleted file mode 100644 --- a/mlir/test/Conversion/SCFToGPU/imperfect_4D.mlir +++ /dev/null @@ -1,86 +0,0 @@ -// RUN: mlir-opt -convert-loop-op-to-gpu="gpu-num-workgroups=4,2,2 gpu-workgroup-size=32,2,2" %s | FileCheck %s - -module { - func @imperfect_3D(%arg0 : memref, %arg1 : memref, %arg2 : memref, %arg3 : memref, %t1 : index, %t2 : index, %t3 : index, %t4 : index, %step1 : index, %step2 : index, %step3 : index, %step4 : index) { - %0 = dim %arg0, 0 : memref - %1 = dim %arg0, 1 : memref - %2 = dim %arg0, 2 : memref - %3 = dim %arg0, 3 : memref - %c0 = constant 0 : index - // CHECK: gpu.launch - // CHECK: scf.for - // CHECK: scf.for - // CHECK: scf.for - // CHECK: alloc - // CHECK: scf.for - // CHECK: scf.for - // CHECK: scf.for - // CHECK: scf.for - // CHECK: load - // CHECK: load - // CHECK: addf - // CHECK: store - // CHECK: scf.for - // CHECK: scf.for - // CHECK: scf.for - // CHECK: scf.for - // CHECK: load - // CHECK: load - // CHECK: mulf - // CHECK: store - // CHECK: dealloc - scf.for %iv1 = %c0 to %0 step %t1 { - scf.for %iv2 = %c0 to %1 step %t2 { - scf.for %iv3 = %c0 to %2 step %t3 { - %6 = alloc(%t1, %t2, %t3, %3) : memref - %ubcmp1 = cmpi "slt", %0, %t1 : index - %ub1 = select %ubcmp1, %0, %t1 : index - %ubcmp2 = cmpi "slt", %1, %t2 : index - %ub2 = select %ubcmp2, %1, %t2 : index - %ubcmp3 = cmpi "slt", %2, %t3 : index - %ub3 = select %ubcmp3, %2, %t3 : index - %ubcmp4 = cmpi "slt", %3, %t4 : index - %ub4 = select %ubcmp3, %3, %t4 : index - scf.for %iv5 = %iv1 to %ub1 step %step1 { - scf.for %iv6 = %iv2 to %ub2 step %step2 { - scf.for %iv7 = %iv3 to %ub3 step %step3 { - scf.for %iv8 = %c0 to %3 step %step4 { - %7 = load %arg0[%iv5, %iv6, %iv7, %iv8] : memref - %8 = load %arg1[%iv5, %iv6, %iv7, %iv8] : memref - %9 = addf %7, %8 : f32 - %10 = subi %iv5, %iv1 : index - %11 = divi_signed %10, %step1 : index - %12 = subi %iv6, %iv2 : index - %13 = divi_signed %12, %step2 : index - %14 = subi %iv7, %iv3 : index - %15 = divi_signed %14, %step3 : index - store %9, %6[%11, %13, %15, %iv8] : memref - } - } - } - } - scf.for %iv9 = %iv1 to %ub1 step %step1 { - scf.for %iv10 = %iv2 to %ub2 step %step2 { - scf.for %iv11 = %iv3 to %ub3 step %step3 { - scf.for %iv12 = %c0 to %3 step %step4 { - %18 = subi %iv9, %iv1 : index - %19 = divi_signed %18, %step1 : index - %20 = subi %iv10, %iv2 : index - %21 = divi_signed %20, %step2 : index - %22 = subi %iv11, %iv3 : index - %23 = divi_signed %22, %step3 : index - %26 = load %6[%19, %21, %23, %iv12] : memref - %27 = load %arg2[%iv9, %iv10, %iv12, %iv11] : memref - %28 = mulf %26, %27 : f32 - store %28, %arg3[%iv9, %iv10, %iv11, %iv12] : memref - } - } - } - } - dealloc %6 : memref - } - } - } - return - } -} diff --git a/mlir/test/Conversion/SCFToGPU/imperfect_linalg.mlir b/mlir/test/Conversion/SCFToGPU/imperfect_linalg.mlir deleted file mode 100644 --- a/mlir/test/Conversion/SCFToGPU/imperfect_linalg.mlir +++ /dev/null @@ -1,40 +0,0 @@ -// RUN: mlir-opt %s -convert-loop-op-to-gpu="gpu-num-workgroups=2,16 gpu-workgroup-size=32,4" | FileCheck %s - -module { - func @fmul(%arg0: memref, %arg1: memref, %arg2: memref) { - %c1 = constant 1 : index - %c0 = constant 0 : index - %c2 = constant 2 : index - %0 = dim %arg0, 0 : memref - %1 = dim %arg0, 1 : memref - // CHECK-LABEL: gpu.launch - // CHECK: scf.for - // CHECK: scf.for - // CHECK: scf.for - // CHECK: scf.for - // CHECK: load - // CHECK: load - // CHECK: load - // CHECK: mulf - // CHECK: store - scf.for %arg3 = %c0 to %0 step %c2 { - scf.for %arg4 = %c0 to %1 step %c2 { - %4 = std.subview %arg0[%arg3, %arg4][%c2, %c2][%c1, %c1] : memref to memref - %7 = std.subview %arg1[%arg3, %arg4][%c2, %c2][%c1, %c1] : memref to memref - %10 = std.subview %arg2[%arg3, %arg4][%c2, %c2][%c1, %c1] : memref to memref - %11 = dim %4, 0 : memref - %12 = dim %4, 1 : memref - scf.for %arg5 = %c0 to %11 step %c1 { - scf.for %arg6 = %c0 to %12 step %c1 { - %13 = load %4[%arg5, %arg6] : memref - %14 = load %7[%arg5, %arg6] : memref - %15 = load %10[%arg5, %arg6] : memref - %16 = mulf %13, %14 : f32 - store %16, %10[%arg5, %arg6] : memref - } - } - } - } - return - } -} diff --git a/mlir/test/Conversion/SCFToGPU/linalg_to_gpu.mlir b/mlir/test/Conversion/SCFToGPU/linalg_to_gpu.mlir deleted file mode 100644 --- a/mlir/test/Conversion/SCFToGPU/linalg_to_gpu.mlir +++ /dev/null @@ -1,29 +0,0 @@ -// RUN: mlir-opt -convert-scf-to-gpu %s | FileCheck %s - -// CHECK-LABEL: @foo -func @foo(%arg0: memref, %arg1 : index) { - %c0 = constant 0 : index - %c42 = constant 42 : index - %c3 = constant 3 : index - // CHECK: subi %{{.*}}, %{{.*}} : index - // CHECK-NEXT: %[[range_i:.*]] = divi_signed {{.*}}, %{{.*}} : index - scf.for %i0 = %c0 to %c42 step %c3 { - // CHECK: subi %{{.*}}, %{{.*}} : index - // CHECK-NEXT: %[[range_j:.*]] = divi_signed {{.*}}, %{{.*}} : index - scf.for %i1 = %c3 to %c42 step %arg1 { - // CHECK: gpu.launch - // CHECK-SAME: blocks - // CHECK-SAME: threads - - // Replacements of loop induction variables. Take a product with the - // step and add the lower bound. - // CHECK: %[[prod_i:.*]] = muli %{{.*}}, %{{.*}} : index - // CHECK: addi %{{.*}}, %[[prod_i]] : index - // CHECK: %[[prod_j:.*]] = muli %{{.*}}, %{{.*}} : index - // CHECK: addi %{{.*}}, %[[prod_j]] : index - - // CHECK: gpu.terminator - } - } - return -} diff --git a/mlir/test/Conversion/SCFToGPU/no_blocks_no_threads.mlir b/mlir/test/Conversion/SCFToGPU/no_blocks_no_threads.mlir --- a/mlir/test/Conversion/SCFToGPU/no_blocks_no_threads.mlir +++ b/mlir/test/Conversion/SCFToGPU/no_blocks_no_threads.mlir @@ -1,5 +1,5 @@ -// RUN: mlir-opt -convert-scf-to-gpu="gpu-block-dims=0 gpu-thread-dims=1" %s | FileCheck --check-prefix=CHECK-THREADS %s --dump-input-on-failure -// RUN: mlir-opt -convert-scf-to-gpu="gpu-block-dims=1 gpu-thread-dims=0" %s | FileCheck --check-prefix=CHECK-BLOCKS %s --dump-input-on-failure +// RUN: mlir-opt -convert-affine-for-to-gpu="gpu-block-dims=0 gpu-thread-dims=1" %s | FileCheck --check-prefix=CHECK-THREADS %s --dump-input-on-failure +// RUN: mlir-opt -convert-affine-for-to-gpu="gpu-block-dims=1 gpu-thread-dims=0" %s | FileCheck --check-prefix=CHECK-BLOCKS %s --dump-input-on-failure // CHECK-THREADS-LABEL: @one_d_loop // CHECK-BLOCKS-LABEL: @one_d_loop diff --git a/mlir/test/Conversion/SCFToGPU/perfect_1D_setlaunch.mlir b/mlir/test/Conversion/SCFToGPU/perfect_1D_setlaunch.mlir deleted file mode 100644 --- a/mlir/test/Conversion/SCFToGPU/perfect_1D_setlaunch.mlir +++ /dev/null @@ -1,26 +0,0 @@ -// RUN: mlir-opt -convert-loop-op-to-gpu="gpu-num-workgroups=2 gpu-workgroup-size=32" %s | FileCheck %s - -module { - func @foo(%arg0: memref, %arg1 : memref, %arg2 : memref) { - %0 = dim %arg0, 0 : memref - %1 = dim %arg0, 1 : memref - %c0 = constant 0 : index - %c1 = constant 1 : index - // CHECK: gpu.launch - // CHECK: scf.for - // CHECK: scf.for - // CHECK: load - // CHECK: load - // CHECK: add - // CHECK: store - scf.for %iv1 = %c0 to %0 step %c1 { - scf.for %iv2 = %c0 to %1 step %c1 { - %12 = load %arg0[%iv1, %iv2] : memref - %13 = load %arg1[%iv2, %iv1] : memref - %14 = addf %12, %13 : f32 - store %12, %arg2[%iv1, %iv2] : memref - } - } - return - } -} diff --git a/mlir/test/Conversion/SCFToGPU/step_one.mlir b/mlir/test/Conversion/SCFToGPU/step_one.mlir --- a/mlir/test/Conversion/SCFToGPU/step_one.mlir +++ b/mlir/test/Conversion/SCFToGPU/step_one.mlir @@ -1,5 +1,5 @@ -// RUN: mlir-opt -convert-scf-to-gpu="gpu-block-dims=1 gpu-thread-dims=1" %s | FileCheck --check-prefix=CHECK-11 %s -// RUN: mlir-opt -convert-scf-to-gpu="gpu-block-dims=2 gpu-thread-dims=2" %s | FileCheck --check-prefix=CHECK-22 %s +// RUN: mlir-opt -convert-affine-for-to-gpu="gpu-block-dims=1 gpu-thread-dims=1" %s | FileCheck --check-prefix=CHECK-11 %s +// RUN: mlir-opt -convert-affine-for-to-gpu="gpu-block-dims=2 gpu-thread-dims=2" %s | FileCheck --check-prefix=CHECK-22 %s // CHECK-11-LABEL: @step_1 // CHECK-22-LABEL: @step_1 diff --git a/mlir/test/Conversion/SCFToGPU/step_positive.mlir b/mlir/test/Conversion/SCFToGPU/step_positive.mlir --- a/mlir/test/Conversion/SCFToGPU/step_positive.mlir +++ b/mlir/test/Conversion/SCFToGPU/step_positive.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt -convert-scf-to-gpu="gpu-block-dims=1 gpu-thread-dims=1" %s | FileCheck %s +// RUN: mlir-opt -convert-affine-for-to-gpu="gpu-block-dims=1 gpu-thread-dims=1" %s | FileCheck %s // CHECK-LABEL: @step_var func @step_var(%A : memref, %B : memref) {