diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td @@ -335,42 +335,32 @@ let verifier = [{ return ::verify(*this); }]; } -def GPU_LaunchOp : GPU_Op<"launch", [IsolatedFromAbove]>, +def GPU_LaunchOp : GPU_Op<"launch">, Arguments<(ins Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ, - Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ, - Variadic:$operands)>, + Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ)>, Results<(outs)> { let summary = "GPU kernel launch operation"; let description = [{ Launch a kernel on the specified grid of thread blocks. The body of the kernel is defined by the single region that this operation contains. The - operation takes at least six operands, with first three operands being grid - sizes along x,y,z dimensions, the following three arguments being block - sizes along x,y,z dimension, and the remaining operands are arguments of the - kernel. When a lower-dimensional kernel is required, unused sizes must be - explicitly set to `1`. + operation takes six operands, with first three operands being grid sizes + along x,y,z dimensions and the following three arguments being block sizes + along x,y,z dimension. When a lower-dimensional kernel is required, + unused sizes must be explicitly set to `1`. - The body region has at least _twelve_ arguments, grouped as follows: + The body region has _twelve_ arguments, grouped as follows: - three arguments that contain block identifiers along x,y,z dimensions; - three arguments that contain thread identifiers along x,y,z dimensions; - - operands of the `gpu.launch` operation as is, including six leading - operands for grid and block sizes. - - Operations inside the body region, and any operations in the nested regions, - are _not_ allowed to use values defined outside the _body_ region, as if - this region was a function. If necessary, values must be passed as kernel - arguments into the body region. Nested regions inside the kernel body are - allowed to use values defined in their ancestor regions as long as they - don't cross the kernel body region boundary. + - operands of the `gpu.launch` operation as is (i.e. the operands for + grid and block sizes). Syntax: ``` operation ::= `gpu.launch` `block` `(` ssa-id-list `)` `in` ssa-reassignment `threads` `(` ssa-id-list `)` `in` ssa-reassignment - (`args` ssa-reassignment `:` type-list)? region attr-dict? ssa-reassignment ::= `(` ssa-id `=` ssa-use (`,` ssa-id `=` ssa-use)* `)` ``` @@ -379,32 +369,29 @@ ```mlir gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %0, %sz_by = %1, %sz_bz = %2) - threads(%tx, %ty, %tz) in (%sz_tx = %3, %sz_ty = %4, %sz_tz = %5) - args(%arg0 = %6, %arg1 = 7) : f32, memref { + threads(%tx, %ty, %tz) in (%sz_tx = %3, %sz_ty = %4, %sz_tz = %5) { // Block and thread identifiers, as well as block/grid sizes are // immediately usable inside body region. "some_op"(%bx, %tx) : (index, index) -> () - %42 = load %arg1[%bx] : memref + // Assuming %val1 is defined outside the gpu.launch region. + %42 = load %val1[%bx] : memref } // Generic syntax explains how the pretty syntax maps to the IR structure. "gpu.launch"(%cst, %cst, %c1, // Grid sizes. - %cst, %c1, %c1, // Block sizes. - %arg0, %arg1) // Actual arguments. + %cst, %c1, %c1) // Block sizes. + {/*attributes*/} // All sizes and identifiers have "index" size. - : (index, index, index, index, index, index, f32, memref) - -> () { + : (index, index, index, index, index, index) -> () { // The operation passes block and thread identifiers, followed by grid and - // block sizes, followed by actual arguments to the entry block of the - // region. + // block sizes. ^bb0(%bx : index, %by : index, %bz : index, %tx : index, %ty : index, %tz : index, %num_bx : index, %num_by : index, %num_bz : index, - %num_tx : index, %num_ty : index, %num_tz : index, - %arg0 : f32, %arg1 : memref): + %num_tx : index, %num_ty : index, %num_tz : index) "some_op"(%bx, %tx) : (index, index) -> () - %3 = "std.load"(%arg1, %bx) : (memref, index) -> f32 + %3 = "std.load"(%val1, %bx) : (memref, index) -> f32 } ``` @@ -422,12 +409,9 @@ let builders = [ OpBuilder<"Builder *builder, OperationState &result, Value gridSizeX," "Value gridSizeY, Value gridSizeZ, Value blockSizeX," - "Value blockSizeY, Value blockSizeZ," - "ValueRange operands"> + "Value blockSizeY, Value blockSizeZ"> ]; - let hasCanonicalizer = 1; - let extraClassDeclaration = [{ /// Get the SSA values corresponding to kernel block identifiers. KernelDim3 getBlockIds(); @@ -437,26 +421,14 @@ KernelDim3 getGridSize(); /// Get the SSA values corresponding to kernel block size. KernelDim3 getBlockSize(); - /// Get the operand values passed as kernel arguments. - operand_range getKernelOperandValues(); - /// Get the operand types passed as kernel arguments. - operand_type_range getKernelOperandTypes(); /// Get the SSA values passed as operands to specify the grid size. KernelDim3 getGridSizeOperandValues(); /// Get the SSA values passed as operands to specify the block size. KernelDim3 getBlockSizeOperandValues(); - /// Get the SSA values of the kernel arguments. - iterator_range getKernelArguments(); - - /// Erase the `index`-th kernel argument. Both the entry block argument and - /// the operand will be dropped. The block argument must not have any uses. - void eraseKernelArgument(unsigned index); - static StringRef getBlocksKeyword() { return "blocks"; } static StringRef getThreadsKeyword() { return "threads"; } - static StringRef getArgsKeyword() { return "args"; } /// The number of launch configuration operands, placed at the leading /// positions of the operand list. diff --git a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp --- a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp +++ b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp @@ -357,32 +357,15 @@ workGroupSize3D[workGroupSize.index()] = workGroupSize.value(); } - // Get the values used within the region of the rootForOp but defined above - // it. - llvm::SetVector valuesToForwardSet; - getUsedValuesDefinedAbove(rootForOp.region(), rootForOp.region(), - valuesToForwardSet); - // Also add the values used for the lb, ub, and step of the rootForOp. - valuesToForwardSet.insert(rootForOp.getOperands().begin(), - rootForOp.getOperands().end()); - auto valuesToForward = valuesToForwardSet.takeVector(); auto launchOp = builder.create( rootForOp.getLoc(), numWorkGroups3D[0], numWorkGroups3D[1], numWorkGroups3D[2], workGroupSize3D[0], workGroupSize3D[1], - workGroupSize3D[2], valuesToForward); + workGroupSize3D[2]); if (failed(createLaunchBody(builder, rootForOp, launchOp, numWorkGroups.size(), workGroupSizes.size()))) { return failure(); } - // Replace values that are used within the region of the launchOp but are - // defined outside. They all are replaced with kernel arguments. - for (auto pair : - llvm::zip_first(valuesToForward, launchOp.getKernelArguments())) { - Value from = std::get<0>(pair); - Value to = std::get<1>(pair); - replaceAllUsesInRegionWith(from, to, launchOp.body()); - } return success(); } @@ -411,24 +394,13 @@ Value blockSizeZ = numThreadDims > 2 ? dims[numBlockDims + 2] : constOne; // Create a launch op and move the body region of the innermost loop to the - // launch op. Pass the values defined outside the outermost loop and used - // inside the innermost loop and loop lower bounds as kernel data arguments. - // Still assuming perfect nesting so there are no values other than induction - // variables that are defined in one loop and used in deeper loops. - llvm::SetVector valuesToForwardSet; - getUsedValuesDefinedAbove(innermostForOp.region(), rootForOp.region(), - valuesToForwardSet); - auto valuesToForward = valuesToForwardSet.takeVector(); - auto originallyForwardedValues = valuesToForward.size(); - valuesToForward.insert(valuesToForward.end(), lbs.begin(), lbs.end()); - valuesToForward.insert(valuesToForward.end(), steps.begin(), steps.end()); + // launch op. auto launchOp = builder.create( rootForOp.getLoc(), gridSizeX, gridSizeY, gridSizeZ, blockSizeX, - blockSizeY, blockSizeZ, valuesToForward); - valuesToForward.resize(originallyForwardedValues); + blockSizeY, blockSizeZ); // Replace the loop terminator (loops contain only a single block) with the - // gpu return and move the operations from the loop body block to the gpu + // gpu terminator and move the operations from the loop body block to the gpu // launch body block. Do not move the entire block because of the difference // in block arguments. Operation &terminator = innermostForOp.getBody()->back(); @@ -445,9 +417,8 @@ // from 0 to N with step 1. Therefore, loop induction variables are replaced // with (gpu-thread/block-id * S) + LB. builder.setInsertionPointToStart(&launchOp.body().front()); - auto lbArgumentIt = std::next(launchOp.getKernelArguments().begin(), - originallyForwardedValues); - auto stepArgumentIt = std::next(lbArgumentIt, lbs.size()); + auto lbArgumentIt = lbs.begin(); + auto stepArgumentIt = steps.begin(); for (auto en : llvm::enumerate(ivs)) { Value id = en.index() < numBlockDims @@ -460,22 +431,10 @@ Value ivReplacement = builder.create(rootForOp.getLoc(), *lbArgumentIt, id); en.value().replaceAllUsesWith(ivReplacement); - replaceAllUsesInRegionWith(steps[en.index()], *stepArgumentIt, - launchOp.body()); std::advance(lbArgumentIt, 1); std::advance(stepArgumentIt, 1); } - // Remap the values defined outside the body to use kernel arguments instead. - // The list of kernel arguments also contains the lower bounds for loops at - // trailing positions, make sure we don't touch those. - for (auto pair : - llvm::zip_first(valuesToForward, launchOp.getKernelArguments())) { - Value from = std::get<0>(pair); - Value to = std::get<1>(pair); - replaceAllUsesInRegionWith(from, to, launchOp.body()); - } - // We are done and can erase the original outermost loop. rootForOp.erase(); } diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -196,11 +196,10 @@ void LaunchOp::build(Builder *builder, OperationState &result, Value gridSizeX, Value gridSizeY, Value gridSizeZ, Value blockSizeX, - Value blockSizeY, Value blockSizeZ, ValueRange operands) { + Value blockSizeY, Value blockSizeZ) { // Add grid and block sizes as op operands, followed by the data operands. result.addOperands( {gridSizeX, gridSizeY, gridSizeZ, blockSizeX, blockSizeY, blockSizeZ}); - result.addOperands(operands); // Create a kernel body region with kNumConfigRegionAttributes + N arguments, // where the first kNumConfigRegionAttributes arguments have `index` type and @@ -209,7 +208,6 @@ Block *body = new Block(); body->addArguments( std::vector(kNumConfigRegionAttributes, builder->getIndexType())); - body->addArguments(llvm::to_vector<4>(operands.getTypes())); kernelRegion->push_back(body); } @@ -237,14 +235,6 @@ return KernelDim3{args[9], args[10], args[11]}; } -LaunchOp::operand_range LaunchOp::getKernelOperandValues() { - return llvm::drop_begin(getOperands(), kNumConfigOperands); -} - -LaunchOp::operand_type_range LaunchOp::getKernelOperandTypes() { - return llvm::drop_begin(getOperandTypes(), kNumConfigOperands); -} - KernelDim3 LaunchOp::getGridSizeOperandValues() { return KernelDim3{getOperand(0), getOperand(1), getOperand(2)}; } @@ -253,11 +243,6 @@ return KernelDim3{getOperand(3), getOperand(4), getOperand(5)}; } -iterator_range LaunchOp::getKernelArguments() { - auto args = body().getBlocks().front().getArguments(); - return llvm::drop_begin(args, LaunchOp::kNumConfigRegionAttributes); -} - static LogicalResult verify(LaunchOp op) { // Kernel launch takes kNumConfigOperands leading operands for grid/block // sizes and transforms them into kNumConfigRegionAttributes region arguments @@ -312,25 +297,6 @@ printSizeAssignment(p, op.getBlockSize(), operands.slice(3, 3), op.getThreadIds()); - // From now on, the first kNumConfigOperands operands corresponding to grid - // and block sizes are irrelevant, so we can drop them. - operands = operands.drop_front(LaunchOp::kNumConfigOperands); - - // Print the data argument remapping. - if (!op.body().empty() && !operands.empty()) { - p << ' ' << op.getArgsKeyword() << '('; - Block *entryBlock = &op.body().front(); - interleaveComma(llvm::seq(0, operands.size()), p, [&](int i) { - p << entryBlock->getArgument(LaunchOp::kNumConfigRegionAttributes + i) - << " = " << operands[i]; - }); - p << ") "; - } - - // Print the types of data arguments. - if (!operands.empty()) - p << ": " << operands.getTypes(); - p.printRegion(op.body(), /*printEntryBlockArgs=*/false); p.printOptionalAttrDict(op.getAttrs()); } @@ -368,8 +334,7 @@ // Parses a Launch operation. // operation ::= `gpu.launch` `blocks` `(` ssa-id-list `)` `in` ssa-reassignment // `threads` `(` ssa-id-list `)` `in` ssa-reassignment -// (`args` ssa-reassignment `:` type-list)? -// region attr-dict? +// region attr-dict? // ssa-reassignment ::= `(` ssa-id `=` ssa-use (`,` ssa-id `=` ssa-use)* `)` static ParseResult parseLaunchOp(OpAsmParser &parser, OperationState &result) { // Sizes of the grid and block. @@ -402,103 +367,17 @@ result.operands)) return failure(); - // If kernel argument renaming segment is present, parse it. When present, - // the segment should have at least one element. If this segment is present, - // so is the trailing type list. Parse it as well and use the parsed types - // to resolve the operands passed to the kernel arguments. - SmallVector dataTypes; - if (!parser.parseOptionalKeyword(LaunchOp::getArgsKeyword())) { - llvm::SMLoc argsLoc = parser.getCurrentLocation(); - - regionArgs.push_back({}); - dataOperands.push_back({}); - if (parser.parseLParen() || parser.parseRegionArgument(regionArgs.back()) || - parser.parseEqual() || parser.parseOperand(dataOperands.back())) - return failure(); - - while (!parser.parseOptionalComma()) { - regionArgs.push_back({}); - dataOperands.push_back({}); - if (parser.parseRegionArgument(regionArgs.back()) || - parser.parseEqual() || parser.parseOperand(dataOperands.back())) - return failure(); - } - - if (parser.parseRParen() || parser.parseColonTypeList(dataTypes) || - parser.resolveOperands(dataOperands, dataTypes, argsLoc, - result.operands)) - return failure(); - } - - // Introduce the body region and parse it. The region has - // kNumConfigRegionAttributes leading arguments that correspond to + // Introduce the body region and parse it. The region has + // kNumConfigRegionAttributes arguments that correspond to // block/thread identifiers and grid/block sizes, all of the `index` type. - // Follow the actual kernel arguments. Type index = parser.getBuilder().getIndexType(); - dataTypes.insert(dataTypes.begin(), LaunchOp::kNumConfigRegionAttributes, - index); + SmallVector dataTypes( + LaunchOp::kNumConfigRegionAttributes, index); Region *body = result.addRegion(); return failure(parser.parseRegion(*body, regionArgs, dataTypes) || parser.parseOptionalAttrDict(result.attributes)); } -void LaunchOp::eraseKernelArgument(unsigned index) { - Block &entryBlock = body().front(); - assert(index < entryBlock.getNumArguments() - kNumConfigRegionAttributes && - "kernel argument index overflow"); - entryBlock.eraseArgument(kNumConfigRegionAttributes + index); - getOperation()->eraseOperand(kNumConfigOperands + index); -} - -namespace { -// Clone any known constants passed as operands to the kernel into its body. -class PropagateConstantBounds : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - - PatternMatchResult matchAndRewrite(LaunchOp launchOp, - PatternRewriter &rewriter) const override { - rewriter.startRootUpdate(launchOp); - PatternRewriter::InsertionGuard guard(rewriter); - rewriter.setInsertionPointToStart(&launchOp.body().front()); - - // Traverse operands passed to kernel and check if some of them are known - // constants. If so, clone the constant operation inside the kernel region - // and use it instead of passing the value from the parent region. Perform - // the traversal in the inverse order to simplify index arithmetics when - // dropping arguments. - auto operands = launchOp.getKernelOperandValues(); - auto kernelArgs = launchOp.getKernelArguments(); - bool found = false; - for (unsigned i = operands.size(); i > 0; --i) { - unsigned index = i - 1; - Value operand = operands[index]; - if (!isa_and_nonnull(operand.getDefiningOp())) - continue; - - found = true; - Value internalConstant = - rewriter.clone(*operand.getDefiningOp())->getResult(0); - Value kernelArg = *std::next(kernelArgs.begin(), index); - kernelArg.replaceAllUsesWith(internalConstant); - launchOp.eraseKernelArgument(index); - } - - if (!found) { - rewriter.cancelRootUpdate(launchOp); - return matchFailure(); - } - - rewriter.finalizeRootUpdate(launchOp); - return matchSuccess(); - } -}; -} // end namespace - -void LaunchOp::getCanonicalizationPatterns(OwningRewritePatternList &results, - MLIRContext *context) { - results.insert(context); -} - //===----------------------------------------------------------------------===// // LaunchFuncOp //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp --- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp @@ -17,6 +17,7 @@ #include "mlir/IR/Builders.h" #include "mlir/IR/SymbolTable.h" #include "mlir/Pass/Pass.h" +#include "mlir/Transforms/RegionUtils.h" using namespace mlir; @@ -100,13 +101,22 @@ // Outline the `gpu.launch` operation body into a kernel function. Replace // `gpu.terminator` operations by `gpu.return` in the generated function. -static gpu::GPUFuncOp outlineKernelFunc(gpu::LaunchOp launchOp) { +static gpu::GPUFuncOp outlineKernelFunc(gpu::LaunchOp launchOp, + llvm::SetVector &operands) { Location loc = launchOp.getLoc(); // Create a builder with no insertion point, insertion will happen separately // due to symbol table manipulation. OpBuilder builder(launchOp.getContext()); - SmallVector kernelOperandTypes(launchOp.getKernelOperandTypes()); + // Identify uses from values defined outside of the scope of the launch + // operation. + getUsedValuesDefinedAbove(launchOp.body(), operands); + + SmallVector kernelOperandTypes; + kernelOperandTypes.reserve(operands.size()); + for (Value operand : operands) { + kernelOperandTypes.push_back(operand.getType()); + } FunctionType type = FunctionType::get(kernelOperandTypes, {}, launchOp.getContext()); std::string kernelFuncName = @@ -116,6 +126,11 @@ builder.getUnitAttr()); outlinedFunc.body().takeBody(launchOp.body()); injectGpuIndexOperations(loc, outlinedFunc.body()); + Block &entryBlock = outlinedFunc.body().front(); + for (Value operand : operands) { + BlockArgument newArg = entryBlock.addArgument(operand.getType()); + replaceAllUsesInRegionWith(operand, newArg, outlinedFunc.body()); + } outlinedFunc.walk([](gpu::TerminatorOp op) { OpBuilder replacer(op); replacer.create(op.getLoc()); @@ -129,11 +144,12 @@ // `kernelFunc`. The kernel func contains the body of the `gpu.launch` with // constant region arguments inlined. static void convertToLaunchFuncOp(gpu::LaunchOp &launchOp, - gpu::GPUFuncOp kernelFunc) { + gpu::GPUFuncOp kernelFunc, + ValueRange operands) { OpBuilder builder(launchOp); auto launchFuncOp = builder.create( launchOp.getLoc(), kernelFunc, launchOp.getGridSizeOperandValues(), - launchOp.getBlockSizeOperandValues(), launchOp.getKernelOperandValues()); + launchOp.getBlockSizeOperandValues(), operands); inlineBeneficiaryOps(kernelFunc, launchFuncOp); launchOp.erase(); } @@ -158,7 +174,8 @@ // Insert just after the function. Block::iterator insertPt(func.getOperation()->getNextNode()); func.walk([&](gpu::LaunchOp op) { - gpu::GPUFuncOp outlinedFunc = outlineKernelFunc(op); + llvm::SetVector operands; + gpu::GPUFuncOp outlinedFunc = outlineKernelFunc(op, operands); // Create nested module and insert outlinedFunc. The module will // originally get the same name as the function, but may be renamed on @@ -167,7 +184,7 @@ symbolTable.insert(kernelModule, insertPt); // Potentially changes signature, pulling in constants. - convertToLaunchFuncOp(op, outlinedFunc); + convertToLaunchFuncOp(op, outlinedFunc, operands.getArrayRef()); modified = true; }); } diff --git a/mlir/test/Conversion/LoopsToGPU/imperfect_2D.mlir b/mlir/test/Conversion/LoopsToGPU/imperfect_2D.mlir --- a/mlir/test/Conversion/LoopsToGPU/imperfect_2D.mlir +++ b/mlir/test/Conversion/LoopsToGPU/imperfect_2D.mlir @@ -8,16 +8,16 @@ %1 = dim %arg0, 1 : memref %c0 = constant 0 : index %c1 = constant 1 : index - // CHECK: gpu.launch blocks([[ARG5:%.*]], [[ARG6:%.*]], [[ARG7:%.*]]) in ([[ARG11:%.*]] = {{%.*}}, [[ARG12:%.*]] = {{%.*}}, [[ARG13:%.*]] = {{%.*}}) threads([[ARG8:%.*]], [[ARG9:%.*]], [[ARG10:%.*]]) in ([[ARG14:%.*]] = {{%.*}}, [[ARG15:%.*]] = {{%.*}}, [[ARG16:%.*]] = {{%.*}}) args([[ARG17:%.*]] = [[ARG3]], [[ARG18:%.*]] = [[ARG4]], [[ARG19:%.*]] = [[ARG1]], [[ARG20:%.*]] = {{%.*}}, {{%.*}} = {{%.*}}, [[ARG22:%.*]] = [[ARG0]], [[ARG23:%.*]] = [[ARG2]] - // CHECK: [[TEMP1:%.*]] = muli [[ARG17]], [[ARG6]] : index + // CHECK: gpu.launch blocks([[ARG5:%.*]], [[ARG6:%.*]], [[ARG7:%.*]]) in ([[ARG11:%.*]] = {{%.*}}, [[ARG12:%.*]] = {{%.*}}, [[ARG13:%.*]] = {{%.*}}) threads([[ARG8:%.*]], [[ARG9:%.*]], [[ARG10:%.*]]) in ([[ARG14:%.*]] = {{%.*}}, [[ARG15:%.*]] = {{%.*}}, [[ARG16:%.*]] = {{%.*}}) + // CHECK: [[TEMP1:%.*]] = muli [[ARG3]], [[ARG6]] : index // CHECK: [[BLOCKLOOPYLB:%.*]] = addi {{%.*}}, [[TEMP1]] : index - // CHECK: [[BLOCKLOOPYSTEP:%.*]] = muli [[ARG17]], [[ARG12]] : index + // CHECK: [[BLOCKLOOPYSTEP:%.*]] = muli [[ARG3]], [[ARG12]] : index // CHECK: loop.for [[BLOCKLOOPYIV:%.*]] = [[BLOCKLOOPYLB]] to {{%.*}} step [[BLOCKLOOPYSTEP]] loop.for %iv1 = %c0 to %0 step %arg3 { - // CHECK: [[TEMP2:%.*]] = muli [[ARG18]], [[ARG5]] : index + // CHECK: [[TEMP2:%.*]] = muli [[ARG4]], [[ARG5]] : index // CHECK: [[BLOCKLOOPXLB:%.*]] = addi {{%.*}}, [[TEMP2]] : index - // CHECK: [[BLOCKLOOPXSTEP:%.*]] = muli [[ARG18]], [[ARG11]] : index + // CHECK: [[BLOCKLOOPXSTEP:%.*]] = muli [[ARG4]], [[ARG11]] : index // CHECK: loop.for [[BLOCKLOOPXIV:%.*]] = [[BLOCKLOOPXLB]] to {{%.*}} step [[BLOCKLOOPXSTEP]] loop.for %iv2 = %c0 to %1 step %arg4 { @@ -27,7 +27,7 @@ %2 = alloc(%arg3, %arg4) : memref // Load transpose tile - // CHECK: [[TEMP3:%.*]] = muli [[ARG20]], [[ARG9:%.*]] : index + // CHECK: [[TEMP3:%.*]] = muli [[ARG20:%.*]], [[ARG9:%.*]] : index // CHECK: [[THREADLOOP1YLB:%.*]] = addi {{%.*}}, [[TEMP3]] : index // CHECK: [[THREADLOOP1YSTEP:%.*]] = muli [[ARG20]], [[ARG15]] : index // CHECK: loop.for [[THREADLOOP1YIV:%.*]] = [[THREADLOOP1YLB]] to {{%.*}} step [[THREADLOOP1YSTEP]] @@ -41,7 +41,7 @@ %10 = addi %iv1, %iv3 : index // CHECK: [[INDEX1:%.*]] = addi [[BLOCKLOOPXIV]], [[THREADLOOP1XIV]] : index %11 = addi %iv2, %iv4 : index - // CHECK: [[VAL1:%.*]] = load [[ARG19]]{{\[}}[[INDEX1]], [[INDEX2]]{{\]}} : memref + // CHECK: [[VAL1:%.*]] = load [[ARG1]]{{\[}}[[INDEX1]], [[INDEX2]]{{\]}} : memref %12 = load %arg1[%11, %10] : memref // CHECK: store [[VAL1]], [[SCRATCHSPACE:%.*]]{{\[}}[[THREADLOOP1XIV]], [[THREADLOOP1YIV]]{{\]}} : memref store %12, %2[%iv4, %iv3] : memref @@ -67,10 +67,10 @@ %14 = addi %iv2, %iv4 : index // CHECK: {{%.*}} = load [[SCRATCHSPACE]]{{\[}}[[THREADLOOP2XIV]], [[THREADLOOP2YIV]]{{\]}} : memref %15 = load %2[%iv4, %iv3] : memref - // CHECK: {{%.*}} = load [[ARG22]]{{\[}}[[INDEX3]], [[INDEX4]]{{\]}} + // CHECK: {{%.*}} = load [[ARG0]]{{\[}}[[INDEX3]], [[INDEX4]]{{\]}} %16 = load %arg0[%13, %14] : memref %17 = mulf %15, %16 : f32 - // CHECK: store {{%.*}}, [[ARG23]]{{\[}}[[INDEX3]], [[INDEX4]]{{\]}} + // CHECK: store {{%.*}}, [[ARG2]]{{\[}}[[INDEX3]], [[INDEX4]]{{\]}} store %17, %arg2[%13, %14] : memref } } @@ -80,4 +80,4 @@ } return } -} \ No newline at end of file +} diff --git a/mlir/test/Conversion/LoopsToGPU/linalg_to_gpu.mlir b/mlir/test/Conversion/LoopsToGPU/linalg_to_gpu.mlir --- a/mlir/test/Conversion/LoopsToGPU/linalg_to_gpu.mlir +++ b/mlir/test/Conversion/LoopsToGPU/linalg_to_gpu.mlir @@ -14,7 +14,6 @@ // CHECK: gpu.launch // CHECK-SAME: blocks // CHECK-SAME: threads - // CHECK-SAME: args // Replacements of loop induction variables. Take a product with the // step and add the lower bound. diff --git a/mlir/test/Conversion/LoopsToGPU/step_one.mlir b/mlir/test/Conversion/LoopsToGPU/step_one.mlir --- a/mlir/test/Conversion/LoopsToGPU/step_one.mlir +++ b/mlir/test/Conversion/LoopsToGPU/step_one.mlir @@ -30,7 +30,6 @@ // CHECK-11: gpu.launch // CHECK-11-SAME: blocks // CHECK-11-SAME: threads - // CHECK-11-SAME: args // Remapping of the loop induction variables. // CHECK-11: %[[i:.*]] = addi %{{.*}}, %{{.*}} : index @@ -57,7 +56,6 @@ // CHECK-22: gpu.launch // CHECK-22-SAME: blocks // CHECK-22-SAME: threads - // CHECK-22-SAME: args // Remapping of the loop induction variables in the last mapped loop. // CHECK-22: %[[i:.*]] = addi %{{.*}}, %{{.*}} : index diff --git a/mlir/test/Dialect/GPU/canonicalize.mlir b/mlir/test/Dialect/GPU/canonicalize.mlir deleted file mode 100644 --- a/mlir/test/Dialect/GPU/canonicalize.mlir +++ /dev/null @@ -1,28 +0,0 @@ -// RUN: mlir-opt -pass-pipeline='func(canonicalize)' %s | FileCheck %s - -// CHECK-LABEL: @propagate_constant -// CHECK-SAME: %[[arg1:.*]]: memref -func @propagate_constant(%arg1: memref) { - // The outer constant must be preserved because it still has uses. - // CHECK: %[[outer_cst:.*]] = constant 1 - %c1 = constant 1 : index - - // The constant must be dropped from the args list, but the memref should - // remain. - // CHECK: gpu.launch - // CHECK-SAME: args(%[[inner_arg:.*]] = %[[arg1]]) : memref - gpu.launch blocks(%bx, %by, %bz) in (%sbx = %c1, %sby = %c1, %sbz = %c1) - threads(%tx, %ty, %tz) in (%stx = %c1, %sty = %c1, %stz = %c1) - args(%x = %c1, %y = %arg1) : index, memref { - // The constant is propagated into the kernel body and used. - // CHECK: %[[inner_cst:.*]] = constant 1 - // CHECK: "foo"(%[[inner_cst]]) - "foo"(%x) : (index) -> () - - // CHECK: "bar"(%[[inner_arg]]) - "bar"(%y) : (memref) -> () - gpu.terminator - } - return -} - diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir --- a/mlir/test/Dialect/GPU/invalid.mlir +++ b/mlir/test/Dialect/GPU/invalid.mlir @@ -1,7 +1,7 @@ // RUN: mlir-opt -split-input-file -verify-diagnostics %s func @not_enough_sizes(%sz : index) { - // expected-error@+1 {{expected 6 or more operands}} + // expected-error@+1 {{expected 6 operands, but found 5}} "gpu.launch"(%sz, %sz, %sz, %sz, %sz) ({ gpu.return }) : (index, index, index, index, index) -> () @@ -22,59 +22,6 @@ // ----- -func @isolation_arg(%sz : index) { - // expected-note@+1 {{required by region isolation constraints}} - "gpu.launch"(%sz, %sz, %sz, %sz, %sz, %sz) ({ - ^bb1(%bx: index, %by: index, %bz: index, - %tx: index, %ty: index, %tz: index, - %szbx: index, %szby: index, %szbz: index, - %sztx: index, %szty: index, %sztz: index): - // expected-error@+1 {{using value defined outside the region}} - "use"(%sz) : (index) -> () - gpu.return - }) : (index, index, index, index, index, index) -> () - return -} - -// ----- - -func @isolation_op(%sz : index) { - %val = "produce"() : () -> (index) - // expected-note@+1 {{required by region isolation constraints}} - "gpu.launch"(%sz, %sz, %sz, %sz, %sz, %sz) ({ - ^bb1(%bx: index, %by: index, %bz: index, - %tx: index, %ty: index, %tz: index, - %szbx: index, %szby: index, %szbz: index, - %sztx: index, %szty: index, %sztz: index): - // expected-error@+1 {{using value defined outside the region}} - "use"(%val) : (index) -> () - gpu.return - }) : (index, index, index, index, index, index) -> () - return -} - -// ----- - -func @nested_isolation(%sz : index) { - // expected-note@+1 {{required by region isolation constraints}} - "gpu.launch"(%sz, %sz, %sz, %sz, %sz, %sz) ({ - ^bb1(%bx: index, %by: index, %bz: index, - %tx: index, %ty: index, %tz: index, - %szbx: index, %szby: index, %szbz: index, - %sztx: index, %szty: index, %sztz: index): - "region"() ({ - "region"() ({ - // expected-error@+1 {{using value defined outside the region}} - "use"(%sz) : (index) -> () - }) : () -> () - }) : () -> () - gpu.return - }) : (index, index, index, index, index, index) -> () - return -} - -// ----- - func @launch_requires_gpu_return(%sz : index) { // @expected-note@+1 {{in 'gpu.launch' body region}} gpu.launch blocks(%bx, %by, %bz) in (%sbx = %sz, %sby = %sz, %sbz = %sz) @@ -463,4 +410,4 @@ gpu.return } } -} \ No newline at end of file +} diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -15,45 +15,11 @@ // CHECK-LABEL:func @args(%{{.*}}: index, %{{.*}}: index, %{{.*}}: f32, %{{.*}}: memref) { func @args(%blk : index, %thrd : index, %float : f32, %data : memref) { - // CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) args(%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) : f32, memref - gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %blk, %grid_y = %blk, %grid_z = %blk) - threads(%tx, %ty, %tz) in (%block_x = %thrd, %block_y = %thrd, %block_z = %thrd) - args(%kernel_arg0 = %float, %kernel_arg1 = %data) : f32, memref { - // CHECK: gpu.terminator - gpu.terminator - } - return - } - - // It is possible to use values passed into the region as arguments. - // CHECK-LABEL: func @passing_values - func @passing_values(%blk : index, %thrd : index, %float : f32, %data : memref) { - // CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) args(%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) : f32, memref + // CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %blk, %grid_y = %blk, %grid_z = %blk) - threads(%tx, %ty, %tz) in (%block_x = %thrd, %block_y = %thrd, %block_z = %thrd) - args(%kernel_arg0 = %float, %kernel_arg1 = %data) : f32, memref { - // CHECK: "use"(%{{.*}}) - "use"(%kernel_arg0): (f32) -> () - // CHECK: gpu.terminator - gpu.terminator - } - return - } - - // It is possible to use values defined in nested regions as long as they don't - // cross kernel launch region boundaries. - // CHECK-LABEL: func @nested_isolation - func @nested_isolation(%sz : index) { - gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %sz, %grid_y = %sz, %grid_z = %sz) - threads(%tx, %ty, %tz) in (%block_x = %sz, %block_y = %sz, %block_z = %sz) { - "region"() ({ - // CHECK: %{{.*}} = "produce"() - %val = "produce"() : () -> (index) - "region"() ({ - // CHECK: "use"(%{{.*}}) - "use"(%val) : (index) -> () - }) : () -> () - }) : () -> () + threads(%tx, %ty, %tz) in (%block_x = %thrd, %block_y = %thrd, %block_z = %thrd) { + "use"(%float) : (f32) -> () + "use"(%data) : (memref) -> () // CHECK: gpu.terminator gpu.terminator } diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir --- a/mlir/test/Dialect/GPU/outlining.mlir +++ b/mlir/test/Dialect/GPU/outlining.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt -gpu-kernel-outlining -split-input-file -verify-diagnostics %s | FileCheck %s +// RUN: mlir-opt -gpu-kernel-outlining -split-input-file -verify-diagnostics %s | FileCheck %s -dump-input-on-failure // CHECK: module attributes {gpu.container_module} @@ -26,11 +26,10 @@ gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY, %grid_z = %gDimZ) threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, - %block_z = %bDimZ) - args(%arg0 = %0, %arg1 = %1) : f32, memref { - "use"(%arg0): (f32) -> () + %block_z = %bDimZ) { + "use"(%0): (f32) -> () "some_op"(%bx, %block_x) : (index, index) -> () - %42 = load %arg1[%tx] : memref + %42 = load %1[%tx] : memref gpu.terminator } return @@ -96,9 +95,8 @@ gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst, %grid_z = %cst) threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst, - %block_z = %cst) - args(%kernel_arg0 = %cst2, %kernel_arg1 = %arg0, %kernel_arg2 = %cst3) : index, memref, index { - "use"(%kernel_arg0, %kernel_arg1, %kernel_arg2) : (index, memref, index) -> () + %block_z = %cst) { + "use"(%cst2, %arg0, %cst3) : (index, memref, index) -> () gpu.terminator } return diff --git a/mlir/test/mlir-cuda-runner/all-reduce-op.mlir b/mlir/test/mlir-cuda-runner/all-reduce-op.mlir --- a/mlir/test/mlir-cuda-runner/all-reduce-op.mlir +++ b/mlir/test/mlir-cuda-runner/all-reduce-op.mlir @@ -10,8 +10,7 @@ %sz = dim %dst, 0 : memref call @mcuMemHostRegisterMemRef3dFloat(%dst) : (memref) -> () gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one) - threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %sy, %block_z = %sz) - args(%kernel_dst = %dst) : memref { + threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %sy, %block_z = %sz) { %t0 = muli %tz, %block_y : index %t1 = addi %ty, %t0 : index %t2 = muli %t1, %block_x : index @@ -19,7 +18,7 @@ %t3 = index_cast %idx : index to i32 %val = sitofp %t3 : i32 to f32 %sum = "gpu.all_reduce"(%val) ({}) { op = "add" } : (f32) -> (f32) - store %sum, %kernel_dst[%tz, %ty, %tx] : memref + store %sum, %dst[%tz, %ty, %tx] : memref gpu.terminator } %U = memref_cast %dst : memref to memref<*xf32> diff --git a/mlir/test/mlir-cuda-runner/all-reduce-region.mlir b/mlir/test/mlir-cuda-runner/all-reduce-region.mlir --- a/mlir/test/mlir-cuda-runner/all-reduce-region.mlir +++ b/mlir/test/mlir-cuda-runner/all-reduce-region.mlir @@ -8,8 +8,7 @@ %sx = dim %dst, 0 : memref call @mcuMemHostRegisterMemRef1dFloat(%dst) : (memref) -> () gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one) - threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %one, %block_z = %one) - args(%kernel_dst = %dst) : memref { + threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %one, %block_z = %one) { %val = index_cast %tx : index to i32 %xor = "gpu.all_reduce"(%val) ({ ^bb(%lhs : i32, %rhs : i32): @@ -17,7 +16,7 @@ "gpu.yield"(%xor) : (i32) -> () }) : (i32) -> (i32) %res = sitofp %xor : i32 to f32 - store %res, %kernel_dst[%tx] : memref + store %res, %dst[%tx] : memref gpu.terminator } %U = memref_cast %dst : memref to memref<*xf32> diff --git a/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir b/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir --- a/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir +++ b/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir @@ -4,9 +4,8 @@ %cst = constant 1 : index %cst2 = dim %arg1, 0 : memref gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst, %grid_z = %cst) - threads(%tx, %ty, %tz) in (%block_x = %cst2, %block_y = %cst, %block_z = %cst) - args(%kernel_arg0 = %arg0, %kernel_arg1 = %arg1) : f32, memref { - store %kernel_arg0, %kernel_arg1[%tx] : memref + threads(%tx, %ty, %tz) in (%block_x = %cst2, %block_y = %cst, %block_z = %cst) { + store %arg0, %arg1[%tx] : memref gpu.terminator } return diff --git a/mlir/test/mlir-cuda-runner/shuffle.mlir b/mlir/test/mlir-cuda-runner/shuffle.mlir --- a/mlir/test/mlir-cuda-runner/shuffle.mlir +++ b/mlir/test/mlir-cuda-runner/shuffle.mlir @@ -8,8 +8,7 @@ %sx = dim %dst, 0 : memref call @mcuMemHostRegisterMemRef1dFloat(%dst) : (memref) -> () gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one) - threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %one, %block_z = %one) - args(%kernel_dst = %dst) : memref { + threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %one, %block_z = %one) { %t0 = index_cast %tx : index to i32 %val = sitofp %t0 : i32 to f32 %width = index_cast %block_x : index to i32 @@ -20,7 +19,7 @@ %m1 = constant -1.0 : f32 br ^bb1(%m1 : f32) ^bb1(%value : f32): - store %value, %kernel_dst[%tx] : memref + store %value, %dst[%tx] : memref gpu.terminator } %U = memref_cast %dst : memref to memref<*xf32>