diff --git a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h --- a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h +++ b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h @@ -53,6 +53,7 @@ /// dim sizes are currently not supported. DiagnosedSilenceableFailure mapForeachToBlocksImpl( RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp, + SmallVector mapping, function_ref &)> blockIdGenerator, diff --git a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td --- a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td +++ b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td @@ -160,6 +160,65 @@ let arguments = (ins PDL_Operation:$target, DefaultValuedAttr:$gridDim, + DefaultValuedAttr:$blockDim, + UnitAttr:$generate_gpu_launch); + let results = (outs PDL_Operation:$result); + + let assemblyFormat = "$target attr-dict"; + let extraClassDeclaration = [{ + ::mlir::DiagnosedSilenceableFailure applyToOne( + ::mlir::Operation *target, + ::llvm::SmallVectorImpl<::mlir::Operation *> &results, + ::mlir::transform::TransformState &state); + }]; +} + +def MapForeach : + Op { + let description = [{ + Target the gpu_launch op and rewrite the top level `scf.foreach_thread` + to distributed gpu.block_id attribute. If `generate_gpu_launch` attribute + is set, then first generates `gpu_launch` and moves the top level + `scf.foreach_thread` inside. + + The operation searches top level `scf.foreach_thread` ops under + `gpu_launch` and maps each such op to GPU blocks. Mapping is + one-to-one and the induction variables of `scf.foreach_thread` are + rewritten to gpu.block_id according to the `thread_dim_apping` attribute. + + Dynamic, `scf.foreach_thread` trip counts are currently not supported. + Dynamic block dim sizes are currently not supported. + + Only **bufferized** scf.foreach_thread are currently supported. + Only scf.foreach_thread distributed to **at most 3 dimensions** are + currently supported. + + The operation alters the block size of the given gpu_launch using + gridDim argument. + + #### Return modes: + + This operation ignores non-gpu_launch ops and drops them in the return. + + If any scf.foreach_thread with tensors is found, the transform definitely + fails. + + If all the scf.foreach_thread operations contained within the LaunchOp + referred to by the `target` PDLOperation lower to GPU properly, the + transform succeeds. Otherwise the transform definitely fails. + + The returned handle points to the same LaunchOp operand, consuming it and + producing a new SSA value to satisfy chaining and linearity of the IR + properties. + }]; + + let arguments = (ins PDL_Operation:$target, + DefaultValuedAttr:$gridDim, + DefaultValuedAttr:$blockDim, UnitAttr:$generate_gpu_launch); let results = (outs PDL_Operation:$result); diff --git a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp --- a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp +++ b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp @@ -155,6 +155,7 @@ DiagnosedSilenceableFailure mlir::transform::gpu::mapForeachToBlocksImpl( RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp, + SmallVector mapping, function_ref &)> blockIdGenerator, @@ -168,18 +169,6 @@ // Step 0. Outline the compute workload region and set up the workload // operands. - SmallVector mapping; - if (!foreachThreadOp.getMapping().has_value()) - return transformOp.emitSilenceableError() << "Mapping must be present"; - for (auto map : *foreachThreadOp.getMapping()) { - if (auto blockMap = map.dyn_cast()) { - mapping.push_back((int64_t)blockMap.getBlock()); - } else { - return transformOp.emitSilenceableError() - << "Mapping must be #gpu.block"; - } - } - FailureOr> potentialGridDim = foreachThreadOp.getPermutedNumThreads(rewriter, mapping); @@ -260,6 +249,49 @@ } } +/// This is a helper that is only used in +/// rewriteTopLevelForeachThreadToGpuBlocks. It generates GPU dialects +/// block_id * block_dim + thread_id. +static void generateGpuGlobalIds(RewriterBase &rewriter, + scf::ForeachThreadOp foreachOp, + SmallVectorImpl &blockOps) { + Location loc = foreachOp->getLoc(); + OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPoint(foreachOp); + IndexType indexType = rewriter.getIndexType(); + SmallVector gpuDims{Dimension::x, Dimension::y, Dimension::z}; + for (int64_t idx : llvm::seq(0, gpuDims.size())) { + auto bidx = rewriter.create(loc, indexType, gpuDims[idx]); + auto tidx = rewriter.create(loc, indexType, gpuDims[idx]); + auto bdimx = rewriter.create(loc, indexType, gpuDims[idx]); + auto bid = rewriter.create(loc, bdimx, bidx); + auto globalid = rewriter.create(loc, bid, tidx); + blockOps.push_back(globalid); + } +} + +static LogicalResult fill_blocks(mlir::ArrayAttr maps, + SmallVector &mapping) { + for (auto map : maps) { + if (auto blockMap = map.dyn_cast()) + mapping.push_back((int64_t)blockMap.getBlock()); + else + return failure(); + } + return success(); +} + +static LogicalResult fill_global(mlir::ArrayAttr maps, + SmallVector &mapping) { + for (auto map : maps) { + if (auto blockMap = map.dyn_cast()) + mapping.push_back((int64_t)blockMap.getGlobal()); + else + return failure(); + } + return success(); +} + DiagnosedSilenceableFailure transform::MapForeachToBlocks::applyToOne(Operation *target, SmallVectorImpl &results, @@ -305,9 +337,22 @@ topLevelForeachThreadOp = cast(newForeachThreadOp); } + SmallVector mapping; + auto maps = topLevelForeachThreadOp.getMapping(); + if (!maps.has_value() || maps->empty()) { + results.assign({target}); + return emitSilenceableError() << "Mapping must be present, which can be " + "#gpu.block or #gpu.global"; + } + if (failed(fill_blocks(maps.value(), mapping))) { + results.assign({target}); + return emitSilenceableError() + << "Mapping must be same type of #gpu.global or #gpu.block"; + } + SmallVector gridDim = extractFromI64ArrayAttr(getGridDim()); diag = mlir::transform::gpu::mapForeachToBlocksImpl( - rewriter, topLevelForeachThreadOp, generateGpuBlockIds, gridDim, + rewriter, topLevelForeachThreadOp, mapping, generateGpuBlockIds, gridDim, transformOp); if (diag.succeeded()) { gridDim.resize(3, 1); @@ -504,6 +549,108 @@ return diag; } +//===----------------------------------------------------------------------===// +// MapForeach +//===----------------------------------------------------------------------===// + +DiagnosedSilenceableFailure +transform::MapForeach::applyToOne(Operation *target, + SmallVectorImpl &results, + transform::TransformState &state) { + LaunchOp gpuLaunch = dyn_cast(target); + SimpleRewriter rewriter(getContext()); + auto transformOp = cast(getOperation()); + + if (!getGenerateGpuLaunch() && !gpuLaunch) { + results.assign({target}); + DiagnosedSilenceableFailure diag = + emitSilenceableError() + << "Given target is not gpu.launch, set `generate_gpu_launch` " + "attribute"; + diag.attachNote(target->getLoc()) << "when applied to this payload op"; + return diag; + } + + scf::ForeachThreadOp topLevelForeachThreadOp; + DiagnosedSilenceableFailure diag = + mlir::transform::gpu::findTopLevelForeachThreadOp( + target, topLevelForeachThreadOp, transformOp); + if (!diag.succeeded()) { + results.assign({target}); + diag.attachNote(target->getLoc()) << "when applied to this payload op"; + return diag; + } + + OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPoint(topLevelForeachThreadOp); + + // Generate gpu launch here and move the foreach_thread inside + if (getGenerateGpuLaunch()) { + DiagnosedSilenceableFailure diag = + createGpuLaunch(rewriter, target->getLoc(), transformOp, gpuLaunch); + if (!diag.succeeded()) { + results.assign({target}); + return diag; + } + rewriter.setInsertionPointToStart(&gpuLaunch.getBody().front()); + Operation *newForeachThreadOp = rewriter.clone(*topLevelForeachThreadOp); + rewriter.eraseOp(topLevelForeachThreadOp); + topLevelForeachThreadOp = cast(newForeachThreadOp); + } + + SmallVector mapping; + SmallVector gridDim = extractFromI64ArrayAttr(getGridDim()); + SmallVector blockDim = extractFromI64ArrayAttr(getBlockDim()); + if (blockDim.empty()) { + results.assign({target}); + return emitSilenceableError() << getBlockDimAttrName() << " is missing"; + } + + auto maps = topLevelForeachThreadOp.getMapping(); + if (!maps.has_value() || maps->empty()) { + results.assign({target}); + return emitSilenceableError() << "Mapping must be present, which can be " + "#gpu.block or #gpu.global"; + } + bool globalIdMapping = false; + if (failed(fill_blocks(maps.value(), mapping))) { + if (failed(fill_global(maps.value(), mapping))) { + results.assign({target}); + return emitSilenceableError() + << "Mapping must be same type of #gpu.global or #gpu.block"; + } + globalIdMapping = true; + } + + diag = mlir::transform::gpu::mapForeachToBlocksImpl( + rewriter, topLevelForeachThreadOp, mapping, + globalIdMapping ? generateGpuGlobalIds : generateGpuBlockIds, gridDim, + transformOp); + if (diag.succeeded()) { + // process inner loops + diag = gpu::mapNestedForeachToThreadsImpl(rewriter, gpuLaunch, blockDim, + true, transformOp); + if (!diag.succeeded()) { + results.assign({gpuLaunch}); + return diag; + } + + gridDim.resize(3, 1); + auto calcBlockSize = [=](int64_t gridX, int64_t blockX) { + return globalIdMapping ? gridX / blockX + (gridX % blockX) : gridX; + }; + diag = alterGpuLaunch(rewriter, gpuLaunch, + cast(getOperation()), + calcBlockSize(gridDim[0], blockDim[0]), + calcBlockSize(gridDim[1], blockDim[1]), + calcBlockSize(gridDim[2], blockDim[2]), blockDim[0], + blockDim[1], blockDim[2]); + } + + results.assign({gpuLaunch}); + return diag; +} + //===----------------------------------------------------------------------===// // Transform op registration //===----------------------------------------------------------------------===// diff --git a/mlir/test/Dialect/GPU/transform-gpu.mlir b/mlir/test/Dialect/GPU/transform-gpu.mlir --- a/mlir/test/Dialect/GPU/transform-gpu.mlir +++ b/mlir/test/Dialect/GPU/transform-gpu.mlir @@ -162,3 +162,102 @@ %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 transform.gpu.map_nested_foreach_to_threads %funcop { blockDim = [12, 9, 1], syncAfterDistribute = false } } + + +// ----- + +!type4d = memref<32x64x32xf32> + +// CHECK-LABEL: func.func @saxpyglobal( +// CHECK-SAME: %[[ARGX:[0-9a-z]+]]: memref<32x64x32xf32> +// CHECK-SAME: %[[ARGY:[0-9a-z]+]]: memref<32x64x32xf32> +func.func @saxpyglobal(%x: !type4d, %y: !type4d, %alpha : f32) -> !type4d { + %c32 = arith.constant 32 : index + %c64 = arith.constant 64 : index + %c4 = arith.constant 4 : index +// CHECK: %[[C4:.*]] = arith.constant 4 : index +// CHECK: %[[C8:.*]] = arith.constant 8 : index +// CHECK: %[[C16:.*]] = arith.constant 16 : index +// CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C8]], %{{.*}} = %[[C16]], %{{.*}} = %[[C8]]) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C4]], %{{.*}} = %[[C4]], %{{.*}} = %[[C4]]) +// CHECK: %[[BLKX:.*]] = gpu.block_id x +// CHECK: %[[TIDX:.*]] = gpu.thread_id x +// CHECK: %[[BDIMX:.*]] = gpu.block_dim x +// CHECK: %[[MULX:.*]] = arith.muli %[[BDIMX]], %[[BLKX]] : index +// CHECK: %[[IDX:.*]] = arith.addi %[[MULX]], %[[TIDX]] : index +// CHECK: %[[BLKY:.*]] = gpu.block_id y +// CHECK: %[[TIDY:.*]] = gpu.thread_id y +// CHECK: %[[BDIMY:.*]] = gpu.block_dim y +// CHECK: %[[MULY:.*]] = arith.muli %[[BDIMY]], %[[BLKY]] : index +// CHECK: %[[IDY:.*]] = arith.addi %[[MULY]], %[[TIDY]] : index +// CHECK: %[[BLKZ:.*]] = gpu.block_id z +// CHECK: %[[TIDZ:.*]] = gpu.thread_id z +// CHECK: %[[BDIMZ:.*]] = gpu.block_dim z +// CHECK: %[[MULZ:.*]] = arith.muli %[[BDIMZ]], %[[BLKZ]] : index +// CHECK: %[[IDZ:.*]] = arith.addi %[[MULZ]], %[[TIDZ]] : index +// CHECK: memref.load %[[ARGX]][%[[IDZ]], %[[IDY]], %[[IDX]]] +// CHECK: memref.load %[[ARGY]][%[[IDZ]], %[[IDY]], %[[IDX]]] + scf.foreach_thread (%i, %j, %k) in (%c32, %c64, %c32) { + %4 = memref.load %x[%i, %j, %k] : !type4d + %5 = memref.load %y[%i, %j, %k] : !type4d + %6 = math.fma %alpha, %4, %5 : f32 + memref.store %6, %y[%i, %j, %k] : !type4d + } { mapping = [#gpu.global, #gpu.global, #gpu.global] } + return %y : !type4d +} + +transform.sequence failures(propagate) { +^bb1(%arg0: !pdl.operation): + %funcop = transform.structured.match ops{["func.func"]} in %arg0 + %gpuLaunch = transform.gpu.map_foreach %funcop { generate_gpu_launch, blockDim = [4, 4, 4] } +} + +// ----- + +!type4d = memref<32x64x4x32xf32> + +// CHECK-LABEL: func.func @saxpy4d( +// CHECK-SAME: %[[ARGX:[0-9a-z]+]]: memref<32x64x4x32xf32> +// CHECK-SAME: %[[ARGY:[0-9a-z]+]]: memref<32x64x4x32xf32> +// CHECK-SAME: %[[ARGA:[0-9a-z]+]]: memref<32x64x4x32xf32> +// CHECK-SAME: %[[ARGB:[0-9a-z]+]]: memref<32x64x4x32xf32> +func.func @saxpy4d(%x: !type4d, %y: !type4d, %a: !type4d, %b: !type4d, %alpha : f32) -> !type4d { + %c32 = arith.constant 32 : index + %c64 = arith.constant 64 : index + %c4 = arith.constant 4 : index +// CHECK: %[[C32:.*]] = arith.constant 32 : index +// CHECK: %[[C64:.*]] = arith.constant 64 : index +// CHECK: %[[C4:.*]] = arith.constant 4 : index +// CHECK: %[[C1:.*]] = arith.constant 1 : index +// CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C32]], %{{.*}} = %[[C64]], %{{.*}} = %[[C1]]) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C32]], %{{.*}} = %[[C4]], %{{.*}} = %[[C1]]) +// CHECK: %[[BLKX:.*]] = gpu.block_id x +// CHECK: %[[BLKY:.*]] = gpu.block_id y +// CHECK: %[[TIDX:.*]] = gpu.thread_id x +// CHECK: %[[TIDY:.*]] = gpu.thread_id y +// CHECK: memref.load %[[ARGX]][%[[BLKX]], %[[BLKY]], %[[TIDY]], %[[TIDX]]] +// CHECK: memref.load %[[ARGY]][%[[BLKX]], %[[BLKY]], %[[TIDY]], %[[TIDX]]] +// CHECK: gpu.barrier +// CHECK: memref.load %[[ARGA]][%[[BLKX]], %[[BLKY]], %[[TIDY]], %[[TIDX]]] +// CHECK: memref.load %[[ARGB]][%[[BLKX]], %[[BLKY]], %[[TIDY]], %[[TIDX]]] + scf.foreach_thread (%i, %j) in (%c32, %c64) { + scf.foreach_thread (%k, %l) in (%c4, %c32) { + %4 = memref.load %x[%i, %j, %k, %l] : !type4d + %5 = memref.load %y[%i, %j, %k, %l] : !type4d + %6 = math.fma %alpha, %4, %5 : f32 + memref.store %6, %y[%i, %j, %k, %l] : !type4d + } { mapping = [#gpu.thread, #gpu.thread] } + scf.foreach_thread (%k, %l) in (%c4, %c32) { + %4 = memref.load %a[%i, %j, %k, %l] : !type4d + %5 = memref.load %b[%i, %j, %k, %l] : !type4d + %6 = math.fma %alpha, %4, %5 : f32 + memref.store %6, %b[%i, %j, %k, %l] : !type4d + } { mapping = [#gpu.thread, #gpu.thread] } + } { mapping = [#gpu.block, #gpu.block] } + + return %y : !type4d +} + +transform.sequence failures(propagate) { +^bb1(%arg0: !pdl.operation): + %funcop = transform.structured.match ops{["func.func"]} in %arg0 + %gpuLaunch = transform.gpu.map_foreach %funcop { generate_gpu_launch, blockDim = [32, 4, 1] } +} \ No newline at end of file