diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -114,6 +114,20 @@ let assemblyFormat = "attr-dict"; } +def GPU_LinearIdOp : GPU_Op<"linear_id", [ + Pure, DeclareOpInterfaceMethods]> { + let description = [{ + Returns the linearized id within the workgroup (block). + + Example: + ```mlir + %laneId = gpu.lane_id + ``` + }]; + let results = (outs Index:$result); + let assemblyFormat = "attr-dict"; +} + def GPU_SubgroupIdOp : GPU_Op<"subgroup_id", [ Pure, DeclareOpInterfaceMethods]>, Arguments<(ins)>, Results<(outs Index:$result)> { diff --git a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td --- a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td +++ b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td @@ -64,6 +64,22 @@ }]; } +def LinearIdEnum : I64EnumAttr<"LinearId", "threads for loop mapping", [ + DimX, DimY, DimZ]> { + let cppNamespace = "::mlir::gpu"; +} + +def GPULinearIdMapping : GPU_Attr<"GPULinearIdMapping", "linear", [ + DeclareAttrInterfaceMethods ] > { + let parameters = (ins + EnumParameter:$linear_id + ); + let assemblyFormat = "`<` params `>`"; + let description = [{ + An attribute that allows defining thread parallelism for GPU devices. + }]; +} + def BlocksEnum : I64EnumAttr<"Blocks", "threads for loop mapping", [ DimX, DimY, DimZ]> { let cppNamespace = "::mlir::gpu"; diff --git a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td --- a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td +++ b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td @@ -114,6 +114,7 @@ let arguments = (ins PDL_Operation:$target, DefaultValuedAttr:$block_dims, DefaultValuedOptionalAttr:$warp_dims, + DefaultValuedOptionalAttr:$linear_dims, DefaultValuedAttr:$sync_after_distribute); let results = (outs PDL_Operation:$result); @@ -121,6 +122,7 @@ $target `block_dims` `=` $block_dims (`warp_dims` `=` $warp_dims^)? + (`linear_dims` `=` $linear_dims^)? (`sync_after_distribute` `=` $sync_after_distribute^)? attr-dict }]; @@ -132,7 +134,6 @@ }]; } - def MapForallToBlocks : Op(getWarp()); } +int64_t GPULinearIdMappingAttr::getMappingId() const { + return static_cast(getLinearId()); +} + int64_t GPUThreadMappingAttr::getMappingId() const { return static_cast(getThread()); } diff --git a/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp b/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp --- a/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp +++ b/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp @@ -118,6 +118,11 @@ setResultRange(getResult(), getIndexRange(0, kMaxSubgroupSize - 1ULL)); } +void LinearIdOp::inferResultRanges(ArrayRef, + SetIntRangeFn setResultRange) { + setResultRange(getResult(), getIndexRange(0, kMaxSubgroupSize - 1ULL)); +} + void SubgroupIdOp::inferResultRanges(ArrayRef, SetIntRangeFn setResultRange) { setResultRange(getResult(), getIndexRange(0, kMaxDim - 1ULL)); diff --git a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp --- a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp +++ b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp @@ -147,6 +147,45 @@ } }; +struct GpuLinearIdBuilder : public GpuIdBuilder { + GpuLinearIdBuilder(MLIRContext *ctx) : GpuIdBuilder() { + mappingAttributes = {GPULinearIdMappingAttr::get(ctx, LinearId::DimX), + GPULinearIdMappingAttr::get(ctx, LinearId::DimY), + GPULinearIdMappingAttr::get(ctx, LinearId::DimZ)}; + idBuilder = [](RewriterBase &rewriter, scf::ForallOp forallOp, + ArrayRef mappingDims) { + OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPoint(forallOp); + Location loc = forallOp.getLoc(); + Value linearIdOp = rewriter.create(loc); + SmallVector reverseBlockDims(llvm::reverse(mappingDims)); + LLVM_DEBUG(llvm::interleaveComma(reverseBlockDims, + DBGS() << "--delinearization basis: "); + llvm::dbgs() << "\n"); + + SmallVector strides = computeStrides(reverseBlockDims); + LLVM_DEBUG(llvm::interleaveComma(strides, + DBGS() << "--delinearization strides: "); + llvm::dbgs() << "\n"); + + AffineExpr d0; + bindDims(rewriter.getContext(), d0); + SmallVector delinearizingExprs = delinearize(d0, strides); + LLVM_DEBUG(llvm::interleaveComma(delinearizingExprs, + DBGS() << "--delinearization exprs: "); + llvm::dbgs() << "\n"); + + SmallVector ids; + for (AffineExpr e : delinearizingExprs) + ids.push_back(makeComposedAffineApply(rewriter, loc, e, linearIdOp)); + LLVM_DEBUG(llvm::interleaveComma(ids, DBGS() << "--ids: "); + llvm::dbgs() << "\n"); + + return ids; + }; + } +}; + } // namespace static DiagnosedSilenceableFailure @@ -732,6 +771,34 @@ rewriter.replaceAllUsesWith(subgroupIdOp, warpId.get()); }); + SmallVector linearDims{getLinearDims()}; + if (!linearDims.empty()) { + if (linearDims.size() != 3) + return transformOp.emitDefiniteFailure("requires size-3 linear mapping"); + } + + LLVM_DEBUG( + llvm::interleaveComma( + linearDims, DBGS() << "mapNestedForallToThreadsImpl linearDims: "); + llvm::dbgs() << "\n"); + GpuLinearIdBuilder gpuLinearIdBuilder(ctx); + diag = mlir::transform::gpu::mapNestedForallToThreadsImpl( + rewriter, transformOp, target, linearDims, getSyncAfterDistribute(), + gpuLinearIdBuilder); + if (!diag.succeeded()) + return diag; + + // Perform a late replacement of the SubgroupIdOp, taking into account the + // blockDims. + target->walk([&](LinearIdOp linearIdOp) { + OpBuilder::InsertionGuard g(rewriter); + rewriter.setInsertionPoint(linearIdOp); + auto linearThreadId = + getStaticLinearThreadId(rewriter, linearIdOp.getLoc(), blockDimsOfr); + LDBG("----linearThreadId: " << linearThreadId); + rewriter.replaceAllUsesWith(linearIdOp, linearThreadId.get()); + }); + results.push_back(gpuLaunch.getOperation()); return diag; } diff --git a/mlir/test/Dialect/GPU/transform-gpu.mlir b/mlir/test/Dialect/GPU/transform-gpu.mlir --- a/mlir/test/Dialect/GPU/transform-gpu.mlir +++ b/mlir/test/Dialect/GPU/transform-gpu.mlir @@ -239,16 +239,22 @@ // CHECK-DAG: #[[MAPWY:.*]] = affine_map<(d0, d1, d2) -> (((d0 + d1 * 12 + d2 * 132) floordiv 32) floordiv 4)> // CHECK-DAG: #[[MAPWX:.*]] = affine_map<(d0, d1, d2) -> ((((d0 + d1 * 12 + d2 * 132) floordiv 32) mod 4) floordiv 2)> +// CHECK-DAG: #[[MAPLX:.*]] = affine_map<(d0, d1, d2) -> (d2 * 4 + (d0 + d1 * 12) floordiv 33)> +// CHECK-DAG: #[[MAPLY:.*]] = affine_map<(d0, d1) -> (((d0 + d1 * 12) mod 33) floordiv 11)> +// CHECK-DAG: #[[MAPLZ:.*]] = affine_map<(d0, d1) -> ((d0 + d1 * 12) mod 11)> + // CHECK-LABEL: func.func @map_multi_level( func.func @map_multi_level(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type { %one = arith.constant 1 : index - %c12 = arith.constant 12 : index + %c10 = arith.constant 10 : index %c9 = arith.constant 9 : index %c7 = arith.constant 7 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index - // CHECK: %[[C1:.*]] = arith.constant 1 : index + // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index + // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index + // CHECK-DAG: %[[C10:.*]] = arith.constant 10 : index // check that both the thread level and the warp level got distributed. // CHECK-NOT: #gpu.thread @@ -277,6 +283,21 @@ %8 = arith.addf %alpha, %7 : f32 memref.store %8, %t[%i] : !type1d } {mapping = [#gpu.warp] } + + // CHECK-DAG: %[[LIDX:.*]] = affine.apply #[[MAPLX]](%[[TIDX]], %[[TIDY]], %[[TIDZ]]) + // CHECK-DAG: %[[LIDY:.*]] = affine.apply #[[MAPLY]](%[[TIDX]], %[[TIDY]]) + // CHECK-DAG: %[[LIDZ:.*]] = affine.apply #[[MAPLZ]](%[[TIDX]], %[[TIDY]]) + // CHECK-DAG: %[[CMPX:.*]] = arith.cmpi ult, %[[LIDX]], %[[C10]] : index + // CHECK-DAG: %[[CMPY:.*]] = arith.cmpi ult, %[[LIDY]], %[[C2]] : index + // CHECK: %[[CONDXY:.*]] = arith.andi %[[CMPX]], %[[CMPY]] : i1 + // CHECK-DAG: %[[CMPZ:.*]] = arith.cmpi ult, %[[LIDZ]], %[[C1]] : index + // CHECK: %[[CONDXYZ:.*]] = arith.andi %[[CONDXY]], %[[CMPZ]] : i1 + // CHECK: scf.if %[[CONDXYZ]] + scf.forall (%i, %j) in (%c10, %c2) { + %7 = memref.load %t[%i] : !type1d + %8 = arith.addf %alpha, %7 : f32 + memref.store %8, %t[%j] : !type1d + } {mapping = [#gpu.linear, #gpu.linear] } gpu.terminator } return %y : !type @@ -286,5 +307,5 @@ ^bb1(%arg0: !pdl.operation): %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation transform.gpu.map_nested_forall_to_threads %funcop - block_dims = [12, 11, 1] warp_dims = [2, 2, 1] + block_dims = [12, 11, 1] warp_dims = [2, 2, 1] linear_dims = [11, 3, 4] }