diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -108,6 +108,35 @@ def ROCDL_MbcntLoOp : ROCDL_MbcntOp<"lo">; def ROCDL_MbcntHiOp : ROCDL_MbcntOp<"hi">; +def ROCDL_DsSwizzleOp : +ROCDL_Op<"ds_swizzle">, +Results<(outs I32:$res)>, +Arguments<(ins I32:$src, + I32:$offset)> +{ + string llvmBuilder = [{ + $res = createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_ds_swizzle, {$src, $offset}); + }]; + let assemblyFormat = [{ + $src `,` $offset attr-dict `:` `(` type($src) `,` type($offset) `)` `->` type($res) + }]; +} + +def ROCDL_DsBpermuteOp : +ROCDL_Op<"ds_bpermute">, +Results<(outs I32:$res)>, +Arguments<(ins I32:$index, + I32:$src)> +{ + string llvmBuilder = [{ + $res = createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_ds_bpermute, {$index, $src}); + }]; + let assemblyFormat = [{ + $index `,` $src attr-dict `:` `(` type($index) `,` type($src) `)` `->` type($res) + }]; +} + + //===----------------------------------------------------------------------===// // Thread index and Block index diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp --- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp +++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp @@ -64,6 +64,18 @@ return canBeBare; } +Value getLaneId(ConversionPatternRewriter &rewriter, Location loc, + const unsigned indexBitwidth) { + auto int32Type = IntegerType::get(rewriter.getContext(), 32); + Value zero = rewriter.createOrFold(loc, 0, 32); + Value minus1 = rewriter.createOrFold(loc, -1, 32); + Value mbcntLo = rewriter.create(loc, int32Type, + ValueRange{minus1, zero}); + Value laneId = rewriter.create(loc, int32Type, + ValueRange{minus1, mbcntLo}); + return laneId; +} + namespace { struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern { using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; @@ -98,6 +110,86 @@ } }; +struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + + /// Lowers a shuffle to the corresponding NVVM op. + /// + /// Convert the `width` argument into an activeMask (a bitmask which specifies + /// which threads participate in the shuffle) and a maskAndClamp (specifying + /// the highest lane which participates in the shuffle). + /// + /// + /// DS Bpermute: + /// let shflMode = [xor, up, down, idx] + /// let warpSize = 32, step = [1, 2, 4, 8, 16, ... , WarpSize]. + /// 1. curLaneId = using mbcnt.lo + mbcnt.hi + /// 2. warpSizeOrZeroIfOutside = (curLaneId + warpSize) & -warpSize + /// 3. dstLane = shflMode(curLaneId, step) + /// 4. isActiveSrcLane = dstLane < isActiveSrcLane + /// 5. dstLane = isActiveSrcLane ? dstLane : curLaneId + /// 6. dwordAlignedDstLane = dstLane * 4 or dstLane << 2. + /// 7. bpermute(dwordAlignedDstLane, shfl_value). + /// + LogicalResult + matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Location loc = op->getLoc(); + // TODO: Add support for non 32-bit shuffle values. + if (adaptor.getValue().getType().getIntOrFloatBitWidth() != 32) + return failure(); + const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth(); + Value srcLaneId = getLaneId(rewriter, loc, indexBitwidth); + + // TODO: Change warp from const to width. + auto int32Type = IntegerType::get(rewriter.getContext(), 32); + const int kWarpSize = 32; + Value warpSize = adaptor.getWidth(); + Value zero = rewriter.create(loc, int32Type, 0); + Value negWarpSize = + rewriter.create(loc, int32Type, zero, warpSize); + Value add = + rewriter.create(loc, int32Type, srcLaneId, warpSize); + Value warpSizeOrZeroIfOutside = + rewriter.create(loc, int32Type, add, negWarpSize); + Value dstLane; + // TODO: Add support for gpu::ShuffleMode::UP and gpu::ShuffleMode::DOWN. + // TODO: Use ds_swizzle for XOR when step/offsets are constants for better + // perf. + switch (op.getMode()) { + case gpu::ShuffleMode::XOR: + dstLane = rewriter.create(loc, int32Type, srcLaneId, + adaptor.getOffset()); + break; + case gpu::ShuffleMode::IDX: + dstLane = adaptor.getOffset(); + break; + default: + return failure(); + } + Value isActiveSrcLane = rewriter.create( + loc, LLVM::ICmpPredicate::slt, dstLane, warpSizeOrZeroIfOutside); + Value selectDstLane = rewriter.create(loc, isActiveSrcLane, + dstLane, srcLaneId); + Value two = rewriter.create(loc, int32Type, 2); + Value dwordAlignedDstLane = + rewriter.create(loc, int32Type, selectDstLane, two); + Value initShflValue = adaptor.getValue(); + if (adaptor.getValue().getType().isF32()) { + initShflValue = + rewriter.create(loc, int32Type, initShflValue); + } + Value shflValue = rewriter.create( + loc, int32Type, dwordAlignedDstLane, initShflValue); + if (adaptor.getValue().getType().isF32()) { + shflValue = rewriter.create( + loc, adaptor.getValue().getType(), shflValue); + } + rewriter.replaceOp(op, {shflValue, isActiveSrcLane}); + return success(); + } +}; + /// Import the GPU Ops to ROCDL Patterns. #include "GPUToROCDL.cpp.inc" @@ -278,7 +370,7 @@ patterns.add(converter, /*addressSpace=*/4); } - patterns.add(converter); + patterns.add(converter); populateOpPatterns(converter, patterns, "__ocml_fabs_f32", "__ocml_fabs_f64"); diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir --- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir +++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir @@ -524,3 +524,92 @@ } } +// ----- + +gpu.module @test_module { + // CHECK-LABEL: func @gpu_all_reduce_op() + gpu.func @gpu_all_reduce_op() { + %arg0 = arith.constant 1.0 : f32 + // TODO: Check full IR expansion once lowering has settled. + // CHECK: llvm.add + // CHECK: llvm.and + // CHECK: llvm.xor + // CHECK: llvm.icmp "slt" + // CHECK: llvm.select + // CHECK: llvm.shl + // CHECK: rocdl.ds_bpermute {{.*}} + // CHECK: rocdl.barrier + // CHECK: llvm.bitcast + // CHECK: llvm.fadd + %result = gpu.all_reduce add %arg0 uniform {} : (f32) -> (f32) + + gpu.return + } +} + + +// ----- + +gpu.module @test_module { + // CHECK-LABEL: func @gpu_all_reduce_region() + gpu.func @gpu_all_reduce_region() { + %arg0 = arith.constant 1 : i32 + // TODO: Check full IR expansion once lowering has settled. + // CHECK: llvm.add + // CHECK: llvm.and + // CHECK: llvm.xor + // CHECK: llvm.icmp "slt" + // CHECK: llvm.select + // CHECK: llvm.shl + // CHECK: rocdl.ds_bpermute {{.*}} + // CHECK: rocdl.barrier + %result = gpu.all_reduce %arg0 uniform { + ^bb(%lhs : i32, %rhs : i32): + %xor = arith.xori %lhs, %rhs : i32 + "gpu.yield"(%xor) : (i32) -> () + } : (i32) -> (i32) + gpu.return + } +} + +// ----- + +gpu.module @test_module { + // CHECK-LABEL: func @gpu_shuffle() + func.func @gpu_shuffle() -> (f32, f32) { + // CHECK: %[[#VALUE:]] = llvm.mlir.constant(1.000000e+00 : f32) : f32 + %arg0 = arith.constant 1.0 : f32 + // CHECK: %[[#OFFSET:]] = llvm.mlir.constant(4 : i32) : i32 + %arg1 = arith.constant 4 : i32 + // CHECK: %[[#WIDTH:]] = llvm.mlir.constant(23 : i32) : i32 + %arg2 = arith.constant 23 : i32 + // CHECK: %[[#LANE_ID:]] = rocdl.mbcnt.hi + // CHECK: %[[#ZERO:]] = llvm.mlir.constant(0 : i32) : i32 + // CHECK: %[[#NEG_WIDTH:]] = llvm.sub %[[#ZERO]], %[[#WIDTH]] : i32 + // CHECK: %[[#ADD:]] = llvm.add %[[#LANE_ID]], %[[#WIDTH]] : i32 + // CHECK: %[[#WARP_OR_ZERO:]] = llvm.and %[[#ADD]], %[[#NEG_WIDTH]] : i32 + // CHECK: %[[#XOR:]] = llvm.xor %[[#LANE_ID]], %{{.*}} : i32 + // CHECK: %[[#CMP:]] = llvm.icmp "slt" %[[#XOR]], %[[#WARP_OR_ZERO]] : i32 + // CHECK: %[[#DST_LANE:]] = llvm.select %[[#CMP]], %[[#XOR]], %{{.*}} : i1, i32 + // CHECK: %[[#TWO:]] = llvm.mlir.constant(2 : i32) : i32 + // CHECK: %[[#ALIGNED_DST_LANE:]] = llvm.shl %[[#DST_LANE]], %[[#TWO]] : i32 + // CHECK: %[[#CAST_VALUE:]] = llvm.bitcast %[[#VALUE]] : f32 to i32 + // CHECK: %[[#PERMUTE:]] = rocdl.ds_bpermute %[[#ALIGNED_DST_LANE]], %[[#CAST_VALUE]] : (i32, i32) -> i32 + // CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#PERMUTE]] : i32 to f32 + %shfl, %pred = gpu.shuffle xor %arg0, %arg1, %arg2 : f32 + // CHECK: %[[#LANE_ID:]] = rocdl.mbcnt.hi + // CHECK: %[[#ZERO:]] = llvm.mlir.constant(0 : i32) : i32 + // CHECK: %[[#NEG_WIDTH:]] = llvm.sub %[[#ZERO]], %[[#WIDTH]] : i32 + // CHECK: %[[#ADD:]] = llvm.add %[[#LANE_ID]], %[[#WIDTH]] : i32 + // CHECK: %[[#WARP_OR_ZERO:]] = llvm.and %[[#ADD]], %[[#NEG_WIDTH]] : i32 + // CHECK: %[[#CMP:]] = llvm.icmp "slt" %[[#OFFSET]], %[[#WARP_OR_ZERO]] : i32 + // CHECK: %[[#DST_LANE:]] = llvm.select %[[#CMP]], %[[#OFFSET]], %{{.*}} : i1, i32 + // CHECK: %[[#TWO:]] = llvm.mlir.constant(2 : i32) : i32 + // CHECK: %[[#ALIGNED_DST_LANE:]] = llvm.shl %[[#DST_LANE]], %[[#TWO]] : i32 + // CHECK: %[[#CAST_VALUE:]] = llvm.bitcast %[[#VALUE]] : f32 to i32 + // CHECK: %[[#PERMUTE:]] = rocdl.ds_bpermute %[[#ALIGNED_DST_LANE]], %[[#CAST_VALUE]] : (i32, i32) -> i32 + // CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#PERMUTE]] : i32 to f32 + %shfli, %predi = gpu.shuffle idx %arg0, %arg1, %arg2 : f32 + func.return %shfl, %shfli : f32, f32 + } +} \ No newline at end of file diff --git a/mlir/test/Conversion/GPUToROCDL/invalid.mlir b/mlir/test/Conversion/GPUToROCDL/invalid.mlir deleted file mode 100644 --- a/mlir/test/Conversion/GPUToROCDL/invalid.mlir +++ /dev/null @@ -1,17 +0,0 @@ -// RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file --verify-diagnostics - - -// Demonstrate the need to register the cf and memref dialect as dependent. -// CHECK-LABEL: @dependentDialect -gpu.module @module { - gpu.func @dependentDialect() { - %arg0 = arith.constant 1 : i32 - // expected-error@+1 {{failed to legalize operation 'gpu.shuffle' that was explicitly marked illega}} - %result = gpu.all_reduce %arg0 uniform { - ^bb(%lhs : i32, %rhs : i32): - %xor = arith.xori %lhs, %rhs : i32 - "gpu.yield"(%xor) : (i32) -> () - } : (i32) -> (i32) - gpu.return - } -} diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir --- a/mlir/test/Target/LLVMIR/rocdl.mlir +++ b/mlir/test/Target/LLVMIR/rocdl.mlir @@ -66,6 +66,22 @@ llvm.return %3 : i32 } +llvm.func @rocdl.swizzle(%src : i32) -> i32 { + // CHECK-LABEL: rocdl.swizzle + // CHECK: call i32 @llvm.amdgcn.ds.swizzle + %offset = llvm.mlir.constant(100 : i32) : i32 + %0 = rocdl.ds_swizzle %src, %offset : (i32, i32) -> i32 + llvm.return %0 : i32 +} + +llvm.func @rocdl.bpermute(%src : i32) -> i32 { + // CHECK-LABEL: rocdl.bpermute + // CHECK: call i32 @llvm.amdgcn.ds.bpermute + %index = llvm.mlir.constant(10 : i32) : i32 + %0 = rocdl.ds_bpermute %index, %src : (i32, i32) -> i32 + llvm.return %0 : i32 +} + llvm.func @rocdl.barrier() { // CHECK: fence syncscope("workgroup") release // CHECK-NEXT: call void @llvm.amdgcn.s.barrier()