diff --git a/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td --- a/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td @@ -136,6 +136,48 @@ let hasVerifier = 1; } +// Raw buffer atomic compare-and-swap +def AMDGPU_RawBufferAtomicCmpswapOp : + AMDGPU_Op<"raw_buffer_atomic_cmpswap", [ + AttrSizedOperandSegments, + AllTypesMatch<["src", "cmp", "value"]>, + AllElementTypesMatch<["value", "memref"]>]>, + Arguments<(ins AnyTypeOf<[I32, I64, F32, F64]>:$src, + AnyType:$cmp, + Arg:$memref, + Variadic:$indices, + DefaultValuedAttr:$boundsCheck, + OptionalAttr:$indexOffset, + Optional:$sgprOffset)>, + Results<(outs AnyType:$value)> { + + let summary = "Raw Buffer Atomic compare-and-swap"; + let description = [{ + The `amdgpu.raw_buffer_atomic_cmpswap` op is a wrapper around the + buffer-based atomic compare-and-swap min available on AMD GPUs. + + The index into the buffer is computed as for `memref.store` with the addition + of `indexOffset` (which is used to aid in emitting vectorized code) and, + if present `sgprOffset` (which is added after bounds checks and includes + any non-zero offset on the memref type). + + All indexing components are given in terms of the memref's element size, not + the byte lengths required by the intrinsic. + + Out of bounds atomic operations are ignored in hardware. + + See `amdgpu.raw_buffer_load` for a description of how the underlying + instruction is constructed. + }]; + let assemblyFormat = [{ + attr-dict $src `,` $cmp `->` $memref `[` $indices `]` + (`sgprOffset` $sgprOffset^)? `:` + type($value) `->` type($memref) `,` type($indices) + }]; + let hasCanonicalizer = 1; + let hasVerifier = 1; +} + // Raw buffer atomic floating point add def AMDGPU_RawBufferAtomicFaddOp : AMDGPU_Op<"raw_buffer_atomic_fadd", [AllElementTypesMatch<["value", "memref"]>, diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -252,6 +252,25 @@ let hasCustomAssemblyFormat = 1; } +def ROCDL_RawBufferAtomicCmpSwap : + ROCDL_Op<"raw.buffer.atomic.cmpswap", [AllTypesMatch<["res", "src", "cmp"]>]>, + Results<(outs LLVM_Type:$res)>, + Arguments<(ins LLVM_Type:$src, + LLVM_Type:$cmp, + LLVM_Type:$rsrc, + I32:$offset, + I32:$soffset, + I32:$aux)>{ + string llvmBuilder = [{ + $res = createIntrinsicCall(builder, + llvm::Intrinsic::amdgcn_raw_buffer_atomic_cmpswap, {$src, $cmp, $rsrc, + $offset, $soffset, $aux}, {$_resultType}); + }]; + let assemblyFormat = [{ + attr-dict `(` operands `)` `:` type($res) `,` type($rsrc) + }]; +} + //===---------------------------------------------------------------------===// // MI-100 and MI-200 buffer atomic floating point add intrinsic diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -62,6 +62,14 @@ else wantedDataType = gpuOp.getODSResults(0)[0].getType(); + Value atomicCmpData = Value(); + // Operand index 1 of a load is the indices, trying to read them can crash. + if (storeData) { + Value maybeCmpData = adaptor.getODSOperands(1)[0]; + if (maybeCmpData != memref) + atomicCmpData = maybeCmpData; + } + Type llvmWantedDataType = this->typeConverter->convertType(wantedDataType); Type i32 = rewriter.getI32Type(); @@ -73,8 +81,16 @@ // If we want to load a vector with total size <= 32 // bits, use a scalar load and bitcast it. Similarly, if bitsize(T) < 32 // and the total load size is >= 32, use a vector load of N / (bitsize(T) / - // 32) x i32 and bitcast. + // 32) x i32 and bitcast. Also, the CAS intrinsic requires integer operands, + // so bitcast any floats to integers. Type llvmBufferValType = llvmWantedDataType; + if (atomicCmpData) { + if (wantedDataType.isa()) + return gpuOp.emitOpError("vector compare-and-swap does not exist"); + if (auto floatType = wantedDataType.dyn_cast()) + llvmBufferValType = this->getTypeConverter()->convertType( + rewriter.getIntegerType(floatType.getWidth())); + } if (auto dataVector = wantedDataType.dyn_cast()) { uint32_t elemBits = dataVector.getElementTypeBitWidth(); uint32_t totalBits = elemBits * dataVector.getNumElements(); @@ -109,6 +125,16 @@ } } + if (atomicCmpData) { + if (llvmBufferValType != llvmWantedDataType) { + Value castForCmp = rewriter.create( + loc, llvmBufferValType, atomicCmpData); + args.push_back(castForCmp); + } else { + args.push_back(atomicCmpData); + } + } + // Construct buffer descriptor from memref, attributes int64_t offset = 0; SmallVector strides; @@ -529,6 +555,8 @@ RawBufferOpLowering, RawBufferOpLowering, RawBufferOpLowering, + RawBufferOpLowering, MFMAOpLowering>(converter, chipset); } diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -90,6 +90,10 @@ return verifyRawBufferOp(*this); } +LogicalResult RawBufferAtomicCmpswapOp::verify() { + return verifyRawBufferOp(*this); +} + static std::optional getConstantUint32(Value v) { APInt cst; if (!v.getType().isInteger(32)) @@ -136,12 +140,11 @@ } namespace { -struct RemoveStaticallyOobBufferLoads final - : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; +template +struct RemoveStaticallyOobBufferLoads final : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; - LogicalResult matchAndRewrite(RawBufferLoadOp op, - PatternRewriter &rw) const override { + LogicalResult matchAndRewrite(OpType op, PatternRewriter &rw) const override { if (!staticallyOutOfBounds(op)) return failure(); Type loadType = op.getResult().getType(); @@ -167,7 +170,7 @@ void RawBufferLoadOp::getCanonicalizationPatterns(RewritePatternSet &results, MLIRContext *context) { - results.add(context); + results.add>(context); } void RawBufferStoreOp::getCanonicalizationPatterns(RewritePatternSet &results, @@ -195,6 +198,12 @@ results.add>(context); } +void RawBufferAtomicCmpswapOp::getCanonicalizationPatterns( + RewritePatternSet &results, MLIRContext *context) { + results.add>( + context); +} + //===----------------------------------------------------------------------===// // MFMAOp //===----------------------------------------------------------------------===// diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir --- a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir @@ -197,6 +197,35 @@ func.return } +// CHECK-LABEL: func @amdgpu_raw_buffer_atomic_cmpswap_f32 +// CHECK-SAME: (%[[src:.*]]: f32, %[[cmp:.*]]: f32, {{.*}}) +func.func @amdgpu_raw_buffer_atomic_cmpswap_f32(%src : f32, %cmp : f32, %buf : memref<64xf32>, %idx: i32) -> f32 { + // CHECK: %[[srcCast:.*]] = llvm.bitcast %[[src]] : f32 to i32 + // CHECK: %[[cmpCast:.*]] = llvm.bitcast %[[cmp]] : f32 to i32 + // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32) + // CHECK: llvm.insertelement{{.*}}%[[numRecords]] + // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32) + // CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]] + // CHECK: %[[dst:.*]] = rocdl.raw.buffer.atomic.cmpswap(%[[srcCast]], %[[cmpCast]], %[[resource]], %{{.*}}, %{{.*}}, %{{.*}}) : i32, vector<4xi32> + // CHECK: %[[dstCast:.*]] = llvm.bitcast %[[dst]] : i32 to f32 + // CHECK: return %[[dstCast]] + %dst = amdgpu.raw_buffer_atomic_cmpswap {boundsCheck = true} %src, %cmp -> %buf[%idx] : f32 -> memref<64xf32>, i32 + func.return %dst : f32 +} + +// CHECK-LABEL: func @amdgpu_raw_buffer_atomic_cmpswap_i64 +// CHECK-SAME: (%[[src:.*]]: i64, %[[cmp:.*]]: i64, {{.*}}) +func.func @amdgpu_raw_buffer_atomic_cmpswap_i64(%src : i64, %cmp : i64, %buf : memref<64xi64>, %idx: i32) -> i64 { + // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(512 : i32) + // CHECK: llvm.insertelement{{.*}}%[[numRecords]] + // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32) + // CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]] + // CHECK: %[[dst:.*]] = rocdl.raw.buffer.atomic.cmpswap(%[[src]], %[[cmp]], %[[resource]], %{{.*}}, %{{.*}}, %{{.*}}) : i64, vector<4xi32> + // CHECK: return %[[dst]] + %dst = amdgpu.raw_buffer_atomic_cmpswap {boundsCheck = true} %src, %cmp -> %buf[%idx] : i64 -> memref<64xi64>, i32 + func.return %dst : i64 +} + // CHECK-LABEL: func @lds_barrier func.func @lds_barrier() { // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "s_waitcnt lgkmcnt(0)\0As_barrier" diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -74,6 +74,13 @@ func.return } +// CHECK-LABEL: func @raw_buffer_atomic_cmpswap_f32 +func.func @raw_buffer_atomic_cmpswap_f32(%src : f32, %cmp : f32, %dst : memref<128x64x32x16xf32>, %offset : i32, %idx0 : i32, %idx1 : i32, %idx2 : i32, %idx3 : i32) { + // CHECK: amdgpu.raw_buffer_atomic_cmpswap {indexOffset = 1 : i32} %{{.*}}, %{{.*}} -> %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] sgprOffset %{{.*}} : f32 -> memref<128x64x32x16xf32>, i32, i32, i32, i32 + amdgpu.raw_buffer_atomic_cmpswap {boundsCheck = true, indexOffset = 1 : i32} %src, %cmp -> %dst[%idx0, %idx1, %idx2, %idx3] sgprOffset %offset : f32 -> memref<128x64x32x16xf32>, i32, i32, i32, i32 + func.return +} + // CHECK-LABEL: func @lds_barrier func.func @lds_barrier() { // CHECK: amdgpu.lds_barrier diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir --- a/mlir/test/Dialect/LLVMIR/rocdl.mlir +++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir @@ -262,9 +262,11 @@ // CHECK-LABEL: rocdl.raw.buffer.i32 // CHECK: rocdl.raw.buffer.atomic.smax %{{.*}} %{{.*}} %{{.*}} %{{.*}} %{{.*}} : i32 // CHECK: rocdl.raw.buffer.atomic.umin %{{.*}} %{{.*}} %{{.*}} %{{.*}} %{{.*}} : i32 + // CHECK: %{{.*}} = rocdl.raw.buffer.atomic.cmpswap(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : i32, vector<4xi32> rocdl.raw.buffer.atomic.smax %vdata1, %rsrc, %offset, %soffset, %aux : i32 rocdl.raw.buffer.atomic.umin %vdata1, %rsrc, %offset, %soffset, %aux : i32 + %val = rocdl.raw.buffer.atomic.cmpswap(%vdata1, %vdata1, %rsrc, %offset, %soffset, %aux) : i32, vector<4xi32> llvm.return } diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir --- a/mlir/test/Target/LLVMIR/rocdl.mlir +++ b/mlir/test/Target/LLVMIR/rocdl.mlir @@ -294,6 +294,18 @@ llvm.return } +llvm.func @rocdl.raw.buffer.atomic.cmpswap(%rsrc : vector<4xi32>, + %offset : i32, %soffset : i32, + %src : i32, %cmp : i32) -> i32 { + %aux = llvm.mlir.constant(0 : i32) : i32 + // CHECK-LABEL: rocdl.raw.buffer.atomic.cmpswap + // CHECK: [[val:%.+]] = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %{{.*}}, i32 %{{.*}}, <4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} + // CHECK: ret i32 [[val]] + + %val = rocdl.raw.buffer.atomic.cmpswap(%src, %cmp, %rsrc, %offset, %soffset, %aux) : i32, vector<4xi32> + llvm.return %val : i32 +} + // CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-implicitarg-num-bytes"="56" } // CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1,1024" // CHECK-DAG: attributes #[[$KNOWN_BLOCK_SIZE_ATTRS]] = { "amdgpu-flat-work-group-size"="128,128"