diff --git a/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td --- a/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td @@ -175,6 +175,120 @@ let hasVerifier = 1; } +// Raw buffer atomic floating point max +def AMDGPU_RawBufferAtomicFmaxOp : + AMDGPU_Op<"raw_buffer_atomic_fmax", [AllElementTypesMatch<["value", "memref"]>, + AttrSizedOperandSegments]>, + Arguments<(ins F32:$value, + Arg:$memref, + Variadic:$indices, + DefaultValuedAttr:$boundsCheck, + OptionalAttr:$indexOffset, + Optional:$sgprOffset)> { + + let summary = "Raw Buffer Floating-point Atomic Max (non-GFX9)"; + let description = [{ + The `amdgpu.raw_buffer_atomic_fmax` op is a wrapper around the + buffer-based atomic floating point max available on AMD GPUs (except GFX9). + + The index into the buffer is computed as for `memref.store` with the addition + of `indexOffset` (which is used to aid in emitting vectorized code) and, + if present `sgprOffset` (which is added after bounds checks and includes + any non-zero offset on the memref type). + + All indexing components are given in terms of the memref's element size, not + the byte lengths required by the intrinsic. + + Out of bounds atomic operations are ignored in hardware. + + See `amdgpu.raw_buffer_load` for a description of how the underlying + instruction is constructed. + }]; + let assemblyFormat = [{ + attr-dict $value `->` $memref `[` $indices `]` + (`sgprOffset` $sgprOffset^)? `:` + type($value) `->` type($memref) `,` type($indices) + }]; + let hasCanonicalizer = 1; + let hasVerifier = 1; +} + +// Raw buffer atomic signed integer max +def AMDGPU_RawBufferAtomicSmaxOp : + AMDGPU_Op<"raw_buffer_atomic_smax", [ + AttrSizedOperandSegments]>, + Arguments<(ins I32:$value, + Arg:$memref, + Variadic:$indices, + DefaultValuedAttr:$boundsCheck, + OptionalAttr:$indexOffset, + Optional:$sgprOffset)> { + + let summary = "Raw Buffer Signed Integer Atomic Max"; + let description = [{ + The `amdgpu.raw_buffer_atomic_smax` op is a wrapper around the + buffer-based atomic signed integer max available on AMD GPUs. + + The index into the buffer is computed as for `memref.store` with the addition + of `indexOffset` (which is used to aid in emitting vectorized code) and, + if present `sgprOffset` (which is added after bounds checks and includes + any non-zero offset on the memref type). + + All indexing components are given in terms of the memref's element size, not + the byte lengths required by the intrinsic. + + Out of bounds atomic operations are ignored in hardware. + + See `amdgpu.raw_buffer_load` for a description of how the underlying + instruction is constructed. + }]; + let assemblyFormat = [{ + attr-dict $value `->` $memref `[` $indices `]` + (`sgprOffset` $sgprOffset^)? `:` + type($value) `->` type($memref) `,` type($indices) + }]; + let hasCanonicalizer = 1; + let hasVerifier = 1; +} + +// Raw buffer atomic unsigned integer min +def AMDGPU_RawBufferAtomicUminOp : + AMDGPU_Op<"raw_buffer_atomic_umin", [ + AttrSizedOperandSegments]>, + Arguments<(ins I32:$value, + Arg:$memref, + Variadic:$indices, + DefaultValuedAttr:$boundsCheck, + OptionalAttr:$indexOffset, + Optional:$sgprOffset)> { + + let summary = "Raw Buffer Unsigned Integer Atomic Min"; + let description = [{ + The `amdgpu.raw_buffer_atomic_umin` op is a wrapper around the + buffer-based atomic signed integer min available on AMD GPUs. + + The index into the buffer is computed as for `memref.store` with the addition + of `indexOffset` (which is used to aid in emitting vectorized code) and, + if present `sgprOffset` (which is added after bounds checks and includes + any non-zero offset on the memref type). + + All indexing components are given in terms of the memref's element size, not + the byte lengths required by the intrinsic. + + Out of bounds atomic operations are ignored in hardware. + + See `amdgpu.raw_buffer_load` for a description of how the underlying + instruction is constructed. + }]; + let assemblyFormat = [{ + attr-dict $value `->` $memref `[` $indices `]` + (`sgprOffset` $sgprOffset^)? `:` + type($value) `->` type($memref) `,` type($indices) + }]; + let hasCanonicalizer = 1; + let hasVerifier = 1; +} + def AMDGPU_LDSBarrierOp : AMDGPU_Op<"lds_barrier"> { let summary = "Barrier that includes a wait for LDS memory operations."; let description = [{ diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -271,4 +271,61 @@ let hasCustomAssemblyFormat = 1; } +//===---------------------------------------------------------------------===// +// Buffer atomic floating point max intrinsic. GFX9 does not support fp32. + +def ROCDL_RawBufferAtomicFMaxOp : + ROCDL_Op<"raw.buffer.atomic.fmax">, + Arguments<(ins LLVM_Type:$vdata, + LLVM_Type:$rsrc, + LLVM_Type:$offset, + LLVM_Type:$soffset, + LLVM_Type:$aux)>{ + string llvmBuilder = [{ + auto vdataType = moduleTranslation.convertType(op.getVdata().getType()); + createIntrinsicCall(builder, + llvm::Intrinsic::amdgcn_raw_buffer_atomic_fmax, {$vdata, $rsrc, + $offset, $soffset, $aux}, {vdataType}); + }]; + let hasCustomAssemblyFormat = 1; +} + +//===---------------------------------------------------------------------===// +// Buffer atomic signed integer max intrinsic. + +def ROCDL_RawBufferAtomicSMaxOp : + ROCDL_Op<"raw.buffer.atomic.smax">, + Arguments<(ins LLVM_Type:$vdata, + LLVM_Type:$rsrc, + LLVM_Type:$offset, + LLVM_Type:$soffset, + LLVM_Type:$aux)>{ + string llvmBuilder = [{ + auto vdataType = moduleTranslation.convertType(op.getVdata().getType()); + createIntrinsicCall(builder, + llvm::Intrinsic::amdgcn_raw_buffer_atomic_smax, {$vdata, $rsrc, + $offset, $soffset, $aux}, {vdataType}); + }]; + let hasCustomAssemblyFormat = 1; +} + +//===---------------------------------------------------------------------===// +// Buffer atomic unsigned integer min intrinsic. + +def ROCDL_RawBufferAtomicUMinOp : + ROCDL_Op<"raw.buffer.atomic.umin">, + Arguments<(ins LLVM_Type:$vdata, + LLVM_Type:$rsrc, + LLVM_Type:$offset, + LLVM_Type:$soffset, + LLVM_Type:$aux)>{ + string llvmBuilder = [{ + auto vdataType = moduleTranslation.convertType(op.getVdata().getType()); + createIntrinsicCall(builder, + llvm::Intrinsic::amdgcn_raw_buffer_atomic_umin, {$vdata, $rsrc, + $offset, $soffset, $aux}, {vdataType}); + }]; + let hasCustomAssemblyFormat = 1; +} + #endif // ROCDLIR_OPS diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -527,6 +527,9 @@ RawBufferOpLowering, RawBufferOpLowering, RawBufferOpLowering, + RawBufferOpLowering, + RawBufferOpLowering, + RawBufferOpLowering, MFMAOpLowering>(converter, chipset); } diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -78,6 +78,18 @@ return verifyRawBufferOp(*this); } +LogicalResult RawBufferAtomicFmaxOp::verify() { + return verifyRawBufferOp(*this); +} + +LogicalResult RawBufferAtomicSmaxOp::verify() { + return verifyRawBufferOp(*this); +} + +LogicalResult RawBufferAtomicUminOp::verify() { + return verifyRawBufferOp(*this); +} + static std::optional getConstantUint32(Value v) { APInt cst; if (!v.getType().isInteger(32)) @@ -168,6 +180,21 @@ results.add>(context); } +void RawBufferAtomicFmaxOp::getCanonicalizationPatterns( + RewritePatternSet &results, MLIRContext *context) { + results.add>(context); +} + +void RawBufferAtomicSmaxOp::getCanonicalizationPatterns( + RewritePatternSet &results, MLIRContext *context) { + results.add>(context); +} + +void RawBufferAtomicUminOp::getCanonicalizationPatterns( + RewritePatternSet &results, MLIRContext *context) { + results.add>(context); +} + //===----------------------------------------------------------------------===// // MFMAOp //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp --- a/mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp @@ -154,6 +154,78 @@ p << " " << getOperands() << " : " << getVdata().getType(); } +// ::= +// `llvm.amdgcn.raw.buffer.atomic.fmax.* %vdata, %rsrc, %offset, +// %soffset, %aux : result_type` +ParseResult RawBufferAtomicFMaxOp::parse(OpAsmParser &parser, + OperationState &result) { + SmallVector ops; + Type type; + if (parser.parseOperandList(ops, 5) || parser.parseColonType(type)) + return failure(); + + auto bldr = parser.getBuilder(); + auto int32Ty = bldr.getI32Type(); + auto i32x4Ty = VectorType::get({4}, int32Ty); + + if (parser.resolveOperands(ops, {type, i32x4Ty, int32Ty, int32Ty, int32Ty}, + parser.getNameLoc(), result.operands)) + return failure(); + return success(); +} + +void RawBufferAtomicFMaxOp::print(mlir::OpAsmPrinter &p) { + p << " " << getOperands() << " : " << getVdata().getType(); +} + +// ::= +// `llvm.amdgcn.raw.buffer.atomic.smax.* %vdata, %rsrc, %offset, +// %soffset, %aux : result_type` +ParseResult RawBufferAtomicSMaxOp::parse(OpAsmParser &parser, + OperationState &result) { + SmallVector ops; + Type type; + if (parser.parseOperandList(ops, 5) || parser.parseColonType(type)) + return failure(); + + auto bldr = parser.getBuilder(); + auto int32Ty = bldr.getI32Type(); + auto i32x4Ty = VectorType::get({4}, int32Ty); + + if (parser.resolveOperands(ops, {type, i32x4Ty, int32Ty, int32Ty, int32Ty}, + parser.getNameLoc(), result.operands)) + return failure(); + return success(); +} + +void RawBufferAtomicSMaxOp::print(mlir::OpAsmPrinter &p) { + p << " " << getOperands() << " : " << getVdata().getType(); +} + +// ::= +// `llvm.amdgcn.raw.buffer.atomic.umin.* %vdata, %rsrc, %offset, +// %soffset, %aux : result_type` +ParseResult RawBufferAtomicUMinOp::parse(OpAsmParser &parser, + OperationState &result) { + SmallVector ops; + Type type; + if (parser.parseOperandList(ops, 5) || parser.parseColonType(type)) + return failure(); + + auto bldr = parser.getBuilder(); + auto int32Ty = bldr.getI32Type(); + auto i32x4Ty = VectorType::get({4}, int32Ty); + + if (parser.resolveOperands(ops, {type, i32x4Ty, int32Ty, int32Ty, int32Ty}, + parser.getNameLoc(), result.operands)) + return failure(); + return success(); +} + +void RawBufferAtomicUMinOp::print(mlir::OpAsmPrinter &p) { + p << " " << getOperands() << " : " << getVdata().getType(); +} + //===----------------------------------------------------------------------===// // ROCDLDialect initialization, type parsing, and registration. //===----------------------------------------------------------------------===// diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir --- a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir @@ -133,6 +133,39 @@ func.return } +// CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_fmax_f32 +func.func @gpu_gcn_raw_buffer_atomic_fmax_f32(%value: f32, %buf: memref<64xf32>, %idx: i32) { + // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32) + // CHECK: llvm.insertelement{{.*}}%[[numRecords]] + // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32) + // CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]] + // CHECK: rocdl.raw.buffer.atomic.fmax %{{.*}} %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : f32 + amdgpu.raw_buffer_atomic_fmax {boundsCheck = true} %value -> %buf[%idx] : f32 -> memref<64xf32>, i32 + func.return +} + +// CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_smax_i32 +func.func @gpu_gcn_raw_buffer_atomic_smax_i32(%value: i32, %buf: memref<64xi32>, %idx: i32) { + // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32) + // CHECK: llvm.insertelement{{.*}}%[[numRecords]] + // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32) + // CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]] + // CHECK: rocdl.raw.buffer.atomic.smax %{{.*}} %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32 + amdgpu.raw_buffer_atomic_smax {boundsCheck = true} %value -> %buf[%idx] : i32 -> memref<64xi32>, i32 + func.return +} + +// CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_umin_i32 +func.func @gpu_gcn_raw_buffer_atomic_umin_i32(%value: i32, %buf: memref<64xi32>, %idx: i32) { + // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32) + // CHECK: llvm.insertelement{{.*}}%[[numRecords]] + // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32) + // CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]] + // CHECK: rocdl.raw.buffer.atomic.umin %{{.*}} %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32 + amdgpu.raw_buffer_atomic_umin {boundsCheck = true} %value -> %buf[%idx] : i32 -> memref<64xi32>, i32 + func.return +} + // CHECK-LABEL: func @lds_barrier func.func @lds_barrier() { // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "s_waitcnt lgkmcnt(0)\0As_barrier" diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir --- a/mlir/test/Dialect/LLVMIR/rocdl.mlir +++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir @@ -225,11 +225,11 @@ llvm.return } -llvm.func @rocdl.raw.buffer(%rsrc : vector<4xi32>, +llvm.func @rocdl.raw.buffer.f32(%rsrc : vector<4xi32>, %offset : i32, %soffset : i32, %aux : i32, %vdata1 : f32, %vdata2 : vector<2xf32>, %vdata4 : vector<4xf32>) { - // CHECK-LABEL: rocdl.raw.buffer + // CHECK-LABEL: rocdl.raw.buffer.f32 // CHECK: %{{.*}} = rocdl.raw.buffer.load %{{.*}} %{{.*}} %{{.*}} %{{.*}} : f32 // CHECK: %{{.*}} = rocdl.raw.buffer.load %{{.*}} %{{.*}} %{{.*}} %{{.*}} : vector<2xf32> // CHECK: %{{.*}} = rocdl.raw.buffer.load %{{.*}} %{{.*}} %{{.*}} %{{.*}} : vector<4xf32> @@ -249,10 +249,25 @@ rocdl.raw.buffer.store %vdata4, %rsrc, %offset, %offset, %aux : vector<4xf32> rocdl.raw.buffer.atomic.fadd %vdata1, %rsrc, %offset, %soffset, %aux : f32 + rocdl.raw.buffer.atomic.fmax %vdata1, %rsrc, %offset, %soffset, %aux : f32 llvm.return } + +llvm.func @rocdl.raw.buffer.i32(%rsrc : vector<4xi32>, + %offset : i32, %soffset : i32, + %aux : i32, %vdata1 : i32, + %vdata2 : vector<2xi32>, %vdata4 : vector<4xi32>) { + // CHECK-LABEL: rocdl.raw.buffer.i32 + // CHECK: rocdl.raw.buffer.atomic.smax %{{.*}} %{{.*}} %{{.*}} %{{.*}} %{{.*}} : i32 + // CHECK: rocdl.raw.buffer.atomic.umin %{{.*}} %{{.*}} %{{.*}} %{{.*}} %{{.*}} : i32 + + rocdl.raw.buffer.atomic.smax %vdata1, %rsrc, %offset, %soffset, %aux : i32 + rocdl.raw.buffer.atomic.umin %vdata1, %rsrc, %offset, %soffset, %aux : i32 + llvm.return +} + // ----- // expected-error@below {{attribute attached to unexpected op}} diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir --- a/mlir/test/Target/LLVMIR/rocdl.mlir +++ b/mlir/test/Target/LLVMIR/rocdl.mlir @@ -266,14 +266,30 @@ llvm.return } -llvm.func @rocdl.raw.buffer.atomic(%rsrc : vector<4xi32>, +llvm.func @rocdl.raw.buffer.atomic.f32(%rsrc : vector<4xi32>, %offset : i32, %soffset : i32, %vdata1 : f32) { %aux = llvm.mlir.constant(0 : i32) : i32 - // CHECK-LABEL: rocdl.raw.buffer.atomic + // CHECK-LABEL: rocdl.raw.buffer.atomic.f32 // CHECK: call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %{{.*}}, <4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} + // CHECK: call float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float %{{.*}}, <4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} rocdl.raw.buffer.atomic.fadd %vdata1, %rsrc, %offset, %soffset, %aux : f32 + rocdl.raw.buffer.atomic.fmax %vdata1, %rsrc, %offset, %soffset, %aux : f32 + + llvm.return +} + +llvm.func @rocdl.raw.buffer.atomic.i32(%rsrc : vector<4xi32>, + %offset : i32, %soffset : i32, + %vdata1 : i32) { + %aux = llvm.mlir.constant(0 : i32) : i32 + // CHECK-LABEL: rocdl.raw.buffer.atomic.i32 + // CHECK: call i32 @llvm.amdgcn.raw.buffer.atomic.smax.i32(i32 %{{.*}}, <4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} + // CHECK: call i32 @llvm.amdgcn.raw.buffer.atomic.umin.i32(i32 %{{.*}}, <4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} + + rocdl.raw.buffer.atomic.smax %vdata1, %rsrc, %offset, %soffset, %aux : i32 + rocdl.raw.buffer.atomic.umin %vdata1, %rsrc, %offset, %soffset, %aux : i32 llvm.return }