diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td @@ -55,17 +55,34 @@ def LLVM_AnyPointer : Type($_self)">, "LLVM pointer type", "::mlir::LLVM::LLVMPointerType">; +def LLVM_OpaquePointer : Type< + And<[LLVM_AnyPointer.predicate, + CPred<"::llvm::cast<::mlir::LLVM::LLVMPointerType>($_self).isOpaque()">]>, + "LLVM opaque pointer", "::mlir::LLVM::LLVMPointerType">; + // Type constraint accepting LLVM pointer type with an additional constraint // on the element type. class LLVM_PointerTo : Type< And<[LLVM_AnyPointer.predicate, - Or<[CPred<"::llvm::cast<::mlir::LLVM::LLVMPointerType>($_self).isOpaque()">, + Or<[LLVM_OpaquePointer.predicate, SubstLeaves< "$_self", "::llvm::cast<::mlir::LLVM::LLVMPointerType>($_self).getElementType()", pointee.predicate>]>]>, "LLVM pointer to " # pointee.summary, "::mlir::LLVM::LLVMPointerType">; +// Opaque pointer in a given address space. +class LLVM_OpaquePointerInAddressSpace : Type< + And<[LLVM_OpaquePointer.predicate, + CPred< + "::llvm::cast<::mlir::LLVM::LLVMPointerType>($_self).getAddressSpace() == " + # addressSpace>]>, + "Opaque LLVM pointer in address space " # addressSpace, + "::mlir::LLVM::LLVMPointerType"> { + let builderCall = "$_builder.getType<::mlir::LLVM::LLVMPointerType>(" + # addressSpace # ")"; +} + // Type constraints accepting LLVM pointer type to integer of a specific width. class LLVM_IntPtrBase : Type< And<[LLVM_PointerTo>.predicate, diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -58,6 +58,14 @@ LLVM_IntrOpBase; +class ROCDL_IntrOp overloadedResults, + list overloadedOperands, list traits, int numResults, + int requiresAccessGroup = 0, int requiresAliasAnalysis = 0> : + LLVM_IntrOpBase; + //===----------------------------------------------------------------------===// // ROCDL special register op definitions //===----------------------------------------------------------------------===// @@ -220,7 +228,96 @@ def ROCDL_wmma_i32_16x16x16_iu8 : ROCDL_Wmma_IntrOp<"wmma.i32.16x16x16.iu8">; def ROCDL_wmma_i32_16x16x16_iu4 : ROCDL_Wmma_IntrOp<"wmma.i32.16x16x16.iu4">; +//===---------------------------------------------------------------------===// +// Operations on raw buffer resources (stride of 0, bounds checks either off or in +// raw buffer mode). +//===---------------------------------------------------------------------===// + +def ROCDLBufferRsrc : LLVM_OpaquePointerInAddressSpace<8>; + +def ROCDL_MakeBufferRsrcOp : + ROCDL_IntrOp<"make.buffer.rsrc", [], [0], [Pure], 1>, + Arguments<(ins LLVM_AnyPointer:$base, + I16:$stride, + I32:$numRecords, + I32:$flags)> { + let results = (outs ROCDLBufferRsrc:$res); + let assemblyFormat = "operands attr-dict `:` type($base) `to` type($res)"; +} + +def ROCDL_RawPtrBufferLoadOp : + ROCDL_IntrOp<"raw.ptr.buffer.load", [0], [], [], 1, 0, 1> { + dag args = (ins Arg:$rsrc, + I32:$offset, + I32:$soffset, + I32:$aux); + let arguments = !con(args, aliasAttrs); + let assemblyFormat = "operands attr-dict `:` type($res)"; + let extraClassDefinition = [{ + ::llvm::SmallVector<::mlir::Value> $cppClass::getAccessedOperands() { + return {getRes()}; + } + }]; +} + +def ROCDL_RawPtrBufferStoreOp : + ROCDL_IntrOp<"raw.ptr.buffer.store", [], [0], [], 0, 0, 1> { + dag args = (ins LLVM_Type:$vdata, + Arg:$rsrc, + I32:$offset, + I32:$soffset, + I32:$aux); + let arguments = !con(args, aliasAttrs); + let assemblyFormat = "operands attr-dict `:` type($vdata)"; + let extraClassDefinition = [{ + ::llvm::SmallVector<::mlir::Value> $cppClass::getAccessedOperands() { + return {getRsrc()}; + } + }]; + +} + +def ROCDL_RawPtrBufferAtomicCmpSwap : + ROCDL_IntrOp<"raw.ptr.buffer.atomic.cmpswap", + [0], [], [AllTypesMatch<["res", "src", "cmp"]>], 1, 0, 1> { + dag args = (ins LLVM_Type:$src, + LLVM_Type:$cmp, + Arg:$rsrc, + I32:$offset, + I32:$soffset, + I32:$aux); + let arguments = !con(args, aliasAttrs); + let assemblyFormat = "operands attr-dict `:` type($res)"; + let extraClassDefinition = [{ + ::llvm::SmallVector<::mlir::Value> $cppClass::getAccessedOperands() { + return {getRsrc()}; + } + }]; +} + +class ROCDL_RawPtrBufferAtomicNoRet : + ROCDL_IntrOp<"raw.ptr.buffer.atomic." # op, [], [0], [], 0, 0, 1> { + dag args = (ins LLVM_Type:$vdata, + Arg:$rsrc, + I32:$offset, + I32:$soffset, + I32:$aux); + let arguments = !con(args, aliasAttrs); + let assemblyFormat = "operands attr-dict `:` type($vdata)"; + let extraClassDefinition = [{ + ::llvm::SmallVector<::mlir::Value> $cppClass::getAccessedOperands() { + return {getRsrc()}; + } + }]; +} + +def ROCDL_RawPtrBufferAtomicFmaxOp : ROCDL_RawPtrBufferAtomicNoRet<"fmax">; +def ROCDL_RawPtrBufferAtomicSmaxOp : ROCDL_RawPtrBufferAtomicNoRet<"smax">; +def ROCDL_RawPtrBufferAtomicUminOp : ROCDL_RawPtrBufferAtomicNoRet<"umin">; +// Note: not supported on all architectures +def ROCDL_RawPtrBufferAtomicFaddOp : ROCDL_RawPtrBufferAtomicNoRet<"fadd">; +/// LEGACY BUFFER OPERATIONS. DO NOT USE IN NEW CODE. KEPT FOR IR COMPATIBILITY. //===---------------------------------------------------------------------===// // Vector buffer load/store intrinsics diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -57,7 +57,7 @@ MemRefType memrefType = cast(unconvertedMemref.getType()); if (chipset.majorVersion < 9) - return gpuOp.emitOpError("Raw buffer ops require GCN or higher"); + return gpuOp.emitOpError("raw buffer ops require GCN or higher"); Value storeData = adaptor.getODSOperands(0)[0]; if (storeData == memref) // no write component to this op @@ -80,6 +80,7 @@ Type i32 = rewriter.getI32Type(); Type llvmI32 = this->typeConverter->convertType(i32); + Type llvmI16 = this->typeConverter->convertType(rewriter.getI16Type()); int64_t elementByteWidth = memrefType.getElementTypeBitWidth() / 8; Value byteWidthConst = createI32Constant(rewriter, loc, elementByteWidth); @@ -147,41 +148,13 @@ if (failed(getStridesAndOffset(memrefType, strides, offset))) return gpuOp.emitOpError("Can't lower non-stride-offset memrefs"); - // Resource descriptor - // bits 0-47: base address - // bits 48-61: stride (0 for raw buffers) - // bit 62: texture cache coherency (always 0) - // bit 63: enable swizzles (always off for raw buffers) - // bits 64-95 (word 2): Number of records, units of stride - // bits 96-127 (word 3): See below - - Type llvm4xI32 = this->typeConverter->convertType(VectorType::get(4, i32)); MemRefDescriptor memrefDescriptor(memref); - Type llvmI64 = this->typeConverter->convertType(rewriter.getI64Type()); - Value c32I64 = rewriter.create( - loc, llvmI64, rewriter.getI64IntegerAttr(32)); - - Value resource = rewriter.create(loc, llvm4xI32); Value ptr = memrefDescriptor.alignedPtr(rewriter, loc); - Value ptrAsInt = rewriter.create(loc, llvmI64, ptr); - Value lowHalf = rewriter.create(loc, llvmI32, ptrAsInt); - resource = rewriter.create( - loc, llvm4xI32, resource, lowHalf, - this->createIndexAttrConstant(rewriter, loc, this->getIndexType(), 0)); - - // Bits 48-63 are used both for the stride of the buffer and (on gfx10) for - // enabling swizzling. Prevent the high bits of pointers from accidentally - // setting those flags. - Value highHalfShifted = rewriter.create( - loc, llvmI32, rewriter.create(loc, ptrAsInt, c32I64)); - Value highHalfTruncated = rewriter.create( - loc, llvmI32, highHalfShifted, - createI32Constant(rewriter, loc, 0x0000ffff)); - resource = rewriter.create( - loc, llvm4xI32, resource, highHalfTruncated, - this->createIndexAttrConstant(rewriter, loc, this->getIndexType(), 1)); - + // The stride value is always 0 for raw buffers. This also disables + // swizling. + Value stride = rewriter.createOrFold( + loc, llvmI16, rewriter.getI16IntegerAttr(0)); Value numRecords; if (memrefType.hasStaticShape()) { numRecords = createI32Constant( @@ -200,11 +173,8 @@ } numRecords = rewriter.create(loc, llvmI32, maxIndex); } - resource = rewriter.create( - loc, llvm4xI32, resource, numRecords, - this->createIndexAttrConstant(rewriter, loc, this->getIndexType(), 2)); - // Final word: + // Flag word: // bits 0-11: dst sel, ignored by these intrinsics // bits 12-14: data format (ignored, must be nonzero, 7=float) // bits 15-18: data format (ignored, must be nonzero, 4=32bit) @@ -218,16 +188,16 @@ // bits 28-29: Out of bounds select (0 = structured, 1 = check index, 2 = // none, 3 = either swizzles or testing against offset field) RDNA only // bits 30-31: Type (must be 0) - uint32_t word3 = (7 << 12) | (4 << 15); + uint32_t flags = (7 << 12) | (4 << 15); if (chipset.majorVersion >= 10) { - word3 |= (1 << 24); + flags |= (1 << 24); uint32_t oob = adaptor.getBoundsCheck() ? 3 : 2; - word3 |= (oob << 28); + flags |= (oob << 28); } - Value word3Const = createI32Constant(rewriter, loc, word3); - resource = rewriter.create( - loc, llvm4xI32, resource, word3Const, - this->createIndexAttrConstant(rewriter, loc, this->getIndexType(), 3)); + Value flagsConst = createI32Constant(rewriter, loc, flags); + Type rsrcType = LLVM::LLVMPointerType::get(rewriter.getContext(), 8); + Value resource = rewriter.createOrFold( + loc, rsrcType, ptr, stride, numRecords, flagsConst); args.push_back(resource); // Indexing (voffset) @@ -668,16 +638,20 @@ RewritePatternSet &patterns, Chipset chipset) { patterns.add(converter); - patterns.add< - RawBufferOpLowering, - RawBufferOpLowering, - RawBufferOpLowering, - RawBufferOpLowering, - RawBufferOpLowering, - RawBufferOpLowering, - RawBufferOpLowering, - MFMAOpLowering, WMMAOpLowering>(converter, chipset); + patterns + .add, + RawBufferOpLowering, + RawBufferOpLowering, + RawBufferOpLowering, + RawBufferOpLowering, + RawBufferOpLowering, + RawBufferOpLowering, + MFMAOpLowering, WMMAOpLowering>(converter, chipset); } std::unique_ptr mlir::createConvertAMDGPUToROCDLPass() { diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir --- a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir @@ -4,19 +4,12 @@ // CHECK-LABEL: func @gpu_gcn_raw_buffer_load_scalar_i32 func.func @gpu_gcn_raw_buffer_load_scalar_i32(%buf: memref) -> i32 { - // CHECK: %[[ptr:.*]] = llvm.ptrtoint - // CHECK: %[[lowHalf:.*]] = llvm.trunc %[[ptr]] : i64 to i32 - // CHECK: %[[resource_1:.*]] = llvm.insertelement %[[lowHalf]] - // CHECK: %[[highHalfI64:.*]] = llvm.lshr %[[ptr]] - // CHECK: %[[highHalfI32:.*]] = llvm.trunc %[[highHalfI64]] : i64 to i32 - // CHECK: %[[highHalf:.*]] = llvm.and %[[highHalfI32]], %{{.*}} : i32 - // CHECK: %[[resource_2:.*]] = llvm.insertelement %[[highHalf]], %[[resource_1]] + // CHECK: %[[stride:.*]] = llvm.mlir.constant(0 : i16) // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(4 : i32) - // CHECK: %[[resource_3:.*]] = llvm.insertelement %[[numRecords]], %[[resource_2]] - // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32) - // RDNA: %[[word3:.*]] = llvm.mlir.constant(822243328 : i32) - // CHECK: %[[resource:.*]] = llvm.insertelement %[[word3]], %[[resource_3]] - // CHECK: %[[ret:.*]] = rocdl.raw.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32 + // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) + // RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32) + // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %[[stride]], %[[numRecords]], %[[flags]] : !llvm.ptr to <8> + // CHECK: %[[ret:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32 // CHECK: return %[[ret]] %0 = amdgpu.raw_buffer_load {boundsCheck = true} %buf[] : memref -> i32 func.return %0 : i32 @@ -24,19 +17,12 @@ // CHECK-LABEL: func @gpu_gcn_raw_buffer_load_i32 func.func @gpu_gcn_raw_buffer_load_i32(%buf: memref<64xi32>, %idx: i32) -> i32 { - // CHECK: %[[ptr:.*]] = llvm.ptrtoint - // CHECK: %[[lowHalf:.*]] = llvm.trunc %[[ptr]] : i64 to i32 - // CHECK: %[[resource_1:.*]] = llvm.insertelement %[[lowHalf]] - // CHECK: %[[highHalfI64:.*]] = llvm.lshr %[[ptr]] - // CHECK: %[[highHalfI32:.*]] = llvm.trunc %[[highHalfI64]] : i64 to i32 - // CHECK: %[[highHalf:.*]] = llvm.and %[[highHalfI32]], %{{.*}} : i32 - // CHECK: %[[resource_2:.*]] = llvm.insertelement %[[highHalf]], %[[resource_1]] + // CHECK: %[[stride:.*]] = llvm.mlir.constant(0 : i16) // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32) - // CHECK: %[[resource_3:.*]] = llvm.insertelement %[[numRecords]], %[[resource_2]] - // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32) - // RDNA: %[[word3:.*]] = llvm.mlir.constant(822243328 : i32) - // CHECK: %[[resource:.*]] = llvm.insertelement %[[word3]], %[[resource_3]] - // CHECK: %[[ret:.*]] = rocdl.raw.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32 + // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) + // RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32) + // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %[[stride]], %[[numRecords]], %[[flags]] : !llvm.ptr to <8> + // CHECK: %[[ret:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32 // CHECK: return %[[ret]] %0 = amdgpu.raw_buffer_load {boundsCheck = true} %buf[%idx] : memref<64xi32>, i32 -> i32 func.return %0 : i32 @@ -44,18 +30,18 @@ // CHECK-LABEL: func @gpu_gcn_raw_buffer_load_i32_oob_off func.func @gpu_gcn_raw_buffer_load_i32_oob_off(%buf: memref<64xi32>, %idx: i32) -> i32 { - // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32) - // RDNA: %[[word3:.*]] = llvm.mlir.constant(553807872 : i32) - // RDNA: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]] - // RDNA: %[[ret:.*]] = rocdl.raw.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32 - // RDNA: return %[[ret]] + // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) + // RDNA: %[[flags:.*]] = llvm.mlir.constant(553807872 : i32) + // RDNA: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %{{.*}}, %[[flags]] + // RDNA: %[[ret:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32 + // RDNA: return %[[ret]] %0 = amdgpu.raw_buffer_load {boundsCheck = false} %buf[%idx] : memref<64xi32>, i32 -> i32 func.return %0 : i32 } // CHECK-LABEL: func @gpu_gcn_raw_buffer_load_2xi32 func.func @gpu_gcn_raw_buffer_load_2xi32(%buf: memref<64xi32>, %idx: i32) -> vector<2xi32> { - // CHECK: %[[ret:.*]] = rocdl.raw.buffer.load %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : vector<2xi32> + // CHECK: %[[ret:.*]] = rocdl.raw.ptr.buffer.load %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : vector<2xi32> // CHECK: return %[[ret]] %0 = amdgpu.raw_buffer_load {boundsCheck = true} %buf[%idx] : memref<64xi32>, i32 -> vector<2xi32> func.return %0 : vector<2xi32> @@ -64,8 +50,8 @@ // CHECK-LABEL: func @gpu_gcn_raw_buffer_load_i8 func.func @gpu_gcn_raw_buffer_load_i8(%buf: memref<64xi8>, %idx: i32) -> i8 { // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(64 : i32) - // CHECK: llvm.insertelement{{.*}}%[[numRecords]] - // CHECK: %[[ret:.*]] = rocdl.raw.buffer.load %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : i8 + // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %{{.*}} + // CHECK: %[[ret:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i8 // CHECK: return %[[ret]] %0 = amdgpu.raw_buffer_load {boundsCheck = true} %buf[%idx] : memref<64xi8>, i32 -> i8 func.return %0 : i8 @@ -74,8 +60,8 @@ // CHECK-LABEL: func @gpu_gcn_raw_buffer_load_2xi8 func.func @gpu_gcn_raw_buffer_load_2xi8(%buf: memref<64xi8>, %idx: i32) -> vector<2xi8> { // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(64 : i32) - // CHECK: llvm.insertelement{{.*}}%[[numRecords]] - // CHECK: %[[loaded:.*]] = rocdl.raw.buffer.load %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : i16 + // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %{{.*}} + // CHECK: %[[loaded:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i16 // CHECK: %[[ret:.*]] = llvm.bitcast %[[loaded]] : i16 to vector<2xi8> // CHECK: return %[[ret]] %0 = amdgpu.raw_buffer_load {boundsCheck = true} %buf[%idx] : memref<64xi8>, i32 -> vector<2xi8> @@ -84,7 +70,7 @@ // CHECK-LABEL: func @gpu_gcn_raw_buffer_load_16xi8 func.func @gpu_gcn_raw_buffer_load_16xi8(%buf: memref<64xi8>, %idx: i32) -> vector<16xi8> { - // CHECK: %[[loaded:.*]] = rocdl.raw.buffer.load %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : vector<4xi32> + // CHECK: %[[loaded:.*]] = rocdl.raw.ptr.buffer.load %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : vector<4xi32> // CHECK: %[[ret:.*]] = llvm.bitcast %[[loaded]] : vector<4xi32> to vector<16xi8> // CHECK: return %[[ret]] %0 = amdgpu.raw_buffer_load {boundsCheck = true} %buf[%idx] : memref<64xi8>, i32 -> vector<16xi8> @@ -94,8 +80,8 @@ // CHECK-LABEL: func @gpu_gcn_raw_buffer_load_f8E5M2FNUZ func.func @gpu_gcn_raw_buffer_load_f8E5M2FNUZ(%buf: memref<64xf8E5M2FNUZ>, %idx: i32) -> f8E5M2FNUZ { // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(64 : i32) - // CHECK: llvm.insertelement{{.*}}%[[numRecords]] - // CHECK: %[[loaded:.*]] = rocdl.raw.buffer.load %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : i8 + // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %{{.*}} + // CHECK: %[[loaded:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i8 // CHECK: %[[ret:.*]] = builtin.unrealized_conversion_cast %[[loaded]] : i8 to f8E5M2FNUZ // CHECK: return %[[ret]] %0 = amdgpu.raw_buffer_load {boundsCheck = true} %buf[%idx] : memref<64xf8E5M2FNUZ>, i32 -> f8E5M2FNUZ @@ -105,8 +91,8 @@ // CHECK-LABEL: func @gpu_gcn_raw_buffer_load_4xf8E4M3FNUZ func.func @gpu_gcn_raw_buffer_load_4xf8E4M3FNUZ(%buf: memref<64xf8E4M3FNUZ>, %idx: i32) -> vector<4xf8E4M3FNUZ> { // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(64 : i32) - // CHECK: llvm.insertelement{{.*}}%[[numRecords]] - // CHECK: %[[loaded:.*]] = rocdl.raw.buffer.load %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : i32 + // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %{{.*}} + // CHECK: %[[loaded:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32 // CHECK: %[[cast:.*]] = llvm.bitcast %[[loaded]] : i32 to vector<4xi8> // CHECK: %[[ret:.*]] = builtin.unrealized_conversion_cast %[[cast]] : vector<4xi8> to vector<4xf8E4M3FNUZ> // CHECK: return %[[ret]] @@ -117,11 +103,9 @@ // Since the lowering logic is shared with loads, only bitcasts need to be rechecked // CHECK-LABEL: func @gpu_gcn_raw_buffer_store_scalar_i32 func.func @gpu_gcn_raw_buffer_store_scalar_i32(%value: i32, %buf: memref) { - // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(4 : i32) - // CHECK: llvm.insertelement{{.*}}%[[numRecords]] - // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32) - // CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]] - // CHECK: rocdl.raw.buffer.store %{{.*}} %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32 + // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) + // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %{{.*}}, %[[flags]] + // CHECK: rocdl.raw.ptr.buffer.store %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32 amdgpu.raw_buffer_store {boundsCheck = true} %value -> %buf[] : i32 -> memref func.return } @@ -129,10 +113,9 @@ // CHECK-LABEL: func @gpu_gcn_raw_buffer_store_i32 func.func @gpu_gcn_raw_buffer_store_i32(%value: i32, %buf: memref<64xi32>, %idx: i32) { // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32) - // CHECK: llvm.insertelement{{.*}}%[[numRecords]] - // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32) - // CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]] - // CHECK: rocdl.raw.buffer.store %{{.*}} %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32 + // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) + // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]] + // CHECK: rocdl.raw.ptr.buffer.store %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32 amdgpu.raw_buffer_store {boundsCheck = true} %value -> %buf[%idx] : i32 -> memref<64xi32>, i32 func.return } @@ -140,7 +123,7 @@ // CHECK-LABEL: func @gpu_gcn_raw_buffer_store_2xi8 func.func @gpu_gcn_raw_buffer_store_2xi8(%value: vector<2xi8>, %buf: memref<64xi8>, %idx: i32) { // CHECK: %[[cast:.*]] = llvm.bitcast %{{.*}} : vector<2xi8> to i16 - // CHECK: rocdl.raw.buffer.store %[[cast]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : i16 + // CHECK: rocdl.raw.ptr.buffer.store %[[cast]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : i16 amdgpu.raw_buffer_store {boundsCheck = true} %value -> %buf[%idx] : vector<2xi8> -> memref<64xi8>, i32 func.return } @@ -148,7 +131,7 @@ // CHECK-LABEL: func @gpu_gcn_raw_buffer_store_16xi8 func.func @gpu_gcn_raw_buffer_store_16xi8(%value: vector<16xi8>, %buf: memref<64xi8>, %idx: i32) { // CHECK: %[[cast:.*]] = llvm.bitcast %{{.*}} : vector<16xi8> to vector<4xi32> - // CHECK: rocdl.raw.buffer.store %[[cast]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : vector<4xi32> + // CHECK: rocdl.raw.ptr.buffer.store %[[cast]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : vector<4xi32> amdgpu.raw_buffer_store {boundsCheck = true} %value -> %buf[%idx] : vector<16xi8> -> memref<64xi8>, i32 func.return } @@ -157,10 +140,9 @@ // CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_fadd_f32 func.func @gpu_gcn_raw_buffer_atomic_fadd_f32(%value: f32, %buf: memref<64xf32>, %idx: i32) { // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32) - // CHECK: llvm.insertelement{{.*}}%[[numRecords]] - // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32) - // CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]] - // CHECK: rocdl.raw.buffer.atomic.fadd %{{.*}} %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : f32 + // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) + // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]] + // CHECK: rocdl.raw.ptr.buffer.atomic.fadd %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : f32 amdgpu.raw_buffer_atomic_fadd {boundsCheck = true} %value -> %buf[%idx] : f32 -> memref<64xf32>, i32 func.return } @@ -168,10 +150,9 @@ // CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_fmax_f32 func.func @gpu_gcn_raw_buffer_atomic_fmax_f32(%value: f32, %buf: memref<64xf32>, %idx: i32) { // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32) - // CHECK: llvm.insertelement{{.*}}%[[numRecords]] - // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32) - // CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]] - // CHECK: rocdl.raw.buffer.atomic.fmax %{{.*}} %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : f32 + // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) + // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]] + // CHECK: rocdl.raw.ptr.buffer.atomic.fmax %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : f32 amdgpu.raw_buffer_atomic_fmax {boundsCheck = true} %value -> %buf[%idx] : f32 -> memref<64xf32>, i32 func.return } @@ -179,10 +160,9 @@ // CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_smax_i32 func.func @gpu_gcn_raw_buffer_atomic_smax_i32(%value: i32, %buf: memref<64xi32>, %idx: i32) { // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32) - // CHECK: llvm.insertelement{{.*}}%[[numRecords]] - // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32) - // CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]] - // CHECK: rocdl.raw.buffer.atomic.smax %{{.*}} %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32 + // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) + // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]] + // CHECK: rocdl.raw.ptr.buffer.atomic.smax %{{.*}} %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32 amdgpu.raw_buffer_atomic_smax {boundsCheck = true} %value -> %buf[%idx] : i32 -> memref<64xi32>, i32 func.return } @@ -190,10 +170,9 @@ // CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_umin_i32 func.func @gpu_gcn_raw_buffer_atomic_umin_i32(%value: i32, %buf: memref<64xi32>, %idx: i32) { // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32) - // CHECK: llvm.insertelement{{.*}}%[[numRecords]] - // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32) - // CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]] - // CHECK: rocdl.raw.buffer.atomic.umin %{{.*}} %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32 + // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) + // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]] + // CHECK: rocdl.raw.ptr.buffer.atomic.umin %{{.*}} %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32 amdgpu.raw_buffer_atomic_umin {boundsCheck = true} %value -> %buf[%idx] : i32 -> memref<64xi32>, i32 func.return } @@ -204,10 +183,9 @@ // CHECK: %[[srcCast:.*]] = llvm.bitcast %[[src]] : f32 to i32 // CHECK: %[[cmpCast:.*]] = llvm.bitcast %[[cmp]] : f32 to i32 // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32) - // CHECK: llvm.insertelement{{.*}}%[[numRecords]] - // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32) - // CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]] - // CHECK: %[[dst:.*]] = rocdl.raw.buffer.atomic.cmpswap(%[[srcCast]], %[[cmpCast]], %[[resource]], %{{.*}}, %{{.*}}, %{{.*}}) : i32, vector<4xi32> + // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) + // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]] + // CHECK: %[[dst:.*]] = rocdl.raw.ptr.buffer.atomic.cmpswap %[[srcCast]], %[[cmpCast]], %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32 // CHECK: %[[dstCast:.*]] = llvm.bitcast %[[dst]] : i32 to f32 // CHECK: return %[[dstCast]] %dst = amdgpu.raw_buffer_atomic_cmpswap {boundsCheck = true} %src, %cmp -> %buf[%idx] : f32 -> memref<64xf32>, i32 @@ -218,10 +196,9 @@ // CHECK-SAME: (%[[src:.*]]: i64, %[[cmp:.*]]: i64, {{.*}}) func.func @amdgpu_raw_buffer_atomic_cmpswap_i64(%src : i64, %cmp : i64, %buf : memref<64xi64>, %idx: i32) -> i64 { // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(512 : i32) - // CHECK: llvm.insertelement{{.*}}%[[numRecords]] - // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32) - // CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]] - // CHECK: %[[dst:.*]] = rocdl.raw.buffer.atomic.cmpswap(%[[src]], %[[cmp]], %[[resource]], %{{.*}}, %{{.*}}, %{{.*}}) : i64, vector<4xi32> + // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) + // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]] + // CHECK: %[[dst:.*]] = rocdl.raw.ptr.buffer.atomic.cmpswap %[[src]], %[[cmp]], %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i64 // CHECK: return %[[dst]] %dst = amdgpu.raw_buffer_atomic_cmpswap {boundsCheck = true} %src, %cmp -> %buf[%idx] : i64 -> memref<64xi64>, i32 func.return %dst : i64 diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir --- a/mlir/test/Dialect/LLVMIR/rocdl.mlir +++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir @@ -203,6 +203,66 @@ llvm.return } +llvm.func @rocdl.make.buffer.rsrc(%ptr : !llvm.ptr, + %stride : i16, + %numRecords : i32, + %flags : i32) -> !llvm.ptr<8> { + // CHECK-LABEL: rocdl.make.buffer.rsrc + // CHECK: %{{.*}} = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : !llvm.ptr to <8> + %rsrc = rocdl.make.buffer.rsrc %ptr, %stride, %numRecords, %flags : !llvm.ptr to !llvm.ptr<8> + llvm.return %rsrc : !llvm.ptr<8> +} + +llvm.func @rocdl.raw.ptr.buffer.f32(%rsrc : !llvm.ptr<8>, + %offset : i32, %soffset : i32, + %aux : i32, %vdata1 : f32, + %vdata2 : vector<2xf32>, %vdata4 : vector<4xf32>) { + // CHECK-LABEL: rocdl.raw.ptr.buffer.f32 + // CHECK: %{{.*}} = rocdl.raw.ptr.buffer.load %{{.*}}, %{{.*}} %{{.*}}, %{{.*}} : f32 + // CHECK: %{{.*}} = rocdl.raw.ptr.buffer.load %{{.*}}, %{{.*}} %{{.*}}, %{{.*}} : vector<2xf32> + // CHECK: %{{.*}} = rocdl.raw.ptr.buffer.load %{{.*}}, %{{.*}} %{{.*}}, %{{.*}} : vector<4xf32> + + // CHECK: rocdl.raw.ptr.buffer.store %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : f32 + // CHECK: rocdl.raw.ptr.buffer.store %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : vector<2xf32> + // CHECK: rocdl.raw.ptr.buffer.store %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : vector<4xf32> + + // CHECK: rocdl.raw.ptr.buffer.atomic.fadd %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : f32 + // CHECK: rocdl.raw.ptr.buffer.atomic.fmax %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : f32 + + %r1 = rocdl.raw.ptr.buffer.load %rsrc, %offset, %soffset, %aux : f32 + %r2 = rocdl.raw.ptr.buffer.load %rsrc, %offset, %soffset, %aux : vector<2xf32> + %r4 = rocdl.raw.ptr.buffer.load %rsrc, %offset, %soffset, %aux : vector<4xf32> + + rocdl.raw.ptr.buffer.store %vdata1, %rsrc, %offset, %soffset, %aux : f32 + rocdl.raw.ptr.buffer.store %vdata2, %rsrc, %offset, %soffset, %aux : vector<2xf32> + rocdl.raw.ptr.buffer.store %vdata4, %rsrc, %offset, %offset, %aux : vector<4xf32> + + rocdl.raw.ptr.buffer.atomic.fadd %vdata1, %rsrc, %offset, %soffset, %aux : f32 + rocdl.raw.ptr.buffer.atomic.fmax %vdata1, %rsrc, %offset, %soffset, %aux : f32 + + llvm.return +} + + +llvm.func @rocdl.raw.ptr.buffer.i32(%rsrc : !llvm.ptr<8>, + %offset : i32, %soffset : i32, + %aux : i32, %vdata1 : i32, + %vdata2 : vector<2xi32>, %vdata4 : vector<4xi32>) { + // CHECK-LABEL: rocdl.raw.ptr.buffer.i32 + // CHECK: rocdl.raw.ptr.buffer.atomic.smax %{{.*}}, %{{.*}}, %{{.*}} %{{.*}}, %{{.*}} : i32 + // CHECK: rocdl.raw.ptr.buffer.atomic.umin %{{.*}}, %{{.*}}, %{{.*}} %{{.*}}, %{{.*}} : i32 + // CHECK: %{{.*}} = rocdl.raw.ptr.buffer.atomic.cmpswap %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : i32 + + rocdl.raw.ptr.buffer.atomic.smax %vdata1, %rsrc, %offset, %soffset, %aux : i32 + rocdl.raw.ptr.buffer.atomic.umin %vdata1, %rsrc, %offset, %soffset, %aux : i32 + %val = rocdl.raw.ptr.buffer.atomic.cmpswap %vdata1, %vdata1, %rsrc, %offset, %soffset, %aux : i32 + llvm.return +} + +// ----- + +// Tests for deprecated buffer ops. + llvm.func @rocdl.mubuf(%rsrc : vector<4xi32>, %vindex : i32, %offset : i32, %glc : i1, %slc : i1, %vdata1 : vector<1xf32>, diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir --- a/mlir/test/Target/LLVMIR/rocdl.mlir +++ b/mlir/test/Target/LLVMIR/rocdl.mlir @@ -284,6 +284,82 @@ llvm.return %r0 : vector<8xf32> } +llvm.func @rocdl.make.buffer.rsrc(%ptr : !llvm.ptr, + %stride : i16, + %numRecords : i32, + %flags : i32) -> !llvm.ptr<8> { + // CHECK-LABEL: rocdl.make.buffer.rsrc + // CHECK: %[[rsrc:.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %{{.*}}, i16 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) + // CHECK: ret ptr addrspace(8) %[[rsrc]] + %rsrc = rocdl.make.buffer.rsrc %ptr, %stride, %numRecords, %flags : !llvm.ptr to !llvm.ptr<8> + llvm.return %rsrc : !llvm.ptr<8> +} + +llvm.func @rocdl.raw.ptr.buffer(%rsrc : !llvm.ptr<8>, + %offset : i32, %soffset : i32, + %vdata1 : i32, + %vdata2 : vector<2xi32>, + %vdata4 : vector<4xi32>) { + %aux = llvm.mlir.constant(0 : i32) : i32 + // CHECK-LABEL: rocdl.raw.ptr.buffer + // CHECK: call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} + // CHECK: call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} + // CHECK: call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} + + // CHECK: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %{{.*}}, ptr addrspace(8) %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} + // CHECK: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %{{.*}}, ptr addrspace(8) %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} + // CHECK: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> %{{.*}}, ptr addrspace(8) %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} + + %r1 = rocdl.raw.ptr.buffer.load %rsrc, %offset, %soffset, %aux : i32 + %r2 = rocdl.raw.ptr.buffer.load %rsrc, %offset, %soffset, %aux : vector<2xi32> + %r4 = rocdl.raw.ptr.buffer.load %rsrc, %offset, %soffset, %aux : vector<4xi32> + + rocdl.raw.ptr.buffer.store %vdata1, %rsrc, %offset, %soffset, %aux : i32 + rocdl.raw.ptr.buffer.store %vdata2, %rsrc, %offset, %soffset, %aux : vector<2xi32> + rocdl.raw.ptr.buffer.store %vdata4, %rsrc, %offset, %soffset, %aux : vector<4xi32> + + llvm.return +} + +llvm.func @rocdl.raw.ptr.buffer.atomic.f32(%rsrc : !llvm.ptr<8>, + %offset : i32, %soffset : i32, + %vdata1 : f32) { + %aux = llvm.mlir.constant(0 : i32) : i32 + // CHECK-LABEL: rocdl.raw.ptr.buffer.atomic.f32 + // CHECK: call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %{{.*}}, ptr addrspace(8) %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} + // CHECK: call float @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f32(float %{{.*}}, ptr addrspace(8) %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} + + rocdl.raw.ptr.buffer.atomic.fadd %vdata1, %rsrc, %offset, %soffset, %aux : f32 + rocdl.raw.ptr.buffer.atomic.fmax %vdata1, %rsrc, %offset, %soffset, %aux : f32 + + llvm.return +} + +llvm.func @rocdl.raw.ptr.buffer.atomic.i32(%rsrc : !llvm.ptr<8>, + %offset : i32, %soffset : i32, + %vdata1 : i32) { + %aux = llvm.mlir.constant(0 : i32) : i32 + // CHECK-LABEL: rocdl.raw.ptr.buffer.atomic.i32 + // CHECK: call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.smax.i32(i32 %{{.*}}, ptr addrspace(8) %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} + // CHECK: call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.umin.i32(i32 %{{.*}}, ptr addrspace(8) %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} + + rocdl.raw.ptr.buffer.atomic.smax %vdata1, %rsrc, %offset, %soffset, %aux : i32 + rocdl.raw.ptr.buffer.atomic.umin %vdata1, %rsrc, %offset, %soffset, %aux : i32 + + llvm.return +} + +llvm.func @rocdl.raw.ptr.buffer.atomic.cmpswap(%rsrc : !llvm.ptr<8>, + %offset : i32, %soffset : i32, + %src : i32, %cmp : i32) -> i32 { + %aux = llvm.mlir.constant(0 : i32) : i32 + // CHECK-LABEL: rocdl.raw.ptr.buffer.atomic.cmpswap + // CHECK: [[val:%.+]] = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.i32(i32 %{{.*}}, i32 %{{.*}}, ptr addrspace(8) %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} + // CHECK: ret i32 [[val]] + + %val = rocdl.raw.ptr.buffer.atomic.cmpswap %src, %cmp, %rsrc, %offset, %soffset, %aux : i32 + llvm.return %val : i32 +} llvm.func @rocdl.mubuf(%rsrc : vector<4xi32>, %vindex : i32, %offset : i32, %vdata1 : vector<1xf32>,