diff --git a/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td --- a/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td @@ -52,14 +52,17 @@ available on AMD GPUs, including extensions in newer GPUs. The index into the buffer is computed as for `memref.load` with the additon - of `indexOffset` and `sgprOffset` (which is added after bounds checks and - includes any offset present on the memref type if it's non-zero). + of `indexOffset` and `sgprOffset` (which **may or may not** be considered + in bounds checks and includes any offset present on the memref type if it's + non-zero). All indices and offsets are in units of the memref's data type and are converted to bytes during lowering. When a load is out of bounds, the instruction returns zero. - Vector instructions bounds check each component's address. + Partially-out of bounds have chipset-dependent behavior: whether reading + 2 elements starting at index 7 of a `memref<8xf32>` returns the last element + in the first vector component depends on the architecture. The memref struct is converted into a buffer resource (a V#) and the arguments are translated to intrinsic arguments as follows: @@ -71,7 +74,7 @@ - The offset enable bit is 1, the index enable bit is 0. - The thread ID addition bit is off - If `boundsCheck` is false and the target chipset is RDNA, OOB_SELECT is set - to 2 to disable bounds checks, otherwise it is 0 + to 2 to disable bounds checks, otherwise it is 3 - The cache coherency bits are off }]; let assemblyFormat = [{ @@ -108,8 +111,9 @@ All index components are in terms of the elements of the memref, not bytes, and are scaled up appropriately. - Out of bounds stores are ignored in hardware, including the out of bounds - components of vector writes. + Out of bounds stores are ignored in hardware. + Wthether a vector write that includes some in-bounds and soeme out-of-bounds + components is partically completed is chipset-dependent. See `amdgpu.raw_buffer_load` for a description of how the underlying instruction is constructed. diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -166,13 +166,13 @@ // bit 24: Reserved to 1 (RDNA) or 0 (CDNA) // bits 25-26: Reserved (0) // bit 27: Buffer is non-volatile (CDNA only) - // bits 28-29: Out of bounds select (0 = structured, 1 = raw, 2 = none, 3 = - // swizzles) RDNA only + // bits 28-29: Out of bounds select (0 = structured, 1 = check index, 2 = + // none, 3 = either swizzles or testing against offset field) RDNA only // bits 30-31: Type (must be 0) uint32_t word3 = (7 << 12) | (4 << 15); if (chipset.majorVersion == 10) { word3 |= (1 << 24); - uint32_t oob = adaptor.getBoundsCheck() ? 1 : 2; + uint32_t oob = adaptor.getBoundsCheck() ? 3 : 2; word3 |= (oob << 28); } Value word3Const = createI32Constant(rewriter, loc, word3); diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir --- a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir @@ -6,7 +6,7 @@ // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32) // CHECK: llvm.insertelement{{.*}}%[[numRecords]] // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32) - // RDNA: %[[word3:.*]] = llvm.mlir.constant(285372416 : i32) + // RDNA: %[[word3:.*]] = llvm.mlir.constant(822243328 : i32) // CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]] // CHECK: %[[ret:.*]] = rocdl.raw.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32 // CHECK: return %[[ret]]