diff --git a/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td --- a/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td @@ -87,7 +87,7 @@ let assemblyFormat = [{ attr-dict $memref `[` $indices `]` (`sgprOffset` $sgprOffset^)? `:` - type($memref) `,` type($indices) `->` type($value) + type($memref) (`,` type($indices)^)? `->` type($value) }]; let hasCanonicalizer = 1; let hasVerifier = 1; @@ -130,7 +130,7 @@ let assemblyFormat = [{ attr-dict $value `->` $memref `[` $indices `]` (`sgprOffset` $sgprOffset^)? `:` - type($value) `->` type($memref) `,` type($indices) + type($value) `->` type($memref) (`,` type($indices)^)? }]; let hasCanonicalizer = 1; let hasVerifier = 1; diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -199,7 +199,7 @@ args.push_back(resource); // Indexing (voffset) - Value voffset; + Value voffset = createI32Constant(rewriter, loc, 0); for (auto pair : llvm::enumerate(adaptor.getIndices())) { size_t i = pair.index(); Value index = pair.value(); @@ -212,8 +212,7 @@ createI32Constant(rewriter, loc, strides[i] * elementByteWidth); } index = rewriter.create(loc, index, strideOp); - voffset = - voffset ? rewriter.create(loc, voffset, index) : index; + voffset = rewriter.create(loc, voffset, index); } if (adaptor.getIndexOffset()) { int32_t indexOffset = *gpuOp.getIndexOffset() * elementByteWidth; diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir --- a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir @@ -1,6 +1,26 @@ // RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx908 | FileCheck %s // RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1030 | FileCheck %s --check-prefix=RDNA +// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_scalar_i32 +func.func @gpu_gcn_raw_buffer_load_scalar_i32(%buf: memref) -> i32 { + // CHECK: %[[ptr:.*]] = llvm.ptrtoint + // CHECK: %[[lowHalf:.*]] = llvm.trunc %[[ptr]] : i64 to i32 + // CHECK: %[[resource_1:.*]] = llvm.insertelement %[[lowHalf]] + // CHECK: %[[highHalfI64:.*]] = llvm.lshr %[[ptr]] + // CHECK: %[[highHalfI32:.*]] = llvm.trunc %[[highHalfI64]] : i64 to i32 + // CHECK: %[[highHalf:.*]] = llvm.and %[[highHalfI32]], %{{.*}} : i32 + // CHECK: %[[resource_2:.*]] = llvm.insertelement %[[highHalf]], %[[resource_1]] + // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(4 : i32) + // CHECK: %[[resource_3:.*]] = llvm.insertelement %[[numRecords]], %[[resource_2]] + // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32) + // RDNA: %[[word3:.*]] = llvm.mlir.constant(822243328 : i32) + // CHECK: %[[resource:.*]] = llvm.insertelement %[[word3]], %[[resource_3]] + // CHECK: %[[ret:.*]] = rocdl.raw.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32 + // CHECK: return %[[ret]] + %0 = amdgpu.raw_buffer_load {boundsCheck = true} %buf[] : memref -> i32 + func.return %0 : i32 +} + // CHECK-LABEL: func @gpu_gcn_raw_buffer_load_i32 func.func @gpu_gcn_raw_buffer_load_i32(%buf: memref<64xi32>, %idx: i32) -> i32 { // CHECK: %[[ptr:.*]] = llvm.ptrtoint @@ -94,6 +114,17 @@ } // Since the lowering logic is shared with loads, only bitcasts need to be rechecked +// CHECK-LABEL: func @gpu_gcn_raw_buffer_store_scalar_i32 +func.func @gpu_gcn_raw_buffer_store_scalar_i32(%value: i32, %buf: memref) { + // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(4 : i32) + // CHECK: llvm.insertelement{{.*}}%[[numRecords]] + // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32) + // CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]] + // CHECK: rocdl.raw.buffer.store %{{.*}} %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32 + amdgpu.raw_buffer_store {boundsCheck = true} %value -> %buf[] : i32 -> memref + func.return +} + // CHECK-LABEL: func @gpu_gcn_raw_buffer_store_i32 func.func @gpu_gcn_raw_buffer_store_i32(%value: i32, %buf: memref<64xi32>, %idx: i32) { // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32) diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -18,6 +18,13 @@ func.return %0 : f32 } +// CHECK-LABEL: func @raw_buffer_load_scalar +func.func @raw_buffer_load_scalar(%src : memref) -> f32 { + // CHECK: amdgpu.raw_buffer_load {indexOffset = 1 : i32} %{{.*}}[] : memref -> f32 + %0 = amdgpu.raw_buffer_load {indexOffset = 1 : i32} %src[] : memref -> f32 + func.return %0 : f32 +} + // CHECK-LABEL: func @raw_buffer_load_4xf32_from_rank_4 func.func @raw_buffer_load_4xf32_from_rank_4(%src : memref<128x64x32x16xf32>, %offset : i32, %idx0 : i32, %idx1 : i32, %idx2 : i32, %idx3 : i32) -> vector<4xf32> { // CHECK: amdgpu.raw_buffer_load {indexOffset = 1 : i32} %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] sgprOffset %{{.*}} : memref<128x64x32x16xf32>, i32, i32, i32, i32 -> vector<4xf32> @@ -46,6 +53,13 @@ func.return } +// CHECK-LABEL: func @raw_buffer_store_scalar +func.func @raw_buffer_store_scalar(%value : f32, %dst : memref) { + // CHECK: amdgpu.raw_buffer_store {indexOffset = 1 : i32} %{{.*}} -> %{{.*}}[] : f32 -> memref + amdgpu.raw_buffer_store {indexOffset = 1 : i32} %value -> %dst[] : f32 -> memref + func.return +} + // CHECK-LABEL: func @raw_buffer_atomic_fadd_f32_to_rank_1 func.func @raw_buffer_atomic_fadd_f32_to_rank_1(%value : f32, %dst : memref<128xf32>, %offset : i32, %idx0 : i32) { // CHECK: amdgpu.raw_buffer_atomic_fadd {indexOffset = 1 : i32} %{{.*}} -> %{{.*}}[{{.*}}] sgprOffset %{{.*}} : f32 -> memref<128xf32>, i32