diff --git a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp --- a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp +++ b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp @@ -14,6 +14,7 @@ #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/Diagnostics.h" #include "mlir/IR/DialectImplementation.h" #include "mlir/IR/OpImplementation.h" #include "mlir/IR/TypeUtilities.h" @@ -76,10 +77,23 @@ return emitOpError() << "expected " << dstMemref.getRank() << " destination indices, got " << getDstIndices().size(); + int64_t dstElements = getDstElements().getZExtValue(); + int64_t sizeInBytes = (dstMemref.getElementTypeBitWidth() * dstElements) / 8; + if (sizeInBytes != 4 && sizeInBytes != 8 && sizeInBytes != 16) { + unsigned dstWidth = dstMemref.getElementTypeBitWidth(); + InFlightDiagnostic diag = emitError(); + diag << "Requested copy elements is " << dstElements << " with width " + << dstMemref.getElementTypeBitWidth() + << ". But copy elements could be one of "; + if ((32 / dstWidth) > 0) + diag << (32 / dstWidth) << ", "; + if ((64 / dstWidth) > 0) + diag << (64 / dstWidth) << ", "; + if ((128 / dstWidth) > 0) + diag << (128 / dstWidth) << "."; + return diag; + } if (getBypassL1().has_value()) { - int64_t dstElements = getDstElements().getZExtValue(); - int64_t sizeInBytes = - (dstMemref.getElementTypeBitWidth() * dstElements) / 8; int64_t req = 16 * 8 / dstMemref.getElementTypeBitWidth(); if (getBypassL1().value() && sizeInBytes != 16) { return emitOpError() << "bypassL1 does not satify alignment for " diff --git a/mlir/test/Dialect/NVGPU/invalid.mlir b/mlir/test/Dialect/NVGPU/invalid.mlir --- a/mlir/test/Dialect/NVGPU/invalid.mlir +++ b/mlir/test/Dialect/NVGPU/invalid.mlir @@ -194,3 +194,30 @@ %0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 1, %srcElements {bypassL1} : memref<128x128xf32> to memref<3x16x128xf32, 3> return } + +// ----- + +func.func @async_cp_size_invalid_f32( + %src: memref<128x128xf32>, %dst: memref<3x16x128xf32, 3>, %i : index) { + // expected-error @+1 {{Requested copy elements is 3 with width 32. But copy elements could be one of 1, 2, 4.}} + %0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 3: memref<128x128xf32> to memref<3x16x128xf32, 3> + return +} + +// ----- + +func.func @async_cp_size_invalid_f16( + %src: memref<128x128xf16>, %dst: memref<3x16x128xf16, 3>, %i : index) { + // expected-error @+1 {{Requested copy elements is 3 with width 16. But copy elements could be one of 2, 4, 8.}} + %0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 3: memref<128x128xf16> to memref<3x16x128xf16, 3> + return +} + +// ----- + +func.func @async_cp_size_invalid_f64( + %src: memref<128x128xf64>, %dst: memref<3x16x128xf64, 3>, %i : index) { + // expected-error @+1 {{Requested copy elements is 3 with width 64. But copy elements could be one of 1, 2.}} + %0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 3: memref<128x128xf64> to memref<3x16x128xf64, 3> + return +} diff --git a/mlir/test/Dialect/NVGPU/optimize-shared-memory.mlir b/mlir/test/Dialect/NVGPU/optimize-shared-memory.mlir --- a/mlir/test/Dialect/NVGPU/optimize-shared-memory.mlir +++ b/mlir/test/Dialect/NVGPU/optimize-shared-memory.mlir @@ -74,7 +74,7 @@ // CHECK: [[xorBits:%.+]] = arith.shli [[src_bits]], [[c1]] // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol]], [[xorBits]] // CHECK: nvgpu.device_async_copy [[arg0]][[[ldRow]], [[ldCol]]], [[shm]][[[stRow]], [[stColPerm]]] - %0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 8 + %0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 4 : memref<128x128xf32> to memref<64x16xf32, 3> %1 = nvgpu.device_async_create_group %0 nvgpu.device_async_wait %1 { numGroups = 1 : i32} @@ -130,7 +130,7 @@ // CHECK: [[xorBits:%.+]] = arith.shli [[src_bits]], [[c2]] // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol]], [[xorBits]] // CHECK: nvgpu.device_async_copy [[arg0]][[[ldRow]], [[ldCol]]], [[shmB]][[[stRow]], [[stColPerm]]] - %2 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shmB[%stRow, %stCol], 8 + %2 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shmB[%stRow, %stCol], 4 : memref<128x128xf32> to memref<16x64xf32, 3> %3 = nvgpu.device_async_create_group %0 nvgpu.device_async_wait %1 { numGroups = 1 : i32} @@ -175,7 +175,7 @@ // CHECK: [[xorBits:%.+]] = arith.shrui [[src_bits]], [[c1]] // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol]], [[xorBits]] // CHECK: nvgpu.device_async_copy [[arg0]][[[ldRow]], [[ldCol]]], [[shm]][[[stRow]], [[stColPerm]]] - %0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 8 + %0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 2 : memref<32x32xf64> to memref<32x4xf64, 3> %1 = nvgpu.device_async_create_group %0 nvgpu.device_async_wait %1 { numGroups = 1 : i32}