diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td --- a/mlir/include/mlir/Conversion/Passes.td +++ b/mlir/include/mlir/Conversion/Passes.td @@ -426,6 +426,10 @@ "Bitwidth of the index type, 0 to use size of machine word">, Option<"hasRedux", "has-redux", "bool", /*default=*/"false", "Target gpu supports redux">, + Option<"useBarePtrCallConv", "use-bare-ptr-memref-call-conv", "bool", + /*default=*/"false", + "Replace memref arguments in GPU functions with bare pointers. " + "All memrefs must have static shape.">, Option<"useOpaquePointers", "use-opaque-pointers", "bool", /*default=*/"true", "Generate LLVM IR using opaque pointers " "instead of typed pointers">, diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp --- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp +++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp @@ -231,6 +231,7 @@ if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout) options.overrideIndexBitwidth(indexBitwidth); options.useOpaquePointers = useOpaquePointers; + options.useBarePtrCallConv = useBarePtrCallConv; // Apply in-dialect lowering. In-dialect lowering will replace // ops which need to be lowered further, which is not supported by a diff --git a/mlir/test/Conversion/GPUToNVVM/memref.mlir b/mlir/test/Conversion/GPUToNVVM/memref.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Conversion/GPUToNVVM/memref.mlir @@ -0,0 +1,13 @@ +// RUN: mlir-opt %s -convert-gpu-to-nvvm | FileCheck %s +// RUN: mlir-opt %s -convert-gpu-to-nvvm='use-bare-ptr-memref-call-conv=1' \ +// RUN: | FileCheck %s --check-prefix=BARE + +gpu.module @memref_conversions { + // CHECK: llvm.func @kern + // CHECK-SAME: (%{{.*}}: !llvm.ptr, %{{.*}}: !llvm.ptr, %{{.*}}: i64, %{{.*}}: i64, %{{.*}}: i64) + // BARE: llvm.func @kern + // BARE-SAME: (%{{.*}}: !llvm.ptr) + gpu.func @kern(%arg0: memref<8xf32>) kernel { + gpu.return + } +} diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir @@ -0,0 +1,64 @@ +// Tests memref bare pointer lowering convention both host side and kernel-side; +// this works for only statically shaped memrefs. +// Similar to the wmma-matmul-f32 but but with the memref bare pointer lowering convention. +// This test also uses gpu.memcpy operations (instead of gpu.host_register). +// RUN: mlir-opt %s \ +// RUN: | mlir-opt -gpu-kernel-outlining \ +// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm{use-bare-ptr-memref-call-conv=1},gpu-to-cubin{chip=sm_70}))' \ +// RUN: | mlir-opt --convert-scf-to-cf -gpu-to-llvm="use-bare-pointers-for-host=1 use-bare-pointers-for-kernels=1" \ +// RUN: | mlir-cpu-runner \ +// RUN: --shared-libs=%mlir_cuda_runtime \ +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s + +func.func @main() { + %h0 = memref.alloc() : memref<16x16xf16> + %h_out = memref.alloc() : memref<16x16xf32> + + %f1 = arith.constant 1.0e+00 : f16 + %f0 = arith.constant 0.0e+00 : f32 + %c0 = arith.constant 0 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c1 = arith.constant 1 : index + + // Intialize the Input matrix with ones. + scf.for %arg0 = %c0 to %c16 step %c1 { + scf.for %arg1 = %c0 to %c16 step %c1 { + memref.store %f1, %h0[%arg0, %arg1] : memref<16x16xf16> + } + } + // Intialize the accumulator matrix with zeros. + scf.for %arg0 = %c0 to %c16 step %c1 { + scf.for %arg1 = %c0 to %c16 step %c1 { + memref.store %f0, %h_out[%arg0, %arg1] : memref<16x16xf32> + } + } + + %token = gpu.wait async + %0, %t0 = gpu.alloc async [%token] () : memref<16x16xf16> + %out, %t1 = gpu.alloc async [%token]() : memref<16x16xf32> + // Copy from host to device. + %x = gpu.memcpy async [%token] %0, %h0 : memref<16x16xf16>, memref<16x16xf16> + %y = gpu.memcpy async [%token] %out, %h_out : memref<16x16xf32>, memref<16x16xf32> + + gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1) + threads(%tx, %ty, %tz) in (%block_x = %c32, %block_y = %c1, %block_z = %c1) { + %A = gpu.subgroup_mma_load_matrix %0[%c0, %c0] {leadDimension = 16 : index} : memref<16x16xf16> -> !gpu.mma_matrix<16x16xf16, "AOp"> + %B = gpu.subgroup_mma_load_matrix %0[%c0, %c0] {leadDimension = 16 : index} : memref<16x16xf16> -> !gpu.mma_matrix<16x16xf16, "BOp"> + %C = gpu.subgroup_mma_load_matrix %out[%c0, %c0] {leadDimension = 16 : index} : memref<16x16xf32> -> !gpu.mma_matrix<16x16xf32, "COp"> + + %R = gpu.subgroup_mma_compute %A, %B, %C : !gpu.mma_matrix<16x16xf16, "AOp">, !gpu.mma_matrix<16x16xf16, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp"> + + gpu.subgroup_mma_store_matrix %R, %out[%c0, %c0] {leadDimension = 16 : index}: !gpu.mma_matrix<16x16xf32, "COp">, memref<16x16xf32> + // Memref print utilities use unranked memrefs. We can't use bare pointer lowering + // with them. We simply test for successful execution. + gpu.printf "Success\n" + // CHECK: Success + gpu.terminator + } + %z = gpu.dealloc async [%token] %0 : memref<16x16xf16> + %w = gpu.dealloc async [%token] %out : memref<16x16xf32> + gpu.wait [%token] + return +} diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir --- a/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir @@ -8,6 +8,17 @@ // RUN: --entry-point-result=void \ // RUN: | FileCheck %s +// Same as above but with the memref bare pointer lowering convention. +// RUN: mlir-opt %s \ +// RUN: | mlir-opt -gpu-kernel-outlining \ +// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm{use-bare-ptr-memref-call-conv=1},gpu-to-cubin))' \ +// RUN: | mlir-opt -gpu-to-llvm="use-bare-pointers-for-kernels=1" \ +// RUN: | mlir-cpu-runner \ +// RUN: --shared-libs=%mlir_cuda_runtime \ +// RUN: --shared-libs=%mlir_runner_utils \ +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s + func.func @main() { %data = memref.alloc() : memref<2x6xi32> %sum = memref.alloc() : memref<2xi32> @@ -67,4 +78,3 @@ } func.func private @printMemrefI32(memref<*xi32>) -