diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt --- a/mlir/test/CMakeLists.txt +++ b/mlir/test/CMakeLists.txt @@ -32,6 +32,7 @@ option(MLIR_RUN_CUDA_SM80_TESTS "Run CUDA A100 tests.") option(MLIR_RUN_CUDA_SM80_LT_TESTS "Run CUDA A100 structured sparsity tests.") option(MLIR_RUN_CUDA_SM90_TESTS "Run CUDA H100 tests.") + option(MLIR_RUN_CUDA_SM90a_TESTS "Run CUDA H100 for sm_90a tests.") option(MLIR_RUN_ARM_SVE_TESTS "Run Arm SVE tests.") option(MLIR_RUN_ARM_SME_TESTS "Run Arm SME tests.") @@ -73,6 +74,7 @@ MLIR_RUN_CUDA_SM80_TESTS MLIR_RUN_CUDA_SM80_LT_TESTS MLIR_RUN_CUDA_SM90_TESTS + MLIR_RUN_CUDA_SM90a_TESTS ) configure_lit_site_cfg( diff --git a/mlir/test/Integration/GPU/CUDA/sm90a/gemm_m128n128k128_f32f16f16.mlir b/mlir/test/Integration/GPU/CUDA/sm90a/gemm_m128n128k128_f32f16f16.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Integration/GPU/CUDA/sm90a/gemm_m128n128k128_f32f16f16.mlir @@ -0,0 +1,262 @@ +// RUN: mlir-opt %s --convert-nvgpu-to-nvvm -gpu-kernel-outlining \ +// RUN: -convert-scf-to-cf -convert-nvvm-to-llvm \ +// RUN: -convert-vector-to-llvm \ +// RUN: -convert-math-to-llvm \ +// RUN: -expand-strided-metadata \ +// RUN: -lower-affine \ +// RUN: -convert-index-to-llvm=index-bitwidth=32 \ +// RUN: -convert-arith-to-llvm \ +// RUN: -finalize-memref-to-llvm \ +// RUN: -convert-func-to-llvm \ +// RUN: -canonicalize \ +// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-nvgpu-to-nvvm{use-opaque-pointers=1},lower-affine,convert-scf-to-cf,convert-vector-to-llvm,convert-math-to-llvm,expand-strided-metadata,lower-affine,convert-index-to-llvm{index-bitwidth=32},convert-arith-to-llvm,reconcile-unrealized-casts,gpu-to-cubin{chip=sm_90 features=+ptx80 dump-ptx}))' \ +// RUN: 2&>1 | FileCheck %s --check-prefixes=CHECK-PTX + +// CHECK-PTX: mbarrier.init.shared.b64 +// CHECK-PTX: mbarrier.arrive.expect_tx.shared.b64 +// CHECK-PTX: cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes +// CHECK-PTX: cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes +// CHECK-PTX: mbarrier.arrive.expect_tx.shared.b64 +// CHECK-PTX: mbarrier.try_wait.parity.shared.b64 +// CHECK-PTX: wgmma.fence.sync.aligned; +// CHECK-PTX: wgmma.mma_async.sync.aligned.m64n128k16.f32.f16.f16 +// CHECK-PTX: wgmma.mma_async.sync.aligned.m64n128k16.f32.f16.f16 +// CHECK-PTX: wgmma.mma_async.sync.aligned.m64n128k16.f32.f16.f16 +// CHECK-PTX: wgmma.mma_async.sync.aligned.m64n128k16.f32.f16.f16 +// CHECK-PTX: wgmma.mma_async.sync.aligned.m64n128k16.f32.f16.f16 +// CHECK-PTX: wgmma.mma_async.sync.aligned.m64n128k16.f32.f16.f16 +// CHECK-PTX: wgmma.mma_async.sync.aligned.m64n128k16.f32.f16.f16 +// CHECK-PTX: wgmma.mma_async.sync.aligned.m64n128k16.f32.f16.f16 +// CHECK-PTX: wgmma.commit_group.sync.aligned +// CHECK-PTX: wgmma.wait_group.sync.aligned 1 + + +// # GEMM Sequential +// ------------------------------------------ +// for(i...128) +// for(j...128) +// for(k...128) +// D = A * B +// # Parallel H100 GEMM (1 CTA, 1 warpgroup(128 threads)) +// ------------------------------------------ +// wgmma.mma_async shape +// wgmma.mma_async shape +// wgmma.mma_async shape +// wgmma.mma_async shape +// wgmma.mma_async shape +// wgmma.mma_async shape +// wgmma.mma_async shape +// wgmma.mma_async shape + +!barrierType = !nvgpu.mbarrier.barrier> +!tokenType = !nvgpu.mbarrier.token + +// TMA device descriptor +!lhs = memref<128x64xf16> +!rhs = memref<64x128xf16> + +!shmemlhs = memref<128x64xf16,3> +!shmemrhs = memref<64x128xf16,3> + +!lhsTensorMap = !nvgpu.tensormap.descriptor +!rhsTensorMap = !nvgpu.tensormap.descriptor + +!accMatrix = !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, + f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, + f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, + f32, f32, f32, f32, f32, f32)> + +module @mymod{ +memref.global "private" @bufferLhsGlobal : !shmemlhs +memref.global "private" @bufferRhsGlobal : !shmemrhs + +func.func @main() { + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %f2 = arith.constant 2.0 : f16 + %f3 = arith.constant 3.0 : f16 + + // Step 1. Allocate matrices by managed memory for sake of simplicity. + %lhs = memref.alloc() : !lhs + %rhs = memref.alloc() : !rhs + + // Step 2. Intialize the Input matrix with ones. + scf.for %i = %c0 to %c64 step %c1 { + scf.for %j = %c0 to %c128 step %c1 { + memref.store %f3, %rhs[%i, %j] : !rhs + memref.store %f2, %lhs[%j, %i] : !lhs + } + } + + // Step 3. Copy data to the device. + %0 = gpu.wait async + %memref, %asyncToken = gpu.alloc async [%0] () : !lhs + %memref_1, %asyncToken_2 = gpu.alloc async [%0] () : !rhs + %1 = gpu.memcpy async [%0] %memref, %lhs : !lhs, !lhs + %2 = gpu.memcpy async [%0] %memref_1, %rhs : !rhs, !rhs + + // Step 4. Create 2 TMA Descriptors for input matrices + %lhs_unranked = memref.cast %memref :!lhs to memref<*xf16> + %rhs_unranked = memref.cast %memref_1 :!rhs to memref<*xf16> + + %lhsTensorMap = nvgpu.tma.create.descriptor %lhs_unranked box[%c128, %c64] : memref<*xf16> -> !lhsTensorMap + %rhsTensorMap = nvgpu.tma.create.descriptor %rhs_unranked box[%c128, %c64] : memref<*xf16> -> !rhsTensorMap + + // Step 5. Launch the kernel. + gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1) + threads(%tx, %ty, %tz) in (%block_x = %c128, %block_y = %c1, %block_z = %c1) { + %num_threads = gpu.block_dim x + %ic0 = arith.constant 0 : index + %ic7 = arith.constant 7 : index + %ic1 = arith.constant 1 : index + %ic2 = arith.constant 2 : index + %ic45 = arith.constant 45 : index + %ic63 = arith.constant 63 : index + %ic15 = arith.constant 15 : index + %ic127 = arith.constant 127 : index + %tidx = gpu.thread_id x + + // Step 1. Get shared memory pointers. This could be dynamic shared memory, but for now I use static + %lhsShmem = memref.get_global @bufferLhsGlobal : !shmemlhs + %rhsShmem = memref.get_global @bufferRhsGlobal : !shmemrhs + + // Step 2. Create a barrier type. This is i64 value in shared memory. + %barrier = nvgpu.mbarrier.create -> !barrierType + + // Step 3. Initialize the barrier + nvgpu.mbarrier.init %barrier, %num_threads : !barrierType + gpu.barrier + // Step 3.1 threadIdx.x == 0 does TMA request + %cnd = arith.cmpi eq, %tidx, %ic0 : index + scf.if %cnd { + %x1 = memref.dim %lhsShmem, %c0 : !shmemlhs + %x2 = memref.dim %lhsShmem, %c1 : !shmemlhs + %x3 = memref.dim %rhsShmem, %c0 : !shmemrhs + %x4 = memref.dim %rhsShmem, %c1 : !shmemrhs + %x5 = arith.muli %x1, %x2 : index + %x6 = arith.muli %x3, %x4 : index + %x7 = arith.addi %x5, %x6 : index + %txcount = arith.muli %x7, %ic2 : index + gpu.printf "[GPU] TMA SIZE %d\n" %txcount : index + + %lhs0 = memref.load %lhsShmem[%ic0, %ic0] : !shmemlhs + %rhs0 = memref.load %rhsShmem[%ic0, %ic0] : !shmemrhs + %lhs032 = arith.extf %lhs0: f16 to f32 + %rhs032 = arith.extf %rhs0: f16 to f32 + gpu.printf "[GPU] Before TMA Shmem lhs[0][0] \t %f\n" %lhs032 : f32 + gpu.printf "[GPU] Before TMA Shmem rhs[0][0] \t %f\n" %rhs032 : f32 + nvgpu.tma.async.load %lhsTensorMap[%ic0, %ic0], %barrier to %lhsShmem : !lhsTensorMap, !barrierType -> !shmemlhs + nvgpu.tma.async.load %rhsTensorMap[%ic0, %ic0], %barrier to %rhsShmem : !rhsTensorMap, !barrierType -> !shmemrhs + nvgpu.mbarrier.arrive.expect_tx %barrier, %txcount : !barrierType + scf.yield + } else { + nvgpu.mbarrier.arrive.expect_tx %barrier, %ic0 : !barrierType + scf.yield + } + + %phase = arith.constant 0 : index + %ticks = arith.constant 10000000 : index + nvgpu.mbarrier.try_wait.parity %barrier, %phase, %ticks : !barrierType + + %descA = nvgpu.wgmma.generate.descriptor %lhsShmem, %lhsTensorMap : !shmemlhs, !lhsTensorMap + %descB = nvgpu.wgmma.generate.descriptor %rhsShmem, %rhsTensorMap : !shmemrhs, !rhsTensorMap + + // Step 4 Sanity check of TMA + scf.if %cnd { + %lhs0 = memref.load %lhsShmem[%ic7, %ic7] : !shmemlhs + %rhs0 = memref.load %rhsShmem[%ic7, %ic1] : !shmemrhs + %lhs032 = arith.extf %lhs0: f16 to f32 + %rhs032 = arith.extf %rhs0: f16 to f32 + gpu.printf "[GPU] TMA Loaded shmem lhs[0][0] \t %f\n" %lhs032 : f32 + gpu.printf "[GPU] TMA Loaded shmem rhs[0][0] \t %f\n" %rhs032 : f32 + gpu.printf "WGMMA DescA : 0x%llx\n" %descA : i64 + gpu.printf "WGMMA DescB : 0x%llx\n" %descB : i64 + } + // Step 6. GEMM 128x128x128 + %d2 = arith.constant 2 : i64 + %d4 = arith.constant 4 : i64 + %d6 = arith.constant 6 : i64 + %d8 = arith.constant 8 : i64 + %d512 = arith.constant 512 : i64 + %d514 = arith.constant 514 : i64 + %d516 = arith.constant 516 : i64 + %d518 = arith.constant 518 : i64 + + %scaleD = arith.constant 1 : i32 // D = A*B (no accumulate itself) + nvvm.wgmma.fence.aligned + %wgmma_result_00 = llvm.mlir.undef : !accMatrix + %wgmma_result_01 = nvvm.wgmma.mma_async %descA, %descB, + #nvvm.shape, + D [%wgmma_result_00, #nvvm.wgmma_scale_out], + A [, #nvvm.wgmma_scale_in, ], + B [, #nvvm.wgmma_scale_in, ] : !accMatrix -> !accMatrix + %descA1 = arith.addi %descA, %d2 : i64 + %descB1 = arith.addi %descB, %d2 : i64 + %wgmma_result_02 = nvvm.wgmma.mma_async %descA1, %descB1, + #nvvm.shape, + D [%wgmma_result_01, #nvvm.wgmma_scale_out], + A [, #nvvm.wgmma_scale_in, ], + B [, #nvvm.wgmma_scale_in, ] : !accMatrix -> !accMatrix + %descA2 = arith.addi %descA, %d4 : i64 + %descB2 = arith.addi %descB, %d4 : i64 + %wgmma_result_03 = nvvm.wgmma.mma_async %descA2, %descB2, + #nvvm.shape, + D [%wgmma_result_02, #nvvm.wgmma_scale_out], + A [, #nvvm.wgmma_scale_in, ], + B [, #nvvm.wgmma_scale_in, ] : !accMatrix -> !accMatrix + %descA3 = arith.addi %descA, %d6 : i64 + %descB3 = arith.addi %descB, %d6 : i64 + %wgmma_result_04 = nvvm.wgmma.mma_async %descA3, %descB3, + #nvvm.shape, + D [%wgmma_result_03, #nvvm.wgmma_scale_out], + A [, #nvvm.wgmma_scale_in, ], + B [, #nvvm.wgmma_scale_in, ] : !accMatrix -> !accMatrix + + %wgmma_result_10 = llvm.mlir.undef : !accMatrix + %descA4 = arith.addi %descA, %d512 : i64 + %wgmma_result_11 = nvvm.wgmma.mma_async %descA4, %descB, + #nvvm.shape, + D [%wgmma_result_10, #nvvm.wgmma_scale_out], + A [, #nvvm.wgmma_scale_in, ], + B [, #nvvm.wgmma_scale_in, ] : !accMatrix -> !accMatrix + %descA5 = arith.addi %descA, %d514 : i64 + %descB5 = arith.addi %descB, %d2 : i64 + %wgmma_result_12 = nvvm.wgmma.mma_async %descA5, %descB5, + #nvvm.shape, + D [%wgmma_result_11, #nvvm.wgmma_scale_out], + A [, #nvvm.wgmma_scale_in, ], + B [, #nvvm.wgmma_scale_in, ] : !accMatrix -> !accMatrix + %descA6 = arith.addi %descA, %d516 : i64 + %descB6 = arith.addi %descB, %d4 : i64 + %wgmma_result_13 = nvvm.wgmma.mma_async %descA6, %descB6, + #nvvm.shape, + D [%wgmma_result_12, #nvvm.wgmma_scale_out], + A [, #nvvm.wgmma_scale_in, ], + B [, #nvvm.wgmma_scale_in, ] : !accMatrix -> !accMatrix + %descA7 = arith.addi %descA, %d518 : i64 + %descB7 = arith.addi %descB, %d8 : i64 + %wgmma_result_14 = nvvm.wgmma.mma_async %descA7, %descB7, + #nvvm.shape, + D [%wgmma_result_13, #nvvm.wgmma_scale_out], + A [, #nvvm.wgmma_scale_in, ], + B [, #nvvm.wgmma_scale_in, ] : !accMatrix -> !accMatrix + + nvvm.wgmma.commit.group.sync.aligned + nvvm.wgmma.wait.group.sync.aligned 1 + + // Step 7. Sanity check WGMMA (2nd and 3rd Warp) + %cnd125 = arith.cmpi uge, %tidx, %ic63 : index + %r0 = llvm.extractvalue %wgmma_result_04[0] : !accMatrix + %r1 = llvm.extractvalue %wgmma_result_04[1] : !accMatrix + %r2 = llvm.extractvalue %wgmma_result_04[2] : !accMatrix + %r3 = llvm.extractvalue %wgmma_result_14[0] : !accMatrix + %r4 = llvm.extractvalue %wgmma_result_14[1] : !accMatrix + %r5 = llvm.extractvalue %wgmma_result_14[2] : !accMatrix + gpu.printf "[GPU][thread=%3d] WGMMA [0]=%4.1f [1]=%4.1f [2]=%4.1f [3]=%4.1f [4]=%4.1f [5]=%4.1f\n" %tidx, %r0, %r1, %r2, %r3, %r4, %r5 : index, f32, f32, f32, f32, f32, f32 + gpu.terminator + } + return +} +} diff --git a/mlir/test/Integration/GPU/CUDA/sm90a/lit.local.cfg b/mlir/test/Integration/GPU/CUDA/sm90a/lit.local.cfg new file mode 100644 --- /dev/null +++ b/mlir/test/Integration/GPU/CUDA/sm90a/lit.local.cfg @@ -0,0 +1,2 @@ +if not config.enable_cuda_runner or not config.mlir_run_cuda_sm90a_tests: + config.unsupported = True diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in --- a/mlir/test/lit.site.cfg.py.in +++ b/mlir/test/lit.site.cfg.py.in @@ -49,6 +49,7 @@ config.mlir_run_cuda_sm80_tests = @MLIR_RUN_CUDA_SM80_TESTS@ config.mlir_run_cuda_sm80_lt_tests = @MLIR_RUN_CUDA_SM80_LT_TESTS@ config.mlir_run_cuda_sm90_tests = @MLIR_RUN_CUDA_SM90_TESTS@ +config.mlir_run_cuda_sm90a_tests = @MLIR_RUN_CUDA_SM90a_TESTS@ config.mlir_include_integration_tests = @MLIR_INCLUDE_INTEGRATION_TESTS@ config.arm_emulator_executable = "@ARM_EMULATOR_EXECUTABLE@" config.arm_emulator_options = "@ARM_EMULATOR_OPTIONS@" diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel --- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel @@ -52,6 +52,7 @@ "@MLIR_RUN_CUDA_SM80_TESTS@": "0", "@MLIR_RUN_CUDA_SM80_LT_TESTS@": "0", "@MLIR_RUN_CUDA_SM90_TESTS@": "0", + "@MLIR_RUN_CUDA_SM90a_TESTS@": "0", "@MLIR_INCLUDE_INTEGRATION_TESTS@": "0", "@SHLIBDIR@": package_path("//llvm:BUILD"), },