This is an archive of the discontinued LLVM Phabricator instance.

mlir/test/Integration/GPU/CUDA/sm90a/gemm_m128n128k128_f32f16f16.mlir
185	Can you please explain or better draw out these desc updates? I see four k-groups and two wgmma.mma_async.m64n128k16
247	Can you please explain how `nvvm.wgmma.commit.group.sync.aligned` and `nvvm.wgmma.wait.group.sync.aligned 1` works here? Use the CTA Shape, wgmma math shape in your explanation as concrete example? Also, are you able to make a functional test pass with random input distribution for operands?

guraypp added inline comments.Aug 25 2023, 5:16 AM

mlir/test/Integration/GPU/CUDA/sm90a/gemm_m128n128k128_f32f16f16.mlir
185	Right there are 4 x `wgmma.m64n128k16`. It completest `64x128x64`. Name of test is misleading (`gemm_m128n128k128_f32f16f16`), it should be `gemm_m128n128k64_f32f16f16`.
247	Can you please explain how nvvm.wgmma.commit.group.sync.aligned and nvvm.wgmma.wait.group.sync.aligned 1 works here? Regarding `nvvm.wgmma.commit.group.sync.aligned`, it commites the 8x `wgmma` group, and `nvvm.wgmma.wait.group.sync.aligned 1` waits for their completion. Interestingly, when analyzing the PTX from CUTLASS, I see `nvvm.wgmma.wait.group.sync.aligned 0`. The instruction sequence appears as follows: wgmma.fence.sync.aligned; 8 x wgmma.mma_async wgmma.commit_group.sync.aligned; wgmma.wait_group.sync.aligned 1 ... wgmma.wait_group.sync.aligned 0 <--- Do we need this? Also, are you able to make a functional test pass with random input distribution for operands? Yes, but I need to more rigid test. Right now, there's a bug in the TMA in upstream. I create the TMA descriptor using `cuTensorMapEncodeTiled`, I do something wrong. Once I resolve that issue, I plan to create another test that uses random data input data and verifies the output.

guraypp added inline comments.Aug 25 2023, 7:07 AM

mlir/test/Integration/GPU/CUDA/sm90a/gemm_m128n128k128_f32f16f16.mlir
247	Right now, there's a bug in the TMA in upstream. I correct myself : It isn't a bug. TMA expects 128b alignment for 2D+. To complete the shape, I need to execute 3x`cp.async.bulk.tensor` 1 x [128][64] for matrix-A 2 x [64][64] for matrix-B

Revision Contents

Path

Size

mlir/

test/

CMakeLists.txt

2 lines

Integration/

GPU/

CUDA/

sm90a/

gemm_m128n128k128_f32f16f16.mlir

262 lines

lit.local.cfg

2 lines

lit.site.cfg.py.in

1 line

utils/

bazel/

llvm-project-overlay/

mlir/

test/

BUILD.bazel

1 line

Diff 552371

mlir/test/CMakeLists.txt

Show All 26 Lines	if (MLIR_INCLUDE_INTEGRATION_TESTS)
set(ARM_EMULATOR_UTILS_LIB_DIR "" CACHE STRING		set(ARM_EMULATOR_UTILS_LIB_DIR "" CACHE STRING
"If arch-specific Arm integration tests run emulated, find Arm native utility libraries in this directory.")		"If arch-specific Arm integration tests run emulated, find Arm native utility libraries in this directory.")
option(MLIR_RUN_AMX_TESTS "Run AMX tests.")		option(MLIR_RUN_AMX_TESTS "Run AMX tests.")
option(MLIR_RUN_X86VECTOR_TESTS "Run X86Vector tests.")		option(MLIR_RUN_X86VECTOR_TESTS "Run X86Vector tests.")
option(MLIR_RUN_CUDA_TENSOR_CORE_TESTS "Run CUDA Tensor core WMMA tests.")		option(MLIR_RUN_CUDA_TENSOR_CORE_TESTS "Run CUDA Tensor core WMMA tests.")
option(MLIR_RUN_CUDA_SM80_TESTS "Run CUDA A100 tests.")		option(MLIR_RUN_CUDA_SM80_TESTS "Run CUDA A100 tests.")
option(MLIR_RUN_CUDA_SM80_LT_TESTS "Run CUDA A100 structured sparsity tests.")		option(MLIR_RUN_CUDA_SM80_LT_TESTS "Run CUDA A100 structured sparsity tests.")
option(MLIR_RUN_CUDA_SM90_TESTS "Run CUDA H100 tests.")		option(MLIR_RUN_CUDA_SM90_TESTS "Run CUDA H100 tests.")
		option(MLIR_RUN_CUDA_SM90a_TESTS "Run CUDA H100 for sm_90a tests.")
option(MLIR_RUN_ARM_SVE_TESTS "Run Arm SVE tests.")		option(MLIR_RUN_ARM_SVE_TESTS "Run Arm SVE tests.")
option(MLIR_RUN_ARM_SME_TESTS "Run Arm SME tests.")		option(MLIR_RUN_ARM_SME_TESTS "Run Arm SME tests.")


# The native target may not be enabled when cross compiling, raise an error.		# The native target may not be enabled when cross compiling, raise an error.
if(NOT MLIR_ENABLE_EXECUTION_ENGINE)		if(NOT MLIR_ENABLE_EXECUTION_ENGINE)
message(FATAL_ERROR "MLIR_INCLUDE_INTEGRATION_TESTS requires a native target")		message(FATAL_ERROR "MLIR_INCLUDE_INTEGRATION_TESTS requires a native target")
endif()		endif()
Show All 25 Lines	llvm_canonicalize_cmake_booleans(
MLIR_RUN_AMX_TESTS		MLIR_RUN_AMX_TESTS
MLIR_RUN_CUDA_TENSOR_CORE_TESTS		MLIR_RUN_CUDA_TENSOR_CORE_TESTS
MLIR_RUN_X86VECTOR_TESTS		MLIR_RUN_X86VECTOR_TESTS
MLIR_RUN_ARM_SVE_TESTS		MLIR_RUN_ARM_SVE_TESTS
MLIR_RUN_ARM_SME_TESTS		MLIR_RUN_ARM_SME_TESTS
MLIR_RUN_CUDA_SM80_TESTS		MLIR_RUN_CUDA_SM80_TESTS
MLIR_RUN_CUDA_SM80_LT_TESTS		MLIR_RUN_CUDA_SM80_LT_TESTS
MLIR_RUN_CUDA_SM90_TESTS		MLIR_RUN_CUDA_SM90_TESTS
		MLIR_RUN_CUDA_SM90a_TESTS
)		)

configure_lit_site_cfg(		configure_lit_site_cfg(
${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in		${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py		${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py
MAIN_CONFIG		MAIN_CONFIG
${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py		${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py
)		)
▲ Show 20 Lines • Show All 111 Lines • Show Last 20 Lines

mlir/test/Integration/GPU/CUDA/sm90a/gemm_m128n128k128_f32f16f16.mlir

This file was added.

				// RUN: mlir-opt %s --convert-nvgpu-to-nvvm -gpu-kernel-outlining \
				// RUN: -convert-scf-to-cf -convert-nvvm-to-llvm \
				// RUN: -convert-vector-to-llvm \
				// RUN: -convert-math-to-llvm \
				// RUN: -expand-strided-metadata \
				// RUN: -lower-affine \
				// RUN: -convert-index-to-llvm=index-bitwidth=32 \
				// RUN: -convert-arith-to-llvm \
				// RUN: -finalize-memref-to-llvm \
				// RUN: -convert-func-to-llvm \
				// RUN: -canonicalize \
				// RUN: \| mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-nvgpu-to-nvvm{use-opaque-pointers=1},lower-affine,convert-scf-to-cf,convert-vector-to-llvm,convert-math-to-llvm,expand-strided-metadata,lower-affine,convert-index-to-llvm{index-bitwidth=32},convert-arith-to-llvm,reconcile-unrealized-casts,gpu-to-cubin{chip=sm_90 features=+ptx80 dump-ptx}))' \
				// RUN: 2&>1 \| FileCheck %s --check-prefixes=CHECK-PTX

				// CHECK-PTX: mbarrier.init.shared.b64
				// CHECK-PTX: mbarrier.arrive.expect_tx.shared.b64
				// CHECK-PTX: cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes
				// CHECK-PTX: cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes
				// CHECK-PTX: mbarrier.arrive.expect_tx.shared.b64
				// CHECK-PTX: mbarrier.try_wait.parity.shared.b64
				// CHECK-PTX: wgmma.fence.sync.aligned;
				// CHECK-PTX: wgmma.mma_async.sync.aligned.m64n128k16.f32.f16.f16
				// CHECK-PTX: wgmma.mma_async.sync.aligned.m64n128k16.f32.f16.f16
				// CHECK-PTX: wgmma.mma_async.sync.aligned.m64n128k16.f32.f16.f16
				// CHECK-PTX: wgmma.mma_async.sync.aligned.m64n128k16.f32.f16.f16
				// CHECK-PTX: wgmma.mma_async.sync.aligned.m64n128k16.f32.f16.f16
				// CHECK-PTX: wgmma.mma_async.sync.aligned.m64n128k16.f32.f16.f16
				// CHECK-PTX: wgmma.mma_async.sync.aligned.m64n128k16.f32.f16.f16
				// CHECK-PTX: wgmma.mma_async.sync.aligned.m64n128k16.f32.f16.f16
				// CHECK-PTX: wgmma.commit_group.sync.aligned
				// CHECK-PTX: wgmma.wait_group.sync.aligned 1


				// # GEMM Sequential
				// ------------------------------------------
				// for(i...128)
				// for(j...128)
				// for(k...128)
				// D = A * B
				// # Parallel H100 GEMM (1 CTA, 1 warpgroup(128 threads))
				// ------------------------------------------
				// wgmma.mma_async shape<m = 64, n = 128, k = 16>
				// wgmma.mma_async shape<m = 64, n = 128, k = 16>
				// wgmma.mma_async shape<m = 64, n = 128, k = 16>
				// wgmma.mma_async shape<m = 64, n = 128, k = 16>
				// wgmma.mma_async shape<m = 64, n = 128, k = 16>
				// wgmma.mma_async shape<m = 64, n = 128, k = 16>
				// wgmma.mma_async shape<m = 64, n = 128, k = 16>
				// wgmma.mma_async shape<m = 64, n = 128, k = 16>

				!barrierType = !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
				!tokenType = !nvgpu.mbarrier.token

				// TMA device descriptor
				!lhs = memref<128x64xf16>
				!rhs = memref<64x128xf16>

				!shmemlhs = memref<128x64xf16,3>
				!shmemrhs = memref<64x128xf16,3>

				!lhsTensorMap = !nvgpu.tensormap.descriptor<tensor = !shmemlhs, swizzle = swizzle_128b, l2promo=none, oob=zero, interleave=none>
				!rhsTensorMap = !nvgpu.tensormap.descriptor<tensor = !shmemrhs, swizzle = swizzle_128b, l2promo=none, oob=zero, interleave=none>

				!accMatrix = !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32,
				f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32,
				f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32,
				f32, f32, f32, f32, f32, f32)>

				module @mymod{
				memref.global "private" @bufferLhsGlobal : !shmemlhs
				memref.global "private" @bufferRhsGlobal : !shmemrhs

				func.func @main() {
				%c64 = arith.constant 64 : index
				%c1 = arith.constant 1 : index
				%c0 = arith.constant 0 : index
				%c128 = arith.constant 128 : index
				%f2 = arith.constant 2.0 : f16
				%f3 = arith.constant 3.0 : f16

				// Step 1. Allocate matrices by managed memory for sake of simplicity.
				%lhs = memref.alloc() : !lhs
				%rhs = memref.alloc() : !rhs

				// Step 2. Intialize the Input matrix with ones.
				scf.for %i = %c0 to %c64 step %c1 {
				scf.for %j = %c0 to %c128 step %c1 {
				memref.store %f3, %rhs[%i, %j] : !rhs
				memref.store %f2, %lhs[%j, %i] : !lhs
				}
				}

				// Step 3. Copy data to the device.
				%0 = gpu.wait async
				%memref, %asyncToken = gpu.alloc async [%0] () : !lhs
				%memref_1, %asyncToken_2 = gpu.alloc async [%0] () : !rhs
				%1 = gpu.memcpy async [%0] %memref, %lhs : !lhs, !lhs
				%2 = gpu.memcpy async [%0] %memref_1, %rhs : !rhs, !rhs

				// Step 4. Create 2 TMA Descriptors for input matrices
				%lhs_unranked = memref.cast %memref :!lhs to memref<*xf16>
				%rhs_unranked = memref.cast %memref_1 :!rhs to memref<*xf16>

				%lhsTensorMap = nvgpu.tma.create.descriptor %lhs_unranked box[%c128, %c64] : memref<*xf16> -> !lhsTensorMap
				%rhsTensorMap = nvgpu.tma.create.descriptor %rhs_unranked box[%c128, %c64] : memref<*xf16> -> !rhsTensorMap

				// Step 5. Launch the kernel.
				gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1)
				threads(%tx, %ty, %tz) in (%block_x = %c128, %block_y = %c1, %block_z = %c1) {
				%num_threads = gpu.block_dim x
				%ic0 = arith.constant 0 : index
				%ic7 = arith.constant 7 : index
				%ic1 = arith.constant 1 : index
				%ic2 = arith.constant 2 : index
				%ic45 = arith.constant 45 : index
				%ic63 = arith.constant 63 : index
				%ic15 = arith.constant 15 : index
				%ic127 = arith.constant 127 : index
				%tidx = gpu.thread_id x

				// Step 1. Get shared memory pointers. This could be dynamic shared memory, but for now I use static
				%lhsShmem = memref.get_global @bufferLhsGlobal : !shmemlhs
				%rhsShmem = memref.get_global @bufferRhsGlobal : !shmemrhs

				// Step 2. Create a barrier type. This is i64 value in shared memory.
				%barrier = nvgpu.mbarrier.create -> !barrierType

				// Step 3. Initialize the barrier
				nvgpu.mbarrier.init %barrier, %num_threads : !barrierType
				gpu.barrier
				// Step 3.1 threadIdx.x == 0 does TMA request
				%cnd = arith.cmpi eq, %tidx, %ic0 : index
				scf.if %cnd {
				%x1 = memref.dim %lhsShmem, %c0 : !shmemlhs
				%x2 = memref.dim %lhsShmem, %c1 : !shmemlhs
				%x3 = memref.dim %rhsShmem, %c0 : !shmemrhs
				%x4 = memref.dim %rhsShmem, %c1 : !shmemrhs
				%x5 = arith.muli %x1, %x2 : index
				%x6 = arith.muli %x3, %x4 : index
				%x7 = arith.addi %x5, %x6 : index
				%txcount = arith.muli %x7, %ic2 : index
				gpu.printf "[GPU] TMA SIZE %d\n" %txcount : index

				%lhs0 = memref.load %lhsShmem[%ic0, %ic0] : !shmemlhs
				%rhs0 = memref.load %rhsShmem[%ic0, %ic0] : !shmemrhs
				%lhs032 = arith.extf %lhs0: f16 to f32
				%rhs032 = arith.extf %rhs0: f16 to f32
				gpu.printf "[GPU] Before TMA Shmem lhs[0][0] \t %f\n" %lhs032 : f32
				gpu.printf "[GPU] Before TMA Shmem rhs[0][0] \t %f\n" %rhs032 : f32
				nvgpu.tma.async.load %lhsTensorMap[%ic0, %ic0], %barrier to %lhsShmem : !lhsTensorMap, !barrierType -> !shmemlhs
				nvgpu.tma.async.load %rhsTensorMap[%ic0, %ic0], %barrier to %rhsShmem : !rhsTensorMap, !barrierType -> !shmemrhs
				nvgpu.mbarrier.arrive.expect_tx %barrier, %txcount : !barrierType
				scf.yield
				} else {
				nvgpu.mbarrier.arrive.expect_tx %barrier, %ic0 : !barrierType
				scf.yield
				}

				%phase = arith.constant 0 : index
				%ticks = arith.constant 10000000 : index
				nvgpu.mbarrier.try_wait.parity %barrier, %phase, %ticks : !barrierType

				%descA = nvgpu.wgmma.generate.descriptor %lhsShmem, %lhsTensorMap : !shmemlhs, !lhsTensorMap
				%descB = nvgpu.wgmma.generate.descriptor %rhsShmem, %rhsTensorMap : !shmemrhs, !rhsTensorMap

				// Step 4 Sanity check of TMA
				scf.if %cnd {
				%lhs0 = memref.load %lhsShmem[%ic7, %ic7] : !shmemlhs
				%rhs0 = memref.load %rhsShmem[%ic7, %ic1] : !shmemrhs
				%lhs032 = arith.extf %lhs0: f16 to f32
				%rhs032 = arith.extf %rhs0: f16 to f32
				gpu.printf "[GPU] TMA Loaded shmem lhs[0][0] \t %f\n" %lhs032 : f32
				gpu.printf "[GPU] TMA Loaded shmem rhs[0][0] \t %f\n" %rhs032 : f32
				gpu.printf "WGMMA DescA : 0x%llx\n" %descA : i64
				gpu.printf "WGMMA DescB : 0x%llx\n" %descB : i64
				}
				// Step 6. GEMM 128x128x128
				%d2 = arith.constant 2 : i64
				%d4 = arith.constant 4 : i64
				%d6 = arith.constant 6 : i64
				%d8 = arith.constant 8 : i64
				%d512 = arith.constant 512 : i64
				%d514 = arith.constant 514 : i64
				%d516 = arith.constant 516 : i64
				%d518 = arith.constant 518 : i64
				manishucsdUnsubmitted Not Done Reply Inline Actions Can you please explain or better draw out these desc updates? I see four k-groups and two wgmma.mma_async.m64n128k16 manishucsd: Can you please explain or better draw out these desc updates? I see four k-groups and two…
				gurayppAuthorUnsubmitted Done Reply Inline Actions Right there are 4 x `wgmma.m64n128k16`. It completest `64x128x64`. Name of test is misleading (`gemm_m128n128k128_f32f16f16`), it should be `gemm_m128n128k64_f32f16f16`. guraypp: Right there are 4 x `wgmma.m64n128k16`. It completest `64x128x64`. Name of test is misleading…

				%scaleD = arith.constant 1 : i32 // D = A*B (no accumulate itself)
				nvvm.wgmma.fence.aligned
				%wgmma_result_00 = llvm.mlir.undef : !accMatrix
				%wgmma_result_01 = nvvm.wgmma.mma_async %descA, %descB,
				#nvvm.shape<m = 64, n = 128, k = 16>,
				D [%wgmma_result_00, #nvvm.wgmma_scale_out<one>],
				A [<f16>, #nvvm.wgmma_scale_in<one>, <row>],
				B [<f16>, #nvvm.wgmma_scale_in<one>, <row>] : !accMatrix -> !accMatrix
				%descA1 = arith.addi %descA, %d2 : i64
				%descB1 = arith.addi %descB, %d2 : i64
				%wgmma_result_02 = nvvm.wgmma.mma_async %descA1, %descB1,
				#nvvm.shape<m = 64, n = 128, k = 16>,
				D [%wgmma_result_01, #nvvm.wgmma_scale_out<one>],
				A [<f16>, #nvvm.wgmma_scale_in<one>, <row>],
				B [<f16>, #nvvm.wgmma_scale_in<one>, <row>] : !accMatrix -> !accMatrix
				%descA2 = arith.addi %descA, %d4 : i64
				%descB2 = arith.addi %descB, %d4 : i64
				%wgmma_result_03 = nvvm.wgmma.mma_async %descA2, %descB2,
				#nvvm.shape<m = 64, n = 128, k = 16>,
				D [%wgmma_result_02, #nvvm.wgmma_scale_out<one>],
				A [<f16>, #nvvm.wgmma_scale_in<one>, <row>],
				B [<f16>, #nvvm.wgmma_scale_in<one>, <row>] : !accMatrix -> !accMatrix
				%descA3 = arith.addi %descA, %d6 : i64
				%descB3 = arith.addi %descB, %d6 : i64
				%wgmma_result_04 = nvvm.wgmma.mma_async %descA3, %descB3,
				#nvvm.shape<m = 64, n = 128, k = 16>,
				D [%wgmma_result_03, #nvvm.wgmma_scale_out<one>],
				A [<f16>, #nvvm.wgmma_scale_in<one>, <row>],
				B [<f16>, #nvvm.wgmma_scale_in<one>, <row>] : !accMatrix -> !accMatrix

				%wgmma_result_10 = llvm.mlir.undef : !accMatrix
				%descA4 = arith.addi %descA, %d512 : i64
				%wgmma_result_11 = nvvm.wgmma.mma_async %descA4, %descB,
				#nvvm.shape<m = 64, n = 128, k = 16>,
				D [%wgmma_result_10, #nvvm.wgmma_scale_out<one>],
				A [<f16>, #nvvm.wgmma_scale_in<one>, <row>],
				B [<f16>, #nvvm.wgmma_scale_in<one>, <row>] : !accMatrix -> !accMatrix
				%descA5 = arith.addi %descA, %d514 : i64
				%descB5 = arith.addi %descB, %d2 : i64
				%wgmma_result_12 = nvvm.wgmma.mma_async %descA5, %descB5,
				#nvvm.shape<m = 64, n = 128, k = 16>,
				D [%wgmma_result_11, #nvvm.wgmma_scale_out<one>],
				A [<f16>, #nvvm.wgmma_scale_in<one>, <row>],
				B [<f16>, #nvvm.wgmma_scale_in<one>, <row>] : !accMatrix -> !accMatrix
				%descA6 = arith.addi %descA, %d516 : i64
				%descB6 = arith.addi %descB, %d4 : i64
				%wgmma_result_13 = nvvm.wgmma.mma_async %descA6, %descB6,
				#nvvm.shape<m = 64, n = 128, k = 16>,
				D [%wgmma_result_12, #nvvm.wgmma_scale_out<one>],
				A [<f16>, #nvvm.wgmma_scale_in<one>, <row>],
				B [<f16>, #nvvm.wgmma_scale_in<one>, <row>] : !accMatrix -> !accMatrix
				%descA7 = arith.addi %descA, %d518 : i64
				%descB7 = arith.addi %descB, %d8 : i64
				%wgmma_result_14 = nvvm.wgmma.mma_async %descA7, %descB7,
				#nvvm.shape<m = 64, n = 128, k = 16>,
				D [%wgmma_result_13, #nvvm.wgmma_scale_out<one>],
				A [<f16>, #nvvm.wgmma_scale_in<one>, <row>],
				B [<f16>, #nvvm.wgmma_scale_in<one>, <row>] : !accMatrix -> !accMatrix

				nvvm.wgmma.commit.group.sync.aligned
				nvvm.wgmma.wait.group.sync.aligned 1
				manishucsdUnsubmitted Not Done Reply Inline Actions Can you please explain how `nvvm.wgmma.commit.group.sync.aligned` and `nvvm.wgmma.wait.group.sync.aligned 1` works here? Use the CTA Shape, wgmma math shape in your explanation as concrete example? Also, are you able to make a functional test pass with random input distribution for operands? manishucsd: Can you please explain how `nvvm.wgmma.commit.group.sync.aligned` and `nvvm.wgmma.wait.group.
				gurayppAuthorUnsubmitted Done Reply Inline Actions Can you please explain how nvvm.wgmma.commit.group.sync.aligned and nvvm.wgmma.wait.group.sync.aligned 1 works here? Regarding `nvvm.wgmma.commit.group.sync.aligned`, it commites the 8x `wgmma` group, and `nvvm.wgmma.wait.group.sync.aligned 1` waits for their completion. Interestingly, when analyzing the PTX from CUTLASS, I see `nvvm.wgmma.wait.group.sync.aligned 0`. The instruction sequence appears as follows: wgmma.fence.sync.aligned; 8 x wgmma.mma_async wgmma.commit_group.sync.aligned; wgmma.wait_group.sync.aligned 1 ... wgmma.wait_group.sync.aligned 0 <--- Do we need this? Also, are you able to make a functional test pass with random input distribution for operands? Yes, but I need to more rigid test. Right now, there's a bug in the TMA in upstream. I create the TMA descriptor using `cuTensorMapEncodeTiled`, I do something wrong. Once I resolve that issue, I plan to create another test that uses random data input data and verifies the output. guraypp: > Can you please explain how nvvm.wgmma.commit.group.sync.aligned and nvvm.wgmma.wait.group.
				gurayppAuthorUnsubmitted Done Reply Inline Actions Right now, there's a bug in the TMA in upstream. I correct myself : It isn't a bug. TMA expects 128b alignment for 2D+. To complete the shape, I need to execute 3x`cp.async.bulk.tensor` 1 x [128][64] for matrix-A 2 x [64][64] for matrix-B guraypp: > Right now, there's a bug in the TMA in upstream. I correct myself : It isn't a bug. TMA…

				// Step 7. Sanity check WGMMA (2nd and 3rd Warp)
				%cnd125 = arith.cmpi uge, %tidx, %ic63 : index
				%r0 = llvm.extractvalue %wgmma_result_04[0] : !accMatrix
				%r1 = llvm.extractvalue %wgmma_result_04[1] : !accMatrix
				%r2 = llvm.extractvalue %wgmma_result_04[2] : !accMatrix
				%r3 = llvm.extractvalue %wgmma_result_14[0] : !accMatrix
				%r4 = llvm.extractvalue %wgmma_result_14[1] : !accMatrix
				%r5 = llvm.extractvalue %wgmma_result_14[2] : !accMatrix
				gpu.printf "[GPU][thread=%3d] WGMMA [0]=%4.1f [1]=%4.1f [2]=%4.1f [3]=%4.1f [4]=%4.1f [5]=%4.1f\n" %tidx, %r0, %r1, %r2, %r3, %r4, %r5 : index, f32, f32, f32, f32, f32, f32
				gpu.terminator
				}
				return
				}
				}

mlir/test/Integration/GPU/CUDA/sm90a/lit.local.cfg

This file was added.

				if not config.enable_cuda_runner or not config.mlir_run_cuda_sm90a_tests:
				config.unsupported = True

mlir/test/lit.site.cfg.py.in

Show First 20 Lines • Show All 43 Lines • ▼ Show 20 Lines	if config.mlir_run_arm_sve_tests:
config.available_features.add("mlir_arm_sve_tests")		config.available_features.add("mlir_arm_sve_tests")
config.mlir_run_arm_sme_tests = @MLIR_RUN_ARM_SME_TESTS@		config.mlir_run_arm_sme_tests = @MLIR_RUN_ARM_SME_TESTS@
config.mlir_run_x86vector_tests = @MLIR_RUN_X86VECTOR_TESTS@		config.mlir_run_x86vector_tests = @MLIR_RUN_X86VECTOR_TESTS@
config.mlir_run_riscv_vector_tests = "@MLIR_RUN_RISCV_VECTOR_TESTS@"		config.mlir_run_riscv_vector_tests = "@MLIR_RUN_RISCV_VECTOR_TESTS@"
config.mlir_run_cuda_tensor_core_tests = @MLIR_RUN_CUDA_TENSOR_CORE_TESTS@		config.mlir_run_cuda_tensor_core_tests = @MLIR_RUN_CUDA_TENSOR_CORE_TESTS@
config.mlir_run_cuda_sm80_tests = @MLIR_RUN_CUDA_SM80_TESTS@		config.mlir_run_cuda_sm80_tests = @MLIR_RUN_CUDA_SM80_TESTS@
config.mlir_run_cuda_sm80_lt_tests = @MLIR_RUN_CUDA_SM80_LT_TESTS@		config.mlir_run_cuda_sm80_lt_tests = @MLIR_RUN_CUDA_SM80_LT_TESTS@
config.mlir_run_cuda_sm90_tests = @MLIR_RUN_CUDA_SM90_TESTS@		config.mlir_run_cuda_sm90_tests = @MLIR_RUN_CUDA_SM90_TESTS@
		config.mlir_run_cuda_sm90a_tests = @MLIR_RUN_CUDA_SM90a_TESTS@
config.mlir_include_integration_tests = @MLIR_INCLUDE_INTEGRATION_TESTS@		config.mlir_include_integration_tests = @MLIR_INCLUDE_INTEGRATION_TESTS@
config.arm_emulator_executable = "@ARM_EMULATOR_EXECUTABLE@"		config.arm_emulator_executable = "@ARM_EMULATOR_EXECUTABLE@"
config.arm_emulator_options = "@ARM_EMULATOR_OPTIONS@"		config.arm_emulator_options = "@ARM_EMULATOR_OPTIONS@"
config.arm_emulator_mlir_cpu_runner_executable = "@ARM_EMULATOR_MLIR_CPU_RUNNER_EXECUTABLE@"		config.arm_emulator_mlir_cpu_runner_executable = "@ARM_EMULATOR_MLIR_CPU_RUNNER_EXECUTABLE@"
config.arm_emulator_lli_executable = "@ARM_EMULATOR_LLI_EXECUTABLE@"		config.arm_emulator_lli_executable = "@ARM_EMULATOR_LLI_EXECUTABLE@"
config.arm_emulator_utils_lib_dir = "@ARM_EMULATOR_UTILS_LIB_DIR@"		config.arm_emulator_utils_lib_dir = "@ARM_EMULATOR_UTILS_LIB_DIR@"
config.riscv_vector_emulator_executable = "@RISCV_VECTOR_EMULATOR_EXECUTABLE@"		config.riscv_vector_emulator_executable = "@RISCV_VECTOR_EMULATOR_EXECUTABLE@"
config.riscv_vector_emulator_options = "@RISCV_VECTOR_EMULATOR_OPTIONS@"		config.riscv_vector_emulator_options = "@RISCV_VECTOR_EMULATOR_OPTIONS@"
config.riscv_emulator_lli_executable = "@RISCV_EMULATOR_LLI_EXECUTABLE@"		config.riscv_emulator_lli_executable = "@RISCV_EMULATOR_LLI_EXECUTABLE@"
config.riscv_emulator_utils_lib_dir = "@RISCV_EMULATOR_UTILS_LIB_DIR@"		config.riscv_emulator_utils_lib_dir = "@RISCV_EMULATOR_UTILS_LIB_DIR@"

import lit.llvm		import lit.llvm
lit.llvm.initialize(lit_config, config)		lit.llvm.initialize(lit_config, config)

# Let the main config do the real work.		# Let the main config do the real work.
lit_config.load_config(config, "@MLIR_SOURCE_DIR@/test/lit.cfg.py")		lit_config.load_config(config, "@MLIR_SOURCE_DIR@/test/lit.cfg.py")

utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel

Show First 20 Lines • Show All 46 Lines • ▼ Show 20 Lines	substitutions = {
"@MLIR_RUN_AMX_TESTS@": "0",		"@MLIR_RUN_AMX_TESTS@": "0",
"@MLIR_RUN_ARM_SVE_TESTS@": "0",		"@MLIR_RUN_ARM_SVE_TESTS@": "0",
"@MLIR_RUN_ARM_SME_TESTS@": "0",		"@MLIR_RUN_ARM_SME_TESTS@": "0",
"@MLIR_RUN_X86VECTOR_TESTS@": "0",		"@MLIR_RUN_X86VECTOR_TESTS@": "0",
"@MLIR_RUN_CUDA_TENSOR_CORE_TESTS@": "0",		"@MLIR_RUN_CUDA_TENSOR_CORE_TESTS@": "0",
"@MLIR_RUN_CUDA_SM80_TESTS@": "0",		"@MLIR_RUN_CUDA_SM80_TESTS@": "0",
"@MLIR_RUN_CUDA_SM80_LT_TESTS@": "0",		"@MLIR_RUN_CUDA_SM80_LT_TESTS@": "0",
"@MLIR_RUN_CUDA_SM90_TESTS@": "0",		"@MLIR_RUN_CUDA_SM90_TESTS@": "0",
		"@MLIR_RUN_CUDA_SM90a_TESTS@": "0",
"@MLIR_INCLUDE_INTEGRATION_TESTS@": "0",		"@MLIR_INCLUDE_INTEGRATION_TESTS@": "0",
"@SHLIBDIR@": package_path("//llvm:BUILD"),		"@SHLIBDIR@": package_path("//llvm:BUILD"),
},		},
template = "lit.site.cfg.py.in",		template = "lit.site.cfg.py.in",
)		)

# Common data used by most lit tests.		# Common data used by most lit tests.
filegroup(		filegroup(
▲ Show 20 Lines • Show All 961 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[MLIR] Add sm_90a integration test with `wgmma`Needs ReviewPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 552371

mlir/test/CMakeLists.txt

mlir/test/Integration/GPU/CUDA/sm90a/gemm_m128n128k128_f32f16f16.mlir

mlir/test/Integration/GPU/CUDA/sm90a/lit.local.cfg

mlir/test/lit.site.cfg.py.in

utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel

[MLIR] Add sm_90a integration test with `wgmma`
Needs ReviewPublic