This is an archive of the discontinued LLVM Phabricator instance.

mlir/test/Integration/GPU/CUDA/sm90/tmaload.mlir
75	nit: can we rename %10 into %is_thread_0 for legibility ?
88	can we test with another thread here too ? e.g. %is_thread_0_or_thread_42 and CHECK-DAG that both see the same value?
98	nit: nl

This revision is now accepted and ready to land.Jul 21 2023, 12:04 AM

guraypp added inline comments.Jul 21 2023, 12:34 AM

mlir/test/Integration/GPU/CUDA/sm90/tmaload.mlir
88	Yes we can. I will do it in a better way. I will implement elect instruction that will select fastest available thread.

rebase

Harbormaster completed remote builds in B247607: Diff 543452.Jul 24 2023, 4:01 AM

@mehdi_amini @kerrmudgeon related to https://reviews.llvm.org/D155463, it seems we have some difficulties running this e2e example through while also using -test-lower-to-nvvm instead of the gynormous CLI.

If you are presently interested in this line of work and have cycles, we'd happily take some help :)

Text PTX not the runtime

Harbormaster completed remote builds in B248852: Diff 545163.Jul 28 2023, 8:40 AM

nicolasvasilache accepted this revision.Jul 28 2023, 1:53 PM

Closed by commit rGca74ad88295f: [mlir] Nvidia Hopper TMA load integration test (authored by guraypp, committed by nicolasvasilache). · Explain WhyJul 28 2023, 2:06 PM

This revision was automatically updated to reflect the committed changes.

nicolasvasilache added a commit: rGca74ad88295f: [mlir] Nvidia Hopper TMA load integration test.

Revision Contents

Path

Size

mlir/

test/

CMakeLists.txt

2 lines

Integration/

GPU/

CUDA/

sm90/

lit.local.cfg

2 lines

tmaload.mlir

97 lines

lit.site.cfg.py.in

1 line

utils/

bazel/

llvm-project-overlay/

mlir/

test/

BUILD.bazel

7 lines

Diff 543452

mlir/test/CMakeLists.txt

Show All 25 Lines	set(ARM_EMULATOR_LLI_EXECUTABLE "" CACHE STRING
"If arch-specific Arm integration tests run emulated, use this Arm native lli.")		"If arch-specific Arm integration tests run emulated, use this Arm native lli.")
set(ARM_EMULATOR_UTILS_LIB_DIR "" CACHE STRING		set(ARM_EMULATOR_UTILS_LIB_DIR "" CACHE STRING
"If arch-specific Arm integration tests run emulated, find Arm native utility libraries in this directory.")		"If arch-specific Arm integration tests run emulated, find Arm native utility libraries in this directory.")
option(MLIR_RUN_AMX_TESTS "Run AMX tests.")		option(MLIR_RUN_AMX_TESTS "Run AMX tests.")
option(MLIR_RUN_X86VECTOR_TESTS "Run X86Vector tests.")		option(MLIR_RUN_X86VECTOR_TESTS "Run X86Vector tests.")
option(MLIR_RUN_CUDA_TENSOR_CORE_TESTS "Run CUDA Tensor core WMMA tests.")		option(MLIR_RUN_CUDA_TENSOR_CORE_TESTS "Run CUDA Tensor core WMMA tests.")
option(MLIR_RUN_CUDA_SM80_TESTS "Run CUDA A100 tests.")		option(MLIR_RUN_CUDA_SM80_TESTS "Run CUDA A100 tests.")
option(MLIR_RUN_CUDA_SM80_LT_TESTS "Run CUDA A100 structured sparsity tests.")		option(MLIR_RUN_CUDA_SM80_LT_TESTS "Run CUDA A100 structured sparsity tests.")
		option(MLIR_RUN_CUDA_SM90_TESTS "Run CUDA H100 tests.")
option(MLIR_RUN_ARM_SVE_TESTS "Run Arm SVE tests.")		option(MLIR_RUN_ARM_SVE_TESTS "Run Arm SVE tests.")
option(MLIR_RUN_ARM_SME_TESTS "Run Arm SME tests.")		option(MLIR_RUN_ARM_SME_TESTS "Run Arm SME tests.")


# The native target may not be enabled when cross compiling, raise an error.		# The native target may not be enabled when cross compiling, raise an error.
if(NOT MLIR_ENABLE_EXECUTION_ENGINE)		if(NOT MLIR_ENABLE_EXECUTION_ENGINE)
message(FATAL_ERROR "MLIR_INCLUDE_INTEGRATION_TESTS requires a native target")		message(FATAL_ERROR "MLIR_INCLUDE_INTEGRATION_TESTS requires a native target")
endif()		endif()
Show All 24 Lines	llvm_canonicalize_cmake_booleans(
MLIR_INCLUDE_INTEGRATION_TESTS		MLIR_INCLUDE_INTEGRATION_TESTS
MLIR_RUN_AMX_TESTS		MLIR_RUN_AMX_TESTS
MLIR_RUN_CUDA_TENSOR_CORE_TESTS		MLIR_RUN_CUDA_TENSOR_CORE_TESTS
MLIR_RUN_X86VECTOR_TESTS		MLIR_RUN_X86VECTOR_TESTS
MLIR_RUN_ARM_SVE_TESTS		MLIR_RUN_ARM_SVE_TESTS
MLIR_RUN_ARM_SME_TESTS		MLIR_RUN_ARM_SME_TESTS
MLIR_RUN_CUDA_SM80_TESTS		MLIR_RUN_CUDA_SM80_TESTS
MLIR_RUN_CUDA_SM80_LT_TESTS		MLIR_RUN_CUDA_SM80_LT_TESTS
		MLIR_RUN_CUDA_SM90_TESTS
)		)

configure_lit_site_cfg(		configure_lit_site_cfg(
${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in		${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py		${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py
MAIN_CONFIG		MAIN_CONFIG
${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py		${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py
)		)
▲ Show 20 Lines • Show All 110 Lines • Show Last 20 Lines

mlir/test/Integration/GPU/CUDA/sm90/lit.local.cfg

This file was added.

				if not config.enable_cuda_runner or not config.mlir_run_cuda_sm90_tests:
				config.unsupported = True

mlir/test/Integration/GPU/CUDA/sm90/tmaload.mlir

This file was added.

				// RUN: mlir-opt %s --convert-nvgpu-to-nvvm -gpu-kernel-outlining \
				// RUN: -convert-scf-to-cf -convert-nvvm-to-llvm \
				// RUN: -convert-vector-to-llvm \
				gurayppAuthorUnsubmitted Done Reply Inline Actions The next step will be using `-test-lower-to-nvvm` pass. guraypp: The next step will be using `-test-lower-to-nvvm` pass.
				// RUN: -convert-math-to-llvm \
				// RUN: -expand-strided-metadata \
				// RUN: -lower-affine \
				// RUN: -convert-index-to-llvm=index-bitwidth=32 \
				// RUN: -convert-arith-to-llvm \
				// RUN: -finalize-memref-to-llvm \
				// RUN: -convert-func-to-llvm \
				// RUN: -canonicalize \
				// RUN: \| mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-nvgpu-to-nvvm{use-opaque-pointers=1},lower-affine,convert-scf-to-cf,convert-vector-to-llvm,convert-math-to-llvm,expand-strided-metadata,lower-affine,convert-index-to-llvm{index-bitwidth=32},convert-arith-to-llvm,reconcile-unrealized-casts,gpu-to-cubin{chip=sm_90 features=+ptx80 dump-ptx}))' \
				// RUN: \| mlir-opt -convert-index-to-llvm=index-bitwidth=32 \
				// RUN: -gpu-to-llvm \
				// RUN: -convert-func-to-llvm \
				// RUN: -cse -canonicalize \
				// RUN: -reconcile-unrealized-casts \
				// RUN: \| mlir-cpu-runner \
				// RUN: --shared-libs=%mlir_cuda_runtime \
				// RUN: --shared-libs=%mlir_runner_utils \
				// RUN: --entry-point-result=void \
				// RUN: \| FileCheck %s

				// CHECK: [GPU] TMA BEFORE lhs[45][7] 0.000000
				// CHECK: [GPU] TMA BEFORE rhs[7][0] 0.000000
				// CHECK: [GPU] TMA LOADED lhs[45][7] 7.000000
				// CHECK: [GPU] TMA LOADED rhs[7][0] 3.000000

				module @mymod {
				memref.global "private" @bufferLhsGlobal : memref<64x8xf32, 3>
				memref.global "private" @bufferRhsGlobal : memref<8x128xf32, 3>
				func.func @main() {
				%c10000000 = arith.constant 10000000 : index
				%c6144 = arith.constant 6144 : index
				%c45 = arith.constant 45 : index
				%c7 = arith.constant 7 : index
				%c64 = arith.constant 64 : index
				%c1 = arith.constant 1 : index
				%c0 = arith.constant 0 : index
				%c8 = arith.constant 8 : index
				%c128 = arith.constant 128 : index
				%cst = arith.constant 3.000000e+00 : f32
				%alloc = memref.alloc() : memref<64x8xf32>
				%alloc_0 = memref.alloc() : memref<8x128xf32>
				scf.for %arg0 = %c0 to %c8 step %c1 {
				scf.for %arg1 = %c0 to %c128 step %c1 {
				memref.store %cst, %alloc_0[%arg0, %arg1] : memref<8x128xf32>
				}
				}
				scf.for %arg0 = %c0 to %c64 step %c1 {
				scf.for %arg1 = %c0 to %c8 step %c1 {
				%5 = arith.index_cast %arg1 : index to i64
				%6 = arith.uitofp %5 : i64 to f32
				memref.store %6, %alloc[%arg0, %arg1] : memref<64x8xf32>
				}
				}
				%0 = gpu.wait async
				%memref, %asyncToken = gpu.alloc async [%0] () : memref<64x8xf32>
				%memref_1, %asyncToken_2 = gpu.alloc async [%0] () : memref<8x128xf32>
				%1 = gpu.memcpy async [%0] %memref, %alloc : memref<64x8xf32>, memref<64x8xf32>
				%2 = gpu.memcpy async [%0] %memref_1, %alloc_0 : memref<8x128xf32>, memref<8x128xf32>
				%cast = memref.cast %memref : memref<64x8xf32> to memref<*xf32>
				%cast_3 = memref.cast %memref_1 : memref<8x128xf32> to memref<*xf32>
				%3 = nvgpu.tma.create.descriptor %cast box[%c64, %c8] : memref<*xf32> -> <tensor = memref<64x8xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>
				%4 = nvgpu.tma.create.descriptor %cast_3 box[%c8, %c128] : memref<*xf32> -> <tensor = memref<8x128xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>
				gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %c1, %arg7 = %c1, %arg8 = %c1) threads(%arg3, %arg4, %arg5) in (%arg9 = %c128, %arg10 = %c1, %arg11 = %c1) {
				%5 = gpu.block_dim x
				%6 = gpu.thread_id x
				%7 = memref.get_global @bufferLhsGlobal : memref<64x8xf32, 3>
				%8 = memref.get_global @bufferRhsGlobal : memref<8x128xf32, 3>
				%9 = nvgpu.mbarrier.create -> <memorySpace = #gpu.address_space<workgroup>>
				nvgpu.mbarrier.init %9, %5 : <memorySpace = #gpu.address_space<workgroup>>
				gpu.barrier
				%10 = arith.cmpi eq, %6, %c0 : index
				scf.if %10 {
				nicolasvasilacheUnsubmitted Not Done Reply Inline Actions nit: can we rename %10 into %is_thread_0 for legibility ? nicolasvasilache: nit: can we rename %10 into %is_thread_0 for legibility ?
				nvgpu.mbarrier.arrive.expect_tx %9, %c6144 : <memorySpace = #gpu.address_space<workgroup>>
				%11 = memref.load %7[%c0, %c0] : memref<64x8xf32, 3>
				%12 = memref.load %8[%c0, %c0] : memref<8x128xf32, 3>
				gpu.printf "[GPU] TMA BEFORE lhs[45][7] %f\0A" %11 : f32
				gpu.printf "[GPU] TMA BEFORE rhs[7][0] %f\0A" %12 : f32
				nvgpu.tma.async.load %3[%c0, %c0], %9 to %7 : <tensor = memref<64x8xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<64x8xf32, 3>
				nvgpu.tma.async.load %4[%c0, %c0], %9 to %8 : <tensor = memref<8x128xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<8x128xf32, 3>
				} else {
				nvgpu.mbarrier.arrive.expect_tx %9, %c0 : <memorySpace = #gpu.address_space<workgroup>>
				}
				nvgpu.mbarrier.try_wait.parity %9, %c0, %c10000000 : <memorySpace = #gpu.address_space<workgroup>>
				scf.if %10 {
				%11 = memref.load %7[%c45, %c7] : memref<64x8xf32, 3>
				nicolasvasilacheUnsubmitted Not Done Reply Inline Actions can we test with another thread here too ? e.g. %is_thread_0_or_thread_42 and CHECK-DAG that both see the same value? nicolasvasilache: can we test with another thread here too ? e.g. %is_thread_0_or_thread_42 and CHECK-DAG that…
				gurayppAuthorUnsubmitted Done Reply Inline Actions Yes we can. I will do it in a better way. I will implement elect instruction that will select fastest available thread. guraypp: Yes we can. I will do it in a better way. I will implement elect instruction that will select…
				%12 = memref.load %8[%c7, %c0] : memref<8x128xf32, 3>
				gpu.printf "[GPU] TMA LOADED lhs[45][7] %f\0A" %11 : f32
				gpu.printf "[GPU] TMA LOADED rhs[7][0] %f\0A" %12 : f32
				}
				gpu.terminator
				}
				return
				}
				}
				No newline at end of file
				nicolasvasilacheUnsubmitted Not Done Reply Inline Actions nit: nl nicolasvasilache: nit: nl

mlir/test/lit.site.cfg.py.in

	Show First 20 Lines • Show All 42 Lines • ▼ Show 20 Lines
	if config.mlir_run_arm_sve_tests:			if config.mlir_run_arm_sve_tests:
	config.available_features.add("mlir_arm_sve_tests")			config.available_features.add("mlir_arm_sve_tests")
	config.mlir_run_arm_sme_tests = @MLIR_RUN_ARM_SME_TESTS@			config.mlir_run_arm_sme_tests = @MLIR_RUN_ARM_SME_TESTS@
	config.mlir_run_x86vector_tests = @MLIR_RUN_X86VECTOR_TESTS@			config.mlir_run_x86vector_tests = @MLIR_RUN_X86VECTOR_TESTS@
	config.mlir_run_riscv_vector_tests = "@MLIR_RUN_RISCV_VECTOR_TESTS@"			config.mlir_run_riscv_vector_tests = "@MLIR_RUN_RISCV_VECTOR_TESTS@"
	config.mlir_run_cuda_tensor_core_tests = @MLIR_RUN_CUDA_TENSOR_CORE_TESTS@			config.mlir_run_cuda_tensor_core_tests = @MLIR_RUN_CUDA_TENSOR_CORE_TESTS@
	config.mlir_run_cuda_sm80_tests = @MLIR_RUN_CUDA_SM80_TESTS@			config.mlir_run_cuda_sm80_tests = @MLIR_RUN_CUDA_SM80_TESTS@
	config.mlir_run_cuda_sm80_lt_tests = @MLIR_RUN_CUDA_SM80_LT_TESTS@			config.mlir_run_cuda_sm80_lt_tests = @MLIR_RUN_CUDA_SM80_LT_TESTS@
				config.mlir_run_cuda_sm90_tests = @MLIR_RUN_CUDA_SM90_TESTS@
	config.mlir_include_integration_tests = @MLIR_INCLUDE_INTEGRATION_TESTS@			config.mlir_include_integration_tests = @MLIR_INCLUDE_INTEGRATION_TESTS@
	config.arm_emulator_executable = "@ARM_EMULATOR_EXECUTABLE@"			config.arm_emulator_executable = "@ARM_EMULATOR_EXECUTABLE@"
	config.arm_emulator_options = "@ARM_EMULATOR_OPTIONS@"			config.arm_emulator_options = "@ARM_EMULATOR_OPTIONS@"
	config.arm_emulator_mlir_cpu_runner_executable = "@ARM_EMULATOR_MLIR_CPU_RUNNER_EXECUTABLE@"			config.arm_emulator_mlir_cpu_runner_executable = "@ARM_EMULATOR_MLIR_CPU_RUNNER_EXECUTABLE@"
	config.arm_emulator_lli_executable = "@ARM_EMULATOR_LLI_EXECUTABLE@"			config.arm_emulator_lli_executable = "@ARM_EMULATOR_LLI_EXECUTABLE@"
	config.arm_emulator_utils_lib_dir = "@ARM_EMULATOR_UTILS_LIB_DIR@"			config.arm_emulator_utils_lib_dir = "@ARM_EMULATOR_UTILS_LIB_DIR@"
	config.riscv_vector_emulator_executable = "@RISCV_VECTOR_EMULATOR_EXECUTABLE@"			config.riscv_vector_emulator_executable = "@RISCV_VECTOR_EMULATOR_EXECUTABLE@"
	config.riscv_vector_emulator_options = "@RISCV_VECTOR_EMULATOR_OPTIONS@"			config.riscv_vector_emulator_options = "@RISCV_VECTOR_EMULATOR_OPTIONS@"
	config.riscv_emulator_lli_executable = "@RISCV_EMULATOR_LLI_EXECUTABLE@"			config.riscv_emulator_lli_executable = "@RISCV_EMULATOR_LLI_EXECUTABLE@"
	config.riscv_emulator_utils_lib_dir = "@RISCV_EMULATOR_UTILS_LIB_DIR@"			config.riscv_emulator_utils_lib_dir = "@RISCV_EMULATOR_UTILS_LIB_DIR@"

	import lit.llvm			import lit.llvm
	lit.llvm.initialize(lit_config, config)			lit.llvm.initialize(lit_config, config)

	# Let the main config do the real work.			# Let the main config do the real work.
	lit_config.load_config(config, "@MLIR_SOURCE_DIR@/test/lit.cfg.py")			lit_config.load_config(config, "@MLIR_SOURCE_DIR@/test/lit.cfg.py")

utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel

Show First 20 Lines • Show All 45 Lines • ▼ Show 20 Lines	substitutions = {
"@MLIR_ENABLE_BINDINGS_PYTHON@": "0",		"@MLIR_ENABLE_BINDINGS_PYTHON@": "0",
"@MLIR_RUN_AMX_TESTS@": "0",		"@MLIR_RUN_AMX_TESTS@": "0",
"@MLIR_RUN_ARM_SVE_TESTS@": "0",		"@MLIR_RUN_ARM_SVE_TESTS@": "0",
"@MLIR_RUN_ARM_SME_TESTS@": "0",		"@MLIR_RUN_ARM_SME_TESTS@": "0",
"@MLIR_RUN_X86VECTOR_TESTS@": "0",		"@MLIR_RUN_X86VECTOR_TESTS@": "0",
"@MLIR_RUN_CUDA_TENSOR_CORE_TESTS@": "0",		"@MLIR_RUN_CUDA_TENSOR_CORE_TESTS@": "0",
"@MLIR_RUN_CUDA_SM80_TESTS@": "0",		"@MLIR_RUN_CUDA_SM80_TESTS@": "0",
"@MLIR_RUN_CUDA_SM80_LT_TESTS@": "0",		"@MLIR_RUN_CUDA_SM80_LT_TESTS@": "0",
		"@MLIR_RUN_CUDA_SM90_TESTS@": "0",
"@MLIR_INCLUDE_INTEGRATION_TESTS@": "0",		"@MLIR_INCLUDE_INTEGRATION_TESTS@": "0",
"@SHLIBDIR@": package_path("//llvm:BUILD"),		"@SHLIBDIR@": package_path("//llvm:BUILD"),
},		},
template = "lit.site.cfg.py.in",		template = "lit.site.cfg.py.in",
)		)

# Common data used by most lit tests.		# Common data used by most lit tests.
filegroup(		filegroup(
▲ Show 20 Lines • Show All 550 Lines • ▼ Show 20 Lines	deps = [
"//mlir:GPUToGPURuntimeTransforms",		"//mlir:GPUToGPURuntimeTransforms",
"//mlir:GPUToNVVMTransforms",		"//mlir:GPUToNVVMTransforms",
"//mlir:GPUTransforms",		"//mlir:GPUTransforms",
"//mlir:IR",		"//mlir:IR",
"//mlir:IndexDialect",		"//mlir:IndexDialect",
"//mlir:IndexToLLVM",		"//mlir:IndexToLLVM",
"//mlir:MathToLLVM",		"//mlir:MathToLLVM",
"//mlir:MemRefDialect",		"//mlir:MemRefDialect",
"//mlir:MemRefTransforms",
"//mlir:MemRefToLLVM",		"//mlir:MemRefToLLVM",
		"//mlir:MemRefTransforms",
"//mlir:NVGPUToNVVM",		"//mlir:NVGPUToNVVM",
"//mlir:NVVMToLLVMIRTranslation",		"//mlir:NVVMToLLVMIRTranslation",
"//mlir:Pass",		"//mlir:Pass",
"//mlir:ReconcileUnrealizedCasts",
"//mlir:ROCDLToLLVMIRTranslation",		"//mlir:ROCDLToLLVMIRTranslation",
		"//mlir:ReconcileUnrealizedCasts",
"//mlir:SCFDialect",		"//mlir:SCFDialect",
"//mlir:SCFToControlFlow",		"//mlir:SCFToControlFlow",
"//mlir:SPIRVDialect",		"//mlir:SPIRVDialect",
"//mlir:ToLLVMIRTranslation",		"//mlir:ToLLVMIRTranslation",
"//mlir:Transforms",
"//mlir:TransformUtils",		"//mlir:TransformUtils",
		"//mlir:Transforms",
"//mlir:VectorDialect",		"//mlir:VectorDialect",
"//mlir:VectorToLLVM",		"//mlir:VectorToLLVM",
"//mlir:VectorToSCF",		"//mlir:VectorToSCF",
],		],
)		)

cc_library(		cc_library(
name = "TestLinalg",		name = "TestLinalg",
▲ Show 20 Lines • Show All 360 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[mlir] Nvidia Hopper TMA load integration testClosedPublic

Details

Diff Detail

Event Timeline