Index: mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir =================================================================== --- mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir +++ mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir @@ -10,85 +10,77 @@ // Test case to check the working of Tensor cores on Nvidia GPUs. The kernel has already // been outlined to prevent crashing due to introduction of an empty basic block by --gpu- // kernel-outling. -module attributes {gpu.container_module} { - func @main() { - %0 = memref.alloc() : memref<16x16xf16> - %22 = memref.alloc() : memref<16x16xf16> - %1 = memref.alloc() : memref<16x16xf32> +func @main() { + %0 = memref.alloc() : memref<16x16xf16> + %22 = memref.alloc() : memref<16x16xf16> + %1 = memref.alloc() : memref<16x16xf32> - %f1 = constant 1.0e+00 : f16 - %f0 = constant 0.0e+00 : f16 - %c0 = constant 0 : index - %c16 = constant 16 : index - %c32 = constant 32 : index - %c1 = constant 1 : index + %f1 = constant 1.0e+00 : f16 + %f0 = constant 0.0e+00 : f16 + %c0 = constant 0 : index + %c16 = constant 16 : index + %c32 = constant 32 : index + %c1 = constant 1 : index - // Intialize the Input matrix with ones. - scf.for %arg0 = %c0 to %c16 step %c1 { - scf.for %arg1 = %c0 to %c16 step %c1 { - memref.store %f1, %0[%arg0, %arg1] : memref<16x16xf16> - } + // Intialize the Input matrix with ones. + scf.for %arg0 = %c0 to %c16 step %c1 { + scf.for %arg1 = %c0 to %c16 step %c1 { + memref.store %f1, %0[%arg0, %arg1] : memref<16x16xf16> } - // Intialize the accumulator matrix with zeros. - scf.for %arg0 = %c0 to %c16 step %c1 { - scf.for %arg1 = %c0 to %c16 step %c1 { - memref.store %f0, %22[%arg0, %arg1] : memref<16x16xf16> - } - } - - %2 = memref.cast %0 : memref<16x16xf16> to memref<*xf16> - %33 = memref.cast %22 : memref<16x16xf16> to memref<*xf16> - %3 = memref.cast %1 : memref<16x16xf32> to memref<*xf32> - gpu.host_register %2 : memref<*xf16> - gpu.host_register %33 : memref<*xf16> - - gpu.launch_func @main_kernel::@main_kernel blocks in (%c1, %c1, %c1) threads in (%c32, %c1, %c1) args(%0 : memref<16x16xf16>, %22 : memref<16x16xf16>) - - // Convert the results from f16 to f32 for printing. - scf.for %arg0 = %c0 to %c16 step %c1 { - scf.for %arg1 = %c0 to %c16 step %c1 { - %6 = memref.load %0[%arg0, %arg1] : memref<16x16xf16> - %7 = fpext %6 : f16 to f32 - memref.store %7, %1[%arg0, %arg1] : memref<16x16xf32> - } + } + // Intialize the accumulator matrix with zeros. + scf.for %arg0 = %c0 to %c16 step %c1 { + scf.for %arg1 = %c0 to %c16 step %c1 { + memref.store %f0, %22[%arg0, %arg1] : memref<16x16xf16> } - - // Print the memref after computation. - call @print_memref_f32(%3) : (memref<*xf32>) -> () - // CHECK: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], - // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], - // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], - // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], - // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], - // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], - // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], - // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], - // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], - // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], - // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], - // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], - // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], - // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], - // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], - // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16] - return } - gpu.module @main_kernel { - gpu.func @main_kernel(%arg0: memref<16x16xf16>, %arg22 : memref<16x16xf16>) kernel { - %c0 = constant 0 : index + %2 = memref.cast %0 : memref<16x16xf16> to memref<*xf16> + %33 = memref.cast %22 : memref<16x16xf16> to memref<*xf16> + %3 = memref.cast %1 : memref<16x16xf32> to memref<*xf32> + gpu.host_register %2 : memref<*xf16> + gpu.host_register %33 : memref<*xf16> - %0 = gpu.subgroup_mma_load_matrix %arg0[%c0, %c0] {operand = "AOp", leadDimension = 16 : index} : memref<16x16xf16> -> !gpu.mma_matrix<16x16xf16, "AOp"> - %1 = gpu.subgroup_mma_load_matrix %arg0[%c0, %c0] {operand = "BOp", leadDimension = 16 : index} : memref<16x16xf16> -> !gpu.mma_matrix<16x16xf16, "BOp"> - %2 = gpu.subgroup_mma_load_matrix %arg22[%c0, %c0] {operand = "COp", leadDimension = 16 : index} : memref<16x16xf16> -> !gpu.mma_matrix<16x16xf16, "COp"> + gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1) + threads(%tx, %ty, %tz) in (%block_x = %c32, %block_y = %c1, %block_z = %c1) { + %A = gpu.subgroup_mma_load_matrix %0[%c0, %c0] {operand = "AOp", leadDimension = 16 : index} : memref<16x16xf16> -> !gpu.mma_matrix<16x16xf16, "AOp"> + %B = gpu.subgroup_mma_load_matrix %0[%c0, %c0] {operand = "BOp", leadDimension = 16 : index} : memref<16x16xf16> -> !gpu.mma_matrix<16x16xf16, "BOp"> + %C = gpu.subgroup_mma_load_matrix %22[%c0, %c0] {operand = "COp", leadDimension = 16 : index} : memref<16x16xf16> -> !gpu.mma_matrix<16x16xf16, "COp"> - %3 = gpu.subgroup_mma_compute %0, %1, %2 : !gpu.mma_matrix<16x16xf16, "AOp">, !gpu.mma_matrix<16x16xf16, "BOp"> -> !gpu.mma_matrix<16x16xf16, "COp"> + %R = gpu.subgroup_mma_compute %A, %B, %C : !gpu.mma_matrix<16x16xf16, "AOp">, !gpu.mma_matrix<16x16xf16, "BOp">, !gpu.mma_matrix<16x16xf16, "COp"> -> !gpu.mma_matrix<16x16xf16, "DOp"> - gpu.subgroup_mma_store_matrix %3, %arg0[%c0, %c0] {leadDimension = 16 : index}: !gpu.mma_matrix<16x16xf16, "COp">, memref<16x16xf16> + gpu.subgroup_mma_store_matrix %R, %0[%c0, %c0] {leadDimension = 16 : index}: !gpu.mma_matrix<16x16xf16, "DOp">, memref<16x16xf16> + gpu.terminator + } - gpu.return + // Convert the results from f16 to f32 for printing. + scf.for %arg0 = %c0 to %c16 step %c1 { + scf.for %arg1 = %c0 to %c16 step %c1 { + %6 = memref.load %0[%arg0, %arg1] : memref<16x16xf16> + %7 = fpext %6 : f16 to f32 + memref.store %7, %1[%arg0, %arg1] : memref<16x16xf32> } } - func private @print_memref_f32(memref<*xf32>) + // Print the memref after computation. + call @print_memref_f32(%3) : (memref<*xf32>) -> () + // CHECK: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], + // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], + // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], + // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], + // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], + // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], + // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], + // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], + // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], + // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], + // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], + // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], + // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], + // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], + // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], + // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16] + return } + +func private @print_memref_f32(memref<*xf32>) Index: mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir =================================================================== --- mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir +++ mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir @@ -7,79 +7,68 @@ // RUN: --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \ // RUN: --entry-point-result=void \ // RUN: | FileCheck %s -// Test case to check the working of Tensor cores on Nvidia GPUs. The kernel has already -// been outlined to prevent crashing due to introduction of an empty basic block by --gpu- -// kernel-outling. -module attributes {gpu.container_module} { - func @main() { - %0 = memref.alloc() : memref<16x16xf16> - %22 = memref.alloc() : memref<16x16xf32> - %1 = memref.alloc() : memref<16x16xf32> - %f1 = constant 1.0e+00 : f16 - %f0 = constant 0.0e+00 : f32 - %c0 = constant 0 : index - %c16 = constant 16 : index - %c32 = constant 32 : index - %c1 = constant 1 : index +func @main() { + %0 = memref.alloc() : memref<16x16xf16> + %22 = memref.alloc() : memref<16x16xf32> + %1 = memref.alloc() : memref<16x16xf32> - // Intialize the Input matrix with ones. - scf.for %arg0 = %c0 to %c16 step %c1 { - scf.for %arg1 = %c0 to %c16 step %c1 { - memref.store %f1, %0[%arg0, %arg1] : memref<16x16xf16> - } + %f1 = constant 1.0e+00 : f16 + %f0 = constant 0.0e+00 : f32 + %c0 = constant 0 : index + %c16 = constant 16 : index + %c32 = constant 32 : index + %c1 = constant 1 : index + + // Intialize the Input matrix with ones. + scf.for %arg0 = %c0 to %c16 step %c1 { + scf.for %arg1 = %c0 to %c16 step %c1 { + memref.store %f1, %0[%arg0, %arg1] : memref<16x16xf16> } - // Intialize the accumulator matrix with zeros. - scf.for %arg0 = %c0 to %c16 step %c1 { - scf.for %arg1 = %c0 to %c16 step %c1 { - memref.store %f0, %22[%arg0, %arg1] : memref<16x16xf32> - } + } + // Intialize the accumulator matrix with zeros. + scf.for %arg0 = %c0 to %c16 step %c1 { + scf.for %arg1 = %c0 to %c16 step %c1 { + memref.store %f0, %22[%arg0, %arg1] : memref<16x16xf32> } - - %2 = memref.cast %0 : memref<16x16xf16> to memref<*xf16> - %33 = memref.cast %22 : memref<16x16xf32> to memref<*xf32> - %3 = memref.cast %1 : memref<16x16xf32> to memref<*xf32> - gpu.host_register %2 : memref<*xf16> - gpu.host_register %33 : memref<*xf32> - - gpu.launch_func @main_kernel::@main_kernel blocks in (%c1, %c1, %c1) threads in (%c32, %c1, %c1) args(%0 : memref<16x16xf16>, %22 : memref<16x16xf32>) - - // Print the memref after computation. - call @print_memref_f32(%33) : (memref<*xf32>) -> () - // CHECK: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], - // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], - // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], - // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], - // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], - // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], - // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], - // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], - // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], - // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], - // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], - // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], - // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], - // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], - // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], - // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16] - return } - gpu.module @main_kernel { - gpu.func @main_kernel(%arg0: memref<16x16xf16>, %arg22 : memref<16x16xf32>) kernel { - %c0 = constant 0 : index + %2 = memref.cast %0 : memref<16x16xf16> to memref<*xf16> + %33 = memref.cast %22 : memref<16x16xf32> to memref<*xf32> + %3 = memref.cast %1 : memref<16x16xf32> to memref<*xf32> + gpu.host_register %2 : memref<*xf16> + gpu.host_register %33 : memref<*xf32> - %0 = gpu.subgroup_mma_load_matrix %arg0[%c0, %c0] {operand = "AOp", leadDimension = 16 : index} : memref<16x16xf16> -> !gpu.mma_matrix<16x16xf16, "AOp"> - %1 = gpu.subgroup_mma_load_matrix %arg0[%c0, %c0] {operand = "BOp", leadDimension = 16 : index} : memref<16x16xf16> -> !gpu.mma_matrix<16x16xf16, "BOp"> - %2 = gpu.subgroup_mma_load_matrix %arg22[%c0, %c0] {operand = "COp", leadDimension = 16 : index} : memref<16x16xf32> -> !gpu.mma_matrix<16x16xf32, "COp"> + gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1) + threads(%tx, %ty, %tz) in (%block_x = %c32, %block_y = %c1, %block_z = %c1) { + %A = gpu.subgroup_mma_load_matrix %0[%c0, %c0] {operand = "AOp", leadDimension = 16 : index} : memref<16x16xf16> -> !gpu.mma_matrix<16x16xf16, "AOp"> + %B = gpu.subgroup_mma_load_matrix %0[%c0, %c0] {operand = "BOp", leadDimension = 16 : index} : memref<16x16xf16> -> !gpu.mma_matrix<16x16xf16, "BOp"> + %C = gpu.subgroup_mma_load_matrix %22[%c0, %c0] {operand = "COp", leadDimension = 16 : index} : memref<16x16xf32> -> !gpu.mma_matrix<16x16xf32, "COp"> - %3 = gpu.subgroup_mma_compute %0, %1, %2 : !gpu.mma_matrix<16x16xf16, "AOp">, !gpu.mma_matrix<16x16xf16, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp"> + %R = gpu.subgroup_mma_compute %A, %B, %C : !gpu.mma_matrix<16x16xf16, "AOp">, !gpu.mma_matrix<16x16xf16, "BOp">, !gpu.mma_matrix<16x16xf32, "COp"> -> !gpu.mma_matrix<16x16xf32, "DOp"> - gpu.subgroup_mma_store_matrix %3, %arg22[%c0, %c0] {leadDimension = 16 : index}: !gpu.mma_matrix<16x16xf32, "COp">, memref<16x16xf32> - - gpu.return - } + gpu.subgroup_mma_store_matrix %R, %22[%c0, %c0] {leadDimension = 16 : index}: !gpu.mma_matrix<16x16xf32, "DOp">, memref<16x16xf32> + gpu.terminator } - - func private @print_memref_f32(memref<*xf32>) + // Print the memref after computation. + call @print_memref_f32(%33) : (memref<*xf32>) -> () + // CHECK: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], + // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], + // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], + // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], + // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], + // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], + // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], + // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], + // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], + // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], + // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], + // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], + // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], + // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], + // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], + // CHECK-NEXT: [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16] + return } + +func private @print_memref_f32(memref<*xf32>)