Index: mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp =================================================================== --- mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp +++ mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp @@ -364,10 +364,13 @@ static void emitCpAsyncOpZfillAsm(Location loc, Value dstPtr, Value srcPtr, Value dstBytes, Value srcElements, mlir::MemRefType elementType, + UnitAttr bypassL1, ConversionPatternRewriter &rewriter) { auto asmDialectAttr = LLVM::AsmDialectAttr::get(rewriter.getContext(), LLVM::AsmDialect::AD_ATT); - const char *asmStr = "cp.async.cg.shared.global [$0], [$1], $2, $3;\n"; + const char *asmStrByPassL1 = + "cp.async.cg.shared.global [$0], [$1], $2, $3;\n"; + const char *asmStr = "cp.async.ca.shared.global [$0], [$1], $2, $3;\n"; const char *asmConstraints = "r,l,n,r"; Value c3I32 = rewriter.create( @@ -385,7 +388,7 @@ rewriter.create( loc, LLVM::LLVMVoidType::get(rewriter.getContext()), /*operands=*/asmVals, - /*asm_string=*/asmStr, + /*asm_string=*/bypassL1 == UnitAttr() ? asmStr : asmStrByPassL1, /*constraints=*/asmConstraints, /*has_side_effects=*/true, /*is_align_stack=*/false, /*asm_dialect=*/asmDialectAttr, /*operand_attrs=*/ArrayAttr()); @@ -619,7 +622,8 @@ rewriter.create( loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(sizeInBytes)), - adaptor.getSrcElements(), srcMemrefType, rewriter); + adaptor.getSrcElements(), srcMemrefType, bypassL1, + rewriter); // When the optional SrcElements argument is *not* present, the regular // CpAsyncOp is generated. CopyAsyncOp reads bytes from source (global Index: mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir =================================================================== --- mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir +++ mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir @@ -299,13 +299,17 @@ // CHECK-SAME: %[[IDX:[a-zA-Z0-9_]+]]: index, %[[SRCELEMENTS:[a-zA-Z0-9_]+]]: index) func.func @async_cp_zfill( %src: memref<128x128xf32>, %dst: memref<3x16x128xf32, 3>, %i : index, %srcElements : index) { - - // CHECK-DAG: lvm.inline_asm has_side_effects asm_dialect = att "cp.async.cg.shared.global [$0], [$1], $2, $3;\0A", "r,l,n,r" %[[DSTPTR:.*]], %[[SRCPTR:.*]], %[[DSTBYTES:.*]], %[[SRCBYTES:.*]] : (!llvm.ptr<3>, !llvm.ptr<1>, i32, i32) -> !llvm.void + // CHECK: %[[C16:.+]] = llvm.mlir.constant(16 : i32) : i32 + // CHECK: lvm.inline_asm has_side_effects asm_dialect = att "cp.async.cg.shared.global [$0], [$1], $2, $3;\0A", "r,l,n,r" %[[DSTPTR:.*]], %[[SRCPTR:.*]], %[[C16]], %[[SRCBYTES:.*]] : (!llvm.ptr<3>, !llvm.ptr<1>, i32, i32) -> !llvm.void %0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 4, %srcElements {bypassL1}: memref<128x128xf32> to memref<3x16x128xf32, 3> + + // CHECK: %[[C4:.+]] = llvm.mlir.constant(4 : i32) : i32 + // CHECK: lvm.inline_asm has_side_effects asm_dialect = att "cp.async.ca.shared.global [$0], [$1], $2, $3;\0A", "r,l,n,r" %[[DSTPTR:.*]], %[[SRCPTR:.*]], %[[C4]], %[[SRCBYTES:.*]] : (!llvm.ptr<3>, !llvm.ptr<1>, i32, i32) -> !llvm.void + %1 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 1, %srcElements {bypassL1}: memref<128x128xf32> to memref<3x16x128xf32, 3> // CHECK: nvvm.cp.async.commit.group - %1 = nvgpu.device_async_create_group %0 + %2 = nvgpu.device_async_create_group %0, %1 // CHECK: nvvm.cp.async.wait.group 1 - nvgpu.device_async_wait %1 { numGroups = 1 : i32 } + nvgpu.device_async_wait %2 { numGroups = 1 : i32 } return }