Index: llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h +++ llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h @@ -186,6 +186,14 @@ llvm_unreachable("Custom values not supported"); } + /// Do a memory copy of \p MemSize bytes from \p SrcPtr to \p DstPtr. This + /// is necessary for outgoing stack-passed byval arguments. + void + copyArgumentMemory(const ArgInfo &Arg, Register DstPtr, Register SrcPtr, + const MachinePointerInfo &DstPtrInfo, Align DstAlign, + const MachinePointerInfo &SrcPtrInfo, Align SrcAlign, + uint64_t MemSize, CCValAssign &VA) const; + /// Extend a register to the location type given in VA, capped at extending /// to at most MaxSize bits. If MaxSizeBits is 0 then no maximum is set. Register extendRegister(Register ValReg, CCValAssign &VA, Index: llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h +++ llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h @@ -1810,6 +1810,27 @@ MachineInstrBuilder buildVecReduceUMin(const DstOp &Dst, const SrcOp &Src) { return buildInstr(TargetOpcode::G_VECREDUCE_UMIN, {Dst}, {Src}); } + + /// Build and insert G_MEMCPY or G_MEMMOVE + MachineInstrBuilder buildMemTransferInst(unsigned Opcode, const SrcOp &DstPtr, + const SrcOp &SrcPtr, + const SrcOp &Size, + MachineMemOperand &DstMMO, + MachineMemOperand &SrcMMO) { + auto MIB = buildInstr( + Opcode, {}, {DstPtr, SrcPtr, Size, SrcOp(INT64_C(0) /*isTailCall*/)}); + MIB.addMemOperand(&DstMMO); + MIB.addMemOperand(&SrcMMO); + return MIB; + } + + MachineInstrBuilder buildMemCpy(const SrcOp &DstPtr, const SrcOp &SrcPtr, + const SrcOp &Size, MachineMemOperand &DstMMO, + MachineMemOperand &SrcMMO) { + return buildMemTransferInst(TargetOpcode::G_MEMCPY, DstPtr, SrcPtr, Size, + DstMMO, SrcMMO); + } + virtual MachineInstrBuilder buildInstr(unsigned Opc, ArrayRef DstOps, ArrayRef SrcOps, Optional Flags = None); Index: llvm/lib/CodeGen/GlobalISel/CallLowering.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/CallLowering.cpp +++ llvm/lib/CodeGen/GlobalISel/CallLowering.cpp @@ -647,17 +647,43 @@ } if (VA.isMemLoc() && Flags.isByVal()) { - // FIXME: We should be inserting a memcpy from the source pointer to the - // result for outgoing byval parameters. - if (!Handler.isIncomingArgumentHandler()) - continue; - - MachinePointerInfo MPO; - Register StackAddr = Handler.getStackAddress( - Flags.getByValSize(), VA.getLocMemOffset(), MPO, Flags); assert(Args[i].Regs.size() == 1 && "didn't expect split byval pointer"); - MIRBuilder.buildCopy(Args[i].Regs[0], StackAddr); + + if (Handler.isIncomingArgumentHandler()) { + // We just need to copy the frame index value to the pointer. + MachinePointerInfo MPO; + Register StackAddr = Handler.getStackAddress( + Flags.getByValSize(), VA.getLocMemOffset(), MPO, Flags); + MIRBuilder.buildCopy(Args[i].Regs[0], StackAddr); + } else { + // For outgoing byval arguments, insert the implicit copy byval + // implies, such that writes in the callee do not modify the caller's + // value. + uint64_t MemSize = Flags.getByValSize(); + int64_t Offset = VA.getLocMemOffset(); + + MachinePointerInfo DstMPO; + Register StackAddr = + Handler.getStackAddress(MemSize, Offset, DstMPO, Flags); + + const LLT PtrTy = MRI.getType(StackAddr); + + // FIXME: We do not have access to the original IR value here to + // preserve the aliasing information. + MachinePointerInfo SrcMPO(PtrTy.getAddressSpace()); + + Align DstAlign = std::max(Flags.getNonZeroByValAlign(), + inferAlignFromPtrInfo(MF, DstMPO)); + + // TODO: Theoretically the source value could have a higher alignment, + // but we don't have that here + Align SrcAlign = Flags.getNonZeroByValAlign(); + + Handler.copyArgumentMemory(Args[i], StackAddr, Args[i].Regs[0], + DstMPO, DstAlign, SrcMPO, SrcAlign, + MemSize, VA); + } continue; } @@ -963,6 +989,29 @@ return true; } +void CallLowering::ValueHandler::copyArgumentMemory( + const ArgInfo &Arg, Register DstPtr, Register SrcPtr, + const MachinePointerInfo &DstPtrInfo, Align DstAlign, + const MachinePointerInfo &SrcPtrInfo, Align SrcAlign, uint64_t MemSize, + CCValAssign &VA) const { + MachineFunction &MF = MIRBuilder.getMF(); + MachineMemOperand *SrcMMO = MF.getMachineMemOperand( + SrcPtrInfo, + MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable, MemSize, + SrcAlign); + + MachineMemOperand *DstMMO = MF.getMachineMemOperand( + DstPtrInfo, + MachineMemOperand::MOStore | MachineMemOperand::MODereferenceable, + MemSize, DstAlign); + + const LLT PtrTy = MRI.getType(DstPtr); + const LLT SizeTy = LLT::scalar(PtrTy.getSizeInBits()); + + auto SizeConst = MIRBuilder.buildConstant(SizeTy, MemSize); + MIRBuilder.buildMemCpy(DstPtr, SrcPtr, SizeConst, *DstMMO, *SrcMMO); +} + Register CallLowering::ValueHandler::extendRegister(Register ValReg, CCValAssign &VA, unsigned MaxSizeBits) { Index: llvm/test/CodeGen/AArch64/GlobalISel/byval-call.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/GlobalISel/byval-call.ll @@ -0,0 +1,75 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s + +declare void @byval_i32(i32* byval(i32) %ptr) + +define void @call_byval_i32(i32* %incoming) { +; CHECK-LABEL: call_byval_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #32 // =32 +; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: str w8, [sp] +; CHECK-NEXT: bl byval_i32 +; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #32 // =32 +; CHECK-NEXT: ret + call void @byval_i32(i32* byval(i32) %incoming) + ret void +} + +declare void @byval_a64i32([64 x i32]* byval([64 x i32]) %ptr) + +define void @call_byval_a64i32([64 x i32]* %incoming) { +; CHECK-LABEL: call_byval_a64i32: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #288 // =288 +; CHECK-NEXT: stp x29, x30, [sp, #256] // 16-byte Folded Spill +; CHECK-NEXT: str x28, [sp, #272] // 8-byte Folded Spill +; CHECK-NEXT: add x29, sp, #256 // =256 +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w28, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: str q0, [sp] +; CHECK-NEXT: ldr q0, [x0, #16] +; CHECK-NEXT: str q0, [sp, #16] +; CHECK-NEXT: ldr q0, [x0, #32] +; CHECK-NEXT: str q0, [sp, #32] +; CHECK-NEXT: ldr q0, [x0, #48] +; CHECK-NEXT: str q0, [sp, #48] +; CHECK-NEXT: ldr q0, [x0, #64] +; CHECK-NEXT: str q0, [sp, #64] +; CHECK-NEXT: ldr q0, [x0, #80] +; CHECK-NEXT: str q0, [sp, #80] +; CHECK-NEXT: ldr q0, [x0, #96] +; CHECK-NEXT: str q0, [sp, #96] +; CHECK-NEXT: ldr q0, [x0, #112] +; CHECK-NEXT: str q0, [sp, #112] +; CHECK-NEXT: ldr q0, [x0, #128] +; CHECK-NEXT: str q0, [sp, #128] +; CHECK-NEXT: ldr q0, [x0, #144] +; CHECK-NEXT: str q0, [sp, #144] +; CHECK-NEXT: ldr q0, [x0, #160] +; CHECK-NEXT: str q0, [sp, #160] +; CHECK-NEXT: ldr q0, [x0, #176] +; CHECK-NEXT: str q0, [sp, #176] +; CHECK-NEXT: ldr q0, [x0, #192] +; CHECK-NEXT: str q0, [sp, #192] +; CHECK-NEXT: ldr q0, [x0, #208] +; CHECK-NEXT: str q0, [sp, #208] +; CHECK-NEXT: ldr q0, [x0, #224] +; CHECK-NEXT: str q0, [sp, #224] +; CHECK-NEXT: ldr q0, [x0, #240] +; CHECK-NEXT: str q0, [sp, #240] +; CHECK-NEXT: bl byval_a64i32 +; CHECK-NEXT: ldr x28, [sp, #272] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #256] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #288 // =288 +; CHECK-NEXT: ret + call void @byval_a64i32([64 x i32]* byval([64 x i32]) %incoming) + ret void +} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll @@ -3912,8 +3912,13 @@ ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C5]](s32) ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; CHECK: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK: [[COPY20:%[0-9]+]]:_(p5) = COPY $sp_reg + ; CHECK: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY20]], [[C6]](s32) + ; CHECK: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: G_MEMCPY [[PTR_ADD2]](p5), [[FRAME_INDEX]](p5), [[C7]](s32), 0 :: (dereferenceable store 8 into stack, align 4, addrspace 5), (dereferenceable load 8, align 4, addrspace 5) + ; CHECK: [[COPY21:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY21]](<4 x s32>) ; CHECK: $sgpr4_sgpr5 = COPY [[COPY10]](p4) ; CHECK: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK: $sgpr8_sgpr9 = COPY [[PTR_ADD1]](p4) @@ -3934,6 +3939,62 @@ ret void } +declare void @void_func_byval_a3i32_byval_i8_align32([3 x i32] addrspace(5)* byval([3 x i32]) %arg0, i8 addrspace(5)* byval(i8) align 32 %arg1, i32 %arg2) #0 + +define void @call_byval_3ai32_byval_i8_align32([3 x i32] addrspace(5)* %incoming0, i8 addrspace(5)* align 32 %incoming1) #0 { + ; CHECK-LABEL: name: call_byval_3ai32_byval_i8_align32 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12 + ; CHECK: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 + ; CHECK: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK: [[COPY8:%[0-9]+]]:_(p5) = COPY $vgpr0 + ; CHECK: [[COPY9:%[0-9]+]]:_(p5) = COPY $vgpr1 + ; CHECK: [[COPY10:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 999 + ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; CHECK: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @void_func_byval_a3i32_byval_i8_align32 + ; CHECK: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY6]] + ; CHECK: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY5]] + ; CHECK: [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY4]] + ; CHECK: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]] + ; CHECK: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]] + ; CHECK: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[COPY19:%[0-9]+]]:_(p5) = COPY $sgpr32 + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY19]], [[C1]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; CHECK: G_MEMCPY [[PTR_ADD]](p5), [[COPY8]](p5), [[C2]](s32), 0 :: (dereferenceable store 12 into stack, align 4, addrspace 5), (dereferenceable load 12, align 4, addrspace 5) + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; CHECK: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY19]], [[C3]](s32) + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK: G_MEMCPY [[PTR_ADD1]](p5), [[COPY9]](p5), [[C4]](s32), 0 :: (dereferenceable store 1 into stack + 32, align 32, addrspace 5), (dereferenceable load 1, align 32, addrspace 5) + ; CHECK: $vgpr0 = COPY [[C]](s32) + ; CHECK: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK: $sgpr4_sgpr5 = COPY [[COPY11]](p4) + ; CHECK: $sgpr6_sgpr7 = COPY [[COPY12]](p4) + ; CHECK: $sgpr8_sgpr9 = COPY [[COPY13]](p4) + ; CHECK: $sgpr10_sgpr11 = COPY [[COPY14]](s64) + ; CHECK: $sgpr12 = COPY [[COPY15]](s32) + ; CHECK: $sgpr13 = COPY [[COPY16]](s32) + ; CHECK: $sgpr14 = COPY [[COPY17]](s32) + ; CHECK: $vgpr31 = COPY [[COPY18]](s32) + ; CHECK: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @void_func_byval_a3i32_byval_i8_align32, csr_amdgpu_highregs, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31 + ; CHECK: ADJCALLSTACKDOWN 0, 36, implicit-def $scc + ; CHECK: [[COPY21:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY10]] + ; CHECK: S_SETPC_B64_return [[COPY21]] + call void @void_func_byval_a3i32_byval_i8_align32([3 x i32] addrspace(5)* byval([3 x i32]) %incoming0, i8 addrspace(5)* align 32 %incoming1, i32 999) + ret void +} + define amdgpu_kernel void @test_call_external_void_func_v2i8() #0 { ; CHECK-LABEL: name: test_call_external_void_func_v2i8 ; CHECK: bb.1 (%ir-block.0):