Index: llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h +++ llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h @@ -22,6 +22,7 @@ #include "llvm/IR/Attributes.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MachineValueType.h" #include @@ -37,7 +38,6 @@ struct MachinePointerInfo; class MachineRegisterInfo; class TargetLowering; -class Value; class CallLowering { const TargetLowering *TLI; @@ -64,10 +64,17 @@ // if the argument was an incoming arg. SmallVector OrigRegs; + /// Optionally track the original IR value for the argument. This may not be + /// meaningful in all contexts. This should only be used on for forwarding + /// through to use for aliasing information in MachinePointerInfo for memory + /// arguments. + const Value *OrigValue = nullptr; + ArgInfo(ArrayRef Regs, Type *Ty, ArrayRef Flags = ArrayRef(), - bool IsFixed = true) - : BaseArgInfo(Ty, Flags, IsFixed), Regs(Regs.begin(), Regs.end()) { + bool IsFixed = true, const Value *OrigValue = nullptr) + : BaseArgInfo(Ty, Flags, IsFixed), Regs(Regs.begin(), Regs.end()), + OrigValue(OrigValue) { if (!Regs.empty() && Flags.empty()) this->Flags.push_back(ISD::ArgFlagsTy()); // FIXME: We should have just one way of saying "no register". @@ -76,6 +83,11 @@ "only void types should have no register"); } + ArgInfo(ArrayRef Regs, const Value &OrigValue, + ArrayRef Flags = ArrayRef(), + bool IsFixed = true) + : ArgInfo(Regs, OrigValue.getType(), Flags, IsFixed, &OrigValue) {} + ArgInfo() : BaseArgInfo() {} }; Index: llvm/lib/CodeGen/GlobalISel/CallLowering.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/CallLowering.cpp +++ llvm/lib/CodeGen/GlobalISel/CallLowering.cpp @@ -112,7 +112,7 @@ unsigned i = 0; unsigned NumFixedArgs = CB.getFunctionType()->getNumParams(); for (auto &Arg : CB.args()) { - ArgInfo OrigArg{ArgRegs[i], Arg->getType(), getAttributesForArgIdx(CB, i), + ArgInfo OrigArg{ArgRegs[i], *Arg.get(), getAttributesForArgIdx(CB, i), i < NumFixedArgs}; setArgFlags(OrigArg, i + AttributeList::FirstArgIndex, DL, CB); @@ -204,7 +204,8 @@ // No splitting to do, but we want to replace the original type (e.g. [1 x // double] -> double). SplitArgs.emplace_back(OrigArg.Regs[0], SplitVTs[0].getTypeForEVT(Ctx), - OrigArg.Flags[0], OrigArg.IsFixed); + OrigArg.Flags[0], OrigArg.IsFixed, + OrigArg.OrigValue); return; } @@ -667,18 +668,19 @@ Register StackAddr = Handler.getStackAddress(MemSize, Offset, DstMPO, Flags); - const LLT PtrTy = MRI.getType(StackAddr); - - // FIXME: We do not have access to the original IR value here to - // preserve the aliasing information. - MachinePointerInfo SrcMPO(PtrTy.getAddressSpace()); + MachinePointerInfo SrcMPO(Args[i].OrigValue); + if (!Args[i].OrigValue) { + // We still need to accurately track the stack address space if we + // don't know the underlying value. + const LLT PtrTy = MRI.getType(StackAddr); + SrcMPO = MachinePointerInfo(PtrTy.getAddressSpace()); + } Align DstAlign = std::max(Flags.getNonZeroByValAlign(), inferAlignFromPtrInfo(MF, DstMPO)); - // TODO: Theoretically the source value could have a higher alignment, - // but we don't have that here - Align SrcAlign = Flags.getNonZeroByValAlign(); + Align SrcAlign = std::max(Flags.getNonZeroByValAlign(), + inferAlignFromPtrInfo(MF, SrcMPO)); Handler.copyArgumentMemory(Args[i], StackAddr, Args[i].Regs[0], DstMPO, DstAlign, SrcMPO, SrcAlign, Index: llvm/lib/CodeGen/GlobalISel/Utils.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -548,6 +548,11 @@ MPO.Offset); } + if (const Value *V = MPO.V.dyn_cast()) { + const Module *M = MF.getFunction().getParent(); + return V->getPointerAlignment(M->getDataLayout()); + } + return Align(1); } Index: llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp +++ llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp @@ -450,7 +450,7 @@ if (DL.getTypeStoreSize(Arg.getType()).isZero()) continue; - ArgInfo OrigArg{VRegs[i], Arg.getType()}; + ArgInfo OrigArg{VRegs[i], Arg}; setArgFlags(OrigArg, i + AttributeList::FirstArgIndex, DL, F); splitToValueTypes(OrigArg, SplitArgs, DL, F.getCallingConv()); Index: llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -654,7 +654,7 @@ } } - ArgInfo OrigArg(VRegs[Idx], Arg.getType()); + ArgInfo OrigArg(VRegs[Idx], Arg); const unsigned OrigArgIdx = Idx + AttributeList::FirstArgIndex; setArgFlags(OrigArg, OrigArgIdx, DL, F); Index: llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll @@ -3916,7 +3916,7 @@ ; CHECK: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY20]], [[C6]](s32) ; CHECK: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; CHECK: G_MEMCPY [[PTR_ADD2]](p5), [[FRAME_INDEX]](p5), [[C7]](s32), 0 :: (dereferenceable store 8 into stack, align 4, addrspace 5), (dereferenceable load 8, align 4, addrspace 5) + ; CHECK: G_MEMCPY [[PTR_ADD2]](p5), [[FRAME_INDEX]](p5), [[C7]](s32), 0 :: (dereferenceable store 8 into stack, align 4, addrspace 5), (dereferenceable load 8 from %ir.val, align 4, addrspace 5) ; CHECK: [[COPY21:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY21]](<4 x s32>) ; CHECK: $sgpr4_sgpr5 = COPY [[COPY10]](p4) @@ -3971,11 +3971,11 @@ ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY19]], [[C1]](s32) ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; CHECK: G_MEMCPY [[PTR_ADD]](p5), [[COPY8]](p5), [[C2]](s32), 0 :: (dereferenceable store 12 into stack, align 4, addrspace 5), (dereferenceable load 12, align 4, addrspace 5) + ; CHECK: G_MEMCPY [[PTR_ADD]](p5), [[COPY8]](p5), [[C2]](s32), 0 :: (dereferenceable store 12 into stack, align 4, addrspace 5), (dereferenceable load 12 from %ir.incoming0, align 4, addrspace 5) ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 ; CHECK: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY19]], [[C3]](s32) ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; CHECK: G_MEMCPY [[PTR_ADD1]](p5), [[COPY9]](p5), [[C4]](s32), 0 :: (dereferenceable store 1 into stack + 32, align 32, addrspace 5), (dereferenceable load 1, align 32, addrspace 5) + ; CHECK: G_MEMCPY [[PTR_ADD1]](p5), [[COPY9]](p5), [[C4]](s32), 0 :: (dereferenceable store 1 into stack + 32, align 32, addrspace 5), (dereferenceable load 1 from %ir.incoming1, align 32, addrspace 5) ; CHECK: $vgpr0 = COPY [[C]](s32) ; CHECK: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) @@ -3995,6 +3995,57 @@ ret void } +declare void @void_func_byval_a4i64_align4([4 x i64] addrspace(5)* byval([4 x i64]) align 4 %arg0) #0 + +; Make sure we are aware of the higher alignment of the incoming value +; than implied by the outgoing byval alignment in the memory operand. +define void @call_byval_a4i64_align4_higher_source_align([4 x i64] addrspace(5)* align 256 %incoming_high_align) #0 { + ; CHECK-LABEL: name: call_byval_a4i64_align4_higher_source_align + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12 + ; CHECK: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 + ; CHECK: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK: [[COPY8:%[0-9]+]]:_(p5) = COPY $vgpr0 + ; CHECK: [[COPY9:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; CHECK: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @void_func_byval_a4i64_align4 + ; CHECK: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]] + ; CHECK: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY5]] + ; CHECK: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY4]] + ; CHECK: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]] + ; CHECK: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]] + ; CHECK: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[COPY18:%[0-9]+]]:_(p5) = COPY $sgpr32 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY18]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; CHECK: G_MEMCPY [[PTR_ADD]](p5), [[COPY8]](p5), [[C1]](s32), 0 :: (dereferenceable store 32 into stack, align 4, addrspace 5), (dereferenceable load 32 from %ir.incoming_high_align, align 256, addrspace 5) + ; CHECK: [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>) + ; CHECK: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; CHECK: $sgpr8_sgpr9 = COPY [[COPY12]](p4) + ; CHECK: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK: $vgpr31 = COPY [[COPY17]](s32) + ; CHECK: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @void_func_byval_a4i64_align4, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31 + ; CHECK: ADJCALLSTACKDOWN 0, 32, implicit-def $scc + ; CHECK: [[COPY20:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY9]] + ; CHECK: S_SETPC_B64_return [[COPY20]] + call void @void_func_byval_a4i64_align4([4 x i64] addrspace(5)* byval([4 x i64]) align 4 %incoming_high_align) + ret void +} + define amdgpu_kernel void @test_call_external_void_func_v2i8() #0 { ; CHECK-LABEL: name: test_call_external_void_func_v2i8 ; CHECK: bb.1 (%ir-block.0):