Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -287,7 +287,6 @@ SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, - SDValue StackPtr, SDValue ArgVal, int64_t Offset) const; Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4003,13 +4003,12 @@ SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, - SDValue StackPtr, SDValue ArgVal, int64_t Offset) const { MachineFunction &MF = DAG.getMachineFunction(); MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset); - SDValue Ptr = DAG.getObjectPtrOffset(SL, StackPtr, Offset); + SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32); SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, 4, MachineMemOperand::MODereferenceable); return Store; Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -265,6 +265,7 @@ void passSpecialInputs( CallLoweringInfo &CLI, + CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl> &RegsToPass, SmallVectorImpl &MemOpChains, Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -2181,6 +2181,7 @@ // from the explicit user arguments present in the IR. void SITargetLowering::passSpecialInputs( CallLoweringInfo &CLI, + CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl> &RegsToPass, SmallVectorImpl &MemOpChains, @@ -2253,9 +2254,9 @@ if (OutgoingArg->isRegister()) { RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg); } else { - SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, StackPtr, - InputReg, - OutgoingArg->getStackOffset()); + unsigned SpecialArgOffset = CCInfo.AllocateStack(ArgVT.getStoreSize(), 4); + SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg, + SpecialArgOffset); MemOpChains.push_back(ArgStore); } } @@ -2401,8 +2402,6 @@ } // The first 4 bytes are reserved for the callee's emergency stack slot. - const unsigned CalleeUsableStackOffset = 4; - if (IsTailCall) { IsTailCall = isEligibleForTailCallOptimization( Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG); @@ -2441,6 +2440,10 @@ SmallVector ArgLocs; CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg); + + // The first 4 bytes are reserved for the callee's emergency stack slot. + CCInfo.AllocateStack(4, 4); + CCInfo.AnalyzeCallOperands(Outs, AssignFn); // Get a count of how many bytes are to be pushed on the stack. @@ -2488,10 +2491,6 @@ } } - // Stack pointer relative accesses are done by changing the offset SGPR. This - // is just the VGPR offset component. - SDValue StackPtr = DAG.getConstant(CalleeUsableStackOffset, DL, MVT::i32); - SmallVector MemOpChains; MVT PtrVT = MVT::i32; @@ -2533,9 +2532,10 @@ MachinePointerInfo DstInfo; unsigned LocMemOffset = VA.getLocMemOffset(); + int32_t Offset = LocMemOffset; - SDValue PtrOff = DAG.getObjectPtrOffset(DL, StackPtr, Offset); + SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT); if (IsTailCall) { ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; @@ -2545,8 +2545,7 @@ Offset = Offset + FPDiff; int FI = MFI.CreateFixedObject(OpSize, Offset, true); - DstAddr = DAG.getObjectPtrOffset(DL, DAG.getFrameIndex(FI, PtrVT), - StackPtr); + DstAddr = DAG.getFrameIndex(FI, PtrVT); DstInfo = MachinePointerInfo::getFixedStack(MF, FI); // Make sure any stack arguments overlapping with where we're storing @@ -2581,7 +2580,8 @@ } // Copy special input registers after user input arguments. - passSpecialInputs(CLI, *Info, RegsToPass, MemOpChains, Chain, StackPtr); + passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain, + DAG.getConstant(0, SDLoc(), MVT::i32)); if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); Index: test/CodeGen/AMDGPU/byval-frame-setup.ll =================================================================== --- test/CodeGen/AMDGPU/byval-frame-setup.ll +++ test/CodeGen/AMDGPU/byval-frame-setup.ll @@ -110,7 +110,7 @@ ; GCN: s_sub_u32 s32, s32, 0xc00{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 -define void @call_void_func_byval_struct_func() #0 { +define void @call_void_func_byval_struct_func() #1 { entry: %arg0 = alloca %struct.ByValStruct, align 4, addrspace(5) %arg1 = alloca %struct.ByValStruct, align 4, addrspace(5) @@ -163,7 +163,7 @@ ; GCN: s_swappc_b64 ; GCN-NOT: s_sub_u32 s32 ; GCN: s_endpgm -define amdgpu_kernel void @call_void_func_byval_struct_kernel() #0 { +define amdgpu_kernel void @call_void_func_byval_struct_kernel() #1 { entry: %arg0 = alloca %struct.ByValStruct, align 4, addrspace(5) %arg1 = alloca %struct.ByValStruct, align 4, addrspace(5) @@ -181,6 +181,146 @@ ret void } +; GCN-LABEL: {{^}}void_func_byval_struct_align8: +; GCN: s_mov_b32 s5, s32 +; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:8{{$}} +; GCN-NOT: s32 +; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s5 offset:8{{$}} +; GCN-NOT: s32 + +; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:24{{$}} +; GCN-NOT: s32 +; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s5 offset:24{{$}} +; GCN-NOT: s32 +define void @void_func_byval_struct_align8(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 8 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 8 %arg1) #1 { +entry: + %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0 + %tmp = load volatile i32, i32 addrspace(5)* %arrayidx, align 8 + %add = add nsw i32 %tmp, 1 + store volatile i32 %add, i32 addrspace(5)* %arrayidx, align 8 + %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0 + %tmp1 = load volatile i32, i32 addrspace(5)* %arrayidx2, align 8 + %add3 = add nsw i32 %tmp1, 2 + store volatile i32 %add3, i32 addrspace(5)* %arrayidx2, align 8 + store volatile i32 9, i32 addrspace(1)* null, align 4 + ret void +} + +; Make sure the byval alignment is respected in the call frame setup +; GCN-LABEL: {{^}}call_void_func_byval_struct_align8_kernel: +; GCN: s_mov_b32 s33, s7 +; GCN: s_add_u32 s32, s33, 0xc00{{$}} + +; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 +; GCN-DAG: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13 +; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s33 offset:8 +; GCN: buffer_store_dword [[THIRTEEN]], off, s[0:3], s33 offset:24 + +; GCN-NOT: s_add_u32 s32, s32, 0x800 + +; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s33 offset:8 +; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s33 offset:12 +; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s33 offset:16 +; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s33 offset:20 + +; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:8{{$}} +; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:12 +; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:16 +; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:20 + +; GCN-DAG: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s33 offset:24 +; GCN-DAG: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s33 offset:28 +; GCN-DAG: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s33 offset:32 +; GCN-DAG: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s33 offset:36 + +; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:24 +; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:28 +; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:32 +; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:36 + + +; GCN: s_swappc_b64 +; GCN-NOT: s_sub_u32 s32 +; GCN: s_endpgm +define amdgpu_kernel void @call_void_func_byval_struct_align8_kernel() #1 { +entry: + %arg0 = alloca %struct.ByValStruct, align 8, addrspace(5) + %arg1 = alloca %struct.ByValStruct, align 8, addrspace(5) + %tmp = bitcast %struct.ByValStruct addrspace(5)* %arg0 to i8 addrspace(5)* + call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp) + %tmp1 = bitcast %struct.ByValStruct addrspace(5)* %arg1 to i8 addrspace(5)* + call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp1) + %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0 + store volatile i32 9, i32 addrspace(5)* %arrayidx, align 8 + %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0 + store volatile i32 13, i32 addrspace(5)* %arrayidx2, align 8 + call void @void_func_byval_struct_align8(%struct.ByValStruct addrspace(5)* byval nonnull align 8 %arg0, %struct.ByValStruct addrspace(5)* byval nonnull align 8 %arg1) + call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp1) + call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp) + ret void +} + +; GCN-LABEL: {{^}}call_void_func_byval_struct_align8_func: +; GCN: s_mov_b32 s5, s32 +; GCN-DAG: s_add_u32 s32, s32, 0xc00{{$}} +; GCN-DAG: v_writelane_b32 + +; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 +; GCN-DAG: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13 + +; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:8 +; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s5 offset:24 + +; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:8 +; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:12 +; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:16 +; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s5 offset:20 + +; GCN-NOT: s_add_u32 s32, s32, 0x800 + +; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:8{{$}} +; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:12 +; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:16 +; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:20 + +; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s5 offset:24 +; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s5 offset:28 +; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s5 offset:32 +; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s5 offset:36 + +; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:24 +; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:28 +; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:32 +; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:36 + +; GCN: s_swappc_b64 +; GCN-NOT: v_readlane_b32 s32 +; GCN: v_readlane_b32 +; GCN-NOT: v_readlane_b32 s32 + +; GCN-NOT: s_sub_u32 s32, s32, 0x800 + +; GCN: s_sub_u32 s32, s32, 0xc00{{$}} +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @call_void_func_byval_struct_align8_func() #0 { +entry: + %arg0 = alloca %struct.ByValStruct, align 8, addrspace(5) + %arg1 = alloca %struct.ByValStruct, align 8, addrspace(5) + %tmp = bitcast %struct.ByValStruct addrspace(5)* %arg0 to i8 addrspace(5)* + call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp) + %tmp1 = bitcast %struct.ByValStruct addrspace(5)* %arg1 to i8 addrspace(5)* + call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp1) + %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0 + store volatile i32 9, i32 addrspace(5)* %arrayidx, align 8 + %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0 + store volatile i32 13, i32 addrspace(5)* %arrayidx2, align 8 + call void @void_func_byval_struct_align8(%struct.ByValStruct addrspace(5)* byval nonnull align 8 %arg0, %struct.ByValStruct addrspace(5)* byval nonnull align 8 %arg1) + call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp1) + call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp) + ret void +} + ; GCN-LABEL: {{^}}call_void_func_byval_struct_kernel_no_frame_pointer_elim: define amdgpu_kernel void @call_void_func_byval_struct_kernel_no_frame_pointer_elim() #2 { entry: Index: test/CodeGen/AMDGPU/callee-special-input-vgprs.ll =================================================================== --- test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -290,7 +290,7 @@ ; GCN: s_mov_b32 s33, s7 ; GCN: s_mov_b32 s32, s33 -; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; GCN: s_mov_b32 s4, s33 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 { @@ -308,7 +308,7 @@ ; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x: ; GCN: s_mov_b32 s5, s32 -; GCN: buffer_store_dword v1, off, s[0:3], s32 offset:8 +; GCN: buffer_store_dword v1, off, s[0:3], s32 offset: ; GCN: s_swappc_b64 define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 { store volatile i32 %arg0, i32 addrspace(1)* undef @@ -330,7 +330,7 @@ ; GCN: s_add_u32 s32, s32, 0x400{{$}} ; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 -; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:8{{$}} +; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4{{$}} ; GCN: s_swappc_b64 @@ -428,7 +428,7 @@ ; GCN-NOT: s32 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} ; GCN: buffer_store_dword [[K]], off, s[0:3], s33 offset:4 -; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33 offset:4 ; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}} @@ -453,7 +453,7 @@ ; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x_byval: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} ; GCN: buffer_store_dword [[K]], off, s[0:3], s5 offset:4 -; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s5 offset:4 ; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}} @@ -539,11 +539,10 @@ ret void } -; frame[0] = kernel emergency stack slot -; frame[1] = callee emergency stack slot -; frame[2] = ID X -; frame[3] = ID Y -; frame[4] = ID Z +; frame[0] = callee emergency stack slot +; frame[1] = ID X +; frame[2] = ID Y +; frame[3] = ID Z ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_xyz: ; GCN: enable_vgpr_workitem_id = 2 @@ -551,9 +550,9 @@ ; GCN: s_mov_b32 s33, s7 ; GCN: s_mov_b32 s32, s33 -; GCN-DAG: buffer_store_dword v0, off, s[0:3], s32 offset:8 -; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:12 -; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:16 +; GCN-DAG: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:8 +; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 { call void @too_many_args_use_workitem_id_xyz( @@ -635,10 +634,9 @@ ret void } -; frame[0] = kernel emergency stack slot -; frame[1] = callee emergency stack slot -; frame[2] = ID Y -; frame[3] = ID Z +; frame[0] = callee emergency stack slot +; frame[1] = ID Y +; frame[2] = ID Z ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_stack_yz: ; GCN: enable_vgpr_workitem_id = 2 @@ -647,8 +645,8 @@ ; GCN: s_mov_b32 s32, s33 ; GCN-DAG: v_mov_b32_e32 v31, v0 -; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:8 -; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:12 +; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_stack_yz() #1 { call void @too_many_args_use_workitem_id_x_stack_yz(