Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -587,6 +587,8 @@ Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects(); Info.PrivateSegmentSize = FrameInfo.getStackSize(); + if (MFI->isStackRealigned()) + Info.PrivateSegmentSize += FrameInfo.getMaxAlignment(); Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) || Index: lib/Target/AMDGPU/SIFrameLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIFrameLowering.cpp +++ lib/Target/AMDGPU/SIFrameLowering.cpp @@ -13,6 +13,7 @@ #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -487,9 +488,43 @@ } } +// Find a scratch register that we can use at the start of the prologue to +// re-align the stack pointer. We avoid using callee-save registers since they +// may appear to be free when this is called from canUseAsPrologue (during +// shrink wrapping), but then no longer be free when this is called from +// emitPrologue. +// +// FIXME: This is a bit conservative, since in the above case we could use one +// of the callee-save registers as a scratch temp to re-align the stack pointer, +// but we would then have to make sure that we were in fact saving at least one +// callee-save register in the prologue, which is additional complexity that +// doesn't seem worth the benefit. +static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock &MBB) { + MachineFunction *MF = MBB.getParent(); + + const SISubtarget &Subtarget = MF->getSubtarget(); + const SIRegisterInfo &TRI = *Subtarget.getRegisterInfo(); + LivePhysRegs LiveRegs(TRI); + LiveRegs.addLiveIns(MBB); + + // Mark callee saved registers as used so we will not choose them. + const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(MF); + for (unsigned i = 0; CSRegs[i]; ++i) + LiveRegs.addReg(CSRegs[i]); + + MachineRegisterInfo &MRI = MF->getRegInfo(); + + for (unsigned Reg : AMDGPU::SReg_32_XM0RegClass) { + if (LiveRegs.available(MRI, Reg)) + return Reg; + } + + return AMDGPU::NoRegister; +} + void SIFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { - const SIMachineFunctionInfo *FuncInfo = MF.getInfo(); + SIMachineFunctionInfo *FuncInfo = MF.getInfo(); if (FuncInfo->isEntryFunction()) { emitEntryFunctionPrologue(MF, MBB); return; @@ -498,6 +533,7 @@ const MachineFrameInfo &MFI = MF.getFrameInfo(); const SISubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg(); unsigned FramePtrReg = FuncInfo->getFrameOffsetReg(); @@ -505,8 +541,36 @@ MachineBasicBlock::iterator MBBI = MBB.begin(); DebugLoc DL; + // XXX - Is this the right predicate? + bool NeedFP = hasFP(MF); - if (NeedFP) { + uint32_t NumBytes = MFI.getStackSize(); + uint32_t RoundedSize = NumBytes; + const bool NeedsRealignment = TRI.needsStackRealignment(MF); + + if (NeedsRealignment) { + assert(NeedFP); + const unsigned Alignment = MFI.getMaxAlignment(); + const unsigned ZeroLowBits = countTrailingZeros(Alignment); + assert(ZeroLowBits > 1); + + RoundedSize += Alignment; + + unsigned ScratchSPReg = findScratchNonCalleeSaveRegister(MBB); + assert(ScratchSPReg != AMDGPU::NoRegister); + + // s_add_u32 tmp_reg, s32, NumBytes + // s_and_b32 s32, tmp_reg, 0b111...0000 + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg) + .addReg(StackPtrReg) + .addImm((Alignment - 1) * ST.getWavefrontSize()) + .setMIFlag(MachineInstr::FrameSetup); + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg) + .addReg(ScratchSPReg, RegState::Kill) + .addImm(-Alignment * ST.getWavefrontSize()) + .setMIFlag(MachineInstr::FrameSetup); + FuncInfo->setIsStackRealigned(true); + } else if (NeedFP) { // If we need a base pointer, set it up here. It's whatever the value of // the stack pointer is at this point. Any variable size objects will be // allocated after this, so we can still use the base pointer to reference @@ -516,11 +580,10 @@ .setMIFlag(MachineInstr::FrameSetup); } - uint32_t NumBytes = MFI.getStackSize(); - if (NumBytes != 0 && hasSP(MF)) { + if (RoundedSize != 0 && hasSP(MF)) { BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg) .addReg(StackPtrReg) - .addImm(NumBytes * ST.getWavefrontSize()) + .addImm(RoundedSize * ST.getWavefrontSize()) .setMIFlag(MachineInstr::FrameSetup); } @@ -566,10 +629,12 @@ // it's really whether we need SP to be accurate or not. if (NumBytes != 0 && hasSP(MF)) { + uint32_t RoundedSize = FuncInfo->isStackRealigned() ? + NumBytes + MFI.getMaxAlignment() : NumBytes; + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg) .addReg(StackPtrReg) - .addImm(NumBytes * ST.getWavefrontSize()) - .setMIFlag(MachineInstr::FrameDestroy); + .addImm(RoundedSize * ST.getWavefrontSize()); } } @@ -759,7 +824,8 @@ } bool SIFrameLowering::hasSP(const MachineFunction &MF) const { + const SIRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); // All stack operations are relative to the frame offset SGPR. const MachineFrameInfo &MFI = MF.getFrameInfo(); - return MFI.hasCalls() || MFI.hasVarSizedObjects(); + return MFI.hasCalls() || MFI.hasVarSizedObjects() || TRI->needsStackRealignment(MF); } Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -142,6 +142,7 @@ bool HasSpilledSGPRs = false; bool HasSpilledVGPRs = false; bool HasNonSpillStackObjects = false; + bool IsStackRealigned = false; unsigned NumSpilledSGPRs = 0; unsigned NumSpilledVGPRs = 0; @@ -495,6 +496,14 @@ HasNonSpillStackObjects = StackObject; } + bool isStackRealigned() const { + return IsStackRealigned; + } + + void setIsStackRealigned(bool Realigned = true) { + IsStackRealigned = Realigned; + } + unsigned getNumSpilledSGPRs() const { return NumSpilledSGPRs; } Index: test/CodeGen/AMDGPU/stack-realign.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/stack-realign.ll @@ -0,0 +1,125 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; Check that we properly realign the stack. While 4-byte access is all +; that is ever needed, some transformations rely on the known bits from the alignment of the pointer (e.g. + + +; 128 byte object +; 4 byte emergency stack slot +; = 144 bytes with padding between them + +; GCN-LABEL: {{^}}needs_align16_default_stack_align: +; GCN: s_mov_b32 s5, s32 +; GCN-NOT: s32 + +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen +; GCN: v_or_b32_e32 v{{[0-9]+}}, 12 +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen + +; GCN-NOT: s32 + +; GCN: ; ScratchSize: 144 +define void @needs_align16_default_stack_align(i32 %idx) #0 { + %alloca.align16 = alloca [8 x <4 x i32>], align 16, addrspace(5) + %gep0 = getelementptr inbounds [8 x <4 x i32>], [8 x <4 x i32>] addrspace(5)* %alloca.align16, i32 0, i32 %idx + store volatile <4 x i32> , <4 x i32> addrspace(5)* %gep0, align 16 + ret void +} + +; GCN-LABEL: {{^}}needs_align16_stack_align4: +; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0x3c0{{$}} +; GCN: s_and_b32 s5, s6, 0xfffffc00 +; GCN: s_add_u32 s32, s32, 0x2800{{$}} + +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen +; GCN: v_or_b32_e32 v{{[0-9]+}}, 12 +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen + +; GCN: s_sub_u32 s32, s32, 0x2800 + +; GCN: ; ScratchSize: 160 +define void @needs_align16_stack_align4(i32 %idx) #2 { + %alloca.align16 = alloca [8 x <4 x i32>], align 16, addrspace(5) + %gep0 = getelementptr inbounds [8 x <4 x i32>], [8 x <4 x i32>] addrspace(5)* %alloca.align16, i32 0, i32 %idx + store volatile <4 x i32> , <4 x i32> addrspace(5)* %gep0, align 16 + ret void +} + +; GCN-LABEL: {{^}}needs_align32: +; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0x7c0{{$}} +; GCN: s_and_b32 s5, s6, 0xfffff800 +; GCN: s_add_u32 s32, s32, 0x3000{{$}} + +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen +; GCN: v_or_b32_e32 v{{[0-9]+}}, 12 +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen + +; GCN: s_sub_u32 s32, s32, 0x3000 + +; GCN: ; ScratchSize: 192 +define void @needs_align32(i32 %idx) #0 { + %alloca.align16 = alloca [8 x <4 x i32>], align 32, addrspace(5) + %gep0 = getelementptr inbounds [8 x <4 x i32>], [8 x <4 x i32>] addrspace(5)* %alloca.align16, i32 0, i32 %idx + store volatile <4 x i32> , <4 x i32> addrspace(5)* %gep0, align 32 + ret void +} + +; GCN-LABEL: {{^}}force_realign4: +; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0xc0{{$}} +; GCN: s_and_b32 s5, s6, 0xffffff00 +; GCN: s_add_u32 s32, s32, 0xd00{{$}} + +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen +; GCN: s_sub_u32 s32, s32, 0xd00 + +; GCN: ; ScratchSize: 52 +define void @force_realign4(i32 %idx) #1 { + %alloca.align16 = alloca [8 x i32], align 4, addrspace(5) + %gep0 = getelementptr inbounds [8 x i32], [8 x i32] addrspace(5)* %alloca.align16, i32 0, i32 %idx + store volatile i32 3, i32 addrspace(5)* %gep0, align 4 + ret void +} + +; GCN-LABEL: {{^}}kernel_call_align16_from_8: +; GCN: s_add_u32 s32, s8, 0x400{{$}} +; GCN-NOT: s32 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kernel_call_align16_from_8() #0 { + %alloca = alloca i32, align 4, addrspace(5) + store volatile i32 2, i32 addrspace(5)* %alloca + call void @needs_align16_default_stack_align(i32 1) + ret void +} + +; The call sequence should keep the stack on call aligned to 4 +; GCN-LABEL: {{^}}kernel_call_align16_from_5: +; GCN: s_add_u32 s32, s8, 0x400 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kernel_call_align16_from_5() { + %alloca0 = alloca i8, align 1, addrspace(5) + store volatile i8 2, i8 addrspace(5)* %alloca0 + + call void @needs_align16_default_stack_align(i32 1) + ret void +} + +; GCN-LABEL: {{^}}kernel_call_align4_from_5: +; GCN: s_add_u32 s32, s8, 0x400 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kernel_call_align4_from_5() { + %alloca0 = alloca i8, align 1, addrspace(5) + store volatile i8 2, i8 addrspace(5)* %alloca0 + + call void @needs_align16_stack_align4(i32 1) + ret void +} + +attributes #0 = { noinline nounwind } +attributes #1 = { noinline nounwind "stackrealign" } +attributes #2 = { noinline nounwind alignstack=4 }