Index: lib/Target/AArch64/AArch64FrameLowering.h =================================================================== --- lib/Target/AArch64/AArch64FrameLowering.h +++ lib/Target/AArch64/AArch64FrameLowering.h @@ -37,6 +37,8 @@ void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; + bool canUseAsPrologue(const MachineBasicBlock &MBB) const override; + int getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const override; int resolveFrameIndexReference(const MachineFunction &MF, int FI, Index: lib/Target/AArch64/AArch64FrameLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64FrameLowering.cpp +++ lib/Target/AArch64/AArch64FrameLowering.cpp @@ -250,6 +250,63 @@ } } +// Find a scratch register that we can use at the start of the prologue to +// re-align the stack pointer. We avoid using callee-save registers since they +// may appear to be free when this is called from canUseAsPrologue (during +// shrink wrapping), but then no longer be free when this is called from +// emitPrologue. +// +// FIXME: This is a bit conservative, since in the above case we could use one +// of the callee-save registers as a scratch temp to re-align the stack pointer, +// but we would then have to make sure that we were in fact saving at least one +// callee-save register in the prologue, which is additional complexity that +// doesn't seem worth the benefit. +static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) { + MachineFunction *MF = MBB->getParent(); + + // If MBB is an entry block, use X9 as the scratch register + if (&MF->front() == MBB) + return AArch64::X9; + + RegScavenger RS; + RS.enterBasicBlock(MBB); + + // Prefer X9 since it was historically used for the prologue scratch reg. + if (!RS.isRegUsed(AArch64::X9)) + return AArch64::X9; + + // Find a free non callee-save reg. + const AArch64Subtarget &Subtarget = MF->getSubtarget(); + const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(MF); + BitVector CalleeSaveRegs(RegInfo->getNumRegs()); + for (unsigned i = 0; CSRegs[i]; ++i) + CalleeSaveRegs.set(CSRegs[i]); + + BitVector Available = RS.getRegsAvailable(&AArch64::GPR64RegClass); + for (int AvailReg = Available.find_first(); AvailReg != -1; + AvailReg = Available.find_next(AvailReg)) + if (!CalleeSaveRegs.test(AvailReg)) + return AvailReg; + + return AArch64::NoRegister; +} + +bool AArch64FrameLowering::canUseAsPrologue( + const MachineBasicBlock &MBB) const { + const MachineFunction *MF = MBB.getParent(); + MachineBasicBlock *TmpMBB = const_cast(&MBB); + const AArch64Subtarget &Subtarget = MF->getSubtarget(); + const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + + // Don't need a scratch register if we're not going to re-align the stack. + if (!RegInfo->needsStackRealignment(*MF)) + return true; + // Otherwise, we can use any block as long as it has a scratch register + // available. + return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister; +} + void AArch64FrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { MachineBasicBlock::iterator MBBI = MBB.begin(); @@ -331,8 +388,8 @@ const bool NeedsRealignment = RegInfo->needsStackRealignment(MF); unsigned scratchSPReg = AArch64::SP; if (NumBytes && NeedsRealignment) { - // Use the first callee-saved register as a scratch register. - scratchSPReg = AArch64::X9; + scratchSPReg = findScratchNonCalleeSaveRegister(&MBB); + assert(scratchSPReg != AArch64::NoRegister); } // If we're a leaf function, try using the red zone. @@ -926,19 +983,14 @@ if (RegInfo->hasBasePointer(MF)) BasePointerReg = RegInfo->getBaseRegister(); - unsigned StackAlignReg = AArch64::NoRegister; - if (RegInfo->needsStackRealignment(MF) && !RegInfo->hasBasePointer(MF)) - StackAlignReg = AArch64::X9; - bool ExtraCSSpill = false; const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); // Figure out which callee-saved registers to save/restore. for (unsigned i = 0; CSRegs[i]; ++i) { const unsigned Reg = CSRegs[i]; - // Add the stack re-align scratch register and base pointer register to - // SavedRegs set only if they are callee-save. - if (Reg == BasePointerReg || Reg == StackAlignReg) + // Add the base pointer register to SavedRegs if it is callee-save. + if (Reg == BasePointerReg) SavedRegs.set(Reg); bool RegUsed = SavedRegs.test(Reg); Index: test/CodeGen/AArch64/arm64-shrink-wrapping.ll =================================================================== --- test/CodeGen/AArch64/arm64-shrink-wrapping.ll +++ test/CodeGen/AArch64/arm64-shrink-wrapping.ll @@ -630,3 +630,92 @@ end: ret void } + +; Re-aligned stack pointer. See bug 26642. Avoid clobbering live +; values in the prologue when re-aligning the stack pointer. +; CHECK-LABEL: stack_realign: +; ENABLE-DAG: lsl w[[LSL1:[0-9]+]], w0, w1 +; ENABLE-DAG: lsl w[[LSL2:[0-9]+]], w1, w0 +; DISABLE-NOT: lsl w[[LSL1:[0-9]+]], w0, w1 +; DISABLE-NOT: lsl w[[LSL2:[0-9]+]], w1, w0 +; CHECK: stp x29, x30, [sp, #-16]! +; CHECK: mov x29, sp +; ENABLE-NOT: sub x[[LSL1]], sp, #16 +; ENABLE-NOT: sub x[[LSL2]], sp, #16 +; DISABLE: sub x{{[0-9]+}}, sp, #16 +; DISABLE-DAG: lsl w[[LSL1:[0-9]+]], w0, w1 +; DISABLE-DAG: lsl w[[LSL2:[0-9]+]], w1, w0 +; CHECK-DAG: str w[[LSL1]], +; CHECK-DAG: str w[[LSL2]], + +define i32 @stack_realign(i32 %a, i32 %b, i32* %ptr1, i32* %ptr2) { + %tmp = alloca i32, align 32 + %shl1 = shl i32 %a, %b + %shl2 = shl i32 %b, %a + %tmp2 = icmp slt i32 %a, %b + br i1 %tmp2, label %true, label %false + +true: + store i32 %a, i32* %tmp, align 4 + %tmp4 = load i32, i32* %tmp + br label %false + +false: + %tmp.0 = phi i32 [ %tmp4, %true ], [ %a, %0 ] + store i32 %shl1, i32* %ptr1 + store i32 %shl2, i32* %ptr2 + ret i32 %tmp.0 +} + +; Re-aligned stack pointer with all caller-save regs live. See bug +; 26642. In this case we currently avoid shrink wrapping because +; ensuring we have a scratch register to re-align the stack pointer is +; too complicated. Output should be the same for both enabled and +; disabled shrink wrapping. +; CHECK-LABEL: stack_realign2: +; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #-{{[0-9]+}}]! +; CHECK: add x29, sp, #{{[0-9]+}} +; CHECK: lsl {{w[0-9]+}}, w0, w1 + +define void @stack_realign2(i32 %a, i32 %b, i32* %ptr1, i32* %ptr2, i32* %ptr3, i32* %ptr4, i32* %ptr5, i32* %ptr6) { + %tmp = alloca i32, align 32 + %tmp1 = shl i32 %a, %b + %tmp2 = shl i32 %b, %a + %tmp3 = lshr i32 %a, %b + %tmp4 = lshr i32 %b, %a + %tmp5 = add i32 %b, %a + %tmp6 = sub i32 %b, %a + %tmp7 = add i32 %tmp1, %tmp2 + %tmp8 = sub i32 %tmp2, %tmp3 + %tmp9 = add i32 %tmp3, %tmp4 + %tmp10 = add i32 %tmp4, %tmp5 + %cmp = icmp slt i32 %a, %b + br i1 %cmp, label %true, label %false + +true: + store i32 %a, i32* %tmp, align 4 + call void asm sideeffect "nop", "~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28}"() nounwind + br label %false + +false: + store i32 %tmp1, i32* %ptr1, align 4 + store i32 %tmp2, i32* %ptr2, align 4 + store i32 %tmp3, i32* %ptr3, align 4 + store i32 %tmp4, i32* %ptr4, align 4 + store i32 %tmp5, i32* %ptr5, align 4 + store i32 %tmp6, i32* %ptr6, align 4 + %idx1 = getelementptr inbounds i32, i32* %ptr1, i64 1 + store i32 %a, i32* %idx1, align 4 + %idx2 = getelementptr inbounds i32, i32* %ptr1, i64 2 + store i32 %b, i32* %idx2, align 4 + %idx3 = getelementptr inbounds i32, i32* %ptr1, i64 3 + store i32 %tmp7, i32* %idx3, align 4 + %idx4 = getelementptr inbounds i32, i32* %ptr1, i64 4 + store i32 %tmp8, i32* %idx4, align 4 + %idx5 = getelementptr inbounds i32, i32* %ptr1, i64 5 + store i32 %tmp9, i32* %idx5, align 4 + %idx6 = getelementptr inbounds i32, i32* %ptr1, i64 6 + store i32 %tmp10, i32* %idx6, align 4 + + ret void +}