Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -2110,6 +2110,10 @@ RetOps.push_back(Chain); // Operand #0 = Chain (updated below) bool isLittleEndian = Subtarget->isLittle(); + MachineFunction &MF = DAG.getMachineFunction(); + ARMFunctionInfo *AFI = MF.getInfo(); + AFI->setReturnRegsCount(RVLocs.size()); + // Copy the result values into the output registers. for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size(); Index: lib/Target/ARM/ARMMachineFunctionInfo.h =================================================================== --- lib/Target/ARM/ARMMachineFunctionInfo.h +++ lib/Target/ARM/ARMMachineFunctionInfo.h @@ -48,6 +48,9 @@ /// unsigned ArgRegsSaveSize; + /// ReturnRegsCount - Number of registers used up in the return. + unsigned ReturnRegsCount; + /// HasStackFrame - True if this function has a stack frame. Set by /// processFunctionBeforeCalleeSavedScan(). bool HasStackFrame; @@ -127,7 +130,8 @@ ARMFunctionInfo() : isThumb(false), hasThumb2(false), - ArgRegsSaveSize(0), HasStackFrame(false), RestoreSPFromFP(false), + ArgRegsSaveSize(0), ReturnRegsCount(0), HasStackFrame(false), + RestoreSPFromFP(false), LRSpilledForFarJump(false), FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0), GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0), @@ -151,6 +155,9 @@ } void setArgRegsSaveSize(unsigned s) { ArgRegsSaveSize = s; } + unsigned getReturnRegsCount() const { return ReturnRegsCount; } + void setReturnRegsCount(unsigned s) { ReturnRegsCount = s; } + bool hasStackFrame() const { return HasStackFrame; } void setHasStackFrame(bool s) { HasStackFrame = s; } Index: lib/Target/ARM/Thumb1FrameLowering.cpp =================================================================== --- lib/Target/ARM/Thumb1FrameLowering.cpp +++ lib/Target/ARM/Thumb1FrameLowering.cpp @@ -382,28 +382,65 @@ } } - if (ArgRegsSaveSize) { - // Unlike T2 and ARM mode, the T1 pop instruction cannot restore - // to LR, and we can't pop the value directly to the PC since - // we need to update the SP after popping the value. Therefore, we - // pop the old LR into R3 as a temporary. + bool IsV4PopReturn = false; + for (const CalleeSavedInfo &CSI : MFI->getCalleeSavedInfo()) + if (CSI.getReg() == ARM::LR) + IsV4PopReturn = true; + IsV4PopReturn &= STI.hasV4TOps() && !STI.hasV5TOps(); + // Unlike T2 and ARM mode, the T1 pop instruction cannot restore + // to LR, and we can't pop the value directly to the PC since + // we need to update the SP after popping the value. So instead + // we have to emit: + // POP {r3} + // ADD sp, #offset + // BX r3 + // If this would clobber a return value, then generate this sequence instead: + // MOV ip, r3 + // POP {r3} + // ADD sp, #offset + // MOV lr, r3 + // MOV r3, ip + // BX lr + if (ArgRegsSaveSize || IsV4PopReturn) { // Get the last instruction, tBX_RET MBBI = MBB.getLastNonDebugInstr(); assert (MBBI->getOpcode() == ARM::tBX_RET); - // Epilogue for vararg functions: pop LR to R3 and branch off it. - AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP))) - .addReg(ARM::R3, RegState::Define); + DebugLoc dl = MBBI->getDebugLoc(); - emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize); + if (AFI->getReturnRegsCount() <= 3) { + // Epilogue: pop saved LR to R3 and branch off it. + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP))) + .addReg(ARM::R3, RegState::Define); - MachineInstrBuilder MIB = - BuildMI(MBB, MBBI, dl, TII.get(ARM::tBX_RET_vararg)) - .addReg(ARM::R3, RegState::Kill); - AddDefaultPred(MIB); - MIB.copyImplicitOps(&*MBBI); - // erase the old tBX_RET instruction - MBB.erase(MBBI); + emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize); + + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, dl, TII.get(ARM::tBX)) + .addReg(ARM::R3, RegState::Kill); + AddDefaultPred(MIB); + MIB.copyImplicitOps(&*MBBI); + // erase the old tBX_RET instruction + MBB.erase(MBBI); + } else { + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr)) + .addReg(ARM::R12, RegState::Define) + .addReg(ARM::R3, RegState::Kill)); + + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP))) + .addReg(ARM::R3, RegState::Define); + + emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize); + + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr)) + .addReg(ARM::LR, RegState::Define) + .addReg(ARM::R3, RegState::Kill)); + + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr)) + .addReg(ARM::R3, RegState::Define) + .addReg(ARM::R12, RegState::Kill)); + // Keep the tBX_RET instruction + } } } @@ -470,6 +507,9 @@ // Special epilogue for vararg functions. See emitEpilogue if (isVarArg) continue; + // ARMv4T requires BX, see emitEpilogue + if (STI.hasV4TOps() && !STI.hasV5TOps()) + continue; Reg = ARM::PC; (*MIB).setDesc(TII.get(ARM::tPOP_RET)); MIB.copyImplicitOps(&*MI); Index: test/CodeGen/ARM/thumb1_return_sequence.ll =================================================================== --- test/CodeGen/ARM/thumb1_return_sequence.ll +++ test/CodeGen/ARM/thumb1_return_sequence.ll @@ -0,0 +1,205 @@ +; RUN: llc -mtriple=thumbv4t-none--eabi < %s | FileCheck %s --check-prefix=CHECK-V4T +; RUN: llc -mtriple=thumbv5t-none--eabi < %s | FileCheck %s --check-prefix=CHECK-V5T + +; CHECK-V4T-LABEL: clobberframe +; CHECK-V5T-LABEL: clobberframe +define <4 x i32> @clobberframe() #0 { +entry: +; Prologue +; -------- +; CHECK-V4T: push {r4, r5, r7, lr} +; CHECK-V4T: sub sp, +; CHECK-V5T: push {r4, r5, r7, lr} + + %b = alloca <4 x i32>, align 16 + %a = alloca <4 x i32>, align 16 + store <4 x i32> , <4 x i32>* %b, align 16 + store <4 x i32> , <4 x i32>* %a, align 16 + %0 = load <4 x i32>* %a, align 16 + ret <4 x i32> %0 + +; Epilogue +; -------- +; CHECK-V4T: add sp, +; CHECK-V4T-NEXT: pop {r4, r5, r7} +; CHECK-V4T-NEXT: mov r12, r3 +; CHECK-V4T-NEXT: pop {r3} +; CHECK-V4T-NEXT: mov lr, r3 +; CHECK-V4T-NEXT: mov r3, r12 +; CHECK-V4T: bx lr +; CHECK-V5T: pop {r4, r5, r7, pc} +} + +; CHECK-V4T-LABEL: clobbervariadicframe +; CHECK-V5T-LABEL: clobbervariadicframe +define <4 x i32> @clobbervariadicframe(i32 %i, ...) #0 { +entry: +; Prologue +; -------- +; CHECK-V4T: sub sp, +; CHECK-V4T: push {r4, r5, r7, lr} +; CHECK-V5T: sub sp, +; CHECK-V5T: push {r4, r5, r7, lr} + + %b = alloca <4 x i32>, align 16 + %a = alloca <4 x i32>, align 16 + store <4 x i32> , <4 x i32>* %b, align 16 + store <4 x i32> , <4 x i32>* %a, align 16 + %0 = load <4 x i32>* %a, align 16 + ret <4 x i32> %0 + +; Epilogue +; -------- +; CHECK-V4T: pop {r4, r5, r7} +; CHECK-V4T-NEXT: mov r12, r3 +; CHECK-V4T-NEXT: pop {r3} +; CHECK-V4T-NEXT: add sp, +; CHECK-V4T-NEXT: mov lr, r3 +; CHECK-V4T-NEXT: mov r3, r12 +; CHECK-V4T: bx lr +; CHECK-V5T: add sp, +; CHECK-V5T-NEXT: pop {r4, r5, r7} +; CHECK-V5T-NEXT: mov r12, r3 +; CHECK-V5T-NEXT: pop {r3} +; CHECK-V5T-NEXT: add sp, +; CHECK-V5T-NEXT: mov lr, r3 +; CHECK-V5T-NEXT: mov r3, r12 +; CHECK-V5T-NEXT: bx lr +} + +; CHECK-V4T-LABEL: simpleframe +; CHECK-V5T-LABEL: simpleframe +define i32 @simpleframe() #0 { +entry: +; Prologue +; -------- +; CHECK-V4T: push {r4, lr} +; CHECK-V5T: push {r4, lr} + + %a = alloca i32, align 4 + %b = alloca i32, align 4 + %c = alloca i32, align 4 + %d = alloca i32, align 4 + store i32 1, i32* %a, align 4 + store i32 2, i32* %b, align 4 + store i32 3, i32* %c, align 4 + store i32 4, i32* %d, align 4 + %0 = load i32* %a, align 4 + %inc = add nsw i32 %0, 1 + store i32 %inc, i32* %a, align 4 + %1 = load i32* %b, align 4 + %inc1 = add nsw i32 %1, 1 + store i32 %inc1, i32* %b, align 4 + %2 = load i32* %c, align 4 + %inc2 = add nsw i32 %2, 1 + store i32 %inc2, i32* %c, align 4 + %3 = load i32* %d, align 4 + %inc3 = add nsw i32 %3, 1 + store i32 %inc3, i32* %d, align 4 + %4 = load i32* %a, align 4 + %5 = load i32* %b, align 4 + %add = add nsw i32 %4, %5 + %6 = load i32* %c, align 4 + %add4 = add nsw i32 %add, %6 + %7 = load i32* %d, align 4 + %add5 = add nsw i32 %add4, %7 + ret i32 %add5 + +; Epilogue +; -------- +; CHECK-V4T: pop {r4} +; CHECK-V4T: pop {r3} +; CHECK-V4T: bx r3 +; CHECK-V5T: pop {r4, pc} +} + +; CHECK-V4T-LABEL: simplevariadicframe +; CHECK-V5T-LABEL: simplevariadicframe +define i32 @simplevariadicframe(i32 %i, ...) #0 { +entry: +; Prologue +; -------- +; CHECK-V4T: sub sp, +; CHECK-V4T: push {r4, r5, r7, lr} +; CHECK-V4T: sub sp, +; CHECK-V5T: sub sp, +; CHECK-V5T: push {r4, r5, r7, lr} +; CHECK-V5T: sub sp, + + %a = alloca i32, align 4 + %b = alloca i32, align 4 + %c = alloca i32, align 4 + %d = alloca i32, align 4 + store i32 1, i32* %a, align 4 + store i32 2, i32* %b, align 4 + store i32 3, i32* %c, align 4 + store i32 4, i32* %d, align 4 + %0 = load i32* %a, align 4 + %inc = add nsw i32 %0, 1 + store i32 %inc, i32* %a, align 4 + %1 = load i32* %b, align 4 + %inc1 = add nsw i32 %1, 1 + store i32 %inc1, i32* %b, align 4 + %2 = load i32* %c, align 4 + %inc2 = add nsw i32 %2, 1 + store i32 %inc2, i32* %c, align 4 + %3 = load i32* %d, align 4 + %inc3 = add nsw i32 %3, 1 + store i32 %inc3, i32* %d, align 4 + %4 = load i32* %a, align 4 + %5 = load i32* %b, align 4 + %add = add nsw i32 %4, %5 + %6 = load i32* %c, align 4 + %add4 = add nsw i32 %add, %6 + %7 = load i32* %d, align 4 + %add5 = add nsw i32 %add4, %7 + %add6 = add nsw i32 %add5, %i + ret i32 %add6 + +; Epilogue +; -------- +; CHECK-V4T: add sp, +; CHECK-V4T-NEXT: pop {r4, r5, r7} +; CHECK-V4T-NEXT: pop {r3} +; CHECK-V4T-NEXT: add sp, +; CHECK-V4T-NEXT: bx r3 +; CHECK-V5T: add sp, +; CHECK-V5T-NEXT: pop {r4, r5, r7} +; CHECK-V5T-NEXT: pop {r3} +; CHECK-V5T-NEXT: add sp, +; CHECK-V5T-NEXT: bx r3 +} + +; CHECK-V4T-LABEL: noframe +; CHECK-V5T-LABEL: noframe +define i32 @noframe() #0 { +entry: +; Prologue +; -------- +; CHECK-V4T-NOT: push +; CHECK-V5T-NOT: push + ret i32 0; +; Epilogue +; -------- +; CHECK-V4T-NOT: pop +; CHECK-V5T-NOT: pop +; CHECK-V4T: bx lr +; CHECK-V5T: bx lr +} + +; CHECK-V4T-LABEL: novariadicframe +; CHECK-V5T-LABEL: novariadicframe +define i32 @novariadicframe(i32 %i) #0 { +entry: +; Prologue +; -------- +; CHECK-V4T-NOT: push +; CHECK-V5T-NOT: push + ret i32 %i; +; Epilogue +; -------- +; CHECK-V4T-NOT: pop +; CHECK-V5T-NOT: pop +; CHECK-V4T: bx lr +; CHECK-V5T: bx lr +}