Index: lib/Target/ARM/ARMExpandPseudoInsts.cpp =================================================================== --- lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -887,6 +887,9 @@ unsigned MaxAlign = MFI->getMaxAlignment(); assert (!AFI->isThumb1OnlyFunction()); // Emit bic r6, r6, MaxAlign + assert(MaxAlign <= 256 && "The BIC instruction cannot encode " + "immediates larger than 256 with all lower " + "bits set."); unsigned bicOpc = AFI->isThumbFunction() ? ARM::t2BICri : ARM::BICri; AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(), Index: lib/Target/ARM/ARMFrameLowering.cpp =================================================================== --- lib/Target/ARM/ARMFrameLowering.cpp +++ lib/Target/ARM/ARMFrameLowering.cpp @@ -211,6 +211,68 @@ }; } +/// Emit an instruction sequence that will align the address in +/// register Reg by zero-ing out the lower bits. For versions of the +/// architecture that support Neon, this must be done in a single +/// instruction, since skipAlignedDPRCS2Spills assumes it is done in a +/// single instruction. That function only gets called when optimizing +/// spilling of D registers on a core with the Neon instruction set +/// present. +static void emitAligningInstructions(MachineFunction &MF, ARMFunctionInfo *AFI, + const TargetInstrInfo &TII, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, const unsigned Reg, + const unsigned Alignment, + const bool MustBeSingleInstruction) { + const ARMSubtarget &AST = MF.getTarget().getSubtarget(); + const bool CanUseBFC = AST.hasV6T2Ops() || AST.hasV7Ops(); + const unsigned AlignMask = Alignment - 1; + const unsigned nrBitsToZero = countTrailingZeros(Alignment); + assert(!AFI->isThumb1OnlyFunction() && "Thumb1 not supported"); + if (!AFI->isThumbFunction()) { + // if the BFC instruction is available, use that to zero the lower + // bits: + // bfc Reg, #0, log2(Alignment) + // otherwise use BIC, if the mask to zero the required number of bits + // can be encoded in the bic immediate field + // bic Reg, Reg, Alignment-1 + // otherwise, emit + // lsr Reg, Reg, log2(Alignment) + // lsl Reg, Reg, log2(Alignment) + if (CanUseBFC) { + AddDefaultPred(BuildMI(MBB, MBBI, DL, TII.get(ARM::BFC), Reg) + .addReg(Reg, RegState::Kill) + .addImm(~AlignMask)); + } else if (AlignMask <= 255) { + AddDefaultCC( + AddDefaultPred(BuildMI(MBB, MBBI, DL, TII.get(ARM::BICri), Reg) + .addReg(Reg, RegState::Kill) + .addImm(AlignMask))); + } else { + assert(!MustBeSingleInstruction && + "Shouldn't call emitAligningInstructions demanding a single " + "instruction to be emitted for large stack alignment for a target " + "without BFC."); + AddDefaultCC(AddDefaultPred( + BuildMI(MBB, MBBI, DL, TII.get(ARM::MOVsi), Reg) + .addReg(Reg, RegState::Kill) + .addImm(ARM_AM::getSORegOpc(ARM_AM::lsr, nrBitsToZero)))); + AddDefaultCC(AddDefaultPred( + BuildMI(MBB, MBBI, DL, TII.get(ARM::MOVsi), Reg) + .addReg(Reg, RegState::Kill) + .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, nrBitsToZero)))); + } + } else { + // Since this is only reached for Thumb-2 targets, the BFC instruction + // should always be available. + assert(CanUseBFC); + AddDefaultPred(BuildMI(MBB, MBBI, DL, TII.get(ARM::t2BFC), Reg) + .addReg(Reg, RegState::Kill) + .addImm(~AlignMask)); + } +} + void ARMFrameLowering::emitPrologue(MachineFunction &MF) const { MachineBasicBlock &MBB = MF.front(); MachineBasicBlock::iterator MBBI = MBB.begin(); @@ -568,28 +630,24 @@ // realigned. if (!AFI->getNumAlignedDPRCS2Regs() && RegInfo->needsStackRealignment(MF)) { unsigned MaxAlign = MFI->getMaxAlignment(); - assert (!AFI->isThumb1OnlyFunction()); + assert(!AFI->isThumb1OnlyFunction()); if (!AFI->isThumbFunction()) { - // Emit bic sp, sp, MaxAlign - AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl, - TII.get(ARM::BICri), ARM::SP) - .addReg(ARM::SP, RegState::Kill) - .addImm(MaxAlign-1))); + emitAligningInstructions(MF, AFI, TII, MBB, MBBI, dl, ARM::SP, MaxAlign, + false); } else { - // We cannot use sp as source/dest register here, thus we're emitting the - // following sequence: + // We cannot use sp as source/dest register here, thus we're using r4 to + // perform the calculations. We're emitting the following sequence: // mov r4, sp - // bic r4, r4, MaxAlign + // -- use emitAligningInstructions to produce best sequence to zero + // -- out lower bits in r4 // mov sp, r4 // FIXME: It will be better just to find spare register here. AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::R4) - .addReg(ARM::SP, RegState::Kill)); - AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl, - TII.get(ARM::t2BICri), ARM::R4) - .addReg(ARM::R4, RegState::Kill) - .addImm(MaxAlign-1))); + .addReg(ARM::SP, RegState::Kill)); + emitAligningInstructions(MF, AFI, TII, MBB, MBBI, dl, ARM::R4, MaxAlign, + false); AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP) - .addReg(ARM::R4, RegState::Kill)); + .addReg(ARM::R4, RegState::Kill)); } AFI->setShouldRestoreSPFromFP(true); @@ -1084,15 +1142,16 @@ // The immediate is <= 64, so it doesn't need any special encoding. unsigned Opc = isThumb ? ARM::t2SUBri : ARM::SUBri; AddDefaultCC(AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(Opc), ARM::R4) - .addReg(ARM::SP) - .addImm(8 * NumAlignedDPRCS2Regs))); + .addReg(ARM::SP) + .addImm(8 * NumAlignedDPRCS2Regs))); - // bic r4, r4, #align-1 - Opc = isThumb ? ARM::t2BICri : ARM::BICri; unsigned MaxAlign = MF.getFrameInfo()->getMaxAlignment(); - AddDefaultCC(AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(Opc), ARM::R4) - .addReg(ARM::R4, RegState::Kill) - .addImm(MaxAlign - 1))); + // We must set parameter MustBeSingleInstruction to true, since + // skipAlignedDPRCS2Spills expects exactly 3 instructions to perform + // stack alignment. Luckily, this can always be done since all ARM + // architecture versions that support Neon also support the BFC + // instruction. + emitAligningInstructions(MF, AFI, TII, MBB, MI, DL, ARM::R4, MaxAlign, true); // mov sp, r4 // The stack pointer must be adjusted before spilling anything, otherwise Index: test/CodeGen/ARM/alloc-no-stack-realign.ll =================================================================== --- test/CodeGen/ARM/alloc-no-stack-realign.ll +++ test/CodeGen/ARM/alloc-no-stack-realign.ll @@ -41,7 +41,7 @@ define void @test2(<16 x float>* noalias sret %agg.result) nounwind ssp { entry: ; REALIGN-LABEL: test2 -; REALIGN: bic sp, sp, #63 +; REALIGN: bfc sp, #0, #6 ; REALIGN: mov r[[R2:[0-9]+]], r[[R1:[0-9]+]] ; REALIGN: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]! ; REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] Index: test/CodeGen/ARM/fold-stack-adjust.ll =================================================================== --- test/CodeGen/ARM/fold-stack-adjust.ll +++ test/CodeGen/ARM/fold-stack-adjust.ll @@ -71,7 +71,7 @@ ; CHECK-IOS-LABEL: check_vfp_fold: ; CHECK-IOS: push {r0, r1, r2, r3, r4, r7, lr} ; CHECK-IOS: sub.w r4, sp, #16 -; CHECK-IOS: bic r4, r4, #15 +; CHECK-IOS: bfc r4, #0, #4 ; CHECK-IOS: mov sp, r4 ; CHECK-IOS: vst1.64 {d8, d9}, [r4:128] ; ... Index: test/CodeGen/ARM/interrupt-attr.ll =================================================================== --- test/CodeGen/ARM/interrupt-attr.ll +++ test/CodeGen/ARM/interrupt-attr.ll @@ -15,7 +15,7 @@ ; CHECK-A: push {r0, r1, r2, r3, r10, r11, r12, lr} ; CHECK-A: add r11, sp, #20 ; CHECK-A-NOT: sub sp, sp, #{{[0-9]+}} -; CHECK-A: bic sp, sp, #7 +; CHECK-A: bfc sp, #0, #3 ; CHECK-A: bl bar ; CHECK-A: sub sp, r11, #20 ; CHECK-A: pop {r0, r1, r2, r3, r10, r11, r12, lr} @@ -25,7 +25,7 @@ ; CHECK-A-THUMB: push.w {r0, r1, r2, r3, r4, r7, r12, lr} ; CHECK-A-THUMB: add r7, sp, #20 ; CHECK-A-THUMB: mov r4, sp -; CHECK-A-THUMB: bic r4, r4, #7 +; CHECK-A-THUMB: bfc r4, #0, #3 ; CHECK-A-THUMB: bl bar ; CHECK-A-THUMB: sub.w r4, r7, #20 ; CHECK-A-THUMB: mov sp, r4 @@ -38,7 +38,7 @@ ; CHECK-M: push.w {r4, r10, r11, lr} ; CHECK-M: add.w r11, sp, #8 ; CHECK-M: mov r4, sp -; CHECK-M: bic r4, r4, #7 +; CHECK-M: bfc r4, #0, #3 ; CHECK-M: mov sp, r4 ; CHECK-M: bl _bar ; CHECK-M: sub.w r4, r11, #8 @@ -56,7 +56,7 @@ ; 32 to get past r0, r1, ..., r7 ; CHECK-A: add r11, sp, #32 ; CHECK-A: sub sp, sp, #{{[0-9]+}} -; CHECK-A: bic sp, sp, #7 +; CHECK-A: bfc sp, #0, #3 ; [...] ; 32 must match above ; CHECK-A: sub sp, r11, #32 @@ -75,7 +75,7 @@ ; CHECK-A: push {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, lr} ; CHECK-A: add r11, sp, #44 ; CHECK-A: sub sp, sp, #{{[0-9]+}} -; CHECK-A: bic sp, sp, #7 +; CHECK-A: bfc sp, #0, #3 ; [...] ; CHECK-A: sub sp, r11, #44 ; CHECK-A: pop {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, lr} @@ -91,7 +91,7 @@ ; CHECK-A: push {r0, r1, r2, r3, r10, r11, r12, lr} ; CHECK-A: add r11, sp, #20 ; CHECK-A-NOT: sub sp, sp, #{{[0-9]+}} -; CHECK-A: bic sp, sp, #7 +; CHECK-A: bfc sp, #0, #3 ; [...] ; CHECK-A: sub sp, r11, #20 ; CHECK-A: pop {r0, r1, r2, r3, r10, r11, r12, lr} @@ -106,7 +106,7 @@ ; CHECK-A: push {r0, r1, r2, r3, r10, r11, r12, lr} ; CHECK-A: add r11, sp, #20 ; CHECK-A-NOT: sub sp, sp, #{{[0-9]+}} -; CHECK-A: bic sp, sp, #7 +; CHECK-A: bfc sp, #0, #3 ; [...] ; CHECK-A: sub sp, r11, #20 ; CHECK-A: pop {r0, r1, r2, r3, r10, r11, r12, lr} Index: test/CodeGen/ARM/spill-q.ll =================================================================== --- test/CodeGen/ARM/spill-q.ll +++ test/CodeGen/ARM/spill-q.ll @@ -11,7 +11,7 @@ define void @aaa(%quuz* %this, i8* %block) { ; CHECK-LABEL: aaa: -; CHECK: bic {{.*}}, #15 +; CHECK: bfc {{.*}}, #0, #4 ; CHECK: vst1.64 {{.*}}sp:128 ; CHECK: vld1.64 {{.*}}sp:128 entry: Index: test/CodeGen/ARM/stack-alignment.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/stack-alignment.ll @@ -0,0 +1,164 @@ +; RUN: llc -verify-machineinstrs < %s -mtriple=armv4t | FileCheck %s -check-prefix=CHECK-v4A32 +; RUN: llc -verify-machineinstrs < %s -mtriple=armv7a | FileCheck %s -check-prefix=CHECK-v7A32 +; RUN: llc -verify-machineinstrs < %s -mtriple=thumbv7a | FileCheck %s -check-prefix=CHECK-THUMB2 +; FIXME: There are no tests for Thumb1 since dynamic stack alignment is not supported for +; Thumb1. + +define i32 @f_bic_can_be_used_align() nounwind { +entry: +; CHECK-LABEL: f_bic_can_be_used_align: +; CHECK-v7A32: bfc sp, #0, #8 +; CHECK-v4A32: bic sp, sp, #255 +; CHECK-THUMB2: mov r4, sp +; CHECK-THUMB2-NEXT: bfc r4, #0, #8 +; CHECK-THUMB2-NEXT: mov sp, r4 + %x = alloca i32, align 256 + store volatile i32 0, i32* %x, align 256 + ret i32 0 +} + +define i32 @f_too_large_for_bic_align() nounwind { +entry: +; CHECK-LABEL: f_too_large_for_bic_align: +; CHECK-v7A32: bfc sp, #0, #9 +; CHECK-v4A32: lsr sp, sp, #9 +; CHECK-v4A32: lsl sp, sp, #9 +; CHECK-THUMB2: mov r4, sp +; CHECK-THUMB2-NEXT: bfc r4, #0, #9 +; CHECK-THUMB2-NEXT: mov sp, r4 + %x = alloca i32, align 512 + store volatile i32 0, i32* %x, align 512 + ret i32 0 +} + +define i8* @f_alignedDPRCS2Spills(double* %d) #0 { +entry: +; CHECK-LABEL: f_too_large_for_bic_align: +; CHECK-v7A32: bfc sp, #0, #12 +; CHECK-v4A32: lsr sp, sp, #12 +; CHECK-v4A32: lsl sp, sp, #12 +; CHECK-THUMB2: bfc r4, #0, #12 +; CHECK-THUMB2-NEXT: mov sp, r4 + %a = alloca i8, align 4096 + %0 = load double* %d, align 4 + %arrayidx1 = getelementptr inbounds double* %d, i32 1 + %1 = load double* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds double* %d, i32 2 + %2 = load double* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds double* %d, i32 3 + %3 = load double* %arrayidx3, align 4 + %arrayidx4 = getelementptr inbounds double* %d, i32 4 + %4 = load double* %arrayidx4, align 4 + %arrayidx5 = getelementptr inbounds double* %d, i32 5 + %5 = load double* %arrayidx5, align 4 + %arrayidx6 = getelementptr inbounds double* %d, i32 6 + %6 = load double* %arrayidx6, align 4 + %arrayidx7 = getelementptr inbounds double* %d, i32 7 + %7 = load double* %arrayidx7, align 4 + %arrayidx8 = getelementptr inbounds double* %d, i32 8 + %8 = load double* %arrayidx8, align 4 + %arrayidx9 = getelementptr inbounds double* %d, i32 9 + %9 = load double* %arrayidx9, align 4 + %arrayidx10 = getelementptr inbounds double* %d, i32 10 + %10 = load double* %arrayidx10, align 4 + %arrayidx11 = getelementptr inbounds double* %d, i32 11 + %11 = load double* %arrayidx11, align 4 + %arrayidx12 = getelementptr inbounds double* %d, i32 12 + %12 = load double* %arrayidx12, align 4 + %arrayidx13 = getelementptr inbounds double* %d, i32 13 + %13 = load double* %arrayidx13, align 4 + %arrayidx14 = getelementptr inbounds double* %d, i32 14 + %14 = load double* %arrayidx14, align 4 + %arrayidx15 = getelementptr inbounds double* %d, i32 15 + %15 = load double* %arrayidx15, align 4 + %arrayidx16 = getelementptr inbounds double* %d, i32 16 + %16 = load double* %arrayidx16, align 4 + %arrayidx17 = getelementptr inbounds double* %d, i32 17 + %17 = load double* %arrayidx17, align 4 + %arrayidx18 = getelementptr inbounds double* %d, i32 18 + %18 = load double* %arrayidx18, align 4 + %arrayidx19 = getelementptr inbounds double* %d, i32 19 + %19 = load double* %arrayidx19, align 4 + %arrayidx20 = getelementptr inbounds double* %d, i32 20 + %20 = load double* %arrayidx20, align 4 + %arrayidx21 = getelementptr inbounds double* %d, i32 21 + %21 = load double* %arrayidx21, align 4 + %arrayidx22 = getelementptr inbounds double* %d, i32 22 + %22 = load double* %arrayidx22, align 4 + %arrayidx23 = getelementptr inbounds double* %d, i32 23 + %23 = load double* %arrayidx23, align 4 + %arrayidx24 = getelementptr inbounds double* %d, i32 24 + %24 = load double* %arrayidx24, align 4 + %arrayidx25 = getelementptr inbounds double* %d, i32 25 + %25 = load double* %arrayidx25, align 4 + %arrayidx26 = getelementptr inbounds double* %d, i32 26 + %26 = load double* %arrayidx26, align 4 + %arrayidx27 = getelementptr inbounds double* %d, i32 27 + %27 = load double* %arrayidx27, align 4 + %arrayidx28 = getelementptr inbounds double* %d, i32 28 + %28 = load double* %arrayidx28, align 4 + %arrayidx29 = getelementptr inbounds double* %d, i32 29 + %29 = load double* %arrayidx29, align 4 + %div = fdiv double %29, %28 + %div30 = fdiv double %div, %27 + %div31 = fdiv double %div30, %26 + %div32 = fdiv double %div31, %25 + %div33 = fdiv double %div32, %24 + %div34 = fdiv double %div33, %23 + %div35 = fdiv double %div34, %22 + %div36 = fdiv double %div35, %21 + %div37 = fdiv double %div36, %20 + %div38 = fdiv double %div37, %19 + %div39 = fdiv double %div38, %18 + %div40 = fdiv double %div39, %17 + %div41 = fdiv double %div40, %16 + %div42 = fdiv double %div41, %15 + %div43 = fdiv double %div42, %14 + %div44 = fdiv double %div43, %13 + %div45 = fdiv double %div44, %12 + %div46 = fdiv double %div45, %11 + %div47 = fdiv double %div46, %10 + %div48 = fdiv double %div47, %9 + %div49 = fdiv double %div48, %8 + %div50 = fdiv double %div49, %7 + %div51 = fdiv double %div50, %6 + %div52 = fdiv double %div51, %5 + %div53 = fdiv double %div52, %4 + %div54 = fdiv double %div53, %3 + %div55 = fdiv double %div54, %2 + %div56 = fdiv double %div55, %1 + %div57 = fdiv double %div56, %0 + %div58 = fdiv double %0, %1 + %div59 = fdiv double %div58, %2 + %div60 = fdiv double %div59, %3 + %div61 = fdiv double %div60, %4 + %div62 = fdiv double %div61, %5 + %div63 = fdiv double %div62, %6 + %div64 = fdiv double %div63, %7 + %div65 = fdiv double %div64, %8 + %div66 = fdiv double %div65, %9 + %div67 = fdiv double %div66, %10 + %div68 = fdiv double %div67, %11 + %div69 = fdiv double %div68, %12 + %div70 = fdiv double %div69, %13 + %div71 = fdiv double %div70, %14 + %div72 = fdiv double %div71, %15 + %div73 = fdiv double %div72, %16 + %div74 = fdiv double %div73, %17 + %div75 = fdiv double %div74, %18 + %div76 = fdiv double %div75, %19 + %div77 = fdiv double %div76, %20 + %div78 = fdiv double %div77, %21 + %div79 = fdiv double %div78, %22 + %div80 = fdiv double %div79, %23 + %div81 = fdiv double %div80, %24 + %div82 = fdiv double %div81, %25 + %div83 = fdiv double %div82, %26 + %div84 = fdiv double %div83, %27 + %div85 = fdiv double %div84, %28 + %div86 = fdiv double %div85, %29 + %mul = fmul double %div57, %div86 + %conv = fptosi double %mul to i32 + %add.ptr = getelementptr inbounds i8* %a, i32 %conv + ret i8* %add.ptr +} Index: test/CodeGen/Thumb2/aligned-spill.ll =================================================================== --- test/CodeGen/Thumb2/aligned-spill.ll +++ test/CodeGen/Thumb2/aligned-spill.ll @@ -9,7 +9,7 @@ ; ; The caller-saved r4 is used as a scratch register for stack realignment. ; CHECK: push {r4, r7, lr} -; CHECK: bic r4, r4, #7 +; CHECK: bfc r4, #0, #3 ; CHECK: mov sp, r4 define void @f(double* nocapture %p) nounwind ssp { entry: @@ -23,7 +23,7 @@ ; NEON: f ; NEON: push {r4, r7, lr} ; NEON: sub.w r4, sp, #64 -; NEON: bic r4, r4, #15 +; NEON: bfc r4, #0, #4 ; Stack pointer must be updated before the spills. ; NEON: mov sp, r4 ; NEON: vst1.64 {d8, d9, d10, d11}, [r4:128]! @@ -54,7 +54,7 @@ ; NEON: f7 ; NEON: push {r4, r7, lr} ; NEON: sub.w r4, sp, #56 -; NEON: bic r4, r4, #15 +; NEON: bfc r4, #0, #4 ; Stack pointer must be updated before the spills. ; NEON: mov sp, r4 ; NEON: vst1.64 {d8, d9, d10, d11}, [r4:128]! @@ -81,7 +81,7 @@ ; NEON: push {r4, r7, lr} ; NEON: vpush {d12, d13, d14, d15} ; NEON: sub.w r4, sp, #24 -; NEON: bic r4, r4, #15 +; NEON: bfc r4, #0, #4 ; Stack pointer must be updated before the spills. ; NEON: mov sp, r4 ; NEON: vst1.64 {d8, d9}, [r4:128] Index: test/CodeGen/Thumb2/thumb2-spill-q.ll =================================================================== --- test/CodeGen/Thumb2/thumb2-spill-q.ll +++ test/CodeGen/Thumb2/thumb2-spill-q.ll @@ -11,7 +11,7 @@ define void @aaa(%quuz* %this, i8* %block) { ; CHECK-LABEL: aaa: -; CHECK: bic r4, r4, #15 +; CHECK: bfc r4, #0, #4 ; CHECK: vst1.64 {{.*}}[{{.*}}:128] ; CHECK: vld1.64 {{.*}}[{{.*}}:128] entry: