Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -73,6 +73,7 @@ class IntrinsicInst; struct KnownBits; class LegacyDivergenceAnalysis; +class LoopInfo; class LLVMContext; class MachineBasicBlock; class MachineFunction; @@ -2526,8 +2527,8 @@ /// instruction during instruction selection. After calling the function /// \p Ops contains the Uses to sink ordered by dominance (dominating users /// come first). - virtual bool shouldSinkOperands(Instruction *I, - SmallVectorImpl &Ops) const { + virtual bool shouldSinkOperands(Instruction *I, SmallVectorImpl &Ops, + const LoopInfo *LI) const { return false; } Index: llvm/lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- llvm/lib/CodeGen/CodeGenPrepare.cpp +++ llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -271,7 +271,7 @@ const TargetRegisterInfo *TRI; const TargetTransformInfo *TTI = nullptr; const TargetLibraryInfo *TLInfo; - const LoopInfo *LI; + LoopInfo *LI; std::unique_ptr BFI; std::unique_ptr BPI; ProfileSummaryInfo *PSI; @@ -878,7 +878,7 @@ assert(SinglePred == BB && "Single predecessor not the same as predecessor"); // Merge DestBB into SinglePred/BB and delete it. - MergeBlockIntoPredecessor(DestBB); + MergeBlockIntoPredecessor(DestBB, nullptr, LI); // Note: BB(=SinglePred) will not be deleted on this path. // DestBB(=its single successor) is the one that was deleted. LLVM_DEBUG(dbgs() << "AFTER:\n" << *SinglePred << "\n\n\n"); @@ -915,6 +915,7 @@ // The PHIs are now updated, change everything that refers to BB to use // DestBB and remove BB. + LI->removeBlock(BB); BB->replaceAllUsesWith(DestBB); BB->eraseFromParent(); ++NumBlocksElim; @@ -6503,7 +6504,7 @@ // If the operands of I can be folded into a target instruction together with // I, duplicate and sink them. SmallVector OpsToSink; - if (!TLI->shouldSinkOperands(I, OpsToSink)) + if (!TLI->shouldSinkOperands(I, OpsToSink, LI)) return false; // OpsToSink can contain multiple uses in a use chain (e.g. Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -470,8 +470,8 @@ bool isZExtFree(EVT VT1, EVT VT2) const override; bool isZExtFree(SDValue Val, EVT VT2) const override; - bool shouldSinkOperands(Instruction *I, - SmallVectorImpl &Ops) const override; + bool shouldSinkOperands(Instruction *I, SmallVectorImpl &Ops, + const LoopInfo *LI) const override; bool hasPairedLoad(EVT LoadedType, unsigned &RequiredAligment) const override; Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -9435,8 +9435,9 @@ /// Check if sinking \p I's operands to I's basic block is profitable, because /// the operands can be folded into a target instruction, e.g. /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2). -bool AArch64TargetLowering::shouldSinkOperands( - Instruction *I, SmallVectorImpl &Ops) const { +bool AArch64TargetLowering::shouldSinkOperands(Instruction *I, + SmallVectorImpl &Ops, + const LoopInfo *LI) const { if (!I->getType()->isVectorTy()) return false; Index: llvm/lib/Target/ARM/ARMISelLowering.h =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.h +++ llvm/lib/Target/ARM/ARMISelLowering.h @@ -388,8 +388,8 @@ bool isTruncateFree(Type *SrcTy, Type *DstTy) const override; bool isTruncateFree(EVT SrcVT, EVT DstVT) const override; bool isZExtFree(SDValue Val, EVT VT2) const override; - bool shouldSinkOperands(Instruction *I, - SmallVectorImpl &Ops) const override; + bool shouldSinkOperands(Instruction *I, SmallVectorImpl &Ops, + const LoopInfo *LI) const override; Type* shouldConvertSplatType(ShuffleVectorInst* SVI) const override; bool isFNegFree(EVT VT) const override; Index: llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -15878,7 +15878,9 @@ /// the operands can be folded into a target instruction, e.g. /// sext/zext can be folded into vsubl. bool ARMTargetLowering::shouldSinkOperands(Instruction *I, - SmallVectorImpl &Ops) const { + SmallVectorImpl &Ops, + const LoopInfo *LI) const { + const DataLayout &DL = I->getModule()->getDataLayout(); if (!I->getType()->isVectorTy()) return false; @@ -15959,6 +15961,8 @@ m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()), m_Undef(), m_ZeroMask()))) continue; + if (Shuffle->getParent() == I->getParent()) + continue; if (!IsSinker(I, OpIdx.index())) continue; @@ -15975,6 +15979,45 @@ Ops.push_back(&Op->getOperandUse(0)); Ops.push_back(&OpIdx.value()); } + + if (Ops.empty()) + return false; + + // If we are sinking into a loop, make sure that the register pressure for R + // regs will not be adversely affected. Otherwise we might as well use a Q reg + // and avoid the spill. We get a rough estimate of register pressure by adding + // the number of phi recursions and loop invariant values need in the loop. We + // don't include constants yet because of the number of false positives. If we + // are over the R limit (~14) and under the Q limit (~6*4 to give some freedom + // for extra registers in the loop), we don't sink. + if (auto *L = LI->getLoopFor(I->getParent())) { + int RRegLimit = 14; + int QRegLimit = 24; + + auto AdjustLimits = [&](Type *Ty) { + if (Ty->isVectorTy() || Ty->isFloatingPointTy()) + QRegLimit -= (DL.getTypeSizeInBits(Ty) + 31) / 32; + else if (Ty->isIntOrPtrTy()) + RRegLimit -= (DL.getTypeSizeInBits(Ty) + 31) / 32; + return QRegLimit <= 0; // Early exit if we are already over the limit + }; + + for (auto *BB : L->getBlocks()) { + for (Instruction &I : *BB) { + if (isa(I) && I.getParent() == L->getHeader()) { + if (AdjustLimits(I.getType())) + return true; + continue; + } + + for (Value *Op : I.operands()) + if (L->isLoopInvariant(Op) && !isa(Op)) + if (AdjustLimits(Op->getType())) + return true; + } + } + return RRegLimit > 0 || QRegLimit <= 0; + } return true; } Index: llvm/lib/Target/X86/X86ISelLowering.h =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.h +++ llvm/lib/Target/X86/X86ISelLowering.h @@ -1179,8 +1179,8 @@ bool isZExtFree(EVT VT1, EVT VT2) const override; bool isZExtFree(SDValue Val, EVT VT2) const override; - bool shouldSinkOperands(Instruction *I, - SmallVectorImpl &Ops) const override; + bool shouldSinkOperands(Instruction *I, SmallVectorImpl &Ops, + const LoopInfo *LI) const override; /// Return true if folding a vector load into ExtVal (a sign, zero, or any /// extend node) is profitable. Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -30662,7 +30662,8 @@ } bool X86TargetLowering::shouldSinkOperands(Instruction *I, - SmallVectorImpl &Ops) const { + SmallVectorImpl &Ops, + const LoopInfo *LI) const { // A uniform shift amount in a vector shift or funnel shift may be much // cheaper than a generic variable vector shift, so make that pattern visible // to SDAG by sinking the shuffle instruction next to the shift. Index: llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll @@ -617,13 +617,13 @@ ; CHECK-NEXT: bic r10, r11, #3 ; CHECK-NEXT: sub.w r0, r10, #4 ; CHECK-NEXT: add.w r8, r1, r0, lsr #2 -; CHECK-NEXT: ldr r1, [sp, #112] -; CHECK-NEXT: lsl.w r0, r11, #1 -; CHECK-NEXT: str r0, [sp] @ 4-byte Spill +; CHECK-NEXT: ldr r0, [sp, #112] +; CHECK-NEXT: lsl.w r1, r11, #1 +; CHECK-NEXT: vdup.32 q4, r0 +; CHECK-NEXT: str r1, [sp] @ 4-byte Spill +; CHECK-NEXT: lsls r4, r0, #1 ; CHECK-NEXT: adr r0, .LCPI10_0 -; CHECK-NEXT: vdup.32 q4, r1 ; CHECK-NEXT: vldrw.u32 q5, [r0] -; CHECK-NEXT: lsls r4, r1, #1 ; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: vshl.i32 q6, q4, #2 ; CHECK-NEXT: movs r1, #0 Index: llvm/test/CodeGen/Thumb2/mve-sinkbalance.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-sinkbalance.ll +++ llvm/test/CodeGen/Thumb2/mve-sinkbalance.ll @@ -155,36 +155,36 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: movs r6, #0 -; CHECK-NEXT: sbfx r11, r2, #30, #1 -; CHECK-NEXT: lsl.w r8, r2, #1 -; CHECK-NEXT: asrs r7, r2, #31 -; CHECK-NEXT: mov.w r10, #100 -; CHECK-NEXT: mov r5, r6 -; CHECK-NEXT: mov r9, r6 +; CHECK-NEXT: ldr r7, [sp, #44] +; CHECK-NEXT: ldrd r10, r12, [sp, #36] +; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: sbfx r9, r2, #30, #1 +; CHECK-NEXT: vdup.32 q0, r7 +; CHECK-NEXT: lsls r6, r2, #1 +; CHECK-NEXT: asr.w r11, r2, #31 +; CHECK-NEXT: movs r4, #100 +; CHECK-NEXT: mov r5, r8 +; CHECK-NEXT: mov r7, r8 ; CHECK-NEXT: dls lr, r2 ; CHECK-NEXT: .LBB2_1: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: ldr r4, [sp, #36] -; CHECK-NEXT: ldr.w r12, [sp, #44] -; CHECK-NEXT: vadd.i32 q1, q0, r3 -; CHECK-NEXT: vaddlva.s32 r10, r9, q0 -; CHECK-NEXT: vadd.i32 q1, q1, r4 -; CHECK-NEXT: ldr r4, [sp, #40] -; CHECK-NEXT: vaddlva.s32 r8, r11, q0 -; CHECK-NEXT: vaddlva.s32 r2, r7, q0 -; CHECK-NEXT: vadd.i32 q1, q1, r4 -; CHECK-NEXT: vaddlva.s32 r6, r5, q0 -; CHECK-NEXT: vadd.i32 q1, q1, r12 -; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vadd.i32 q2, q1, r3 +; CHECK-NEXT: vaddlva.s32 r4, r7, q1 +; CHECK-NEXT: vadd.i32 q2, q2, r10 +; CHECK-NEXT: vaddlva.s32 r6, r9, q1 +; CHECK-NEXT: vadd.i32 q2, q2, r12 +; CHECK-NEXT: vaddlva.s32 r2, r11, q1 +; CHECK-NEXT: vadd.i32 q2, q2, q0 +; CHECK-NEXT: vaddlva.s32 r8, r5, q1 +; CHECK-NEXT: vstrw.32 q2, [r1] ; CHECK-NEXT: le lr, .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: adds.w r0, r8, r2 -; CHECK-NEXT: adc.w r1, r11, r7 -; CHECK-NEXT: adds.w r0, r0, r10 -; CHECK-NEXT: adc.w r1, r1, r9 -; CHECK-NEXT: adds r0, r0, r6 +; CHECK-NEXT: adds r0, r6, r2 +; CHECK-NEXT: adc.w r1, r9, r11 +; CHECK-NEXT: adds r0, r0, r4 +; CHECK-NEXT: adcs r1, r7 +; CHECK-NEXT: adds.w r0, r0, r8 ; CHECK-NEXT: adcs r1, r5 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: @@ -240,41 +240,49 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: movs r4, #0 -; CHECK-NEXT: sbfx r11, r2, #30, #1 -; CHECK-NEXT: lsl.w r12, r2, #1 -; CHECK-NEXT: mov.w r10, #100 -; CHECK-NEXT: asrs r7, r2, #31 -; CHECK-NEXT: mov r5, r4 -; CHECK-NEXT: mov r9, r4 +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: ldr r7, [sp, #72] +; CHECK-NEXT: sbfx r9, r2, #30, #1 +; CHECK-NEXT: ldr r5, [sp, #64] +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: vdup.32 q0, r7 +; CHECK-NEXT: ldr r7, [sp, #68] +; CHECK-NEXT: ldrd r8, r10, [sp, #56] +; CHECK-NEXT: vdup.32 q2, r5 +; CHECK-NEXT: vdup.32 q1, r7 +; CHECK-NEXT: lsls r6, r2, #1 +; CHECK-NEXT: asr.w r11, r2, #31 +; CHECK-NEXT: movs r4, #100 +; CHECK-NEXT: mov r5, r12 +; CHECK-NEXT: mov r7, r12 ; CHECK-NEXT: dls lr, r2 ; CHECK-NEXT: .LBB3_1: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: ldr r6, [sp, #36] -; CHECK-NEXT: ldr.w r8, [sp, #52] -; CHECK-NEXT: vadd.i32 q1, q0, r3 -; CHECK-NEXT: vaddlva.s32 r10, r9, q0 -; CHECK-NEXT: vadd.i32 q1, q1, r6 -; CHECK-NEXT: ldr r6, [sp, #40] -; CHECK-NEXT: vaddlva.s32 r12, r11, q0 -; CHECK-NEXT: vaddlva.s32 r2, r7, q0 -; CHECK-NEXT: vadd.i32 q1, q1, r6 -; CHECK-NEXT: ldr r6, [sp, #44] -; CHECK-NEXT: vaddlva.s32 r4, r5, q0 -; CHECK-NEXT: vadd.i32 q1, q1, r6 -; CHECK-NEXT: ldr r6, [sp, #48] -; CHECK-NEXT: vadd.i32 q1, q1, r6 -; CHECK-NEXT: vadd.i32 q1, q1, r8 -; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vadd.i32 q4, q3, r3 +; CHECK-NEXT: vaddlva.s32 r4, r7, q3 +; CHECK-NEXT: vadd.i32 q4, q4, r8 +; CHECK-NEXT: vaddlva.s32 r6, r9, q3 +; CHECK-NEXT: vadd.i32 q4, q4, r10 +; CHECK-NEXT: vaddlva.s32 r2, r11, q3 +; CHECK-NEXT: vadd.i32 q4, q4, q2 +; CHECK-NEXT: vaddlva.s32 r12, r5, q3 +; CHECK-NEXT: vadd.i32 q4, q4, q1 +; CHECK-NEXT: vadd.i32 q4, q4, q0 +; CHECK-NEXT: vstrw.32 q4, [r1] ; CHECK-NEXT: le lr, .LBB3_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: adds.w r0, r12, r2 -; CHECK-NEXT: adc.w r1, r11, r7 -; CHECK-NEXT: adds.w r0, r0, r10 -; CHECK-NEXT: adc.w r1, r1, r9 +; CHECK-NEXT: adds r0, r6, r2 +; CHECK-NEXT: adc.w r1, r9, r11 ; CHECK-NEXT: adds r0, r0, r4 +; CHECK-NEXT: adcs r1, r7 +; CHECK-NEXT: adds.w r0, r0, r12 ; CHECK-NEXT: adcs r1, r5 +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %conv = sext i32 %n to i64 @@ -335,46 +343,54 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: movs r4, #0 -; CHECK-NEXT: mov r8, r3 -; CHECK-NEXT: sbfx r3, r2, #30, #1 +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: ldrd r5, r7, [sp, #108] +; CHECK-NEXT: sbfx r9, r2, #30, #1 +; CHECK-NEXT: ldrd r8, r10, [sp, #88] +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: vdup.32 q0, r7 +; CHECK-NEXT: ldr r7, [sp, #104] +; CHECK-NEXT: vdup.32 q1, r5 +; CHECK-NEXT: ldr r5, [sp, #96] +; CHECK-NEXT: vdup.32 q2, r7 +; CHECK-NEXT: ldr r7, [sp, #100] +; CHECK-NEXT: vdup.32 q4, r5 ; CHECK-NEXT: lsls r6, r2, #1 -; CHECK-NEXT: mov.w r10, #100 -; CHECK-NEXT: asrs r7, r2, #31 -; CHECK-NEXT: mov r9, r4 -; CHECK-NEXT: mov r11, r4 +; CHECK-NEXT: vdup.32 q3, r7 +; CHECK-NEXT: asr.w r11, r2, #31 +; CHECK-NEXT: movs r4, #100 +; CHECK-NEXT: mov r5, r12 +; CHECK-NEXT: mov r7, r12 ; CHECK-NEXT: dls lr, r2 ; CHECK-NEXT: .LBB4_1: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: ldr r5, [sp, #36] -; CHECK-NEXT: ldr.w r12, [sp, #60] -; CHECK-NEXT: vadd.i32 q1, q0, r8 -; CHECK-NEXT: vaddlva.s32 r10, r11, q0 -; CHECK-NEXT: vadd.i32 q1, q1, r5 -; CHECK-NEXT: ldr r5, [sp, #40] -; CHECK-NEXT: vaddlva.s32 r6, r3, q0 -; CHECK-NEXT: vaddlva.s32 r2, r7, q0 -; CHECK-NEXT: vadd.i32 q1, q1, r5 -; CHECK-NEXT: ldr r5, [sp, #44] -; CHECK-NEXT: vaddlva.s32 r4, r9, q0 -; CHECK-NEXT: vadd.i32 q1, q1, r5 -; CHECK-NEXT: ldr r5, [sp, #48] -; CHECK-NEXT: vadd.i32 q1, q1, r5 -; CHECK-NEXT: ldr r5, [sp, #52] -; CHECK-NEXT: vadd.i32 q1, q1, r5 -; CHECK-NEXT: ldr r5, [sp, #56] -; CHECK-NEXT: vadd.i32 q1, q1, r5 -; CHECK-NEXT: vadd.i32 q1, q1, r12 -; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q5, [r0] +; CHECK-NEXT: vadd.i32 q6, q5, r3 +; CHECK-NEXT: vaddlva.s32 r4, r7, q5 +; CHECK-NEXT: vadd.i32 q6, q6, r8 +; CHECK-NEXT: vaddlva.s32 r6, r9, q5 +; CHECK-NEXT: vadd.i32 q6, q6, r10 +; CHECK-NEXT: vaddlva.s32 r2, r11, q5 +; CHECK-NEXT: vadd.i32 q6, q6, q4 +; CHECK-NEXT: vaddlva.s32 r12, r5, q5 +; CHECK-NEXT: vadd.i32 q6, q6, q3 +; CHECK-NEXT: vadd.i32 q6, q6, q2 +; CHECK-NEXT: vadd.i32 q6, q6, q1 +; CHECK-NEXT: vadd.i32 q6, q6, q0 +; CHECK-NEXT: vstrw.32 q6, [r1] ; CHECK-NEXT: le lr, .LBB4_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: adds r0, r6, r2 -; CHECK-NEXT: adc.w r1, r3, r7 -; CHECK-NEXT: adds.w r0, r0, r10 -; CHECK-NEXT: adc.w r1, r1, r11 +; CHECK-NEXT: adc.w r1, r9, r11 ; CHECK-NEXT: adds r0, r0, r4 -; CHECK-NEXT: adc.w r1, r1, r9 +; CHECK-NEXT: adcs r1, r7 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adcs r1, r5 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %conv = sext i32 %n to i64 @@ -441,50 +457,58 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: movs r4, #0 -; CHECK-NEXT: mov r12, r3 -; CHECK-NEXT: sbfx r3, r2, #30, #1 -; CHECK-NEXT: lsls r6, r2, #1 -; CHECK-NEXT: mov.w r10, #100 -; CHECK-NEXT: asrs r7, r2, #31 -; CHECK-NEXT: mov r9, r4 -; CHECK-NEXT: mov r11, r4 +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: ldrd r4, r5, [sp, #116] +; CHECK-NEXT: ldrd r6, r7, [sp, #104] +; CHECK-NEXT: sbfx r11, r2, #30, #1 +; CHECK-NEXT: vdup.32 q0, r5 +; CHECK-NEXT: ldr r5, [sp, #112] +; CHECK-NEXT: vdup.32 q4, r6 +; CHECK-NEXT: movs r6, #0 +; CHECK-NEXT: vdup.32 q2, r5 +; CHECK-NEXT: vdup.32 q3, r7 +; CHECK-NEXT: vdup.32 q1, r4 +; CHECK-NEXT: lsl.w r10, r2, #1 +; CHECK-NEXT: mov.w r12, #100 +; CHECK-NEXT: asrs r5, r2, #31 +; CHECK-NEXT: mov r7, r6 +; CHECK-NEXT: mov r9, r6 ; CHECK-NEXT: dls lr, r2 ; CHECK-NEXT: .LBB5_1: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: ldr r5, [sp, #36] -; CHECK-NEXT: ldr.w r8, [sp, #68] -; CHECK-NEXT: vadd.i32 q1, q0, r12 -; CHECK-NEXT: vaddlva.s32 r10, r11, q0 -; CHECK-NEXT: vadd.i32 q1, q1, r5 -; CHECK-NEXT: ldr r5, [sp, #40] -; CHECK-NEXT: vaddlva.s32 r6, r3, q0 -; CHECK-NEXT: vaddlva.s32 r2, r7, q0 -; CHECK-NEXT: vadd.i32 q1, q1, r5 -; CHECK-NEXT: ldr r5, [sp, #44] -; CHECK-NEXT: vaddlva.s32 r4, r9, q0 -; CHECK-NEXT: vadd.i32 q1, q1, r5 -; CHECK-NEXT: ldr r5, [sp, #48] -; CHECK-NEXT: vadd.i32 q1, q1, r5 -; CHECK-NEXT: ldr r5, [sp, #52] -; CHECK-NEXT: vadd.i32 q1, q1, r5 -; CHECK-NEXT: ldr r5, [sp, #56] -; CHECK-NEXT: vadd.i32 q1, q1, r5 -; CHECK-NEXT: ldr r5, [sp, #60] -; CHECK-NEXT: vadd.i32 q1, q1, r5 -; CHECK-NEXT: ldr r5, [sp, #64] -; CHECK-NEXT: vadd.i32 q1, q1, r5 -; CHECK-NEXT: vadd.i32 q1, q1, r8 -; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q5, [r0] +; CHECK-NEXT: ldr r4, [sp, #88] +; CHECK-NEXT: ldr.w r8, [sp, #100] +; CHECK-NEXT: vadd.i32 q6, q5, r3 +; CHECK-NEXT: vaddlva.s32 r12, r9, q5 +; CHECK-NEXT: vadd.i32 q6, q6, r4 +; CHECK-NEXT: ldr r4, [sp, #92] +; CHECK-NEXT: vaddlva.s32 r10, r11, q5 +; CHECK-NEXT: vaddlva.s32 r2, r5, q5 +; CHECK-NEXT: vadd.i32 q6, q6, r4 +; CHECK-NEXT: ldr r4, [sp, #96] +; CHECK-NEXT: vaddlva.s32 r6, r7, q5 +; CHECK-NEXT: vadd.i32 q6, q6, r4 +; CHECK-NEXT: vadd.i32 q6, q6, r8 +; CHECK-NEXT: vadd.i32 q6, q6, q4 +; CHECK-NEXT: vadd.i32 q6, q6, q3 +; CHECK-NEXT: vadd.i32 q6, q6, q2 +; CHECK-NEXT: vadd.i32 q6, q6, q1 +; CHECK-NEXT: vadd.i32 q6, q6, q0 +; CHECK-NEXT: vstrw.32 q6, [r1] ; CHECK-NEXT: le lr, .LBB5_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: adds r0, r6, r2 -; CHECK-NEXT: adc.w r1, r3, r7 -; CHECK-NEXT: adds.w r0, r0, r10 -; CHECK-NEXT: adc.w r1, r1, r11 -; CHECK-NEXT: adds r0, r0, r4 +; CHECK-NEXT: adds.w r0, r10, r2 +; CHECK-NEXT: adc.w r1, r11, r5 +; CHECK-NEXT: adds.w r0, r0, r12 ; CHECK-NEXT: adc.w r1, r1, r9 +; CHECK-NEXT: adds r0, r0, r6 +; CHECK-NEXT: adcs r1, r7 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %conv = sext i32 %n to i64 @@ -557,70 +581,66 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #12 -; CHECK-NEXT: sub sp, #12 -; CHECK-NEXT: ldr r4, [sp, #48] -; CHECK-NEXT: mov r10, r2 -; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: asr.w lr, r0, #2 -; CHECK-NEXT: mov r7, r2 -; CHECK-NEXT: movs r6, #0 -; CHECK-NEXT: mov r5, r2 +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: ldr r6, [sp, #40] ; CHECK-NEXT: mov.w r8, #0 -; CHECK-NEXT: mov r11, r2 +; CHECK-NEXT: asr.w lr, r0, #2 +; CHECK-NEXT: mov r7, r8 +; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: mov r5, r8 +; CHECK-NEXT: mov.w r10, #0 +; CHECK-NEXT: mov r11, r8 ; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: mov r9, r2 +; CHECK-NEXT: mov r9, r8 ; CHECK-NEXT: wls lr, lr, .LBB6_4 ; CHECK-NEXT: @ %bb.1: @ %while.body.lr.ph -; CHECK-NEXT: add.w r4, r3, r0, lsl #2 -; CHECK-NEXT: movs r6, #0 -; CHECK-NEXT: add.w r12, r4, r0, lsl #2 -; CHECK-NEXT: vidup.u32 q0, r6, #1 -; CHECK-NEXT: add.w r0, r12, r0, lsl #2 -; CHECK-NEXT: str.w r10, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: vmul.i32 q0, q0, r1 +; CHECK-NEXT: add.w r6, r3, r0, lsl #2 +; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: add.w r12, r6, r0, lsl #2 +; CHECK-NEXT: vidup.u32 q0, r4, #1 +; CHECK-NEXT: lsls r5, r1, #2 +; CHECK-NEXT: add.w r9, r12, r0, lsl #2 +; CHECK-NEXT: vdup.32 q1, r5 ; CHECK-NEXT: str r1, [sp] @ 4-byte Spill -; CHECK-NEXT: lsls r1, r1, #2 -; CHECK-NEXT: mov r9, r2 -; CHECK-NEXT: mov r11, r2 -; CHECK-NEXT: mov.w r8, #0 -; CHECK-NEXT: mov r5, r2 -; CHECK-NEXT: movs r6, #0 -; CHECK-NEXT: mov r7, r2 +; CHECK-NEXT: vmul.i32 q0, q0, r1 +; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: mov r11, r8 ; CHECK-NEXT: mov.w r10, #0 -; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: mov r5, r8 +; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: mov r7, r8 +; CHECK-NEXT: movs r0, #0 ; CHECK-NEXT: .LBB6_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: vadd.i32 q1, q0, r1 -; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: vldrw.u32 q2, [r1, q0, uxtw #2] -; CHECK-NEXT: vldrw.u32 q0, [r0], #16 -; CHECK-NEXT: vrmlalvha.s32 r2, r9, q0, q2 -; CHECK-NEXT: vldrw.u32 q0, [r12], #16 -; CHECK-NEXT: vrmlalvha.s32 r8, r11, q0, q2 -; CHECK-NEXT: vldrw.u32 q0, [r4], #16 -; CHECK-NEXT: vrmlalvha.s32 r6, r5, q0, q2 -; CHECK-NEXT: vldrw.u32 q0, [r3], #16 -; CHECK-NEXT: vrmlalvha.s32 r10, r7, q0, q2 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vldrw.u32 q2, [r2, q0, uxtw #2] +; CHECK-NEXT: vldrw.u32 q3, [r9], #16 +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vrmlalvha.s32 r8, r1, q3, q2 +; CHECK-NEXT: vldrw.u32 q3, [r12], #16 +; CHECK-NEXT: vrmlalvha.s32 r10, r11, q3, q2 +; CHECK-NEXT: vldrw.u32 q3, [r6], #16 +; CHECK-NEXT: vrmlalvha.s32 r4, r5, q3, q2 +; CHECK-NEXT: vldrw.u32 q3, [r3], #16 +; CHECK-NEXT: vrmlalvha.s32 r0, r7, q3, q2 ; CHECK-NEXT: le lr, .LBB6_2 ; CHECK-NEXT: @ %bb.3: @ %while.end.loopexit +; CHECK-NEXT: mov r9, r1 ; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload -; CHECK-NEXT: mov r12, r2 -; CHECK-NEXT: ldr r4, [sp, #48] -; CHECK-NEXT: mov r2, r10 +; CHECK-NEXT: ldr r6, [sp, #40] +; CHECK-NEXT: mov r12, r8 +; CHECK-NEXT: mov r8, r0 ; CHECK-NEXT: .LBB6_4: @ %while.end ; CHECK-NEXT: add.w r0, r1, r1, lsl #1 -; CHECK-NEXT: asrl r2, r7, #23 -; CHECK-NEXT: asrl r6, r5, #23 -; CHECK-NEXT: asrl r8, r11, #23 +; CHECK-NEXT: asrl r8, r7, #23 +; CHECK-NEXT: asrl r4, r5, #23 +; CHECK-NEXT: asrl r10, r11, #23 ; CHECK-NEXT: asrl r12, r9, #23 -; CHECK-NEXT: str r2, [r4] -; CHECK-NEXT: str.w r6, [r4, r1, lsl #2] -; CHECK-NEXT: str.w r8, [r4, r1, lsl #3] -; CHECK-NEXT: str.w r12, [r4, r0, lsl #2] -; CHECK-NEXT: add sp, #12 +; CHECK-NEXT: str.w r8, [r6] +; CHECK-NEXT: str.w r4, [r6, r1, lsl #2] +; CHECK-NEXT: str.w r10, [r6, r1, lsl #3] +; CHECK-NEXT: str.w r12, [r6, r0, lsl #2] +; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %shr = ashr i32 %cols, 2