Index: llvm/lib/Target/ARM/MVETailPredication.cpp =================================================================== --- llvm/lib/Target/ARM/MVETailPredication.cpp +++ llvm/lib/Target/ARM/MVETailPredication.cpp @@ -35,12 +35,14 @@ #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/InitializePasses.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicsARM.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/LoopUtils.h" using namespace llvm; @@ -56,8 +58,12 @@ class MVETailPredication : public LoopPass { SmallVector MaskedInsts; Loop *L = nullptr; + LoopInfo *LI = nullptr; + const DataLayout *DL; + DominatorTree *DT = nullptr; ScalarEvolution *SE = nullptr; TargetTransformInfo *TTI = nullptr; + TargetLibraryInfo *TLI = nullptr; public: static char ID; @@ -69,6 +75,8 @@ AU.addRequired(); AU.addRequired(); AU.addRequired(); + AU.addRequired(); + AU.addRequired(); AU.addPreserved(); AU.setPreservesCFG(); } @@ -97,6 +105,11 @@ DenseMap &NewPredicates, VectorType *VecTy, Value *NumElements); + + /// Rematerialize the iteration count in exit blocks, which enables + /// ARMLowOverheadLoops to better optimise away loop update statements inside + /// hardware-loops. + void RematerializeIterCount(Loop *L); }; } // end namespace @@ -120,6 +133,26 @@ return ID == Intrinsic::masked_store || ID == Intrinsic::masked_load; } +void MVETailPredication::RematerializeIterCount(Loop *L) { + SmallVector DeadInsts; + SCEVExpander Rewriter(*SE, *DL, "mvetp"); + ReplaceExitVal ReplaceExitValue = AlwaysRepl; + + if (!L->isRecursivelyLCSSAForm(*DT, *LI)) { + LLVM_DEBUG(dbgs() << "ARM TP: Loop is not in LCSSA form\n"); + if (!formLCSSA(*L, *DT, LI, SE)) { + LLVM_DEBUG(dbgs() << "ARM TP: couldn't bring loop in LCCS form," + << "can't rematerialise iteration count in exit " + << "blocks\n"); + return; + } else + LLVM_DEBUG(dbgs() << "ARM TP: Loop transformed to LCSSA form\n"); + } + rewriteLoopExitValues(L, LI, TLI, SE, Rewriter, DT, ReplaceExitValue, + DeadInsts); + +} + bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) { if (skipLoop(L) || DisableTailPredication) return false; @@ -128,8 +161,13 @@ auto &TPC = getAnalysis(); auto &TM = TPC.getTM(); auto *ST = &TM.getSubtarget(F); + DT = &getAnalysis().getDomTree(); + LI = &getAnalysis().getLoopInfo(); TTI = &getAnalysis().getTTI(F); SE = &getAnalysis().getSE(); + auto *TLIP = getAnalysisIfAvailable(); + TLI = TLIP ? &TLIP->getTLI(*L->getHeader()->getParent()) : nullptr; + DL = &L->getHeader()->getModule()->getDataLayout(); this->L = L; // The MVE and LOB extensions are combined to enable tail-predication, but @@ -185,7 +223,13 @@ LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n" << *Decrement << "\n"); - return TryConvert(Setup->getArgOperand(0)); + + if (TryConvert(Setup->getArgOperand(0))) { + RematerializeIterCount(L); + return true; + } + + return false; } bool MVETailPredication::isTailPredicate(Instruction *I, Value *NumElements) { Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll @@ -8,7 +8,7 @@ ; CHECK-NEXT: itt eq ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: adds r4, r3, #3 ; CHECK-NEXT: vmov.i32 q1, #0x0 @@ -16,35 +16,36 @@ ; CHECK-NEXT: sub.w r12, r4, #4 ; CHECK-NEXT: movs r4, #1 ; CHECK-NEXT: add.w lr, r4, r12, lsr #2 -; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: lsr.w r4, r12, #2 +; CHECK-NEXT: sub.w r12, r3, r4, lsl #2 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r3 -; CHECK-NEXT: mov r4, r3 -; CHECK-NEXT: and r3, r12, #15 +; CHECK-NEXT: and r5, r4, #15 ; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill -; CHECK-NEXT: vdup.32 q3, r3 +; CHECK-NEXT: vdup.32 q3, r5 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q1, [r2], #16 ; CHECK-NEXT: vldrwt.u32 q2, [r1], #16 ; CHECK-NEXT: vcmp.i32 eq, q3, zr +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vpsel q1, q2, q1 ; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 ; CHECK-NEXT: vmul.i32 q1, q1, q2 -; CHECK-NEXT: add.w r12, r12, #4 -; CHECK-NEXT: subs r3, r4, #4 +; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vadd.i32 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vctp.32 r4 +; CHECK-NEXT: vctp.32 r12 ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %cmp8 = icmp eq i32 %N, 0 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph @@ -97,42 +98,43 @@ define dso_local i32 @vpsel_mul_reduce_add_2(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, ; CHECK-LABEL: vpsel_mul_reduce_add_2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: ldr r5, [sp, #20] -; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: ldr.w r12, [sp, #20] +; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq .LBB1_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph -; CHECK-NEXT: adds r4, r5, #3 +; CHECK-NEXT: add.w r5, r12, #3 ; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: bic r4, r4, #3 -; CHECK-NEXT: sub.w r12, r4, #4 -; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: add.w lr, r4, r12, lsr #2 -; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: bic r5, r5, #3 +; CHECK-NEXT: subs r4, r5, #4 +; CHECK-NEXT: movs r5, #1 +; CHECK-NEXT: add.w lr, r5, r4, lsr #2 +; CHECK-NEXT: lsrs r4, r4, #2 +; CHECK-NEXT: sub.w r4, r12, r4, lsl #2 +; CHECK-NEXT: movs r5, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vctp.32 r5 -; CHECK-NEXT: mov r4, r5 -; CHECK-NEXT: and r5, r12, #15 +; CHECK-NEXT: vctp.32 r12 +; CHECK-NEXT: and r6, r5, #15 ; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q1, [r3], #16 ; CHECK-NEXT: vldrwt.u32 q2, [r2], #16 -; CHECK-NEXT: vdup.32 q3, r5 +; CHECK-NEXT: vdup.32 q3, r6 ; CHECK-NEXT: vsub.i32 q1, q2, q1 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q2, [r1], #16 ; CHECK-NEXT: vcmp.i32 eq, q3, zr +; CHECK-NEXT: adds r5, #4 ; CHECK-NEXT: vpsel q1, q1, q2 ; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 ; CHECK-NEXT: vmul.i32 q1, q1, q2 -; CHECK-NEXT: add.w r12, r12, #4 -; CHECK-NEXT: subs r5, r4, #4 +; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vadd.i32 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB1_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block @@ -140,11 +142,11 @@ ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, r5, r6, pc} ; CHECK-NEXT: .LBB1_4: ; CHECK-NEXT: movs r0, #0 ; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, r5, r6, pc} i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) { entry: %cmp8 = icmp eq i32 %N, 0 @@ -203,19 +205,24 @@ ; CHECK-LABEL: and_mul_reduce_add: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: ldr r5, [sp, #16] -; CHECK-NEXT: cbz r5, .LBB2_4 +; CHECK-NEXT: ldr.w r12, [sp, #16] +; CHECK-NEXT: cmp.w r12, #0 +; CHECK-NEXT: beq .LBB2_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph +; CHECK-NEXT: add.w r4, r12, #3 ; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: dlstp.32 lr, r5 +; CHECK-NEXT: bic r4, r4, #3 +; CHECK-NEXT: subs r5, r4, #4 +; CHECK-NEXT: lsrs r4, r5, #2 +; CHECK-NEXT: sub.w r4, r12, r4, lsl #2 +; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB2_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: vldrw.u32 q2, [r0], #16 -; CHECK-NEXT: mov r12, r5 +; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vsub.i32 q1, q2, q1 -; CHECK-NEXT: subs r5, #4 ; CHECK-NEXT: vcmp.i32 eq, q1, zr ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q1, [r3], #16 @@ -224,7 +231,7 @@ ; CHECK-NEXT: vadd.i32 q1, q1, q0 ; CHECK-NEXT: letp lr, .LBB2_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vctp.32 r12 +; CHECK-NEXT: vctp.32 r4 ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r4, r5, r7, pc} @@ -285,36 +292,37 @@ define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) { ; CHECK-LABEL: or_mul_reduce_add: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: ldr r5, [sp, #20] -; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: ldr.w r12, [sp, #20] +; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq .LBB3_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph -; CHECK-NEXT: adds r4, r5, #3 +; CHECK-NEXT: add.w r4, r12, #3 ; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: bic r4, r4, #3 -; CHECK-NEXT: sub.w r12, r4, #4 +; CHECK-NEXT: subs r5, r4, #4 ; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: add.w lr, r4, r12, lsr #2 +; CHECK-NEXT: add.w lr, r4, r5, lsr #2 +; CHECK-NEXT: lsrs r4, r5, #2 +; CHECK-NEXT: sub.w r4, r12, r4, lsl #2 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB3_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vctp.32 r5 +; CHECK-NEXT: vctp.32 r12 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill -; CHECK-NEXT: mov r12, r5 +; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q1, [r1], #16 ; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 ; CHECK-NEXT: vsub.i32 q1, q2, q1 ; CHECK-NEXT: vcmp.i32 eq, q1, zr -; CHECK-NEXT: vmrs r4, p0 -; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload ; CHECK-NEXT: vmrs r5, p0 -; CHECK-NEXT: orrs r4, r5 -; CHECK-NEXT: sub.w r5, r12, #4 -; CHECK-NEXT: vmsr p0, r4 +; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload +; CHECK-NEXT: vmrs r6, p0 +; CHECK-NEXT: orrs r5, r6 +; CHECK-NEXT: vmsr p0, r5 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q1, [r3], #16 ; CHECK-NEXT: vldrwt.u32 q2, [r2], #16 @@ -322,15 +330,15 @@ ; CHECK-NEXT: vadd.i32 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB3_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vctp.32 r12 +; CHECK-NEXT: vctp.32 r4 ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, r5, r6, pc} ; CHECK-NEXT: .LBB3_4: ; CHECK-NEXT: movs r0, #0 ; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %cmp8 = icmp eq i32 %N, 0 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll @@ -226,11 +226,12 @@ ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 +; CHECK-NEXT: lsr.w r3, r12, #2 +; CHECK-NEXT: sub.w r3, r2, r3, lsl #2 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 -; CHECK-NEXT: mov r3, r2 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll @@ -8,32 +8,28 @@ ; CHECK-NEXT: itt eq ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: adds r3, r2, #3 ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: sub.w r12, r3, #4 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 +; CHECK-NEXT: lsr.w r3, r12, #2 +; CHECK-NEXT: sub.w r12, r2, r3, lsl #2 ; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: mov r12, r2 -; CHECK-NEXT: adds r2, r1, r3 -; CHECK-NEXT: vctp.32 r12 -; CHECK-NEXT: vpst -; CHECK-NEXT: vldrbt.u32 q2, [r2] +; CHECK-NEXT: adds r4, r1, r3 ; CHECK-NEXT: adds r3, #4 -; CHECK-NEXT: sub.w r2, r12, #4 +; CHECK-NEXT: vldrb.u32 q2, [r4] ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmla.u32 q0, q2, r0 -; CHECK-NEXT: le lr, .LBB0_1 +; CHECK-NEXT: letp lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block ; CHECK-NEXT: vctp.32 r12 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp7 = icmp eq i32 %N, 0 br i1 %cmp7, label %for.cond.cleanup, label %vector.ph @@ -84,12 +80,15 @@ ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: adds r3, r2, #3 ; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: bic r3, r3, #3 +; CHECK-NEXT: sub.w r12, r3, #4 +; CHECK-NEXT: lsr.w r3, r12, #2 +; CHECK-NEXT: sub.w r3, r2, r3, lsl #2 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: mov r3, r2 -; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vldrh.s32 q2, [r1], #8 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmla.u32 q0, q2, r0 @@ -148,32 +147,28 @@ ; CHECK-NEXT: itt eq ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: adds r3, r2, #3 ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: sub.w r12, r3, #4 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 +; CHECK-NEXT: lsr.w r3, r12, #2 +; CHECK-NEXT: sub.w r12, r2, r3, lsl #2 ; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: mov r12, r2 -; CHECK-NEXT: adds r2, r1, r3 -; CHECK-NEXT: vctp.32 r12 -; CHECK-NEXT: vpst -; CHECK-NEXT: vldrbt.u32 q2, [r2] +; CHECK-NEXT: adds r4, r1, r3 ; CHECK-NEXT: adds r3, #4 -; CHECK-NEXT: sub.w r2, r12, #4 +; CHECK-NEXT: vldrb.u32 q2, [r4] ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmla.u32 q0, q2, r0 -; CHECK-NEXT: le lr, .LBB2_1 +; CHECK-NEXT: letp lr, .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block ; CHECK-NEXT: vctp.32 r12 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp7 = icmp eq i32 %N, 0 br i1 %cmp7, label %for.cond.cleanup, label %vector.ph @@ -224,12 +219,15 @@ ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: adds r3, r2, #3 ; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: bic r3, r3, #3 +; CHECK-NEXT: sub.w r12, r3, #4 +; CHECK-NEXT: lsr.w r3, r12, #2 +; CHECK-NEXT: sub.w r3, r2, r3, lsl #2 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: mov r3, r2 -; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vldrh.u32 q2, [r1], #8 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmla.u32 q0, q2, r0 @@ -289,12 +287,15 @@ ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: adds r3, r2, #3 ; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: bic r3, r3, #3 +; CHECK-NEXT: sub.w r12, r3, #4 +; CHECK-NEXT: lsr.w r3, r12, #2 +; CHECK-NEXT: sub.w r3, r2, r3, lsl #2 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: mov r3, r2 -; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmla.u32 q0, q2, r0 @@ -941,8 +942,8 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_int(i32* nocapture readonly %a, i32* nocapture readonly %b, i32 %c, i32* nocapture %res, i32 %N) { ; CHECK-LABEL: test_vec_mul_scalar_add_int: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: ldr.w r12, [sp, #32] +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: ldr.w r12, [sp, #36] ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq.w .LBB9_11 ; CHECK-NEXT: @ %bb.1: @ %vector.memcheck @@ -966,7 +967,7 @@ ; CHECK-NEXT: beq .LBB9_4 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader ; CHECK-NEXT: sub.w r4, r12, #1 -; CHECK-NEXT: and r10, r12, #3 +; CHECK-NEXT: and r11, r12, #3 ; CHECK-NEXT: cmp r4, #3 ; CHECK-NEXT: bhs .LBB9_6 ; CHECK-NEXT: @ %bb.3: @@ -986,40 +987,43 @@ ; CHECK-NEXT: b .LBB9_11 ; CHECK-NEXT: .LBB9_6: @ %for.body.preheader.new ; CHECK-NEXT: bic r7, r12, #3 -; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: movs r6, #4 ; CHECK-NEXT: subs r7, #4 -; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: mov.w r9, #0 ; CHECK-NEXT: add.w lr, lr, r7, lsr #2 +; CHECK-NEXT: lsrs r7, r7, #2 +; CHECK-NEXT: add.w r12, r6, r7, lsl #2 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB9_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r5, [r0, r4] -; CHECK-NEXT: add.w r9, r0, r4 -; CHECK-NEXT: ldr r6, [r1, r4] +; CHECK-NEXT: ldr r6, [r0, r4] +; CHECK-NEXT: add.w r10, r0, r4 +; CHECK-NEXT: ldr r7, [r1, r4] +; CHECK-NEXT: add.w r9, r9, #4 +; CHECK-NEXT: mla r6, r7, r6, r2 ; CHECK-NEXT: adds r7, r1, r4 -; CHECK-NEXT: add.w r12, r12, #4 -; CHECK-NEXT: mla r5, r6, r5, r2 -; CHECK-NEXT: str r5, [r3, r4] -; CHECK-NEXT: ldr.w r8, [r9, #4] -; CHECK-NEXT: ldr r6, [r7, #4] -; CHECK-NEXT: mla r8, r6, r8, r2 -; CHECK-NEXT: adds r6, r3, r4 +; CHECK-NEXT: str r6, [r3, r4] +; CHECK-NEXT: ldr.w r8, [r10, #4] +; CHECK-NEXT: ldr r5, [r7, #4] +; CHECK-NEXT: mla r8, r5, r8, r2 +; CHECK-NEXT: adds r5, r3, r4 ; CHECK-NEXT: adds r4, #16 -; CHECK-NEXT: str.w r8, [r6, #4] -; CHECK-NEXT: ldr.w r8, [r9, #8] -; CHECK-NEXT: ldr r5, [r7, #8] -; CHECK-NEXT: mla r5, r5, r8, r2 -; CHECK-NEXT: str r5, [r6, #8] -; CHECK-NEXT: ldr.w r5, [r9, #12] +; CHECK-NEXT: str.w r8, [r5, #4] +; CHECK-NEXT: ldr.w r8, [r10, #8] +; CHECK-NEXT: ldr r6, [r7, #8] +; CHECK-NEXT: mla r6, r6, r8, r2 +; CHECK-NEXT: str r6, [r5, #8] +; CHECK-NEXT: ldr.w r6, [r10, #12] ; CHECK-NEXT: ldr r7, [r7, #12] -; CHECK-NEXT: mla r5, r7, r5, r2 -; CHECK-NEXT: str r5, [r6, #12] +; CHECK-NEXT: mla r6, r7, r6, r2 +; CHECK-NEXT: str r6, [r5, #12] ; CHECK-NEXT: le lr, .LBB9_7 ; CHECK-NEXT: .LBB9_8: @ %for.cond.cleanup.loopexit.unr-lcssa -; CHECK-NEXT: wls lr, r10, .LBB9_11 +; CHECK-NEXT: wls lr, r11, .LBB9_11 ; CHECK-NEXT: @ %bb.9: @ %for.body.epil.preheader ; CHECK-NEXT: mvn r7, #3 -; CHECK-NEXT: mov lr, r10 +; CHECK-NEXT: mov lr, r11 ; CHECK-NEXT: add.w r7, r7, r12, lsl #2 ; CHECK-NEXT: add r0, r7 ; CHECK-NEXT: add r1, r7 @@ -1032,7 +1036,7 @@ ; CHECK-NEXT: str r7, [r3, #4]! ; CHECK-NEXT: le lr, .LBB9_10 ; CHECK-NEXT: .LBB9_11: @ %for.cond.cleanup -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %cmp8 = icmp eq i32 %N, 0 br i1 %cmp8, label %for.cond.cleanup, label %vector.memcheck Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll @@ -44,7 +44,10 @@ ; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0 ; CHECK-NEXT: br i1 [[TMP16]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP17:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP14]], <4 x i32> [[VEC_PHI]] +; CHECK-NEXT: [[VEC_PHI_LCSSA:%.*]] = phi <4 x i32> [ [[VEC_PHI]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi <4 x i1> [ [[TMP1]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14_LCSSA:%.*]] = phi <4 x i32> [ [[TMP14]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP17:%.*]] = select <4 x i1> [[DOTLCSSA]], <4 x i32> [[TMP14_LCSSA]], <4 x i32> [[VEC_PHI_LCSSA]] ; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP17]]) ; CHECK-NEXT: store i32 [[TMP18]], i32* [[ARRAYIDX8_US]], align 4 ; CHECK-NEXT: [[INC10_US]] = add nuw i32 [[I_025_US]], 1 @@ -154,7 +157,10 @@ ; CHECK-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 ; CHECK-NEXT: br i1 [[TMP14]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP15:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP12]], <4 x i32> [[VEC_PHI]] +; CHECK-NEXT: [[VEC_PHI_LCSSA:%.*]] = phi <4 x i32> [ [[VEC_PHI]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi <4 x i1> [ [[TMP1]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP12_LCSSA:%.*]] = phi <4 x i32> [ [[TMP12]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP15:%.*]] = select <4 x i1> [[DOTLCSSA]], <4 x i32> [[TMP12_LCSSA]], <4 x i32> [[VEC_PHI_LCSSA]] ; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP15]]) ; CHECK-NEXT: store i32 [[TMP16]], i32* [[ARRAYIDX7_US]], align 4 ; CHECK-NEXT: [[INC9_US]] = add nuw i32 [[I_024_US]], 1 Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll @@ -9,21 +9,24 @@ ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: adds r3, r2, #3 +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: bic r3, r3, #3 +; CHECK-NEXT: sub.w r12, r3, #4 +; CHECK-NEXT: lsr.w r3, r12, #2 +; CHECK-NEXT: sub.w r3, r2, r3, lsl #2 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16 -; CHECK-NEXT: mov r3, r2 -; CHECK-NEXT: vmul.i32 q1, q2, q1 -; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vadd.i32 q1, q1, q0 +; CHECK-NEXT: vmul.i32 q0, q2, q0 +; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: letp lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block ; CHECK-NEXT: vctp.32 r3 -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} entry: @@ -75,14 +78,17 @@ ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: adds r1, r2, #3 +; CHECK-NEXT: bic r1, r1, #3 ; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: subs r1, #4 +; CHECK-NEXT: lsrs r1, r1, #2 +; CHECK-NEXT: sub.w r1, r2, r1, lsl #2 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: mov r1, r2 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 -; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: letp lr, .LBB1_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block @@ -135,14 +141,17 @@ ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: adds r1, r2, #3 +; CHECK-NEXT: bic r1, r1, #3 ; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: subs r1, #4 +; CHECK-NEXT: lsrs r1, r1, #2 +; CHECK-NEXT: sub.w r1, r2, r1, lsl #2 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: mov r1, r2 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 -; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: letp lr, .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll @@ -3,6 +3,12 @@ ; CHECK-LABEL: vec_mul_reduce_add +; CHECK: vector.ph: +; CHECK: call void @llvm.set.loop.iterations.i32 +; CHECK: [[UF:%[^ ]+]] = shl i32 %{{.*}}, 2 +; CHECK: [[REMAT_ITER:%[^ ]+]] = sub i32 %N, [[UF]] +; CHECK: br label %vector.body + ; CHECK: vector.body: ; CHECK-NOT: phi i32 [ 0, %vector.ph ] ; CHECK: [[ELTS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[SUB:%[^ ]+]], %vector.body ] @@ -12,7 +18,7 @@ ; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], ; CHECK: middle.block: -; CHECK: [[VCTP_CLONE:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELTS]]) +; CHECK: [[VCTP_CLONE:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[REMAT_ITER]]) ; CHECK: [[VPSEL:%[^ ]+]] = select <4 x i1> [[VCTP_CLONE]], ; CHECK: call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VPSEL]])