diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp --- a/llvm/lib/Target/ARM/MVETailPredication.cpp +++ b/llvm/lib/Target/ARM/MVETailPredication.cpp @@ -35,12 +35,14 @@ #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/InitializePasses.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicsARM.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/LoopUtils.h" using namespace llvm; @@ -56,8 +58,13 @@ class MVETailPredication : public LoopPass { SmallVector MaskedInsts; Loop *L = nullptr; + LoopInfo *LI = nullptr; + const DataLayout *DL; + DominatorTree *DT = nullptr; ScalarEvolution *SE = nullptr; TargetTransformInfo *TTI = nullptr; + TargetLibraryInfo *TLI = nullptr; + bool ClonedVCTPInExitBlock = false; public: static char ID; @@ -69,6 +76,8 @@ AU.addRequired(); AU.addRequired(); AU.addRequired(); + AU.addRequired(); + AU.addRequired(); AU.addPreserved(); AU.setPreservesCFG(); } @@ -97,6 +106,11 @@ DenseMap &NewPredicates, VectorType *VecTy, Value *NumElements); + + /// Rematerialize the iteration count in exit blocks, which enables + /// ARMLowOverheadLoops to better optimise away loop update statements inside + /// hardware-loops. + void RematerializeIterCount(); }; } // end namespace @@ -120,6 +134,16 @@ return ID == Intrinsic::masked_store || ID == Intrinsic::masked_load; } +void MVETailPredication::RematerializeIterCount() { + SmallVector DeadInsts; + SCEVExpander Rewriter(*SE, *DL, "mvetp"); + ReplaceExitVal ReplaceExitValue = AlwaysRepl; + + formLCSSARecursively(*L, *DT, LI, SE); + rewriteLoopExitValues(L, LI, TLI, SE, Rewriter, DT, ReplaceExitValue, + DeadInsts); +} + bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) { if (skipLoop(L) || DisableTailPredication) return false; @@ -128,8 +152,13 @@ auto &TPC = getAnalysis(); auto &TM = TPC.getTM(); auto *ST = &TM.getSubtarget(F); + DT = &getAnalysis().getDomTree(); + LI = &getAnalysis().getLoopInfo(); TTI = &getAnalysis().getTTI(F); SE = &getAnalysis().getSE(); + auto *TLIP = getAnalysisIfAvailable(); + TLI = TLIP ? &TLIP->getTLI(*L->getHeader()->getParent()) : nullptr; + DL = &L->getHeader()->getModule()->getDataLayout(); this->L = L; // The MVE and LOB extensions are combined to enable tail-predication, but @@ -185,7 +214,14 @@ LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n" << *Decrement << "\n"); - return TryConvert(Setup->getArgOperand(0)); + + if (TryConvert(Setup->getArgOperand(0))) { + if (ClonedVCTPInExitBlock) + RematerializeIterCount(); + return true; + } + + return false; } bool MVETailPredication::isTailPredicate(Instruction *I, Value *NumElements) { @@ -407,14 +443,16 @@ // in the block. This means that the VPR doesn't have to be live into the // exit block which should make it easier to convert this loop into a proper // tail predicated loop. -static void Cleanup(DenseMap &NewPredicates, +static bool Cleanup(DenseMap &NewPredicates, SetVector &MaybeDead, Loop *L) { BasicBlock *Exit = L->getUniqueExitBlock(); if (!Exit) { LLVM_DEBUG(dbgs() << "ARM TP: can't find loop exit block\n"); - return; + return false; } + bool ClonedVCTPInExitBlock = false; + for (auto &Pair : NewPredicates) { Instruction *OldPred = Pair.first; Instruction *NewPred = Pair.second; @@ -425,6 +463,7 @@ PredClone->insertBefore(&I); I.replaceAllUsesWith(PredClone); MaybeDead.insert(&I); + ClonedVCTPInExitBlock = true; LLVM_DEBUG(dbgs() << "ARM TP: replacing: "; I.dump(); dbgs() << "ARM TP: with: "; PredClone->dump()); break; @@ -455,6 +494,8 @@ for (auto I : L->blocks()) DeleteDeadPHIs(I); + + return ClonedVCTPInExitBlock; } void MVETailPredication::InsertVCTPIntrinsic(Instruction *Predicate, @@ -538,7 +579,7 @@ } // Now clean up. - Cleanup(NewPredicates, Predicates, L); + ClonedVCTPInExitBlock = Cleanup(NewPredicates, Predicates, L); return true; } diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll @@ -8,7 +8,7 @@ ; CHECK-NEXT: itt eq ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: adds r4, r3, #3 ; CHECK-NEXT: vmov.i32 q1, #0x0 @@ -16,35 +16,36 @@ ; CHECK-NEXT: sub.w r12, r4, #4 ; CHECK-NEXT: movs r4, #1 ; CHECK-NEXT: add.w lr, r4, r12, lsr #2 -; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: lsr.w r4, r12, #2 +; CHECK-NEXT: sub.w r12, r3, r4, lsl #2 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r3 -; CHECK-NEXT: mov r4, r3 -; CHECK-NEXT: and r3, r12, #15 +; CHECK-NEXT: and r5, r4, #15 ; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill -; CHECK-NEXT: vdup.32 q3, r3 +; CHECK-NEXT: vdup.32 q3, r5 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q1, [r2], #16 ; CHECK-NEXT: vldrwt.u32 q2, [r1], #16 ; CHECK-NEXT: vcmp.i32 eq, q3, zr +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vpsel q1, q2, q1 ; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 ; CHECK-NEXT: vmul.i32 q1, q1, q2 -; CHECK-NEXT: add.w r12, r12, #4 -; CHECK-NEXT: subs r3, r4, #4 +; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vadd.i32 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vctp.32 r4 +; CHECK-NEXT: vctp.32 r12 ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %cmp8 = icmp eq i32 %N, 0 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph @@ -97,42 +98,43 @@ define dso_local i32 @vpsel_mul_reduce_add_2(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, ; CHECK-LABEL: vpsel_mul_reduce_add_2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: ldr r5, [sp, #20] -; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: ldr.w r12, [sp, #20] +; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq .LBB1_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph -; CHECK-NEXT: adds r4, r5, #3 +; CHECK-NEXT: add.w r5, r12, #3 ; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: bic r4, r4, #3 -; CHECK-NEXT: sub.w r12, r4, #4 -; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: add.w lr, r4, r12, lsr #2 -; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: bic r5, r5, #3 +; CHECK-NEXT: subs r4, r5, #4 +; CHECK-NEXT: movs r5, #1 +; CHECK-NEXT: add.w lr, r5, r4, lsr #2 +; CHECK-NEXT: lsrs r4, r4, #2 +; CHECK-NEXT: sub.w r4, r12, r4, lsl #2 +; CHECK-NEXT: movs r5, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vctp.32 r5 -; CHECK-NEXT: mov r4, r5 -; CHECK-NEXT: and r5, r12, #15 +; CHECK-NEXT: vctp.32 r12 +; CHECK-NEXT: and r6, r5, #15 ; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q1, [r3], #16 ; CHECK-NEXT: vldrwt.u32 q2, [r2], #16 -; CHECK-NEXT: vdup.32 q3, r5 +; CHECK-NEXT: vdup.32 q3, r6 ; CHECK-NEXT: vsub.i32 q1, q2, q1 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q2, [r1], #16 ; CHECK-NEXT: vcmp.i32 eq, q3, zr +; CHECK-NEXT: adds r5, #4 ; CHECK-NEXT: vpsel q1, q1, q2 ; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 ; CHECK-NEXT: vmul.i32 q1, q1, q2 -; CHECK-NEXT: add.w r12, r12, #4 -; CHECK-NEXT: subs r5, r4, #4 +; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vadd.i32 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB1_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block @@ -140,11 +142,11 @@ ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, r5, r6, pc} ; CHECK-NEXT: .LBB1_4: ; CHECK-NEXT: movs r0, #0 ; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, r5, r6, pc} i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) { entry: %cmp8 = icmp eq i32 %N, 0 @@ -203,19 +205,23 @@ ; CHECK-LABEL: and_mul_reduce_add: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: ldr r5, [sp, #16] -; CHECK-NEXT: cbz r5, .LBB2_4 +; CHECK-NEXT: ldr.w r12, [sp, #16] +; CHECK-NEXT: cmp.w r12, #0 +; CHECK-NEXT: beq .LBB2_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph +; CHECK-NEXT: add.w r4, r12, #3 ; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: dlstp.32 lr, r5 +; CHECK-NEXT: bic r4, r4, #3 +; CHECK-NEXT: subs r5, r4, #4 +; CHECK-NEXT: lsrs r4, r5, #2 +; CHECK-NEXT: sub.w r4, r12, r4, lsl #2 +; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB2_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: vldrw.u32 q2, [r0], #16 -; CHECK-NEXT: mov r12, r5 ; CHECK-NEXT: vsub.i32 q1, q2, q1 -; CHECK-NEXT: subs r5, #4 ; CHECK-NEXT: vcmp.i32 eq, q1, zr ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q1, [r3], #16 @@ -224,7 +230,7 @@ ; CHECK-NEXT: vadd.i32 q1, q1, q0 ; CHECK-NEXT: letp lr, .LBB2_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vctp.32 r12 +; CHECK-NEXT: vctp.32 r4 ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r4, r5, r7, pc} @@ -285,36 +291,37 @@ define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) { ; CHECK-LABEL: or_mul_reduce_add: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: ldr r5, [sp, #20] -; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: ldr.w r12, [sp, #20] +; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq .LBB3_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph -; CHECK-NEXT: adds r4, r5, #3 +; CHECK-NEXT: add.w r4, r12, #3 ; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: bic r4, r4, #3 -; CHECK-NEXT: sub.w r12, r4, #4 +; CHECK-NEXT: subs r5, r4, #4 ; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: add.w lr, r4, r12, lsr #2 +; CHECK-NEXT: add.w lr, r4, r5, lsr #2 +; CHECK-NEXT: lsrs r4, r5, #2 +; CHECK-NEXT: sub.w r4, r12, r4, lsl #2 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB3_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vctp.32 r5 +; CHECK-NEXT: vctp.32 r12 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill -; CHECK-NEXT: mov r12, r5 +; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q1, [r1], #16 ; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 ; CHECK-NEXT: vsub.i32 q1, q2, q1 ; CHECK-NEXT: vcmp.i32 eq, q1, zr -; CHECK-NEXT: vmrs r4, p0 -; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload ; CHECK-NEXT: vmrs r5, p0 -; CHECK-NEXT: orrs r4, r5 -; CHECK-NEXT: sub.w r5, r12, #4 -; CHECK-NEXT: vmsr p0, r4 +; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload +; CHECK-NEXT: vmrs r6, p0 +; CHECK-NEXT: orrs r5, r6 +; CHECK-NEXT: vmsr p0, r5 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q1, [r3], #16 ; CHECK-NEXT: vldrwt.u32 q2, [r2], #16 @@ -322,15 +329,15 @@ ; CHECK-NEXT: vadd.i32 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB3_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vctp.32 r12 +; CHECK-NEXT: vctp.32 r4 ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, r5, r6, pc} ; CHECK-NEXT: .LBB3_4: ; CHECK-NEXT: movs r0, #0 ; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %cmp8 = icmp eq i32 %N, 0 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll @@ -224,11 +224,12 @@ ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 +; CHECK-NEXT: lsr.w r3, r12, #2 +; CHECK-NEXT: sub.w r3, r2, r3, lsl #2 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 -; CHECK-NEXT: mov r3, r2 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpstt diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll @@ -9,12 +9,15 @@ ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: adds r3, r2, #3 ; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: bic r3, r3, #3 +; CHECK-NEXT: sub.w r12, r3, #4 +; CHECK-NEXT: lsr.w r3, r12, #2 +; CHECK-NEXT: sub.w r3, r2, r3, lsl #2 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: mov r3, r2 -; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vldrb.u32 q2, [r1], #4 ; CHECK-NEXT: vmla.u32 q0, q2, r0 @@ -74,12 +77,15 @@ ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: adds r3, r2, #3 ; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: bic r3, r3, #3 +; CHECK-NEXT: sub.w r12, r3, #4 +; CHECK-NEXT: lsr.w r3, r12, #2 +; CHECK-NEXT: sub.w r3, r2, r3, lsl #2 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: mov r3, r2 -; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vldrh.s32 q2, [r1], #8 ; CHECK-NEXT: vmla.u32 q0, q2, r0 @@ -139,12 +145,15 @@ ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: adds r3, r2, #3 ; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: bic r3, r3, #3 +; CHECK-NEXT: sub.w r12, r3, #4 +; CHECK-NEXT: lsr.w r3, r12, #2 +; CHECK-NEXT: sub.w r3, r2, r3, lsl #2 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: mov r3, r2 -; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vldrb.u32 q2, [r1], #4 ; CHECK-NEXT: vmla.u32 q0, q2, r0 @@ -204,12 +213,15 @@ ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: adds r3, r2, #3 ; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: bic r3, r3, #3 +; CHECK-NEXT: sub.w r12, r3, #4 +; CHECK-NEXT: lsr.w r3, r12, #2 +; CHECK-NEXT: sub.w r3, r2, r3, lsl #2 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: mov r3, r2 -; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vldrh.u32 q2, [r1], #8 ; CHECK-NEXT: vmla.u32 q0, q2, r0 @@ -269,12 +281,15 @@ ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: adds r3, r2, #3 ; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: bic r3, r3, #3 +; CHECK-NEXT: sub.w r12, r3, #4 +; CHECK-NEXT: lsr.w r3, r12, #2 +; CHECK-NEXT: sub.w r3, r2, r3, lsl #2 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: mov r3, r2 -; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16 ; CHECK-NEXT: vmla.u32 q0, q2, r0 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll @@ -9,21 +9,24 @@ ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: adds r3, r2, #3 +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: bic r3, r3, #3 +; CHECK-NEXT: sub.w r12, r3, #4 +; CHECK-NEXT: lsr.w r3, r12, #2 +; CHECK-NEXT: sub.w r3, r2, r3, lsl #2 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16 -; CHECK-NEXT: mov r3, r2 -; CHECK-NEXT: vmul.i32 q1, q2, q1 -; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vadd.i32 q1, q1, q0 +; CHECK-NEXT: vmul.i32 q0, q2, q0 +; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: letp lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block ; CHECK-NEXT: vctp.32 r3 -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} entry: @@ -75,14 +78,17 @@ ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: adds r1, r2, #3 +; CHECK-NEXT: bic r1, r1, #3 ; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: subs r1, #4 +; CHECK-NEXT: lsrs r1, r1, #2 +; CHECK-NEXT: sub.w r1, r2, r1, lsl #2 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: mov r1, r2 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 -; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: letp lr, .LBB1_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block @@ -135,14 +141,17 @@ ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: adds r1, r2, #3 +; CHECK-NEXT: bic r1, r1, #3 ; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: subs r1, #4 +; CHECK-NEXT: lsrs r1, r1, #2 +; CHECK-NEXT: sub.w r1, r2, r1, lsl #2 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: mov r1, r2 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 -; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: letp lr, .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll @@ -3,6 +3,12 @@ ; CHECK-LABEL: vec_mul_reduce_add +; CHECK: vector.ph: +; CHECK: call void @llvm.set.loop.iterations.i32 +; CHECK: [[UF:%[^ ]+]] = shl i32 %{{.*}}, 2 +; CHECK: [[REMAT_ITER:%[^ ]+]] = sub i32 %N, [[UF]] +; CHECK: br label %vector.body + ; CHECK: vector.body: ; CHECK-NOT: phi i32 [ 0, %vector.ph ] ; CHECK: [[ELTS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[SUB:%[^ ]+]], %vector.body ] @@ -12,7 +18,7 @@ ; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], ; CHECK: middle.block: -; CHECK: [[VCTP_CLONE:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELTS]]) +; CHECK: [[VCTP_CLONE:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[REMAT_ITER]]) ; CHECK: [[VPSEL:%[^ ]+]] = select <4 x i1> [[VCTP_CLONE]], ; CHECK: call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VPSEL]])