diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp --- a/llvm/lib/Target/ARM/MVETailPredication.cpp +++ b/llvm/lib/Target/ARM/MVETailPredication.cpp @@ -79,14 +79,9 @@ class MVETailPredication : public LoopPass { SmallVector MaskedInsts; Loop *L = nullptr; - LoopInfo *LI = nullptr; - const DataLayout *DL; - DominatorTree *DT = nullptr; ScalarEvolution *SE = nullptr; TargetTransformInfo *TTI = nullptr; const ARMSubtarget *ST = nullptr; - TargetLibraryInfo *TLI = nullptr; - bool ClonedVCTPInExitBlock = false; public: static char ID; @@ -98,8 +93,6 @@ AU.addRequired(); AU.addRequired(); AU.addRequired(); - AU.addRequired(); - AU.addRequired(); AU.addPreserved(); AU.setPreservesCFG(); } @@ -123,8 +116,7 @@ /// Insert the intrinsic to represent the effect of tail predication. void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount, - FixedVectorType *VecTy, - DenseMap &NewPredicates); + FixedVectorType *VecTy); /// Rematerialize the iteration count in exit blocks, which enables /// ARMLowOverheadLoops to better optimise away loop update statements inside @@ -153,16 +145,6 @@ return ID == Intrinsic::masked_store || ID == Intrinsic::masked_load; } -void MVETailPredication::RematerializeIterCount() { - SmallVector DeadInsts; - SCEVExpander Rewriter(*SE, *DL, "mvetp"); - ReplaceExitVal ReplaceExitValue = AlwaysRepl; - - formLCSSARecursively(*L, *DT, LI, SE); - rewriteLoopExitValues(L, LI, TLI, SE, TTI, Rewriter, DT, ReplaceExitValue, - DeadInsts); -} - bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) { if (skipLoop(L) || DisableTailPredication) return false; @@ -172,13 +154,8 @@ auto &TPC = getAnalysis(); auto &TM = TPC.getTM(); ST = &TM.getSubtarget(F); - DT = &getAnalysis().getDomTree(); - LI = &getAnalysis().getLoopInfo(); TTI = &getAnalysis().getTTI(F); SE = &getAnalysis().getSE(); - auto *TLIP = getAnalysisIfAvailable(); - TLI = TLIP ? &TLIP->getTLI(*L->getHeader()->getParent()) : nullptr; - DL = &L->getHeader()->getModule()->getDataLayout(); this->L = L; // The MVE and LOB extensions are combined to enable tail-predication, but @@ -232,7 +209,6 @@ if (!Decrement) return false; - ClonedVCTPInExitBlock = false; LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n" << *Decrement << "\n"); @@ -241,8 +217,6 @@ return false; } - if (ClonedVCTPInExitBlock) - RematerializeIterCount(); return true; } @@ -319,32 +293,11 @@ // in the block. This means that the VPR doesn't have to be live into the // exit block which should make it easier to convert this loop into a proper // tail predicated loop. -static bool Cleanup(DenseMap &NewPredicates, - SetVector &MaybeDead, Loop *L) { +static void Cleanup(SetVector &MaybeDead, Loop *L) { BasicBlock *Exit = L->getUniqueExitBlock(); if (!Exit) { LLVM_DEBUG(dbgs() << "ARM TP: can't find loop exit block\n"); - return false; - } - - bool ClonedVCTPInExitBlock = false; - - for (auto &Pair : NewPredicates) { - Instruction *OldPred = Pair.first; - Instruction *NewPred = Pair.second; - - for (auto &I : *Exit) { - if (I.isSameOperationAs(OldPred)) { - Instruction *PredClone = NewPred->clone(); - PredClone->insertBefore(&I); - I.replaceAllUsesWith(PredClone); - MaybeDead.insert(&I); - ClonedVCTPInExitBlock = true; - LLVM_DEBUG(dbgs() << "ARM TP: replacing: "; I.dump(); - dbgs() << "ARM TP: with: "; PredClone->dump()); - break; - } - } + return; } // Drop references and add operands to check for dead. @@ -369,8 +322,6 @@ for (auto I : L->blocks()) DeleteDeadPHIs(I); - - return ClonedVCTPInExitBlock; } // The active lane intrinsic has this form: @@ -549,8 +500,7 @@ } void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, - Value *TripCount, FixedVectorType *VecTy, - DenseMap &NewPredicates) { + Value *TripCount, FixedVectorType *VecTy) { IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); Module *M = L->getHeader()->getModule(); Type *Ty = IntegerType::get(M->getContext(), 32); @@ -591,7 +541,6 @@ Function *VCTP = Intrinsic::getDeclaration(M, VCTPID); Value *VCTPCall = Builder.CreateCall(VCTP, Processed); ActiveLaneMask->replaceAllUsesWith(VCTPCall); - NewPredicates[ActiveLaneMask] = cast(VCTPCall); // Add the incoming value to the new phi. // TODO: This add likely already exists in the loop. @@ -609,9 +558,7 @@ } LLVM_DEBUG(dbgs() << "ARM TP: Found predicated vector loop.\n"); - SetVector Predicates; - DenseMap NewPredicates; // Walk through the masked intrinsics and try to find whether the predicate // operand is generated by intrinsic @llvm.get.active.lane.mask(). @@ -636,11 +583,10 @@ return false; } LLVM_DEBUG(dbgs() << "ARM TP: Safe to insert VCTP.\n"); - InsertVCTPIntrinsic(ActiveLaneMask, TripCount, VecTy, NewPredicates); + InsertVCTPIntrinsic(ActiveLaneMask, TripCount, VecTy); } - // Now clean up. - ClonedVCTPInExitBlock = Cleanup(NewPredicates, Predicates, L); + Cleanup(Predicates, L); return true; } diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll @@ -15,8 +15,7 @@ ; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], ; CHECK: middle.block: -; CHECK: [[VCTP_CLONE:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[REMAT_ITER:%.*]]) -; CHECK: [[VPSEL:%[^ ]+]] = select <4 x i1> [[VCTP_CLONE]], +; CHECK: [[VPSEL:%[^ ]+]] = select <4 x i1> [[VCTP]], ; CHECK: call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VPSEL]]) define i32 @vec_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) { @@ -64,16 +63,12 @@ br i1 %11, label %vector.body, label %middle.block middle.block: ; preds = %vector.body -; TODO: check that the intrinsic is also emitted here by the loop vectoriser -; %12 = icmp ule <4 x i32> %induction, %broadcast.splat12 - %12 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) - - %13 = select <4 x i1> %12, <4 x i32> %9, <4 x i32> %vec.phi - %14 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %13) + %12 = select <4 x i1> %7, <4 x i32> %9, <4 x i32> %vec.phi + %13 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %12) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry - %res.0.lcssa = phi i32 [ 0, %entry ], [ %14, %middle.block ] + %res.0.lcssa = phi i32 [ 0, %entry ], [ %13, %middle.block ] ret i32 %res.0.lcssa }