Index: llvm/lib/Target/ARM/MVETailPredication.cpp =================================================================== --- llvm/lib/Target/ARM/MVETailPredication.cpp +++ llvm/lib/Target/ARM/MVETailPredication.cpp @@ -352,14 +352,14 @@ // The active lane intrinsic has this form: // -// @llvm.get.active.lane.mask(IV, BTC) +// @llvm.get.active.lane.mask(IV, TC) // // Here we perform checks that this intrinsic behaves as expected, // which means: // -// 1) The element count, which is calculated with BTC + 1, cannot overflow. -// 2) The element count needs to be sufficiently large that the decrement of -// element counter doesn't overflow, which means that we need to prove: +// 1) Check that the TripCount (TC) belongs to this loop (originally). +// 2) The element count (TC) needs to be sufficiently large that the decrement +// of element counter doesn't overflow, which means that we need to prove: // ceil(ElementCount / VectorWidth) >= TripCount // by rounding up ElementCount up: // ((ElementCount + (VectorWidth - 1)) / VectorWidth @@ -373,29 +373,8 @@ EnableTailPredication == TailPredication::ForceEnabledNoReductions || EnableTailPredication == TailPredication::ForceEnabled; - // 1) Test whether entry to the loop is protected by a conditional - // BTC + 1 < 0. In other words, if the scalar trip count overflows, - // becomes negative, we shouldn't enter the loop and creating - // tripcount expression BTC + 1 is not safe. So, check that BTC - // isn't max. This is evaluated in unsigned, because the semantics - // of @get.active.lane.mask is a ULE comparison. - auto *BackedgeTakenCount = ActiveLaneMask->getOperand(1); - auto *BTC = SE->getSCEV(BackedgeTakenCount); - auto *MaxBTC = SE->getConstantMaxBackedgeTakenCount(L); - - if (isa(MaxBTC)) { - LLVM_DEBUG(dbgs() << "ARM TP: Can't compute SCEV BTC expression: "; - BTC->dump()); - return false; - } - - APInt MaxInt = APInt(BTC->getType()->getScalarSizeInBits(), ~0); - if (cast(MaxBTC)->getAPInt().eq(MaxInt) && - !ForceTailPredication) { - LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible, BTC can be int max: "; - BTC->dump()); - return false; - } + // 1) TODO: Check that the TripCount (TC) belongs to this loop (originally). + auto *TCUse = ActiveLaneMask->getOperand(1); // 2) Prove that the sub expression is non-negative, i.e. it doesn't overflow: // @@ -412,12 +391,12 @@ // // upperbound(TC) <= UINT_MAX - VectorWidth // - auto *TC = SE->getSCEV(TripCount); + auto *TCDef = SE->getSCEV(TripCount); unsigned SizeInBits = TripCount->getType()->getScalarSizeInBits(); int VectorWidth = VecTy->getNumElements(); auto Diff = APInt(SizeInBits, ~0) - APInt(SizeInBits, VectorWidth); uint64_t MaxMinusVW = Diff.getZExtValue(); - uint64_t UpperboundTC = SE->getSignedRange(TC).getUpper().getZExtValue(); + uint64_t UpperboundTC = SE->getSignedRange(TCDef).getUpper().getZExtValue(); if (UpperboundTC > MaxMinusVW && !ForceTailPredication) { LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible in tripcount rounding:\n"; @@ -434,7 +413,7 @@ // // where Ceil = ElementCount + (VW-1) / VW. If Ceil and TC are runtime // values (and not constants), we have to compensate for the lowerbound value - // range to be off by 1. The reason is that BTC lives in the preheader in + // range to be off by 1. The reason is that the TC lives in the preheader in // this form: // // %trip.count.minus = add nsw nuw i32 %N, -1 @@ -449,9 +428,7 @@ // 1. Thus, if the ranges of Ceil and TC are not a single constant but a set, // we first add 0 to TC such that we can do the <= comparison on both sets. // - auto *One = SE->getOne(TripCount->getType()); - // ElementCount = BTC + 1 - auto *ElementCount = SE->getAddExpr(BTC, One); + auto *ElementCount = SE->getSCEV(TCUse); // Tmp = ElementCount + (VW-1) auto *ECPlusVWMinus1 = SE->getAddExpr(ElementCount, SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth - 1))); @@ -460,7 +437,7 @@ SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth))); ConstantRange RangeCeil = SE->getSignedRange(Ceil) ; - ConstantRange RangeTC = SE->getSignedRange(TC) ; + ConstantRange RangeTC = SE->getSignedRange(TCDef) ; if (!RangeTC.isSingleElement()) { auto ZeroRange = ConstantRange(APInt(TripCount->getType()->getScalarSizeInBits(), 0)); @@ -504,38 +481,6 @@ return false; } -// Materialize NumElements in the preheader block. -static Value *getNumElements(BasicBlock *Preheader, Value *BTC) { - // First, check the preheader if it not already exist: - // - // preheader: - // %BTC = add i32 %N, -1 - // .. - // vector.body: - // - // if %BTC already exists. We don't need to emit %NumElems = %BTC + 1, - // but instead can just return %N. - for (auto &I : *Preheader) { - if (I.getOpcode() != Instruction::Add || &I != BTC) - continue; - ConstantInt *MinusOne = nullptr; - if (!(MinusOne = dyn_cast(I.getOperand(1)))) - continue; - if (MinusOne->getSExtValue() == -1) { - LLVM_DEBUG(dbgs() << "ARM TP: Found num elems: " << I << "\n"); - return I.getOperand(0); - } - } - - // But we do need to materialise BTC if it is not already there, - // e.g. if it is a constant. - IRBuilder<> Builder(Preheader->getTerminator()); - Value *NumElements = Builder.CreateAdd(BTC, - ConstantInt::get(BTC->getType(), 1), "num.elements"); - LLVM_DEBUG(dbgs() << "ARM TP: Created num elems: " << *NumElements << "\n"); - return NumElements; -} - void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount, FixedVectorType *VecTy) { IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); @@ -543,23 +488,15 @@ Type *Ty = IntegerType::get(M->getContext(), 32); unsigned VectorWidth = VecTy->getNumElements(); - // The backedge-taken count in @llvm.get.active.lane.mask, its 2nd operand, - // is one less than the trip count. So we need to find or create - // %num.elements = %BTC + 1 in the preheader. - Value *BTC = ActiveLaneMask->getOperand(1); - Builder.SetInsertPoint(L->getLoopPreheader()->getTerminator()); - Value *NumElements = getNumElements(L->getLoopPreheader(), BTC); - // Insert a phi to count the number of elements processed by the loop. Builder.SetInsertPoint(L->getHeader()->getFirstNonPHI() ); PHINode *Processed = Builder.CreatePHI(Ty, 2); - Processed->addIncoming(NumElements, L->getLoopPreheader()); + Processed->addIncoming(ActiveLaneMask->getOperand(1), L->getLoopPreheader()); - // Replace @llvm.get.active.mask() with the ARM specific VCTP intrinic, and thus - // represent the effect of tail predication. + // Replace @llvm.get.active.mask() with the ARM specific VCTP intrinic, and + // thus represent the effect of tail predication. Builder.SetInsertPoint(ActiveLaneMask); - ConstantInt *Factor = - ConstantInt::get(cast(Ty), VectorWidth); + ConstantInt *Factor = ConstantInt::get(cast(Ty), VectorWidth); Intrinsic::ID VCTPID; switch (VectorWidth) { Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll @@ -37,7 +37,7 @@ %tmp = getelementptr inbounds i8, i8* %a, i32 %index ; %tmp1 = icmp ule <16 x i32> %induction, %broadcast.splat11 - %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N) %tmp2 = bitcast i8* %tmp to <16 x i8>* %wide.masked.load = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp2, i32 4, <16 x i1> %active.lane.mask, <16 x i8> undef) @@ -94,7 +94,7 @@ %tmp = getelementptr inbounds i16, i16* %a, i32 %index ; %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11 - %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N) %tmp2 = bitcast i16* %tmp to <8 x i16>* %wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %active.lane.mask, <8 x i16> undef) @@ -150,7 +150,7 @@ %tmp = getelementptr inbounds i32, i32* %a, i32 %index ; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 %tmp2 = bitcast i32* %tmp to <4 x i32>* - %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index %tmp4 = bitcast i32* %tmp3 to <4 x i32>* @@ -204,7 +204,7 @@ %induction = add <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index ; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 - %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) %tmp2 = bitcast i32* %tmp to <4 x i32>* %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %extract.1.low = shufflevector <4 x i32> %wide.masked.load, <4 x i32> undef, < 2 x i32> < i32 0, i32 2> @@ -264,7 +264,7 @@ %tmp = getelementptr inbounds i32, i32* %a, i32 %index ; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 - %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) %wrong = icmp ult <4 x i32> %induction, %broadcast.splat11 %tmp2 = bitcast i32* %tmp to <4 x i32>* @@ -323,7 +323,7 @@ %tmp = getelementptr inbounds i32, i32* %a, i32 %index ; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 - %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) %wrong = icmp ult <4 x i32> %induction, %broadcast.splat11 %tmp2 = bitcast i32* %tmp to <4 x i32>* @@ -352,10 +352,10 @@ ; ; CHECK-LABEL: interleave4 ; CHECK: vector.body: -; CHECK: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) -; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %trip.count.minus.1) -; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %trip.count.minus.1) -; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %trip.count.minus.1) +; CHECK: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) +; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %N) +; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %N) +; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %N) ; define dso_local void @interleave4(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { entry: @@ -386,13 +386,13 @@ %lsr.iv3840 = bitcast i32* %lsr.iv38 to <4 x i32>* %lsr.iv3133 = bitcast i32* %lsr.iv31 to <4 x i32>* %lsr.iv26 = bitcast i32* %lsr.iv to <4 x i32>* - %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) %v7 = add i32 %index, 4 - %active.lane.mask15 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %trip.count.minus.1) + %active.lane.mask15 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %N) %v8 = add i32 %v7, 4 - %active.lane.mask16 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %trip.count.minus.1) + %active.lane.mask16 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %N) %v9 = add i32 %v8, 4 - %active.lane.mask17 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %trip.count.minus.1) + %active.lane.mask17 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %N) %scevgep42 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3840, i32 -2 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %scevgep42, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %scevgep43 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3840, i32 -1