Index: llvm/lib/Target/ARM/MVETailPredication.cpp =================================================================== --- llvm/lib/Target/ARM/MVETailPredication.cpp +++ llvm/lib/Target/ARM/MVETailPredication.cpp @@ -22,23 +22,13 @@ /// The HardwareLoops pass inserts intrinsics identifying loops that the /// backend will attempt to convert into a low-overhead loop. The vectorizer is /// responsible for generating a vectorized loop in which the lanes are -/// predicated upon the iteration counter. This pass looks at these predicated -/// vector loops, that are targets for low-overhead loops, and prepares it for -/// code generation. Once the vectorizer has produced a masked loop, there's a -/// couple of final forms: -/// - A tail-predicated loop, with implicit predication. -/// - A loop containing multiple VCPT instructions, predicating multiple VPT -/// blocks of instructions operating on different vector types. -/// -/// This pass: -/// 1) Checks if the predicates of the masked load/store instructions are -/// generated by intrinsic @llvm.get.active.lanes(). This intrinsic consumes -/// the the scalar loop tripcount as its second argument, which we extract -/// to set up the number of elements processed by the loop. -/// 2) Intrinsic @llvm.get.active.lanes() is then replaced by the MVE target -/// specific VCTP intrinsic to represent the effect of tail predication. -/// This will be picked up by the ARM Low-overhead loop pass, which performs -/// the final transformation to a DLSTP or WLSTP tail-predicated loop. +/// predicated upon an get.active.lane.mask intrinsic. This pass looks at these +/// get.active.lane.mask intrinsic and attempts to convert them to VCTP +/// instructions. This will be picked up by the ARM Low-overhead loop pass later +/// in the backend, which performs the final transformation to a DLSTP or WLSTP +/// tail-predicated loop. +// +//===----------------------------------------------------------------------===// #include "ARM.h" #include "ARMSubtarget.h" @@ -57,6 +47,7 @@ #include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" @@ -112,23 +103,18 @@ bool runOnLoop(Loop *L, LPPassManager&) override; private: - /// Perform the relevant checks on the loop and convert if possible. - bool TryConvert(Value *TripCount); - - /// Return whether this is a vectorized loop, that contains masked - /// load/stores. - bool IsPredicatedVectorLoop(); + /// Perform the relevant checks on the loop and convert active lane masks if + /// possible. + bool TryConvertActiveLaneMask(Value *TripCount); /// Perform several checks on the arguments of @llvm.get.active.lane.mask /// intrinsic. E.g., check that the loop induction variable and the element /// count are of the form we expect, and also perform overflow checks for /// the new expressions that are created. - bool IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, Value *TripCount, - FixedVectorType *VecTy); + bool IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, Value *TripCount); /// Insert the intrinsic to represent the effect of tail predication. - void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount, - FixedVectorType *VecTy); + void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount); /// Rematerialize the iteration count in exit blocks, which enables /// ARMLowOverheadLoops to better optimise away loop update statements inside @@ -138,25 +124,6 @@ } // end namespace -static bool IsDecrement(Instruction &I) { - auto *Call = dyn_cast(&I); - if (!Call) - return false; - - Intrinsic::ID ID = Call->getIntrinsicID(); - return ID == Intrinsic::loop_decrement_reg; -} - -static bool IsMasked(Instruction *I) { - auto *Call = dyn_cast(I); - if (!Call) - return false; - - Intrinsic::ID ID = Call->getIntrinsicID(); - return ID == Intrinsic::masked_store || ID == Intrinsic::masked_load || - isGatherScatter(Call); -} - bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) { if (skipLoop(L) || !EnableTailPredication) return false; @@ -207,147 +174,11 @@ return false; } - // Search for the hardware loop intrinic that decrements the loop counter. - IntrinsicInst *Decrement = nullptr; - for (auto *BB : L->getBlocks()) { - for (auto &I : *BB) { - if (IsDecrement(I)) { - Decrement = cast(&I); - break; - } - } - } + LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n"); - if (!Decrement) - return false; - - LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n" - << *Decrement << "\n"); - - if (!TryConvert(Setup->getArgOperand(0))) { - LLVM_DEBUG(dbgs() << "ARM TP: Can't tail-predicate this loop.\n"); - return false; - } - - return true; -} + bool Changed = TryConvertActiveLaneMask(Setup->getArgOperand(0)); -static FixedVectorType *getVectorType(IntrinsicInst *I) { - unsigned ID = I->getIntrinsicID(); - FixedVectorType *VecTy; - if (ID == Intrinsic::masked_load || isGather(I)) { - if (ID == Intrinsic::arm_mve_vldr_gather_base_wb || - ID == Intrinsic::arm_mve_vldr_gather_base_wb_predicated) - // then the type is a StructType - VecTy = dyn_cast(I->getType()->getContainedType(0)); - else - VecTy = dyn_cast(I->getType()); - } else if (ID == Intrinsic::masked_store) { - VecTy = dyn_cast(I->getOperand(0)->getType()); - } else { - VecTy = dyn_cast(I->getOperand(2)->getType()); - } - assert(VecTy && "No scalable vectors expected here"); - return VecTy; -} - -bool MVETailPredication::IsPredicatedVectorLoop() { - // Check that the loop contains at least one masked load/store intrinsic. - // We only support 'normal' vector instructions - other than masked - // load/stores. - bool ActiveLaneMask = false; - for (auto *BB : L->getBlocks()) { - for (auto &I : *BB) { - auto *Int = dyn_cast(&I); - if (!Int) - continue; - - switch (Int->getIntrinsicID()) { - case Intrinsic::get_active_lane_mask: - ActiveLaneMask = true; - continue; - case Intrinsic::sadd_sat: - case Intrinsic::uadd_sat: - case Intrinsic::ssub_sat: - case Intrinsic::usub_sat: - case Intrinsic::vector_reduce_add: - continue; - case Intrinsic::fma: - case Intrinsic::trunc: - case Intrinsic::rint: - case Intrinsic::round: - case Intrinsic::floor: - case Intrinsic::ceil: - case Intrinsic::fabs: - if (ST->hasMVEFloatOps()) - continue; - break; - default: - break; - } - if (IsMasked(&I)) { - auto *VecTy = getVectorType(Int); - unsigned Lanes = VecTy->getNumElements(); - unsigned ElementWidth = VecTy->getScalarSizeInBits(); - // MVE vectors are 128-bit, but don't support 128 x i1. - // TODO: Can we support vectors larger than 128-bits? - unsigned MaxWidth = TTI->getRegisterBitWidth(true); - if (Lanes * ElementWidth > MaxWidth || Lanes == MaxWidth) - return false; - MaskedInsts.push_back(cast(&I)); - continue; - } - - for (const Use &U : Int->args()) { - if (isa(U->getType())) - return false; - } - } - } - - if (!ActiveLaneMask) { - LLVM_DEBUG(dbgs() << "ARM TP: No get.active.lane.mask intrinsic found.\n"); - return false; - } - return !MaskedInsts.empty(); -} - -// Look through the exit block to see whether there's a duplicate predicate -// instruction. This can happen when we need to perform a select on values -// from the last and previous iteration. Instead of doing a straight -// replacement of that predicate with the vctp, clone the vctp and place it -// in the block. This means that the VPR doesn't have to be live into the -// exit block which should make it easier to convert this loop into a proper -// tail predicated loop. -static void Cleanup(SetVector &MaybeDead, Loop *L) { - BasicBlock *Exit = L->getUniqueExitBlock(); - if (!Exit) { - LLVM_DEBUG(dbgs() << "ARM TP: can't find loop exit block\n"); - return; - } - - // Drop references and add operands to check for dead. - SmallPtrSet Dead; - while (!MaybeDead.empty()) { - auto *I = MaybeDead.front(); - MaybeDead.remove(I); - if (I->hasNUsesOrMore(1)) - continue; - - for (auto &U : I->operands()) - if (auto *OpI = dyn_cast(U)) - MaybeDead.insert(OpI); - - Dead.insert(I); - } - - for (auto *I : Dead) { - LLVM_DEBUG(dbgs() << "ARM TP: removing dead insn: "; I->dump()); - I->eraseFromParent(); - } - - for (auto I : L->blocks()) - DeleteDeadPHIs(I); + return Changed; } // The active lane intrinsic has this form: @@ -368,7 +199,7 @@ // 3) The IV must be an induction phi with an increment equal to the // vector width. bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, - Value *TripCount, FixedVectorType *VecTy) { + Value *TripCount) { bool ForceTailPredication = EnableTailPredication == TailPredication::ForceEnabledNoReductions || EnableTailPredication == TailPredication::ForceEnabled; @@ -376,7 +207,8 @@ Value *ElemCount = ActiveLaneMask->getOperand(1); auto *EC= SE->getSCEV(ElemCount); auto *TC = SE->getSCEV(TripCount); - int VectorWidth = VecTy->getNumElements(); + int VectorWidth = + cast(ActiveLaneMask->getType())->getNumElements(); ConstantInt *ConstElemCount = nullptr; // 1) Smoke tests that the original scalar loop TripCount (TC) belongs to @@ -503,21 +335,21 @@ if (VectorWidth == StepValue) return true; - LLVM_DEBUG(dbgs() << "ARM TP: Step value " << StepValue << " doesn't match " - "vector width " << VectorWidth << "\n"); + LLVM_DEBUG(dbgs() << "ARM TP: Step value " << StepValue + << " doesn't match vector width " << VectorWidth << "\n"); return false; } void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, - Value *TripCount, FixedVectorType *VecTy) { + Value *TripCount) { IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); Module *M = L->getHeader()->getModule(); Type *Ty = IntegerType::get(M->getContext(), 32); - unsigned VectorWidth = VecTy->getNumElements(); + unsigned VectorWidth = cast(ActiveLaneMask->getType())->getNumElements(); // Insert a phi to count the number of elements processed by the loop. - Builder.SetInsertPoint(L->getHeader()->getFirstNonPHI() ); + Builder.SetInsertPoint(L->getHeader()->getFirstNonPHI()); PHINode *Processed = Builder.CreatePHI(Ty, 2); Processed->addIncoming(ActiveLaneMask->getOperand(1), L->getLoopPreheader()); @@ -553,50 +385,36 @@ << "ARM TP: Inserted VCTP: " << *VCTPCall << "\n"); } -bool MVETailPredication::TryConvert(Value *TripCount) { - if (!IsPredicatedVectorLoop()) { - LLVM_DEBUG(dbgs() << "ARM TP: no masked instructions in loop.\n"); +bool MVETailPredication::TryConvertActiveLaneMask(Value *TripCount) { + SmallVector ActiveLaneMasks; + for (auto *BB : L->getBlocks()) + for (auto &I : *BB) + if (auto *Int = dyn_cast(&I)) + if (Int->getIntrinsicID() == Intrinsic::get_active_lane_mask) + ActiveLaneMasks.push_back(Int); + + if (ActiveLaneMasks.empty()) return false; - } LLVM_DEBUG(dbgs() << "ARM TP: Found predicated vector loop.\n"); - SetVector Predicates; - - auto getPredicateOp = [](IntrinsicInst *I) { - unsigned IntrinsicID = I->getIntrinsicID(); - if (IntrinsicID == Intrinsic::arm_mve_vldr_gather_offset_predicated || - IntrinsicID == Intrinsic::arm_mve_vstr_scatter_offset_predicated) - return 5; - return (IntrinsicID == Intrinsic::masked_load || isGather(I)) ? 2 : 3; - }; - - // Walk through the masked intrinsics and try to find whether the predicate - // operand is generated by intrinsic @llvm.get.active.lane.mask(). - for (auto *I : MaskedInsts) { - Value *PredOp = I->getArgOperand(getPredicateOp(I)); - auto *Predicate = dyn_cast(PredOp); - if (!Predicate || Predicates.count(Predicate)) - continue; - auto *ActiveLaneMask = dyn_cast(Predicate); - if (!ActiveLaneMask || - ActiveLaneMask->getIntrinsicID() != Intrinsic::get_active_lane_mask) - continue; - - Predicates.insert(Predicate); + for (auto *ActiveLaneMask : ActiveLaneMasks) { LLVM_DEBUG(dbgs() << "ARM TP: Found active lane mask: " << *ActiveLaneMask << "\n"); - auto *VecTy = getVectorType(I); - if (!IsSafeActiveMask(ActiveLaneMask, TripCount, VecTy)) { + if (!IsSafeActiveMask(ActiveLaneMask, TripCount)) { LLVM_DEBUG(dbgs() << "ARM TP: Not safe to insert VCTP.\n"); return false; } LLVM_DEBUG(dbgs() << "ARM TP: Safe to insert VCTP.\n"); - InsertVCTPIntrinsic(ActiveLaneMask, TripCount, VecTy); + InsertVCTPIntrinsic(ActiveLaneMask, TripCount); } - Cleanup(Predicates, L); + // Remove dead instructions and now dead phis. + for (auto *II : ActiveLaneMasks) + RecursivelyDeleteTriviallyDeadInstructions(II); + for (auto I : L->blocks()) + DeleteDeadPHIs(I); return true; } Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll @@ -241,42 +241,18 @@ ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} ; CHECK-NEXT: .LBB5_1: @ %vector.ph -; CHECK-NEXT: adds r3, r2, #3 -; CHECK-NEXT: vdup.32 q1, r2 -; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: sub.w r12, r3, #4 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: adr r3, .LCPI5_0 -; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB5_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q2, q0, r12 -; CHECK-NEXT: vdup.32 q3, r12 -; CHECK-NEXT: vcmp.u32 hi, q3, q2 -; CHECK-NEXT: add.w r12, r12, #4 -; CHECK-NEXT: vpnot -; CHECK-NEXT: vpstt -; CHECK-NEXT: vcmpt.u32 hi, q1, q2 -; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 -; CHECK-NEXT: vrintr.f32 s15, s11 -; CHECK-NEXT: vrintr.f32 s14, s10 -; CHECK-NEXT: vrintr.f32 s13, s9 -; CHECK-NEXT: vrintr.f32 s12, s8 -; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q3, [r1], #16 -; CHECK-NEXT: le lr, .LBB5_2 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vrintr.f32 s7, s3 +; CHECK-NEXT: vrintr.f32 s6, s2 +; CHECK-NEXT: vrintr.f32 s5, s1 +; CHECK-NEXT: vrintr.f32 s4, s0 +; CHECK-NEXT: vstrw.32 q1, [r1], #16 +; CHECK-NEXT: letp lr, .LBB5_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.4: -; CHECK-NEXT: .LCPI5_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 3 @ 0x3 entry: %cmp5 = icmp eq i32 %n, 0 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph