Index: llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp =================================================================== --- llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -15,6 +15,10 @@ /// - t2LoopDec - placed within in the loop body. /// - t2LoopEnd - the loop latch terminator. /// +/// In addition to this, we also look for the presence of the VCTP instruction, +/// which determines whether we can generated the tail-predicated low-overhead +/// loop form. +/// //===----------------------------------------------------------------------===// #include "ARM.h" Index: llvm/lib/Target/ARM/MVETailPredication.cpp =================================================================== --- llvm/lib/Target/ARM/MVETailPredication.cpp +++ llvm/lib/Target/ARM/MVETailPredication.cpp @@ -20,6 +20,11 @@ /// - A tail-predicated loop, with implicit predication. /// - A loop containing multiple VCPT instructions, predicating multiple VPT /// blocks of instructions operating on different vector types. +/// +/// This pass inserts the inserts the VCTP intrinsic to represent the effect of +/// tail predication. This will be picked up by the ARM Low-overhead loop pass, +/// which performs the final transformation to a DLSTP or WLSTP tail-predicated +/// loop. #include "ARM.h" #include "ARMSubtarget.h" @@ -86,6 +91,12 @@ /// Is the icmp that generates an i1 vector, based upon a loop counter /// and a limit that is defined outside the loop. bool isTailPredicate(Instruction *Predicate, Value *NumElements); + + /// Insert the intrinsic to represent the effect of tail predication. + void InsertVCTPIntrinsic(Instruction *Predicate, + DenseMap &NewPredicates, + VectorType *VecTy, + Value *NumElements); }; } // end namespace @@ -124,7 +135,7 @@ // The MVE and LOB extensions are combined to enable tail-predication, but // there's nothing preventing us from generating VCTP instructions for v8.1m. if (!ST->hasMVEIntegerOps() || !ST->hasV8_1MMainlineOps()) { - LLVM_DEBUG(dbgs() << "TP: Not a v8.1m.main+mve target.\n"); + LLVM_DEBUG(dbgs() << "ARM TP: Not a v8.1m.main+mve target.\n"); return false; } @@ -149,7 +160,7 @@ // Look for the hardware loop intrinsic that sets the iteration count. IntrinsicInst *Setup = FindLoopIterations(Preheader); - // The test.set iteration could live in the pre- preheader. + // The test.set iteration could live in the pre-preheader. if (!Setup) { if (!Preheader->getSinglePredecessor()) return false; @@ -172,11 +183,9 @@ if (!Decrement) return false; - LLVM_DEBUG(dbgs() << "TP: Running on Loop: " << *L - << *Setup << "\n" + LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n" << *Decrement << "\n"); - bool Changed = TryConvert(Setup->getArgOperand(0)); - return Changed; + return TryConvert(Setup->getArgOperand(0)); } bool MVETailPredication::isTailPredicate(Instruction *I, Value *NumElements) { @@ -235,7 +244,7 @@ return false; // Now back to searching inside the loop body... - // Find the add with takes the index iv and adds a constant vector to it. + // Find the add with takes the index iv and adds a constant vector to it. Instruction *BroadcastSplat = nullptr; Constant *Const = nullptr; if (!match(Induction, m_Add(m_Instruction(BroadcastSplat), @@ -270,14 +279,14 @@ Value *OnEntry = Phi->getIncomingValueForBlock(L->getLoopPreheader()); if (!match(OnEntry, m_Zero())) return false; - + Value *InLoop = Phi->getIncomingValueForBlock(L->getLoopLatch()); unsigned Lanes = cast(Insert->getType())->getNumElements(); Instruction *LHS = nullptr; if (!match(InLoop, m_Add(m_Instruction(LHS), m_SpecificInt(Lanes)))) return false; - + return LHS == Phi; } @@ -299,7 +308,7 @@ unsigned ElementWidth = VecTy->getScalarSizeInBits(); // MVE vectors are 128-bit, but don't support 128 x i1. // TODO: Can we support vectors larger than 128-bits? - unsigned MaxWidth = TTI->getRegisterBitWidth(true); + unsigned MaxWidth = TTI->getRegisterBitWidth(true); if (Lanes * ElementWidth > MaxWidth || Lanes == MaxWidth) return false; MaskedInsts.push_back(cast(&I)); @@ -400,19 +409,25 @@ // tail predicated loop. static void Cleanup(DenseMap &NewPredicates, SetVector &MaybeDead, Loop *L) { - if (BasicBlock *Exit = L->getUniqueExitBlock()) { - for (auto &Pair : NewPredicates) { - Instruction *OldPred = Pair.first; - Instruction *NewPred = Pair.second; - - for (auto &I : *Exit) { - if (I.isSameOperationAs(OldPred)) { - Instruction *PredClone = NewPred->clone(); - PredClone->insertBefore(&I); - I.replaceAllUsesWith(PredClone); - MaybeDead.insert(&I); - break; - } + BasicBlock *Exit = L->getUniqueExitBlock(); + if (!Exit) { + LLVM_DEBUG(dbgs() << "ARM TP: can't find loop exit block\n"); + return; + } + + for (auto &Pair : NewPredicates) { + Instruction *OldPred = Pair.first; + Instruction *NewPred = Pair.second; + + for (auto &I : *Exit) { + if (I.isSameOperationAs(OldPred)) { + Instruction *PredClone = NewPred->clone(); + PredClone->insertBefore(&I); + I.replaceAllUsesWith(PredClone); + MaybeDead.insert(&I); + LLVM_DEBUG(dbgs() << "ARM TP: replacing: "; I.dump(); + dbgs() << "ARM TP: with: "; PredClone->dump()); + break; } } } @@ -433,23 +448,69 @@ Dead.insert(I); } - for (auto *I : Dead) + for (auto *I : Dead) { + LLVM_DEBUG(dbgs() << "ARM TP: removing dead insn: "; I->dump()); I->eraseFromParent(); + } for (auto I : L->blocks()) DeleteDeadPHIs(I); } +void MVETailPredication::InsertVCTPIntrinsic(Instruction *Predicate, + DenseMap &NewPredicates, + VectorType *VecTy, Value *NumElements) { + IRBuilder<> Builder(L->getHeader()->getFirstNonPHI()); + Module *M = L->getHeader()->getModule(); + Type *Ty = IntegerType::get(M->getContext(), 32); + + // Insert a phi to count the number of elements processed by the loop. + PHINode *Processed = Builder.CreatePHI(Ty, 2); + Processed->addIncoming(NumElements, L->getLoopPreheader()); + + // Insert the intrinsic to represent the effect of tail predication. + Builder.SetInsertPoint(cast(Predicate)); + ConstantInt *Factor = + ConstantInt::get(cast(Ty), VecTy->getNumElements()); + + Intrinsic::ID VCTPID; + switch (VecTy->getNumElements()) { + default: + llvm_unreachable("unexpected number of lanes"); + case 4: VCTPID = Intrinsic::arm_mve_vctp32; break; + case 8: VCTPID = Intrinsic::arm_mve_vctp16; break; + case 16: VCTPID = Intrinsic::arm_mve_vctp8; break; + + // FIXME: vctp64 currently not supported because the predicate + // vector wants to be <2 x i1>, but v2i1 is not a legal MVE + // type, so problems happen at isel time. + // Intrinsic::arm_mve_vctp64 exists for ACLE intrinsics + // purposes, but takes a v4i1 instead of a v2i1. + } + Function *VCTP = Intrinsic::getDeclaration(M, VCTPID); + Value *TailPredicate = Builder.CreateCall(VCTP, Processed); + Predicate->replaceAllUsesWith(TailPredicate); + NewPredicates[Predicate] = cast(TailPredicate); + + // Add the incoming value to the new phi. + // TODO: This add likely already exists in the loop. + Value *Remaining = Builder.CreateSub(Processed, Factor); + Processed->addIncoming(Remaining, L->getLoopLatch()); + LLVM_DEBUG(dbgs() << "ARM TP: Insert processed elements phi: " + << *Processed << "\n" + << "ARM TP: Inserted VCTP: " << *TailPredicate << "\n"); +} + bool MVETailPredication::TryConvert(Value *TripCount) { - if (!IsPredicatedVectorLoop()) + if (!IsPredicatedVectorLoop()) { + LLVM_DEBUG(dbgs() << "ARM TP: no masked instructions in loop"); return false; + } - LLVM_DEBUG(dbgs() << "TP: Found predicated vector loop.\n"); + LLVM_DEBUG(dbgs() << "ARM TP: Found predicated vector loop.\n"); // Walk through the masked intrinsics and try to find whether the predicate // operand is generated from an induction variable. - Module *M = L->getHeader()->getModule(); - Type *Ty = IntegerType::get(M->getContext(), 32); SetVector Predicates; DenseMap NewPredicates; @@ -466,48 +527,14 @@ continue; if (!isTailPredicate(Predicate, NumElements)) { - LLVM_DEBUG(dbgs() << "TP: Not tail predicate: " << *Predicate << "\n"); + LLVM_DEBUG(dbgs() << "ARM TP: Not tail predicate: " << *Predicate << "\n"); continue; } - LLVM_DEBUG(dbgs() << "TP: Found tail predicate: " << *Predicate << "\n"); + LLVM_DEBUG(dbgs() << "ARM TP: Found tail predicate: " << *Predicate << "\n"); Predicates.insert(Predicate); - // Insert a phi to count the number of elements processed by the loop. - IRBuilder<> Builder(L->getHeader()->getFirstNonPHI()); - PHINode *Processed = Builder.CreatePHI(Ty, 2); - Processed->addIncoming(NumElements, L->getLoopPreheader()); - - // Insert the intrinsic to represent the effect of tail predication. - Builder.SetInsertPoint(cast(Predicate)); - ConstantInt *Factor = - ConstantInt::get(cast(Ty), VecTy->getNumElements()); - Intrinsic::ID VCTPID; - switch (VecTy->getNumElements()) { - default: - llvm_unreachable("unexpected number of lanes"); - case 4: VCTPID = Intrinsic::arm_mve_vctp32; break; - case 8: VCTPID = Intrinsic::arm_mve_vctp16; break; - case 16: VCTPID = Intrinsic::arm_mve_vctp8; break; - - // FIXME: vctp64 currently not supported because the predicate - // vector wants to be <2 x i1>, but v2i1 is not a legal MVE - // type, so problems happen at isel time. - // Intrinsic::arm_mve_vctp64 exists for ACLE intrinsics - // purposes, but takes a v4i1 instead of a v2i1. - } - Function *VCTP = Intrinsic::getDeclaration(M, VCTPID); - Value *TailPredicate = Builder.CreateCall(VCTP, Processed); - Predicate->replaceAllUsesWith(TailPredicate); - NewPredicates[Predicate] = cast(TailPredicate); - - // Add the incoming value to the new phi. - // TODO: This add likely already exists in the loop. - Value *Remaining = Builder.CreateSub(Processed, Factor); - Processed->addIncoming(Remaining, L->getLoopLatch()); - LLVM_DEBUG(dbgs() << "TP: Insert processed elements phi: " - << *Processed << "\n" - << "TP: Inserted VCTP: " << *TailPredicate << "\n"); + InsertVCTPIntrinsic(Predicate, NewPredicates, VecTy, NumElements); } // Now clean up.