Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -35,6 +35,8 @@ enum ID : unsigned; } +class AssumptionCache; +class BranchInst; class Function; class GlobalValue; class IntrinsicInst; @@ -44,6 +46,7 @@ class ScalarEvolution; class StoreInst; class SwitchInst; +class TargetLibraryInfo; class Type; class User; class Value; @@ -445,6 +448,25 @@ void getUnrollingPreferences(Loop *L, ScalarEvolution &, UnrollingPreferences &UP) const; + struct HardwareLoopInfo { + HardwareLoopInfo(Loop *L) : L(L) { } + Loop *L = nullptr; + BasicBlock *ExitBlock = nullptr; + BranchInst *ExitBranch = nullptr; + const SCEV *ExitCount = nullptr; + Instruction *Predicate = nullptr; + IntegerType *CountType = nullptr; + bool PerformTest = false; + bool IsNestingLegal = false; + bool InsertPHICounter = false; + unsigned NumElements = 1; + }; + + bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, + AssumptionCache &AC, + TargetLibraryInfo *LibInfo, + HardwareLoopInfo &HWLoopInfo) const; + /// @} /// \name Scalar Target Information @@ -1073,6 +1095,10 @@ virtual bool isLoweredToCall(const Function *F) = 0; virtual void getUnrollingPreferences(Loop *L, ScalarEvolution &, UnrollingPreferences &UP) = 0; + virtual bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, + AssumptionCache &AC, + TargetLibraryInfo *LibInfo, + HardwareLoopInfo &HWLoopInfo) = 0; virtual bool isLegalAddImmediate(int64_t Imm) = 0; virtual bool isLegalICmpImmediate(int64_t Imm) = 0; virtual bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, @@ -1304,6 +1330,12 @@ UnrollingPreferences &UP) override { return Impl.getUnrollingPreferences(L, SE, UP); } + bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, + AssumptionCache &AC, + TargetLibraryInfo *LibInfo, + HardwareLoopInfo &HWLoopInfo) override { + return Impl.isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo); + } bool isLegalAddImmediate(int64_t Imm) override { return Impl.isLegalAddImmediate(Imm); } Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -190,6 +190,13 @@ return true; } + bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, + AssumptionCache &AC, + TargetLibraryInfo *LibInfo, + TTI::HardwareLoopInfo &HWLoopInfo) { + return false; + } + void getUnrollingPreferences(Loop *, ScalarEvolution &, TTI::UnrollingPreferences &) {} Index: include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- include/llvm/CodeGen/BasicTTIImpl.h +++ include/llvm/CodeGen/BasicTTIImpl.h @@ -485,6 +485,13 @@ UP.BEInsns = 2; } + bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, + AssumptionCache &AC, + TargetLibraryInfo *LibInfo, + TTI::HardwareLoopInfo &HWLoopInfo) { + return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo); + } + int getInstructionLatency(const Instruction *I) { if (isa(I)) return getST()->getSchedModel().DefaultLoadLatency; Index: include/llvm/CodeGen/Passes.h =================================================================== --- include/llvm/CodeGen/Passes.h +++ include/llvm/CodeGen/Passes.h @@ -446,6 +446,8 @@ /// Creates CFI Instruction Inserter pass. \see CFIInstrInserter.cpp FunctionPass *createCFIInstrInserter(); + FunctionPass *createHardwareLoops(); + } // End llvm namespace #endif Index: include/llvm/IR/Intrinsics.td =================================================================== --- include/llvm/IR/Intrinsics.td +++ include/llvm/IR/Intrinsics.td @@ -1180,6 +1180,25 @@ [llvm_anyvector_ty], [IntrNoMem]>; +def int_set_loop_iterations : + Intrinsic<[], [llvm_anyint_ty], [IntrNoDuplicate]>; + +def int_test_set_loop_iterations : + Intrinsic<[llvm_i1_ty], [llvm_anyint_ty], [IntrNoDuplicate]>; + +def int_set_loop_elements : + Intrinsic<[], [llvm_anyint_ty, llvm_anyint_ty], [IntrNoDuplicate]>; + +def int_test_set_loop_elements : + Intrinsic<[llvm_i1_ty], [llvm_anyint_ty, llvm_anyint_ty], [IntrNoDuplicate]>; + +def int_loop_dec : + Intrinsic<[llvm_anyint_ty], + [llvm_anyint_ty, llvm_anyint_ty], [IntrNoDuplicate]>; + +def int_get_active_mask_4 : + Intrinsic<[llvm_v4i1_ty], [llvm_anyint_ty], [IntrNoDuplicate]>; + //===----- Intrinsics that are used to provide predicate information -----===// def int_ssa_copy : Intrinsic<[llvm_any_ty], [LLVMMatchType<0>], Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -162,6 +162,7 @@ void initializeGlobalSplitPass(PassRegistry&); void initializeGlobalsAAWrapperPassPass(PassRegistry&); void initializeGuardWideningLegacyPassPass(PassRegistry&); +void initializeHardwareLoopsPass(PassRegistry&); void initializeHotColdSplittingLegacyPassPass(PassRegistry&); void initializeHWAddressSanitizerLegacyPassPass(PassRegistry &); void initializeIPCPPass(PassRegistry&); Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -135,6 +135,12 @@ return TTIImpl->getUnrollingPreferences(L, SE, UP); } +bool TargetTransformInfo::isHardwareLoopProfitable( + Loop *L, ScalarEvolution &SE, AssumptionCache &AC, + TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) const { + return TTIImpl->isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo); +} + bool TargetTransformInfo::isLegalAddImmediate(int64_t Imm) const { return TTIImpl->isLegalAddImmediate(Imm); } Index: lib/CodeGen/CMakeLists.txt =================================================================== --- lib/CodeGen/CMakeLists.txt +++ lib/CodeGen/CMakeLists.txt @@ -33,6 +33,7 @@ GCRootLowering.cpp GCStrategy.cpp GlobalMerge.cpp + HardwareLoops.cpp IfConversion.cpp ImplicitNullChecks.cpp IndirectBrExpandPass.cpp Index: lib/CodeGen/HardwareLoops.cpp =================================================================== --- /dev/null +++ lib/CodeGen/HardwareLoops.cpp @@ -0,0 +1,483 @@ +#include "llvm/Pass.h" +#include "llvm/PassRegistry.h" +#include "llvm/PassSupport.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopIterator.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "hardware-loops" + +#define HW_LOOPS_NAME "Hardware Loop Insertion" + +STATISTIC(NumHWLoops, "Number of loops converted to hardware loops"); + +namespace { + + using TTI = TargetTransformInfo; + + class HardwareLoops : public FunctionPass { + public: + static char ID; + + HardwareLoops() : FunctionPass(ID) { + initializeHardwareLoopsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + } + + bool TryConvertLoop(Loop *L); + bool TryConvertLoop(TTI::HardwareLoopInfo &HWLoopInfo); + void ConvertLoop(TTI::HardwareLoopInfo &HWLoopInfo); + + private: + ScalarEvolution *SE = nullptr; + LoopInfo *LI = nullptr; + const DataLayout *DL = nullptr; + const TargetTransformInfo *TTI = nullptr; + DominatorTree *DT = nullptr; + bool PreserveLCSSA = false; + AssumptionCache *AC = nullptr; + TargetLibraryInfo *LibInfo = nullptr; + Module *M = nullptr; + }; +} + +char HardwareLoops::ID = 0; + +bool HardwareLoops::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + + LLVM_DEBUG(dbgs() << "HWLoops: Running on " << F.getName() << "\n"); + auto *TPC = getAnalysisIfAvailable(); + if (!TPC) + return false; + + LI = &getAnalysis().getLoopInfo(); + SE = &getAnalysis().getSE(); + DT = &getAnalysis().getDomTree(); + TTI = &getAnalysis().getTTI(F); + DL = &F.getParent()->getDataLayout(); + auto *TLIP = getAnalysisIfAvailable(); + LibInfo = TLIP ? &TLIP->getTLI() : nullptr; + PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); + AC = &getAnalysis().getAssumptionCache(F); + M = F.getParent(); + + bool MadeChange = false; + + for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I) { + Loop *L = *I; + + if (!L->getParentLoop()) + MadeChange |= TryConvertLoop(L); + } + + return MadeChange; +} + +bool HardwareLoops::TryConvertLoop(Loop *L) { + bool MadeChange = false; + + // Process nested loops first. + for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) { + MadeChange |= TryConvertLoop(*I); + } + + if (MadeChange) + return true; + + // Bail out if the loop has irreducible control flow. + LoopBlocksRPO RPOT(L); + RPOT.perform(LI); + if (containsIrreducibleCFG(RPOT, *LI)) { + LLVM_DEBUG(dbgs() << "HWLoops: Loop contains irreducible CFG.\n"); + return false; + } + + TTI::HardwareLoopInfo HWLoopInfo(L); + if (!TTI->isHardwareLoopProfitable(L, *SE, *AC, LibInfo, HWLoopInfo)) { + LLVM_DEBUG(dbgs() << "HWLoops: Not profitable to convert loop.\n"); + return MadeChange; + } + + MadeChange |= TryConvertLoop(HWLoopInfo); + return MadeChange; +} + +bool HardwareLoops::TryConvertLoop(TTI::HardwareLoopInfo &HWLoopInfo) { + + Loop *L = HWLoopInfo.L; + //BasicBlock *CountedExitBlock = nullptr; + //const SCEV *ExitCount = nullptr; + //BranchInst *CountedExitBranch = nullptr; + SmallVector ExitingBlocks; + L->getExitingBlocks(ExitingBlocks); + + LLVM_DEBUG(dbgs() << "HWLoops: Try to convert profitable do loop: " + << *L << "\n"); + + for (SmallVectorImpl::iterator I = ExitingBlocks.begin(), + IE = ExitingBlocks.end(); I != IE; ++I) { + const SCEV *EC = SE->getExitCount(L, *I); + LLVM_DEBUG(dbgs() << "HWLoops: Exit Count for " << *L << " from block " + << (*I)->getName() << ": " << *EC << "\n"); + if (isa(EC)) + continue; + if (const SCEVConstant *ConstEC = dyn_cast(EC)) { + if (ConstEC->getValue()->isZero()) + continue; + } else if (!SE->isLoopInvariant(EC, L)) + continue; + + if (SE->getTypeSizeInBits(EC->getType()) > + HWLoopInfo.CountType->getBitWidth()) + continue; + + // If this exiting block is contained in a nested loop, it is not eligible + // for insertion of the branch-and-decrement since the inner loop would + // end up messing up the value in the CTR. + if (!HWLoopInfo.IsNestingLegal && LI->getLoopFor(*I) != L) + continue; + + // We now have a loop-invariant count of loop iterations (which is not the + // constant zero) for which we know that this loop will not exit via this + // existing block. + + // We need to make sure that this block will run on every loop iteration. + // For this to be true, we must dominate all blocks with backedges. Such + // blocks are in-loop predecessors to the header block. + bool NotAlways = false; + for (pred_iterator PI = pred_begin(L->getHeader()), + PIE = pred_end(L->getHeader()); PI != PIE; ++PI) { + if (!L->contains(*PI)) + continue; + + if (!DT->dominates(*I, *PI)) { + NotAlways = true; + break; + } + } + + if (NotAlways) + continue; + + // Make sure this blocks ends with a conditional branch. + Instruction *TI = (*I)->getTerminator(); + if (!TI) + continue; + + if (BranchInst *BI = dyn_cast(TI)) { + if (!BI->isConditional()) + continue; + + HWLoopInfo.ExitBranch = BI; + } else + continue; + + // Note that this block may not be the loop latch block, even if the loop + // has a latch block. + HWLoopInfo.ExitBlock = *I; + HWLoopInfo.ExitCount = EC; + break; + } + + if (!HWLoopInfo.ExitBlock) { + LLVM_DEBUG(dbgs() << "HWLoops: Unable to find CountExitBlock.\n"); + return false; + } + + BasicBlock *Preheader = L->getLoopPreheader(); + + // If we don't have a preheader, then insert one. If we already have a + // preheader, then we can use it (except if the preheader contains a use of + // the CTR register because some such uses might be reordered by the + // selection DAG after the mtctr instruction). + if (!Preheader)// || mightUseCTR(Preheader)) + Preheader = InsertPreheaderForLoop(L, DT, LI, nullptr, PreserveLCSSA); + if (!Preheader) + return false; + + LLVM_DEBUG(dbgs() << "Preheader for exit count: " << Preheader->getName() + << "\n"); + + + ConvertLoop(HWLoopInfo); + LLVM_DEBUG(dbgs() << "Converted Loop: " << *L << "\n"); + ++NumHWLoops; + return true; +} + +static const SCEV* CalcTotalElts(ConstantInt *Factor, + const SCEV *TripCount, + ScalarEvolution &SE) { + if (Factor->equalsInt(1)) + return TripCount; + + const SCEV *FactorSCEV = SE.getSCEV(Factor); + IntegerType *Int32Ty = Factor->getType(); + + if (auto *Count = dyn_cast(TripCount)) { + const SCEV *Elts = SE.getMulExpr(TripCount, FactorSCEV); + unsigned Rem = Count->getAPInt().urem(Factor->getZExtValue()); + if (Rem == 0) + return Elts; + else + return SE.getAddExpr(Elts, SE.getSCEV(ConstantInt::get(Int32Ty, Rem))); + } + + auto VisitAdd = [&](const SCEVAddExpr *S) -> const SCEVMulExpr* { + LLVM_DEBUG(dbgs() << "ARM HWLOOPS: VisitAdd " << *S << "\n"); + if (auto *Const = dyn_cast(S->getOperand(0))) { + if (Const->getAPInt() != -Factor->getValue()) + return nullptr; + } else + return nullptr; + return dyn_cast(S->getOperand(1)); + }; + + auto VisitMul = [&](const SCEVMulExpr *S) -> const SCEVUDivExpr* { + LLVM_DEBUG(dbgs() << "ARM HWLOOPS: VisitMul " << *S << "\n"); + if (auto *Const = dyn_cast(S->getOperand(0))) { + if (Const->getValue() != Factor) + return nullptr; + } else + return nullptr; + return dyn_cast(S->getOperand(1)); + }; + + auto VisitDiv = [&](const SCEVUDivExpr *S) -> const SCEV* { + LLVM_DEBUG(dbgs() << "ARM HWLOOPS: VisitDiv " << *S << "\n"); + if (auto *Const = dyn_cast(S->getRHS())) { + if (Const->getValue() != Factor) + return nullptr; + } else + return nullptr; + + if (auto *RoundUp = dyn_cast(S->getLHS())) { + if (auto *Const = dyn_cast(RoundUp->getOperand(0))) { + if (Const->getAPInt() != (Factor->getValue() - 1)) + return nullptr; + } else + return nullptr; + + LLVM_DEBUG(dbgs() << "ARM HWLOOPS: Elements: " + << *RoundUp->getOperand(1) << "\n"); + return RoundUp->getOperand(1); + } + return nullptr; + }; + + // (1 + ((-4 + (4 * ((3 + %N) /u 4))) /u 4)) + if (auto *TC = dyn_cast(TripCount)) + if (auto *Div = dyn_cast(TC->getOperand(1))) + if (auto *Add = dyn_cast(Div->getLHS())) + if (auto *Mul = VisitAdd(Add)) + if (auto *Div = VisitMul(Mul)) + if (auto *Elts = VisitDiv(Div)) + return Elts; + + return nullptr; +} + +// Insert the count into the preheader and replace the condition used by the +// selected branch. +void HardwareLoops::ConvertLoop(TTI::HardwareLoopInfo &HWLoopInfo) { + + auto InitLoopCount = [this](TTI::HardwareLoopInfo &HWLoopInfo, + BasicBlock *BB) { + const SCEV *ExitCount = HWLoopInfo.ExitCount; + + Type *CountType = HWLoopInfo.CountType; + SCEVExpander SCEVE(*SE, *DL, "loopcnt"); + if (!ExitCount->getType()->isPointerTy() && + ExitCount->getType() != CountType) + ExitCount = SE->getZeroExtendExpr(ExitCount, CountType); + + ExitCount = SE->getAddExpr(ExitCount, SE->getOne(CountType)); + + if (HWLoopInfo.Predicate) { + ConstantInt *Factor = cast( + ConstantInt::get(ExitCount->getType(), HWLoopInfo.NumElements)); + ExitCount = CalcTotalElts(Factor, ExitCount, *SE); + } + + return SCEVE.expandCodeFor(ExitCount, CountType, BB->getTerminator()); + }; + + auto InsertIterationSetup = [this](TTI::HardwareLoopInfo &HWLoopInfo, + Value *LoopCountInit, BasicBlock *BB) { + IRBuilder<> Builder(BB->getTerminator()); + Type *Ty = LoopCountInit->getType(); + + if (HWLoopInfo.PerformTest) { + Function *LoopIter = + Intrinsic::getDeclaration(M, Intrinsic::test_set_loop_iterations, + { Ty, Ty }); + Value *Call = Builder.CreateCall(LoopIter, LoopCountInit); + LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop setup: " << *Call << "\n"); + + auto *LoopGuard = dyn_cast(BB->getTerminator()); + assert((LoopGuard && LoopGuard->isConditional()) && + "Expected conditional branch for while loop"); + //Value *Cmp = Builder.CreateICmpNE(Call, ConstantInt::get(Ty, 0)); + LoopGuard->setCondition(Call); + + if (LoopGuard->getSuccessor(0) != HWLoopInfo.L->getLoopPreheader()) + LoopGuard->swapSuccessors(); + } else { + Function *LoopIter = + Intrinsic::getDeclaration(M, Intrinsic::set_loop_iterations, Ty); + Builder.CreateCall(LoopIter, LoopCountInit); + } + }; + + auto InsertElementSetup = [this](TTI::HardwareLoopInfo &HWLoopInfo, + Value *NumElts, BasicBlock *BB) { + Type *Ty = HWLoopInfo.CountType; + IRBuilder<> Builder(BB->getTerminator()); + Value *Ops[] = { NumElts, ConstantInt::get(Ty, HWLoopInfo.NumElements) }; + + + if (HWLoopInfo.PerformTest) { + Function *Setup = + Intrinsic::getDeclaration(M, Intrinsic::test_set_loop_elements, + { Ty, Ty }); + Instruction *Call = Builder.CreateCall(Setup, Ops); + LLVM_DEBUG(dbgs() << "HWLoops: Insert loop elements: " << *Call << "\n"); + + auto *LoopGuard = dyn_cast(BB->getTerminator()); + assert((LoopGuard && LoopGuard->isConditional()) && + "Expected conditional branch for while loop"); + //Value *Cmp = Builder.CreateICmpNE(Call, ConstantInt::get(Ty, 0)); + LoopGuard->setCondition(Call); + + if (LoopGuard->getSuccessor(0) != HWLoopInfo.L->getLoopPreheader()) + LoopGuard->swapSuccessors(); + } else { + Function *Setup = + Intrinsic::getDeclaration(M, Intrinsic::set_loop_elements, + { Ty, Ty }); + Builder.CreateCall(Setup, Ops); + } + }; + + auto InsertCounterPHI = [](TTI::HardwareLoopInfo &HWLoopInfo, + Value *NumElts, Value *EltsRem) { + BasicBlock *Preheader = HWLoopInfo.L->getLoopPreheader(); + BasicBlock *Header = HWLoopInfo.L->getHeader(); + BasicBlock *Latch = HWLoopInfo.ExitBranch->getParent(); + IRBuilder<> Builder(Header->getFirstNonPHI()); + PHINode *Index = Builder.CreatePHI(NumElts->getType(), 2); + Index->addIncoming(NumElts, Preheader); + Index->addIncoming(EltsRem, Latch); + LLVM_DEBUG(dbgs() << "HWLoops: Index PHI: " << *Index << "\n"); + return Index; + }; + + auto InsertDec = [this](TTI::HardwareLoopInfo &HWLoopInfo, Value *NumElts) { + BranchInst *ExitBranch = HWLoopInfo.ExitBranch; + IRBuilder<> CondBuilder(ExitBranch); + Value *Factor = ConstantInt::get(NumElts->getType(), + HWLoopInfo.NumElements); + Function *DecFunc = + Intrinsic::getDeclaration(M, Intrinsic::loop_dec, + { NumElts->getType(), NumElts->getType(), + Factor->getType()}); + Value *Ops[] = { NumElts, Factor }; + Value *Call = CondBuilder.CreateCall(DecFunc, Ops); + Value *NewCond = + CondBuilder.CreateICmpNE(Call, + ConstantInt::get(NumElts->getType(), 0)); + Value *OldCond = ExitBranch->getCondition(); + ExitBranch->setCondition(NewCond); + + // The false branch must exit the loop. + if (!HWLoopInfo.L->contains(ExitBranch->getSuccessor(0))) + ExitBranch->swapSuccessors(); + + // The old condition may be dead now, and may have even created a dead PHI + // (the original induction variable). + RecursivelyDeleteTriviallyDeadInstructions(OldCond); + + LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop dec: " << *Call << "\n"); + return cast(Call); + }; + + auto InsertActiveMask = [this](TTI::HardwareLoopInfo &HWLoopInfo, + Value *Elts) { + IRBuilder<> Builder(HWLoopInfo.Predicate); + Function *F = + Intrinsic::getDeclaration(M, Intrinsic::get_active_mask_4, Elts->getType()); + Value *Ops[] = { Elts }; + Instruction *ActiveMask = Builder.CreateCall(F, Ops); + LLVM_DEBUG(dbgs() << "HWLoops: Active Lane Mask: " << *ActiveMask << "\n"); + HWLoopInfo.Predicate->replaceAllUsesWith(ActiveMask); + }; + + BasicBlock *BeginBB = HWLoopInfo.PerformTest ? + HWLoopInfo.L->getLoopPreheader()->getUniquePredecessor() : + HWLoopInfo.L->getLoopPreheader(); + + Value *LoopCountInit = InitLoopCount(HWLoopInfo, BeginBB); + Value *EltsRem = LoopCountInit; + + if (HWLoopInfo.Predicate) { + InsertElementSetup(HWLoopInfo, LoopCountInit, BeginBB); + } else + InsertIterationSetup(HWLoopInfo, LoopCountInit, BeginBB); + + Instruction *LoopDec = InsertDec(HWLoopInfo, EltsRem); + if (HWLoopInfo.InsertPHICounter) { + EltsRem = InsertCounterPHI(HWLoopInfo, LoopCountInit, LoopDec); + LoopDec->setOperand(0, EltsRem); + } + if (HWLoopInfo.Predicate) + InsertActiveMask(HWLoopInfo, EltsRem); + + // Run through the basic blocks of the loop and see if any of them have dead + // PHIs that can be removed. + for (auto I : HWLoopInfo.L->blocks()) + DeleteDeadPHIs(I); +} + +INITIALIZE_PASS_BEGIN(HardwareLoops, DEBUG_TYPE, HW_LOOPS_NAME, false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_END(HardwareLoops, DEBUG_TYPE, HW_LOOPS_NAME, false, false) + +FunctionPass *llvm::createHardwareLoops() { return new HardwareLoops(); } Index: lib/Target/ARM/ARM.h =================================================================== --- lib/Target/ARM/ARM.h +++ lib/Target/ARM/ARM.h @@ -37,6 +37,7 @@ Pass *createARMParallelDSPPass(); +FunctionPass *createARMFinaliseHardwareLoopsPass(); FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM, CodeGenOpt::Level OptLevel); FunctionPass *createA15SDOptimizerPass(); Index: lib/Target/ARM/ARMFinalizeHardwareLoops.cpp =================================================================== --- /dev/null +++ lib/Target/ARM/ARMFinalizeHardwareLoops.cpp @@ -0,0 +1,256 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMBaseInstrInfo.h" +#include "ARMBaseRegisterInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "arm-finalise-hardware-loops" +#define ARM_FINALISE_HW_LOOPS_NAME "ARM hardware loop finalisation pass" + +namespace { + + class ARMFinaliseHWLoops : public MachineFunctionPass { + const ARMBaseInstrInfo *TII = nullptr; + + public: + static char ID; + + ARMFinaliseHWLoops() : MachineFunctionPass(ID) { } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + bool ProcessLoop(MachineLoop *ML); + + void Expand(MachineInstr *Start, MachineInstr *Dec, MachineInstr *End, + MachineInstr *ActiveMask, + SmallVectorImpl &Predicated); + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoVRegs); + } + + StringRef getPassName() const override { + return ARM_FINALISE_HW_LOOPS_NAME; + } + }; +} + +char ARMFinaliseHWLoops::ID = 0; + +bool ARMFinaliseHWLoops::runOnMachineFunction(MachineFunction &MF) { + auto &MLI = getAnalysis(); + TII = + static_cast(MF.getSubtarget().getInstrInfo()); + LLVM_DEBUG(dbgs() << " ------- ARM HWLOOPS on " << MF.getName() << "\n"); + + bool Changed = false; + for (auto ML : MLI) { + if (!ML->getExitingBlock() || !ML->getHeader() || !ML->getLoopLatch()) + continue; + Changed |= ProcessLoop(ML); + } + return Changed; +} + +bool ARMFinaliseHWLoops::ProcessLoop(MachineLoop *ML) { + + LLVM_DEBUG(dbgs() << "ARM HWLOOPS: Processing " << *ML); + auto SearchForStart = [](MachineBasicBlock *MBB) -> MachineInstr* { + for (auto &MI : *MBB) { + if (MI.getOpcode() == ARM::t2LoopStart) + return &MI; + } + return nullptr; + }; + + MachineInstr *Start = nullptr; + + if (auto *Preheader = ML->getLoopPreheader()) { + Start = SearchForStart(Preheader); + if (!Start) { + if (Preheader->pred_size() == 1) { + MachineBasicBlock *PrePreheader = *Preheader->pred_begin(); + Start = SearchForStart(PrePreheader); + } + } + } + + if (!Start) + return false; + LLVM_DEBUG(dbgs() << "ARM HWLOOPS: Found Loop Start: " << *Start); + + auto IsLoopDec = [](MachineInstr &MI) { + return MI.getOpcode() == ARM::t2LoopDec; + }; + + auto IsLoopEnd = [](MachineInstr &MI) { + return MI.getOpcode() == ARM::t2LoopEnd; + }; + + auto IsActiveMask = [](MachineInstr &MI) { + return MI.getOpcode() == ARM::t2ActiveMask; + }; + + auto IsPredicated = [](MachineInstr &MI) { + switch (MI.getOpcode()) { + default: + break; + case ARM::VMSTR32: + case ARM::VMLDR32: + return true; + } + return false; + }; + + MachineInstr *Dec = nullptr; + MachineInstr *End = nullptr; + MachineInstr *ActiveMask = nullptr; + bool FoundPredicated = false; + bool IsProfitable = true; + SmallVector Predicated; + + for (auto *MBB : ML->getBlocks()) { + for (auto &MI : *MBB) { + // TODO: For scalar loops, check for any instructions that means a + // low-overhead loop wouldn't be profitable. Should we bail if LR has + // been spilt? We'd still need a register to control the loop count but + // the loop index may increase whereas LE(TP) decrement it... + // + // Not inserting a low-overhead loop for a vector loop is not really + // option here as we'd either: + // - Need to reconstruct a vector loop and a scalar epilogue. + // - Try to use VIDUP and create a VPT block to predicate the lanes, + // which would require using a Q register, all of which may be already + // allocated, for the VIDUP result. It looks like VIDUP wouldn't even be + // helpful for 16xi8 vectors because the instruction can only increment + // by a maximum of 8. + + if (IsLoopDec(MI)) + Dec = &MI; + else if (IsLoopEnd(MI)) + End = &MI; + else if (IsActiveMask(MI)) + ActiveMask = &MI; + else if (IsPredicated(MI)) { + FoundPredicated = true; + Predicated.push_back(&MI); + } + } + } + + // Check that we've found the necessary components + if (!Dec || !End || (FoundPredicated && !ActiveMask)) + return false; + + if (!IsProfitable) + return false; + + LLVM_DEBUG(dbgs() << "ARM HWLOOPS: Found Loop Dec: " << *Dec); + LLVM_DEBUG(dbgs() << "ARM HWLOOPS: Found Loop End: " << *End); + + // TODO: Verify that the cmp and br from the WLS either branch to the header + // or the exit block. + // TODO: Verify that the cmp and br from the LE either branch to the header + // or the exit block. + // TODO: Verify that all predicated instructions are using ActiveMask. + + Expand(Start, Dec, End, ActiveMask, Predicated); + return true; +} + +void ARMFinaliseHWLoops::Expand(MachineInstr *Start, MachineInstr *Dec, + MachineInstr *End, MachineInstr *ActiveMask, + SmallVectorImpl &Predicated) { + auto ExpandLoopStart = [this](MachineInstr *Start) { + MachineBasicBlock &MBB = *Start->getParent(); + MachineInstrBuilder MIB = BuildMI(MBB, Start, Start->getDebugLoc(), + TII->get(ARM::t2WLSTP)); + MIB.addDef(ARM::LR); + unsigned OpIdx = 0; + MIB.add(Start->getOperand(OpIdx++)); + MIB.add(Start->getOperand(OpIdx++)); + MIB.add(Start->getOperand(OpIdx++)); + MIB.add(predOps(ARMCC::AL)); + LLVM_DEBUG(dbgs() << "ARM HWLOOPS: Inserted WLSTP: " << *MIB << "\n"); + Start->eraseFromParent(); + }; + + auto ExpandLoad = [this](MachineInstr *MI) { + MachineBasicBlock &MBB = *MI->getParent(); + MachineInstrBuilder MIB = BuildMI(MBB, MI, MI->getDebugLoc(), + TII->get(ARM::t2VLDRW)); + unsigned OpIdx = 0; + MIB.add(MI->getOperand(OpIdx++)); + MIB.add(MI->getOperand(OpIdx++)); + MIB.add(predOps(ARMCC::AL)); + MI->eraseFromParent(); + }; + + auto ExpandStore = [this](MachineInstr *MI) { + MachineBasicBlock &MBB = *MI->getParent(); + MachineInstrBuilder MIB = BuildMI(MBB, MI, MI->getDebugLoc(), + TII->get(ARM::t2VSTRW)); + unsigned OpIdx = 0; + MIB.add(MI->getOperand(OpIdx++)); + MIB.add(MI->getOperand(OpIdx++)); + MIB.add(predOps(ARMCC::AL)); + MI->eraseFromParent(); + }; + + auto RemoveActiveMask = [](MachineInstr *MI) { + MI->eraseFromParent(); + }; + + // Combine the LoopDec and LoopEnd instructions into LE(TP). + auto ExpandLoopEnd = [this](MachineInstr *Dec, MachineInstr *End) { + // TODO: Check and handle the causes where LR is spilt between Dec and End. + MachineBasicBlock &MBB = *End->getParent(); + MachineInstrBuilder MIB = BuildMI(MBB, End, End->getDebugLoc(), + TII->get(ARM::t2LETP)); + MIB.addDef(ARM::LR); + unsigned OpIdx = 0; + MIB.add(End->getOperand(OpIdx++)); + MIB.add(End->getOperand(OpIdx++)); + MIB.add(predOps(ARMCC::AL)); + LLVM_DEBUG(dbgs() << "ARM HWLOOPS: Inserted LETP: " << *MIB << "\n"); + End->eraseFromParent(); + Dec->eraseFromParent(); + }; + + ExpandLoopStart(Start); + ExpandLoopEnd(Dec, End); + + if (ActiveMask) { + for (auto *MI : Predicated) { + if (MI->mayLoad()) + ExpandLoad(MI); + else if (MI->mayStore()) + ExpandStore(MI); + else + llvm_unreachable("unhandled predicated instruction"); + } + RemoveActiveMask(ActiveMask); + } +} + +FunctionPass *llvm::createARMFinaliseHardwareLoopsPass() { + return new ARMFinaliseHWLoops(); +} Index: lib/Target/ARM/ARMISelDAGToDAG.cpp =================================================================== --- lib/Target/ARM/ARMISelDAGToDAG.cpp +++ lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -2959,6 +2959,19 @@ // Other cases are autogenerated. break; } + case ARMISD::WhileLoopStart: { + SDValue Size = CurDAG->getTargetConstant( + cast(N->getOperand(1))->getZExtValue(), dl, MVT::i32); + SDValue Ops[] = { Size, + N->getOperand(2), + N->getOperand(3), + N->getOperand(0) }; + SDNode *LoopStart = + CurDAG->getMachineNode(ARM::t2LoopStart, dl, MVT::Other, Ops); + ReplaceUses(N, LoopStart); + CurDAG->RemoveDeadNode(N); + return; + } case ARMISD::BRCOND: { // Pattern: (ARMbrcond:void (bb:Other):$dst, (imm:i32):$cc) // Emits: (Bcc:void (bb:Other):$dst, (imm:i32):$cc) @@ -2985,7 +2998,39 @@ unsigned CC = (unsigned) cast(N2)->getZExtValue(); - if (InFlag.getOpcode() == ARMISD::CMPZ) { + // Handle loops. + if (InFlag.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN) { + if (InFlag.getOpcode() == ARMISD::CMPZ) { + // Handle loops. + SDValue Int = InFlag.getOperand(0); + LLVM_DEBUG(dbgs() << "Int: "; Int.dump()); + uint64_t ID = cast(Int->getOperand(1))->getZExtValue(); + + if (ID == Intrinsic::loop_dec) { + SDValue Elements = Int.getOperand(2); + SDValue Size = CurDAG->getTargetConstant( + cast(Int.getOperand(3))->getZExtValue(), dl, + MVT::i32); + + SDValue Args[] = { Elements, Size, Int.getOperand(0) }; + SDNode *LoopDec = + CurDAG->getMachineNode(ARM::t2LoopDec, dl, + CurDAG->getVTList(MVT::i32, MVT::Other), + Args); + ReplaceUses(Int.getNode(), LoopDec); + + SDValue EndArgs[] = { SDValue(LoopDec, 0), N1, Chain }; + SDNode *LoopEnd = + CurDAG->getMachineNode(ARM::t2LoopEnd, dl, MVT::Other, EndArgs); + + ReplaceUses(N, LoopEnd); + CurDAG->RemoveDeadNode(N); + CurDAG->RemoveDeadNode(InFlag.getNode()); + CurDAG->RemoveDeadNode(Int.getNode()); + return; + } + } + bool SwitchEQNEToPLMI; SelectCMPZ(InFlag.getNode(), SwitchEQNEToPLMI); InFlag = N->getOperand(4); Index: lib/Target/ARM/ARMISelLowering.h =================================================================== --- lib/Target/ARM/ARMISelLowering.h +++ lib/Target/ARM/ARMISelLowering.h @@ -233,6 +233,8 @@ // instructions. MEMCPY, + WhileLoopStart, + // Vector load N-element structure to all lanes: VLD1DUP = ISD::FIRST_TARGET_MEMORY_OPCODE, VLD2DUP, Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -526,6 +526,10 @@ setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); } + const MVT pTypes[] = { MVT::v16i1, MVT::v8i1, MVT::v4i1 }; + for (auto VT : pTypes) + addRegisterClass(VT, &ARM::VCCRRegClass); + for (MVT VT : MVT::vector_valuetypes()) { for (MVT InnerVT : MVT::vector_valuetypes()) { setTruncStoreAction(VT, InnerVT, Expand); @@ -703,6 +707,7 @@ setOperationAction(ISD::FMA, MVT::v4f32, Expand); } + setTargetDAGCombine(ISD::BRCOND); setTargetDAGCombine(ISD::INTRINSIC_VOID); setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); @@ -12670,6 +12675,31 @@ return V; } +static SDValue PerformHWLoopCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *ST) { + SDValue CC = N->getOperand(1); + + if (CC->getOperand(0)->getOpcode() != ISD::INTRINSIC_W_CHAIN) + return SDValue(); + + SDValue Int = CC->getOperand(0); + unsigned IntOp = cast(Int.getOperand(1))->getZExtValue(); + if (IntOp != Intrinsic::test_set_loop_elements) + return SDValue(); + + SDValue Chain = N->getOperand(0); + SDValue Elements = Int.getOperand(2); + SDValue Size = Int.getOperand(3); + SDValue ExitBlock = N->getOperand(2); + SDLoc dl(Int); + + SDValue Ops[] = { Chain, Size, Elements, ExitBlock }; + SDValue Res = DCI.DAG.getNode(ARMISD::WhileLoopStart, dl, MVT::Other, Ops); + DCI.DAG.ReplaceAllUsesOfValueWith(Int.getValue(1), Int.getOperand(0)); + return Res; +} + /// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND. SDValue ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const { @@ -12684,6 +12714,7 @@ SDValue RHS = Cmp.getOperand(1); SDValue Chain = N->getOperand(0); SDValue BB = N->getOperand(1); + SDValue ARMcc = N->getOperand(2); ARMCC::CondCodes CC = (ARMCC::CondCodes)cast(ARMcc)->getZExtValue(); @@ -12901,6 +12932,7 @@ case ISD::OR: return PerformORCombine(N, DCI, Subtarget); case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget); case ISD::AND: return PerformANDCombine(N, DCI, Subtarget); + case ISD::BRCOND: return PerformHWLoopCombine(N, DCI, Subtarget); case ARMISD::ADDC: case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget); case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget); Index: lib/Target/ARM/ARMInstrInfo.td =================================================================== --- lib/Target/ARM/ARMInstrInfo.td +++ lib/Target/ARM/ARMInstrInfo.td @@ -13,6 +13,12 @@ //===----------------------------------------------------------------------===// // ARM specific DAG Nodes. // +def SDT_ARMWhileLoop : SDTypeProfile<0, 3, [SDTCisVT<0, i32>, + SDTCisVT<1, i32>, + SDTCisVT<2, OtherVT>]>; + +def ARMWLS : SDNode<"ARMISD::WhileLoopStart", SDT_ARMWhileLoop, + [SDNPHasChain]>; // Type profiles. def SDT_ARMCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>, Index: lib/Target/ARM/ARMInstrThumb2.td =================================================================== --- lib/Target/ARM/ARMInstrThumb2.td +++ lib/Target/ARM/ARMInstrThumb2.td @@ -1235,6 +1235,89 @@ 4, IIC_iALUi, []>, Sched<[WriteALU, ReadALU]>; +let isBranch = 1, isTerminator = 1, hasSideEffects = 1 in { + def t2LoopStart : + t2PseudoInst<(outs), + (ins imm0_7:$size, rGPR:$elts, brtarget:$target), + 4, IIC_Br, []>, Sched<[WriteBr]>; + def t2WLSTP : + T2I<(outs GPRlr:$Rm), (ins imm0_7:$size, GPRlr:$elts, brtarget:$target), IIC_Br, + "wlstp.$size", "\t$Rm, $elts, $target", []>, Sched<[WriteBr]> { + bits<5> Rm; + bits<2> size; + bits<5> elts; + bits<12> target; + } +} + +def t2LoopDec : + t2PseudoInst<(outs GPRlr:$Rm), + (ins GPRlr:$Rn, imm0_7:$size), + 4, IIC_Br, + []>, + Sched<[WriteBr]>; + +let isBranch = 1, isTerminator = 1, hasSideEffects = 1 in { + def t2LoopEnd : + t2PseudoInst<(outs), + (ins GPRlr:$elts, brtarget:$target), + 4, IIC_Br, []>, Sched<[WriteBr]>; + def t2LETP : + T2I<(outs GPRlr:$Rm), (ins GPRlr:$elts, brtarget:$target), IIC_Br, + "letp", "\t$target", []>, Sched<[WriteBr]> { + bits<5> Rm; + bits<5> elts; + bits<12> target; + } +} + +def t2ActiveMask : + t2PseudoInst<(outs VCCR:$pred), + (ins rGPR:$elts), + 4, IIC_Br, + [(set VCCR:$pred, (int_get_active_mask_4 rGPR:$elts))]>, + Sched<[WriteBr]>; + +def nonext_masked_load : + PatFrag<(ops node:$ptr, node:$pred, node:$def), + (masked_load node:$ptr, node:$pred, node:$def), [{ + return cast(N)->getExtensionType() == ISD::NON_EXTLOAD; +}]>; +def nontrunc_masked_store : + PatFrag<(ops node:$val, node:$ptr, node:$pred), + (masked_store node:$val, node:$ptr, node:$pred), [{ + return !cast(N)->isTruncatingStore(); +}]>; + +def VMLDR32 : t2PseudoInst<(outs QPR:$vec), + (ins t2addrmode_imm12:$addr, VCCR:$pred, i32imm:$imm), 4, + NoItinerary, []>, Sched<[WriteLd]>; +let mayLoad = 1 in +def t2VLDRW : T2I<(outs QPR:$Rm), + (ins rGPR:$addr), NoItinerary, + "vldrw", "\t$Rm, [$addr]", []>, Sched<[WriteLd]> { + bits<6> Rm; + bits<5> addr; +} + +def VMSTR32 : t2PseudoInst<(outs), + (ins QPR:$vec, t2addrmode_imm12:$addr, VCCR:$pred, i32imm:$imm), 4, + NoItinerary, []>, Sched<[WriteST]>; +let mayStore = 1 in +def t2VSTRW : T2I<(outs), + (ins QPR:$Rm, rGPR:$addr), NoItinerary, + "vstrw", "\t$Rm, [$addr]", []>, Sched<[WriteST]> { + bits<6> Rm; + bits<5> addr; +} + +def : Pat<(v4i32 (nonext_masked_load rGPR:$addr, (v4i1 VCCR:$pred), undef)), + (v4i32 (VMLDR32 rGPR:$addr, (i32 0), (v4i1 VCCR:$pred), (i32 2)))>; +def : Pat<(nontrunc_masked_store (v4i32 QPR:$vec), rGPR:$addr, (v4i1 VCCR:$pred)), + (VMSTR32 (v4i32 QPR:$vec), rGPR:$addr, (i32 0), (v4i1 VCCR:$pred), + (i32 2))>; + + //===----------------------------------------------------------------------===// // Load / store Instructions. Index: lib/Target/ARM/ARMRegisterInfo.td =================================================================== --- lib/Target/ARM/ARMRegisterInfo.td +++ lib/Target/ARM/ARMRegisterInfo.td @@ -254,6 +254,11 @@ let DiagnosticString = "operand must be a register sp"; } +def GPRlr : RegisterClass<"ARM", [i32], 32, (add LR)>; + +def VPR : ARMReg<32, "vpr">; +def VCCR : RegisterClass<"ARM", [i32, v16i1, v8i1, v4i1], 32, (add VPR)>; + // restricted GPR register class. Many Thumb2 instructions allow the full // register range for operands, but have undefined behaviours when PC // or SP (R13 or R15) are used. The ARM ISA refers to these operands Index: lib/Target/ARM/ARMTargetMachine.cpp =================================================================== --- lib/Target/ARM/ARMTargetMachine.cpp +++ lib/Target/ARM/ARMTargetMachine.cpp @@ -410,6 +410,9 @@ TargetPassConfig::addIRPasses(); + addPass(createHardwareLoops()); + addPass(createDeadCodeEliminationPass()); + // Run the parallel DSP pass. if (getOptLevel() == CodeGenOpt::Aggressive) addPass(createARMParallelDSPPass()); @@ -494,6 +497,8 @@ addPass(createBreakFalseDeps()); } + addPass(createARMFinaliseHardwareLoopsPass()); + // Expand some pseudo instructions into multiple instructions to allow // proper scheduling. addPass(createARMExpandPseudoPass()); Index: lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- lib/Target/ARM/ARMTargetTransformInfo.h +++ lib/Target/ARM/ARMTargetTransformInfo.h @@ -180,6 +180,15 @@ bool UseMaskForCond = false, bool UseMaskForGaps = false); + bool isLegalMaskedStore(Type *Ty) { return true; } + + bool isLegalMaskedLoad(Type *Ty) { return true; } + + bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, + AssumptionCache &AC, + TargetLibraryInfo *LibInfo, + TTI::HardwareLoopInfo &HWLoopInfo); + void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP); Index: lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- lib/Target/ARM/ARMTargetTransformInfo.cpp +++ lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -628,6 +628,70 @@ UseMaskForCond, UseMaskForGaps); } +bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, + AssumptionCache &AC, + TargetLibraryInfo *LibInfo, + TTI::HardwareLoopInfo &HWLoopInfo) { + if (!L->getExitBlock() || !SE.getBackedgeTakenCount(L)) + return false; + + const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L); + if (isa(BackedgeTakenCount)) + return false; + + const SCEV *TripCountSCEV = + SE.getAddExpr(BackedgeTakenCount, + SE.getOne(BackedgeTakenCount->getType())); + + if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) + return false; + + auto CheckForPredicates = [&HWLoopInfo](Loop *L) { + VectorType *VecTy = nullptr; + // Inspect the instructions for vector operations. + for (auto *BB : L->getBlocks()) { + for (auto &I : *BB) { + if (!isa(I.getType())) + continue; + + auto *VTy = cast(I.getType()); + if (!VecTy) + VecTy = VTy; + else if (VecTy->getNumElements() != VTy->getNumElements()) + return false; + + if (!isa(&I)) + continue; + + auto *Call = dyn_cast(&I); + if (Call->getIntrinsicID() != Intrinsic::masked_load && + Call->getIntrinsicID() != Intrinsic::masked_store) + continue; + + if (!HWLoopInfo.Predicate) + HWLoopInfo.Predicate = cast(Call->getOperand(2)); + else if (HWLoopInfo.Predicate != cast(Call->getOperand(2))) + return false; + } + } + return true; + }; + + if (!CheckForPredicates(L)) + return false; + + BasicBlock *Preheader = L->getLoopPreheader(); + if (auto *BI = dyn_cast(Preheader->getTerminator())) + if (BI->isUnconditional() && Preheader->getUniquePredecessor()) + HWLoopInfo.PerformTest = true; + + LLVMContext &C = L->getHeader()->getParent()->getParent()->getContext(); + HWLoopInfo.InsertPHICounter = true; + HWLoopInfo.CountType = Type::getInt32Ty(C); + HWLoopInfo.NumElements = 4; + return true; +} + void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP) { // Only currently enable these preferences for M-Class cores. Index: lib/Target/ARM/CMakeLists.txt =================================================================== --- lib/Target/ARM/CMakeLists.txt +++ lib/Target/ARM/CMakeLists.txt @@ -29,6 +29,7 @@ ARMConstantPoolValue.cpp ARMExpandPseudoInsts.cpp ARMFastISel.cpp + ARMFinalizeHardwareLoops.cpp ARMFrameLowering.cpp ARMHazardRecognizer.cpp ARMInstructionSelector.cpp Index: lib/Target/PowerPC/PPCCTRLoops.cpp =================================================================== --- lib/Target/PowerPC/PPCCTRLoops.cpp +++ lib/Target/PowerPC/PPCCTRLoops.cpp @@ -71,63 +71,7 @@ static cl::opt CTRLoopLimit("ppc-max-ctrloop", cl::Hidden, cl::init(-1)); #endif -// The latency of mtctr is only justified if there are more than 4 -// comparisons that will be removed as a result. -static cl::opt -SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden, - cl::desc("Loops with a constant trip count smaller than " - "this value will not use the count register.")); - -STATISTIC(NumCTRLoops, "Number of loops converted to CTR loops"); - namespace { - struct PPCCTRLoops : public FunctionPass { - -#ifndef NDEBUG - static int Counter; -#endif - - public: - static char ID; - - PPCCTRLoops() : FunctionPass(ID) { - initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry()); - } - - bool runOnFunction(Function &F) override; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - AU.addPreserved(); - AU.addRequired(); - AU.addPreserved(); - AU.addRequired(); - AU.addRequired(); - AU.addRequired(); - } - - private: - bool mightUseCTR(BasicBlock *BB); - bool convertToCTRLoop(Loop *L); - - private: - const PPCTargetMachine *TM; - const PPCSubtarget *STI; - const PPCTargetLowering *TLI; - const DataLayout *DL; - const TargetLibraryInfo *LibInfo; - const TargetTransformInfo *TTI; - LoopInfo *LI; - ScalarEvolution *SE; - DominatorTree *DT; - bool PreserveLCSSA; - TargetSchedModel SchedModel; - }; - - char PPCCTRLoops::ID = 0; -#ifndef NDEBUG - int PPCCTRLoops::Counter = 0; -#endif #ifndef NDEBUG struct PPCCTRLoopsVerify : public MachineFunctionPass { @@ -153,16 +97,6 @@ #endif // NDEBUG } // end anonymous namespace -INITIALIZE_PASS_BEGIN(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops", - false, false) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) -INITIALIZE_PASS_END(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops", - false, false) - -FunctionPass *llvm::createPPCCTRLoops() { return new PPCCTRLoops(); } - #ifndef NDEBUG INITIALIZE_PASS_BEGIN(PPCCTRLoopsVerify, "ppc-ctr-loops-verify", "PowerPC CTR Loops Verify", false, false) @@ -175,512 +109,6 @@ } #endif // NDEBUG -bool PPCCTRLoops::runOnFunction(Function &F) { - if (skipFunction(F)) - return false; - - auto *TPC = getAnalysisIfAvailable(); - if (!TPC) - return false; - - TM = &TPC->getTM(); - STI = TM->getSubtargetImpl(F); - TLI = STI->getTargetLowering(); - - LI = &getAnalysis().getLoopInfo(); - SE = &getAnalysis().getSE(); - DT = &getAnalysis().getDomTree(); - TTI = &getAnalysis().getTTI(F); - DL = &F.getParent()->getDataLayout(); - auto *TLIP = getAnalysisIfAvailable(); - LibInfo = TLIP ? &TLIP->getTLI() : nullptr; - PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); - SchedModel.init(STI); - - bool MadeChange = false; - - for (LoopInfo::iterator I = LI->begin(), E = LI->end(); - I != E; ++I) { - Loop *L = *I; - if (!L->getParentLoop()) - MadeChange |= convertToCTRLoop(L); - } - - return MadeChange; -} - -static bool isLargeIntegerTy(bool Is32Bit, Type *Ty) { - if (IntegerType *ITy = dyn_cast(Ty)) - return ITy->getBitWidth() > (Is32Bit ? 32U : 64U); - - return false; -} - -// Determining the address of a TLS variable results in a function call in -// certain TLS models. -static bool memAddrUsesCTR(const PPCTargetMachine &TM, const Value *MemAddr) { - const auto *GV = dyn_cast(MemAddr); - if (!GV) { - // Recurse to check for constants that refer to TLS global variables. - if (const auto *CV = dyn_cast(MemAddr)) - for (const auto &CO : CV->operands()) - if (memAddrUsesCTR(TM, CO)) - return true; - - return false; - } - - if (!GV->isThreadLocal()) - return false; - TLSModel::Model Model = TM.getTLSModel(GV); - return Model == TLSModel::GeneralDynamic || Model == TLSModel::LocalDynamic; -} - -// Loop through the inline asm constraints and look for something that clobbers -// ctr. -static bool asmClobbersCTR(InlineAsm *IA) { - InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints(); - for (unsigned i = 0, ie = CIV.size(); i < ie; ++i) { - InlineAsm::ConstraintInfo &C = CIV[i]; - if (C.Type != InlineAsm::isInput) - for (unsigned j = 0, je = C.Codes.size(); j < je; ++j) - if (StringRef(C.Codes[j]).equals_lower("{ctr}")) - return true; - } - return false; -} - -bool PPCCTRLoops::mightUseCTR(BasicBlock *BB) { - for (BasicBlock::iterator J = BB->begin(), JE = BB->end(); - J != JE; ++J) { - if (CallInst *CI = dyn_cast(J)) { - // Inline ASM is okay, unless it clobbers the ctr register. - if (InlineAsm *IA = dyn_cast(CI->getCalledValue())) { - if (asmClobbersCTR(IA)) - return true; - continue; - } - - if (Function *F = CI->getCalledFunction()) { - // Most intrinsics don't become function calls, but some might. - // sin, cos, exp and log are always calls. - unsigned Opcode = 0; - if (F->getIntrinsicID() != Intrinsic::not_intrinsic) { - switch (F->getIntrinsicID()) { - default: continue; - // If we have a call to ppc_is_decremented_ctr_nonzero, or ppc_mtctr - // we're definitely using CTR. - case Intrinsic::ppc_is_decremented_ctr_nonzero: - case Intrinsic::ppc_mtctr: - return true; - -// VisualStudio defines setjmp as _setjmp -#if defined(_MSC_VER) && defined(setjmp) && \ - !defined(setjmp_undefined_for_msvc) -# pragma push_macro("setjmp") -# undef setjmp -# define setjmp_undefined_for_msvc -#endif - - case Intrinsic::setjmp: - -#if defined(_MSC_VER) && defined(setjmp_undefined_for_msvc) - // let's return it to _setjmp state -# pragma pop_macro("setjmp") -# undef setjmp_undefined_for_msvc -#endif - - case Intrinsic::longjmp: - - // Exclude eh_sjlj_setjmp; we don't need to exclude eh_sjlj_longjmp - // because, although it does clobber the counter register, the - // control can't then return to inside the loop unless there is also - // an eh_sjlj_setjmp. - case Intrinsic::eh_sjlj_setjmp: - - case Intrinsic::memcpy: - case Intrinsic::memmove: - case Intrinsic::memset: - case Intrinsic::powi: - case Intrinsic::log: - case Intrinsic::log2: - case Intrinsic::log10: - case Intrinsic::exp: - case Intrinsic::exp2: - case Intrinsic::pow: - case Intrinsic::sin: - case Intrinsic::cos: - return true; - case Intrinsic::copysign: - if (CI->getArgOperand(0)->getType()->getScalarType()-> - isPPC_FP128Ty()) - return true; - else - continue; // ISD::FCOPYSIGN is never a library call. - case Intrinsic::sqrt: Opcode = ISD::FSQRT; break; - case Intrinsic::floor: Opcode = ISD::FFLOOR; break; - case Intrinsic::ceil: Opcode = ISD::FCEIL; break; - case Intrinsic::trunc: Opcode = ISD::FTRUNC; break; - case Intrinsic::rint: Opcode = ISD::FRINT; break; - case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break; - case Intrinsic::round: Opcode = ISD::FROUND; break; - case Intrinsic::minnum: Opcode = ISD::FMINNUM; break; - case Intrinsic::maxnum: Opcode = ISD::FMAXNUM; break; - case Intrinsic::umul_with_overflow: Opcode = ISD::UMULO; break; - case Intrinsic::smul_with_overflow: Opcode = ISD::SMULO; break; - } - } - - // PowerPC does not use [US]DIVREM or other library calls for - // operations on regular types which are not otherwise library calls - // (i.e. soft float or atomics). If adapting for targets that do, - // additional care is required here. - - LibFunc Func; - if (!F->hasLocalLinkage() && F->hasName() && LibInfo && - LibInfo->getLibFunc(F->getName(), Func) && - LibInfo->hasOptimizedCodeGen(Func)) { - // Non-read-only functions are never treated as intrinsics. - if (!CI->onlyReadsMemory()) - return true; - - // Conversion happens only for FP calls. - if (!CI->getArgOperand(0)->getType()->isFloatingPointTy()) - return true; - - switch (Func) { - default: return true; - case LibFunc_copysign: - case LibFunc_copysignf: - continue; // ISD::FCOPYSIGN is never a library call. - case LibFunc_copysignl: - return true; - case LibFunc_fabs: - case LibFunc_fabsf: - case LibFunc_fabsl: - continue; // ISD::FABS is never a library call. - case LibFunc_sqrt: - case LibFunc_sqrtf: - case LibFunc_sqrtl: - Opcode = ISD::FSQRT; break; - case LibFunc_floor: - case LibFunc_floorf: - case LibFunc_floorl: - Opcode = ISD::FFLOOR; break; - case LibFunc_nearbyint: - case LibFunc_nearbyintf: - case LibFunc_nearbyintl: - Opcode = ISD::FNEARBYINT; break; - case LibFunc_ceil: - case LibFunc_ceilf: - case LibFunc_ceill: - Opcode = ISD::FCEIL; break; - case LibFunc_rint: - case LibFunc_rintf: - case LibFunc_rintl: - Opcode = ISD::FRINT; break; - case LibFunc_round: - case LibFunc_roundf: - case LibFunc_roundl: - Opcode = ISD::FROUND; break; - case LibFunc_trunc: - case LibFunc_truncf: - case LibFunc_truncl: - Opcode = ISD::FTRUNC; break; - case LibFunc_fmin: - case LibFunc_fminf: - case LibFunc_fminl: - Opcode = ISD::FMINNUM; break; - case LibFunc_fmax: - case LibFunc_fmaxf: - case LibFunc_fmaxl: - Opcode = ISD::FMAXNUM; break; - } - } - - if (Opcode) { - EVT EVTy = - TLI->getValueType(*DL, CI->getArgOperand(0)->getType(), true); - - if (EVTy == MVT::Other) - return true; - - if (TLI->isOperationLegalOrCustom(Opcode, EVTy)) - continue; - else if (EVTy.isVector() && - TLI->isOperationLegalOrCustom(Opcode, EVTy.getScalarType())) - continue; - - return true; - } - } - - return true; - } else if (isa(J) && - J->getType()->getScalarType()->isPPC_FP128Ty()) { - // Most operations on ppc_f128 values become calls. - return true; - } else if (isa(J) || isa(J) || - isa(J) || isa(J)) { - CastInst *CI = cast(J); - if (CI->getSrcTy()->getScalarType()->isPPC_FP128Ty() || - CI->getDestTy()->getScalarType()->isPPC_FP128Ty() || - isLargeIntegerTy(!TM->isPPC64(), CI->getSrcTy()->getScalarType()) || - isLargeIntegerTy(!TM->isPPC64(), CI->getDestTy()->getScalarType())) - return true; - } else if (isLargeIntegerTy(!TM->isPPC64(), - J->getType()->getScalarType()) && - (J->getOpcode() == Instruction::UDiv || - J->getOpcode() == Instruction::SDiv || - J->getOpcode() == Instruction::URem || - J->getOpcode() == Instruction::SRem)) { - return true; - } else if (!TM->isPPC64() && - isLargeIntegerTy(false, J->getType()->getScalarType()) && - (J->getOpcode() == Instruction::Shl || - J->getOpcode() == Instruction::AShr || - J->getOpcode() == Instruction::LShr)) { - // Only on PPC32, for 128-bit integers (specifically not 64-bit - // integers), these might be runtime calls. - return true; - } else if (isa(J) || isa(J)) { - // On PowerPC, indirect jumps use the counter register. - return true; - } else if (SwitchInst *SI = dyn_cast(J)) { - if (SI->getNumCases() + 1 >= (unsigned)TLI->getMinimumJumpTableEntries()) - return true; - } - - // FREM is always a call. - if (J->getOpcode() == Instruction::FRem) - return true; - - if (STI->useSoftFloat()) { - switch(J->getOpcode()) { - case Instruction::FAdd: - case Instruction::FSub: - case Instruction::FMul: - case Instruction::FDiv: - case Instruction::FPTrunc: - case Instruction::FPExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::UIToFP: - case Instruction::SIToFP: - case Instruction::FCmp: - return true; - } - } - - for (Value *Operand : J->operands()) - if (memAddrUsesCTR(*TM, Operand)) - return true; - } - - return false; -} -bool PPCCTRLoops::convertToCTRLoop(Loop *L) { - bool MadeChange = false; - - // Do not convert small short loops to CTR loop. - unsigned ConstTripCount = SE->getSmallConstantTripCount(L); - if (ConstTripCount && ConstTripCount < SmallCTRLoopThreshold) { - SmallPtrSet EphValues; - auto AC = getAnalysis().getAssumptionCache( - *L->getHeader()->getParent()); - CodeMetrics::collectEphemeralValues(L, &AC, EphValues); - CodeMetrics Metrics; - for (BasicBlock *BB : L->blocks()) - Metrics.analyzeBasicBlock(BB, *TTI, EphValues); - // 6 is an approximate latency for the mtctr instruction. - if (Metrics.NumInsts <= (6 * SchedModel.getIssueWidth())) - return false; - } - - // Process nested loops first. - for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) { - MadeChange |= convertToCTRLoop(*I); - LLVM_DEBUG(dbgs() << "Nested loop converted\n"); - } - - // If a nested loop has been converted, then we can't convert this loop. - if (MadeChange) - return MadeChange; - - // Bail out if the loop has irreducible control flow. - LoopBlocksRPO RPOT(L); - RPOT.perform(LI); - if (containsIrreducibleCFG(RPOT, *LI)) - return false; - -#ifndef NDEBUG - // Stop trying after reaching the limit (if any). - int Limit = CTRLoopLimit; - if (Limit >= 0) { - if (Counter >= CTRLoopLimit) - return false; - Counter++; - } -#endif - - // We don't want to spill/restore the counter register, and so we don't - // want to use the counter register if the loop contains calls. - for (Loop::block_iterator I = L->block_begin(), IE = L->block_end(); - I != IE; ++I) - if (mightUseCTR(*I)) - return MadeChange; - - SmallVector ExitingBlocks; - L->getExitingBlocks(ExitingBlocks); - - // If there is an exit edge known to be frequently taken, - // we should not transform this loop. - for (auto &BB : ExitingBlocks) { - Instruction *TI = BB->getTerminator(); - if (!TI) continue; - - if (BranchInst *BI = dyn_cast(TI)) { - uint64_t TrueWeight = 0, FalseWeight = 0; - if (!BI->isConditional() || - !BI->extractProfMetadata(TrueWeight, FalseWeight)) - continue; - - // If the exit path is more frequent than the loop path, - // we return here without further analysis for this loop. - bool TrueIsExit = !L->contains(BI->getSuccessor(0)); - if (( TrueIsExit && FalseWeight < TrueWeight) || - (!TrueIsExit && FalseWeight > TrueWeight)) - return MadeChange; - } - } - - BasicBlock *CountedExitBlock = nullptr; - const SCEV *ExitCount = nullptr; - BranchInst *CountedExitBranch = nullptr; - for (SmallVectorImpl::iterator I = ExitingBlocks.begin(), - IE = ExitingBlocks.end(); I != IE; ++I) { - const SCEV *EC = SE->getExitCount(L, *I); - LLVM_DEBUG(dbgs() << "Exit Count for " << *L << " from block " - << (*I)->getName() << ": " << *EC << "\n"); - if (isa(EC)) - continue; - if (const SCEVConstant *ConstEC = dyn_cast(EC)) { - if (ConstEC->getValue()->isZero()) - continue; - } else if (!SE->isLoopInvariant(EC, L)) - continue; - - if (SE->getTypeSizeInBits(EC->getType()) > (TM->isPPC64() ? 64 : 32)) - continue; - - // If this exiting block is contained in a nested loop, it is not eligible - // for insertion of the branch-and-decrement since the inner loop would - // end up messing up the value in the CTR. - if (LI->getLoopFor(*I) != L) - continue; - - // We now have a loop-invariant count of loop iterations (which is not the - // constant zero) for which we know that this loop will not exit via this - // existing block. - - // We need to make sure that this block will run on every loop iteration. - // For this to be true, we must dominate all blocks with backedges. Such - // blocks are in-loop predecessors to the header block. - bool NotAlways = false; - for (pred_iterator PI = pred_begin(L->getHeader()), - PIE = pred_end(L->getHeader()); PI != PIE; ++PI) { - if (!L->contains(*PI)) - continue; - - if (!DT->dominates(*I, *PI)) { - NotAlways = true; - break; - } - } - - if (NotAlways) - continue; - - // Make sure this blocks ends with a conditional branch. - Instruction *TI = (*I)->getTerminator(); - if (!TI) - continue; - - if (BranchInst *BI = dyn_cast(TI)) { - if (!BI->isConditional()) - continue; - - CountedExitBranch = BI; - } else - continue; - - // Note that this block may not be the loop latch block, even if the loop - // has a latch block. - CountedExitBlock = *I; - ExitCount = EC; - break; - } - - if (!CountedExitBlock) - return MadeChange; - - BasicBlock *Preheader = L->getLoopPreheader(); - - // If we don't have a preheader, then insert one. If we already have a - // preheader, then we can use it (except if the preheader contains a use of - // the CTR register because some such uses might be reordered by the - // selection DAG after the mtctr instruction). - if (!Preheader || mightUseCTR(Preheader)) - Preheader = InsertPreheaderForLoop(L, DT, LI, nullptr, PreserveLCSSA); - if (!Preheader) - return MadeChange; - - LLVM_DEBUG(dbgs() << "Preheader for exit count: " << Preheader->getName() - << "\n"); - - // Insert the count into the preheader and replace the condition used by the - // selected branch. - MadeChange = true; - - SCEVExpander SCEVE(*SE, *DL, "loopcnt"); - LLVMContext &C = SE->getContext(); - Type *CountType = TM->isPPC64() ? Type::getInt64Ty(C) : Type::getInt32Ty(C); - if (!ExitCount->getType()->isPointerTy() && - ExitCount->getType() != CountType) - ExitCount = SE->getZeroExtendExpr(ExitCount, CountType); - ExitCount = SE->getAddExpr(ExitCount, SE->getOne(CountType)); - Value *ECValue = - SCEVE.expandCodeFor(ExitCount, CountType, Preheader->getTerminator()); - - IRBuilder<> CountBuilder(Preheader->getTerminator()); - Module *M = Preheader->getParent()->getParent(); - Function *MTCTRFunc = - Intrinsic::getDeclaration(M, Intrinsic::ppc_mtctr, CountType); - CountBuilder.CreateCall(MTCTRFunc, ECValue); - - IRBuilder<> CondBuilder(CountedExitBranch); - Function *DecFunc = - Intrinsic::getDeclaration(M, Intrinsic::ppc_is_decremented_ctr_nonzero); - Value *NewCond = CondBuilder.CreateCall(DecFunc, {}); - Value *OldCond = CountedExitBranch->getCondition(); - CountedExitBranch->setCondition(NewCond); - - // The false branch must exit the loop. - if (!L->contains(CountedExitBranch->getSuccessor(0))) - CountedExitBranch->swapSuccessors(); - - // The old condition may be dead now, and may have even created a dead PHI - // (the original induction variable). - RecursivelyDeleteTriviallyDeadInstructions(OldCond); - // Run through the basic blocks of the loop and see if any of them have dead - // PHIs that can be removed. - for (auto I : L->blocks()) - DeleteDeadPHIs(I); - - ++NumCTRLoops; - return MadeChange; -} - #ifndef NDEBUG static bool clobbersCTR(const MachineInstr &MI) { for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { Index: lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- lib/Target/PowerPC/PPCISelLowering.cpp +++ lib/Target/PowerPC/PPCISelLowering.cpp @@ -9764,11 +9764,11 @@ } case ISD::INTRINSIC_W_CHAIN: { if (cast(N->getOperand(1))->getZExtValue() != - Intrinsic::ppc_is_decremented_ctr_nonzero) + Intrinsic::loop_dec) break; - assert(N->getValueType(0) == MVT::i1 && - "Unexpected result type for CTR decrement intrinsic"); + //assert(N->getValueType(0) == MVT::i1 && + // "Unexpected result type for CTR decrement intrinsic"); EVT SVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), N->getValueType(0)); SDVTList VTs = DAG.getVTList(SVT, MVT::Other); @@ -13454,10 +13454,12 @@ SDValue Cond = N->getOperand(1); SDValue Target = N->getOperand(2); - if (Cond.getOpcode() == ISD::INTRINSIC_W_CHAIN && - cast(Cond.getOperand(1))->getZExtValue() == - Intrinsic::ppc_is_decremented_ctr_nonzero) { + if (Cond.getOpcode() == ISD::SETCC && + Cond.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN && + cast(Cond.getOperand(0).getOperand(1))->getZExtValue() == + Intrinsic::loop_dec) { + Cond = Cond.getOperand(0); // We now need to make the intrinsic dead (it cannot be instruction // selected). DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Cond.getOperand(0)); @@ -13482,14 +13484,14 @@ if (LHS.getOpcode() == ISD::AND && LHS.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN && cast(LHS.getOperand(0).getOperand(1))->getZExtValue() == - Intrinsic::ppc_is_decremented_ctr_nonzero && + Intrinsic::loop_dec && isa(LHS.getOperand(1)) && !isNullConstant(LHS.getOperand(1))) LHS = LHS.getOperand(0); if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN && cast(LHS.getOperand(1))->getZExtValue() == - Intrinsic::ppc_is_decremented_ctr_nonzero && + Intrinsic::loop_dec && isa(RHS)) { assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Counter decrement comparison is not EQ or NE"); Index: lib/Target/PowerPC/PPCInstr64Bit.td =================================================================== --- lib/Target/PowerPC/PPCInstr64Bit.td +++ lib/Target/PowerPC/PPCInstr64Bit.td @@ -382,7 +382,7 @@ PPC970_DGroup_First, PPC970_Unit_FXU; } let hasSideEffects = 1, Defs = [CTR8] in { -let Pattern = [(int_ppc_mtctr i64:$rS)] in +let Pattern = [(int_set_loop_iterations i64:$rS)] in def MTCTR8loop : XFXForm_7_ext<31, 467, 9, (outs), (ins g8rc:$rS), "mtctr $rS", IIC_SprMTSPR>, PPC970_DGroup_First, PPC970_Unit_FXU; Index: lib/Target/PowerPC/PPCInstrInfo.td =================================================================== --- lib/Target/PowerPC/PPCInstrInfo.td +++ lib/Target/PowerPC/PPCInstrInfo.td @@ -2600,7 +2600,7 @@ PPC970_DGroup_First, PPC970_Unit_FXU; } let hasSideEffects = 1, isCodeGenOnly = 1, Defs = [CTR] in { -let Pattern = [(int_ppc_mtctr i32:$rS)] in +let Pattern = [(int_set_loop_iterations i32:$rS)] in def MTCTRloop : XFXForm_7_ext<31, 467, 9, (outs), (ins gprc:$rS), "mtctr $rS", IIC_SprMTSPR>, PPC970_DGroup_First, PPC970_Unit_FXU; Index: lib/Target/PowerPC/PPCTargetMachine.cpp =================================================================== --- lib/Target/PowerPC/PPCTargetMachine.cpp +++ lib/Target/PowerPC/PPCTargetMachine.cpp @@ -101,7 +101,6 @@ RegisterTargetMachine C(getThePPC64LETarget()); PassRegistry &PR = *PassRegistry::getPassRegistry(); - initializePPCCTRLoopsPass(PR); #ifndef NDEBUG initializePPCCTRLoopsVerifyPass(PR); #endif @@ -422,7 +421,7 @@ addPass(createPPCLoopPreIncPrepPass(getPPCTargetMachine())); if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None) - addPass(createPPCCTRLoops()); + addPass(createHardwareLoops()); return false; } Index: lib/Target/PowerPC/PPCTargetTransformInfo.h =================================================================== --- lib/Target/PowerPC/PPCTargetTransformInfo.h +++ lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -53,6 +53,11 @@ unsigned getUserCost(const User *U, ArrayRef Operands); TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth); + bool mightUseCTR(BasicBlock *BB, TargetLibraryInfo *LibInfo); + bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, + AssumptionCache &AC, + TargetLibraryInfo *LibInfo, + TTI::HardwareLoopInfo &HWLoopInfo); void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP); Index: lib/Target/PowerPC/PPCTargetTransformInfo.cpp =================================================================== --- lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -7,10 +7,12 @@ //===----------------------------------------------------------------------===// #include "PPCTargetTransformInfo.h" +#include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/CodeGen/CostTable.h" #include "llvm/CodeGen/TargetLowering.h" +#include "llvm/CodeGen/TargetSchedule.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" using namespace llvm; @@ -31,6 +33,13 @@ cl::desc("Enable using coldcc calling conv for cold " "internal functions")); +// The latency of mtctr is only justified if there are more than 4 +// comparisons that will be removed as a result. +static cl::opt +SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden, + cl::desc("Loops with a constant trip count smaller than " + "this value will not use the count register.")); + //===----------------------------------------------------------------------===// // // PPC cost model. @@ -204,6 +213,341 @@ return BaseT::getUserCost(U, Operands); } +bool PPCTTIImpl::mightUseCTR(BasicBlock *BB, + TargetLibraryInfo *LibInfo) { + const PPCTargetMachine &TM = ST->getTargetMachine(); + + // Loop through the inline asm constraints and look for something that + // clobbers ctr. + auto asmClobbersCTR = [](InlineAsm *IA) { + InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints(); + for (unsigned i = 0, ie = CIV.size(); i < ie; ++i) { + InlineAsm::ConstraintInfo &C = CIV[i]; + if (C.Type != InlineAsm::isInput) + for (unsigned j = 0, je = C.Codes.size(); j < je; ++j) + if (StringRef(C.Codes[j]).equals_lower("{ctr}")) + return true; + } + return false; + }; + + // Determining the address of a TLS variable results in a function call in + // certain TLS models. + std::function memAddrUsesCTR = + [&memAddrUsesCTR, &TM](const Value *MemAddr) -> bool { + const auto *GV = dyn_cast(MemAddr); + if (!GV) { + // Recurse to check for constants that refer to TLS global variables. + if (const auto *CV = dyn_cast(MemAddr)) + for (const auto &CO : CV->operands()) + if (memAddrUsesCTR(CO)) + return true; + + return false; + } + + if (!GV->isThreadLocal()) + return false; + TLSModel::Model Model = TM.getTLSModel(GV); + return Model == TLSModel::GeneralDynamic || + Model == TLSModel::LocalDynamic; + }; + + auto isLargeIntegerTy = [](bool Is32Bit, Type *Ty) { + if (IntegerType *ITy = dyn_cast(Ty)) + return ITy->getBitWidth() > (Is32Bit ? 32U : 64U); + + return false; + }; + + for (BasicBlock::iterator J = BB->begin(), JE = BB->end(); + J != JE; ++J) { + if (CallInst *CI = dyn_cast(J)) { + // Inline ASM is okay, unless it clobbers the ctr register. + if (InlineAsm *IA = dyn_cast(CI->getCalledValue())) { + if (asmClobbersCTR(IA)) + return true; + continue; + } + + if (Function *F = CI->getCalledFunction()) { + // Most intrinsics don't become function calls, but some might. + // sin, cos, exp and log are always calls. + unsigned Opcode = 0; + if (F->getIntrinsicID() != Intrinsic::not_intrinsic) { + switch (F->getIntrinsicID()) { + default: continue; + // If we have a call to ppc_is_decremented_ctr_nonzero, or ppc_mtctr + // we're definitely using CTR. + case Intrinsic::set_loop_iterations: + case Intrinsic::loop_dec: + return true; + +// VisualStudio defines setjmp as _setjmp +#if defined(_MSC_VER) && defined(setjmp) && \ + !defined(setjmp_undefined_for_msvc) +# pragma push_macro("setjmp") +# undef setjmp +# define setjmp_undefined_for_msvc +#endif + + case Intrinsic::setjmp: + +#if defined(_MSC_VER) && defined(setjmp_undefined_for_msvc) + // let's return it to _setjmp state +# pragma pop_macro("setjmp") +# undef setjmp_undefined_for_msvc +#endif + + case Intrinsic::longjmp: + + // Exclude eh_sjlj_setjmp; we don't need to exclude eh_sjlj_longjmp + // because, although it does clobber the counter register, the + // control can't then return to inside the loop unless there is also + // an eh_sjlj_setjmp. + case Intrinsic::eh_sjlj_setjmp: + + case Intrinsic::memcpy: + case Intrinsic::memmove: + case Intrinsic::memset: + case Intrinsic::powi: + case Intrinsic::log: + case Intrinsic::log2: + case Intrinsic::log10: + case Intrinsic::exp: + case Intrinsic::exp2: + case Intrinsic::pow: + case Intrinsic::sin: + case Intrinsic::cos: + return true; + case Intrinsic::copysign: + if (CI->getArgOperand(0)->getType()->getScalarType()-> + isPPC_FP128Ty()) + return true; + else + continue; // ISD::FCOPYSIGN is never a library call. + case Intrinsic::sqrt: Opcode = ISD::FSQRT; break; + case Intrinsic::floor: Opcode = ISD::FFLOOR; break; + case Intrinsic::ceil: Opcode = ISD::FCEIL; break; + case Intrinsic::trunc: Opcode = ISD::FTRUNC; break; + case Intrinsic::rint: Opcode = ISD::FRINT; break; + case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break; + case Intrinsic::round: Opcode = ISD::FROUND; break; + case Intrinsic::minnum: Opcode = ISD::FMINNUM; break; + case Intrinsic::maxnum: Opcode = ISD::FMAXNUM; break; + case Intrinsic::umul_with_overflow: Opcode = ISD::UMULO; break; + case Intrinsic::smul_with_overflow: Opcode = ISD::SMULO; break; + } + } + + // PowerPC does not use [US]DIVREM or other library calls for + // operations on regular types which are not otherwise library calls + // (i.e. soft float or atomics). If adapting for targets that do, + // additional care is required here. + + LibFunc Func; + if (!F->hasLocalLinkage() && F->hasName() && LibInfo && + LibInfo->getLibFunc(F->getName(), Func) && + LibInfo->hasOptimizedCodeGen(Func)) { + // Non-read-only functions are never treated as intrinsics. + if (!CI->onlyReadsMemory()) + return true; + + // Conversion happens only for FP calls. + if (!CI->getArgOperand(0)->getType()->isFloatingPointTy()) + return true; + + switch (Func) { + default: return true; + case LibFunc_copysign: + case LibFunc_copysignf: + continue; // ISD::FCOPYSIGN is never a library call. + case LibFunc_copysignl: + return true; + case LibFunc_fabs: + case LibFunc_fabsf: + case LibFunc_fabsl: + continue; // ISD::FABS is never a library call. + case LibFunc_sqrt: + case LibFunc_sqrtf: + case LibFunc_sqrtl: + Opcode = ISD::FSQRT; break; + case LibFunc_floor: + case LibFunc_floorf: + case LibFunc_floorl: + Opcode = ISD::FFLOOR; break; + case LibFunc_nearbyint: + case LibFunc_nearbyintf: + case LibFunc_nearbyintl: + Opcode = ISD::FNEARBYINT; break; + case LibFunc_ceil: + case LibFunc_ceilf: + case LibFunc_ceill: + Opcode = ISD::FCEIL; break; + case LibFunc_rint: + case LibFunc_rintf: + case LibFunc_rintl: + Opcode = ISD::FRINT; break; + case LibFunc_round: + case LibFunc_roundf: + case LibFunc_roundl: + Opcode = ISD::FROUND; break; + case LibFunc_trunc: + case LibFunc_truncf: + case LibFunc_truncl: + Opcode = ISD::FTRUNC; break; + case LibFunc_fmin: + case LibFunc_fminf: + case LibFunc_fminl: + Opcode = ISD::FMINNUM; break; + case LibFunc_fmax: + case LibFunc_fmaxf: + case LibFunc_fmaxl: + Opcode = ISD::FMAXNUM; break; + } + } + + if (Opcode) { + EVT EVTy = + TLI->getValueType(DL, CI->getArgOperand(0)->getType(), true); + + if (EVTy == MVT::Other) + return true; + + if (TLI->isOperationLegalOrCustom(Opcode, EVTy)) + continue; + else if (EVTy.isVector() && + TLI->isOperationLegalOrCustom(Opcode, EVTy.getScalarType())) + continue; + + return true; + } + } + + return true; + } else if (isa(J) && + J->getType()->getScalarType()->isPPC_FP128Ty()) { + // Most operations on ppc_f128 values become calls. + return true; + } else if (isa(J) || isa(J) || + isa(J) || isa(J)) { + CastInst *CI = cast(J); + if (CI->getSrcTy()->getScalarType()->isPPC_FP128Ty() || + CI->getDestTy()->getScalarType()->isPPC_FP128Ty() || + isLargeIntegerTy(!TM.isPPC64(), CI->getSrcTy()->getScalarType()) || + isLargeIntegerTy(!TM.isPPC64(), CI->getDestTy()->getScalarType())) + return true; + } else if (isLargeIntegerTy(!TM.isPPC64(), + J->getType()->getScalarType()) && + (J->getOpcode() == Instruction::UDiv || + J->getOpcode() == Instruction::SDiv || + J->getOpcode() == Instruction::URem || + J->getOpcode() == Instruction::SRem)) { + return true; + } else if (!TM.isPPC64() && + isLargeIntegerTy(false, J->getType()->getScalarType()) && + (J->getOpcode() == Instruction::Shl || + J->getOpcode() == Instruction::AShr || + J->getOpcode() == Instruction::LShr)) { + // Only on PPC32, for 128-bit integers (specifically not 64-bit + // integers), these might be runtime calls. + return true; + } else if (isa(J) || isa(J)) { + // On PowerPC, indirect jumps use the counter register. + return true; + } else if (SwitchInst *SI = dyn_cast(J)) { + if (SI->getNumCases() + 1 >= (unsigned)TLI->getMinimumJumpTableEntries()) + return true; + } + + // FREM is always a call. + if (J->getOpcode() == Instruction::FRem) + return true; + + if (ST->useSoftFloat()) { + switch(J->getOpcode()) { + case Instruction::FAdd: + case Instruction::FSub: + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::FPTrunc: + case Instruction::FPExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::UIToFP: + case Instruction::SIToFP: + case Instruction::FCmp: + return true; + } + } + + for (Value *Operand : J->operands()) + if (memAddrUsesCTR(Operand)) + return true; + } + + return false; +} + +bool PPCTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, + AssumptionCache &AC, + TargetLibraryInfo *LibInfo, + TTI::HardwareLoopInfo &HWLoopInfo) { + const PPCTargetMachine &TM = ST->getTargetMachine(); + TargetSchedModel SchedModel; + SchedModel.init(ST); + + // Do not convert small short loops to CTR loop. + unsigned ConstTripCount = SE.getSmallConstantTripCount(L); + if (ConstTripCount && ConstTripCount < SmallCTRLoopThreshold) { + SmallPtrSet EphValues; + CodeMetrics::collectEphemeralValues(L, &AC, EphValues); + CodeMetrics Metrics; + for (BasicBlock *BB : L->blocks()) + Metrics.analyzeBasicBlock(BB, *this, EphValues); + // 6 is an approximate latency for the mtctr instruction. + if (Metrics.NumInsts <= (6 * SchedModel.getIssueWidth())) + return false; + } + + // We don't want to spill/restore the counter register, and so we don't + // want to use the counter register if the loop contains calls. + for (Loop::block_iterator I = L->block_begin(), IE = L->block_end(); + I != IE; ++I) + if (mightUseCTR(*I, LibInfo)) + return false; + + SmallVector ExitingBlocks; + L->getExitingBlocks(ExitingBlocks); + + // If there is an exit edge known to be frequently taken, + // we should not transform this loop. + for (auto &BB : ExitingBlocks) { + Instruction *TI = BB->getTerminator(); + if (!TI) continue; + + if (BranchInst *BI = dyn_cast(TI)) { + uint64_t TrueWeight = 0, FalseWeight = 0; + if (!BI->isConditional() || + !BI->extractProfMetadata(TrueWeight, FalseWeight)) + continue; + + // If the exit path is more frequent than the loop path, + // we return here without further analysis for this loop. + bool TrueIsExit = !L->contains(BI->getSuccessor(0)); + if (( TrueIsExit && FalseWeight < TrueWeight) || + (!TrueIsExit && FalseWeight > TrueWeight)) + return false; + } + } + + LLVMContext &C = L->getHeader()->getParent()->getParent()->getContext(); + HWLoopInfo.CountType = TM.isPPC64() ? + Type::getInt64Ty(C) : Type::getInt32Ty(C); + + return true; +} + void PPCTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP) { if (ST->getDarwinDirective() == PPC::DIR_A2) { Index: test/CodeGen/PowerPC/ctrloop-intrin.ll =================================================================== --- test/CodeGen/PowerPC/ctrloop-intrin.ll +++ test/CodeGen/PowerPC/ctrloop-intrin.ll @@ -263,7 +263,7 @@ %8 = sub i64 0, %int_part_ptr.02534 %scevgep5 = getelementptr i8, i8* %call109, i64 %8 %scevgep56 = ptrtoint i8* %scevgep5 to i64 - call void @llvm.ppc.mtctr.i64(i64 %scevgep56) + call void @llvm.set.loop.iterations.i64(i64 %scevgep56) br label %for.body.116 for.cond.cleanup: ; preds = %if.end.138, %if.end.105 @@ -298,8 +298,9 @@ %conv134 = trunc i32 %add133 to i8 %scevgep = getelementptr i8, i8* inttoptr (i64 -1 to i8*), i64 %call109.pn2 store i8 %conv134, i8* %scevgep, align 1, !tbaa !10 - %12 = call i1 @llvm.ppc.is.decremented.ctr.nonzero() - br i1 %12, label %for.body.116, label %for.cond.cleanup.115 + %12 = call i64 @llvm.loop.dec(i64 %scevgep56, i64 1) + %dec.cmp = icmp ne i64 %12, 0 + br i1 %dec.cmp, label %for.body.116, label %for.cond.cleanup.115 if.then.136: ; preds = %for.cond.cleanup.115 %incdec.ptr137 = getelementptr inbounds i8, i8* %int_part_ptr.0253, i64 -1 @@ -323,10 +324,10 @@ declare i8* @memcpy(i8*, i8* nocapture readonly, i64) #1 ; Function Attrs: nounwind -declare void @llvm.ppc.mtctr.i64(i64) #0 +declare void @llvm.set.loop.iterations.i64(i64) #0 ; Function Attrs: nounwind -declare i1 @llvm.ppc.is.decremented.ctr.nonzero() #0 +declare i64 @llvm.loop.dec(i64, i64) #0 attributes #0 = { nounwind } attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/CodeGen/PowerPC/ppc-passname.ll =================================================================== --- test/CodeGen/PowerPC/ppc-passname.ll +++ test/CodeGen/PowerPC/ppc-passname.ll @@ -1,15 +1,3 @@ -; Test pass name: ppc-ctr-loops. -; RUN: llc -mtriple=powerpc64le-unknown-unknown < %s -debug-pass=Structure -stop-before=ppc-ctr-loops -o /dev/null 2>&1 | FileCheck %s -check-prefix=STOP-BEFORE-CTR-LOOPS -; STOP-BEFORE-CTR-LOOPS-NOT: -ppc-ctr-loops -; STOP-BEFORE-CTR-LOOPS-NOT: "ppc-ctr-loops" pass is not registered. -; STOP-BEFORE-CTR-LOOPS-NOT: PowerPC CTR Loops - -; RUN: llc -mtriple=powerpc64le-unknown-unknown < %s -debug-pass=Structure -stop-after=ppc-ctr-loops -o /dev/null 2>&1 | FileCheck %s -check-prefix=STOP-AFTER-CTR-LOOPS -; STOP-AFTER-CTR-LOOPS: -ppc-ctr-loops -; STOP-AFTER-CTR-LOOPS-NOT: "ppc-ctr-loops" pass is not registered. -; STOP-AFTER-CTR-LOOPS: PowerPC CTR Loops - - ; Test pass name: ppc-loop-preinc-prep. ; RUN: llc -mtriple=powerpc64le-unknown-unknown < %s -debug-pass=Structure -stop-before=ppc-loop-preinc-prep -o /dev/null 2>&1 | FileCheck %s -check-prefix=STOP-BEFORE-LOOP-PREINC-PREP ; STOP-BEFORE-LOOP-PREINC-PREP-NOT: -ppc-loop-preinc-prep Index: test/CodeGen/Thumb2/mve-tailpred.ll =================================================================== --- /dev/null +++ test/CodeGen/Thumb2/mve-tailpred.ll @@ -0,0 +1,78 @@ +; RUN: opt -mtriple=thumbv8 -mcpu=cortex-a72 %s -arm-hardware-loops -dce -S -o - | FileCheck %s --check-prefix=OPT +; RUN: llc -mtriple=thumbv8 -mcpu=cortex-a72 %s -S -o - | FileCheck %s --check-prefix=LLC + +; CHECK-OPT-LABEL: mul_N +; CHECK-OPT: %0 = call i32 @llvm.arm.while.setup(i32 %N, i32 4) +; CHECK-OPT: br i1 %1, label %vector.ph, label %for.cond.cleanup + +; CHECK-OPT: vector.ph: +; CHECK-OPT: br label %vector.body + +; CHECK-OPT: vecctor.body: +; CHECK-OPT: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECK-OPT: %2 = phi i32 [ %N, %vector.ph ], [ %11, %vector.body ] +; CHECK-OPT: %3 = getelementptr inbounds i32, i32* %a, i32 %index +; CHECK-OPT: %4 = call <4 x i1> @llvm.arm.get.active.mask.4(i32 %2 +; CHECK-OPT: %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %5, i32 4, <4 x i1> %4, <4 x i32> undef) +; CHECK-OPT: %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %7, i32 4, <4 x i1> %4, <4 x i32> undef) +; CHECK-OPT: %8 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load +; CHECK-OPT: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %8, <4 x i32>* %10, i32 4, <4 x i1> %4) +; CHECK-OPT: %index.next = add i32 %index, 4 +; CHECK-OPT: %11 = call i32 @llvm.arm.loop.end(i32 %2, i32 4) +; CHECk-OPT: %12 = icmp ne i32 %11, 0 +; CHECK-OPT: br i1 %12, label %vector.body, label %for.cond.cleanup + +; CHECK-LLC-LABEL: mul_N +; CHECK-LLC:: wlstp.#4 lr, r3, .LBB0_3 +; CHECK-LLC: .LBB0_2: +; CHECK-LLC: vldrw q8, [r0] +; CHECK-LLC: vldrw q9, [r1] +; CHECK-LLC: adds r0, #16 +; CHECK-LLC: adds r1, #16 +; CHECK-LLC: adds r3, #4 +; CHECK-LLC: vmul.i32 q8, q9, q8 +; CHECK-LLC: vstrw q8, [r2] +; CHECK-LLC: adds r2, #16 +; CHECK-LLC: letp .LBB0_2 +; CHECK-LLC: b .LBB0_3 + +define dso_local arm_aapcs_vfpcc void @mul_N(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) { +entry: + %cmp8 = icmp eq i32 %N, 0 + br i1 %cmp8, label %for.cond.cleanup, label %vector.ph + +vector.ph: + %n.rnd.up = add i32 %N, 3 + %n.vec = and i32 %n.rnd.up, -4 + %trip.count.minus.1 = add i32 %N, -1 + %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 + %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %induction = add <4 x i32> %broadcast.splat, + %0 = getelementptr inbounds i32, i32* %a, i32 %index + %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %2 = bitcast i32* %0 to <4 x i32>* + %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) + %3 = getelementptr inbounds i32, i32* %b, i32 %index + %4 = bitcast i32* %3 to <4 x i32>* + %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %4, i32 4, <4 x i1> %1, <4 x i32> undef) + %5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load + %6 = getelementptr inbounds i32, i32* %c, i32 %index + %7 = bitcast i32* %6 to <4 x i32>* + tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %5, <4 x i32>* %7, i32 4, <4 x i1> %1) + %index.next = add i32 %index, 4 + %8 = icmp eq i32 %index.next, %n.vec + br i1 %8, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: + ret void +} + +declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) + +declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)