diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -98,7 +98,7 @@ Loop *L = nullptr; BasicBlock *ExitBlock = nullptr; BranchInst *ExitBranch = nullptr; - const SCEV *ExitCount = nullptr; + const SCEV *TripCount = nullptr; IntegerType *CountType = nullptr; Value *LoopDecrement = nullptr; // Decrement the loop counter by this // value in every iteration. @@ -111,7 +111,8 @@ // produces an i1 to guard the loop entry. bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI, DominatorTree &DT, bool ForceNestedLoop = false, - bool ForceHardwareLoopPHI = false); + bool ForceHardwareLoopPHI = false, + bool ForceGuardLoopEntry = false); bool canAnalyze(LoopInfo &LI); }; diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -100,10 +100,12 @@ bool HardwareLoopInfo::isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI, DominatorTree &DT, bool ForceNestedLoop, - bool ForceHardwareLoopPHI) { + bool ForceHardwareLoopPHI, + bool ForceGuardLoopEntry) { SmallVector ExitingBlocks; L->getExitingBlocks(ExitingBlocks); + const SCEV *ExitCount = nullptr; for (BasicBlock *BB : ExitingBlocks) { // If we pass the updated counter back through a phi, we need to know // which latch the updated value will be coming from. @@ -173,6 +175,35 @@ if (!ExitBlock) return false; + + assert(ExitCount->getType()->isIntegerTy() && + "Count type must be integer!\n"); + + assert((SE.getTypeSizeInBits(ExitCount->getType()) <= + CountType->getBitWidth()) && + "Invalid loop count type!\n"); + + // If we can prove that ExitCount + 1 doesn't overflow(can't be zero), + // 1: we can use test and set form loop count. + // 2: if we need a zero extend, we can first do +1 before extend + // for some const folding. + if (SE.isLoopEntryGuardedByCond( + L, ICmpInst::ICMP_NE, + SE.getAddExpr(ExitCount, SE.getOne(ExitCount->getType())), + SE.getZero(ExitCount->getType()))) { + if (ExitCount->getType() != CountType) + TripCount = SE.getZeroExtendExpr( + SE.getAddExpr(ExitCount, SE.getOne(ExitCount->getType())), CountType); + else + TripCount = SE.getAddExpr(SE.getNoopOrZeroExtend(ExitCount, CountType), + SE.getOne(CountType)); + PerformEntryTest |= ForceGuardLoopEntry; + } else { + TripCount = SE.getAddExpr(SE.getNoopOrZeroExtend(ExitCount, CountType), + SE.getOne(CountType)); + PerformEntryTest = false; + } + return true; } diff --git a/llvm/lib/CodeGen/HardwareLoops.cpp b/llvm/lib/CodeGen/HardwareLoops.cpp --- a/llvm/lib/CodeGen/HardwareLoops.cpp +++ b/llvm/lib/CodeGen/HardwareLoops.cpp @@ -184,15 +184,12 @@ public: HardwareLoop(HardwareLoopInfo &Info, ScalarEvolution &SE, - const DataLayout &DL, - OptimizationRemarkEmitter *ORE) : - SE(SE), DL(DL), ORE(ORE), L(Info.L), M(L->getHeader()->getModule()), - ExitCount(Info.ExitCount), - CountType(Info.CountType), - ExitBranch(Info.ExitBranch), - LoopDecrement(Info.LoopDecrement), - UsePHICounter(Info.CounterInReg), - UseLoopGuard(Info.PerformEntryTest) { } + const DataLayout &DL, OptimizationRemarkEmitter *ORE) + : SE(SE), DL(DL), ORE(ORE), L(Info.L), M(L->getHeader()->getModule()), + TripCount(Info.TripCount), CountType(Info.CountType), + ExitBranch(Info.ExitBranch), LoopDecrement(Info.LoopDecrement), + UsePHICounter(Info.CounterInReg), + UseLoopGuard(Info.PerformEntryTest) {} void Create(); @@ -200,15 +197,15 @@ ScalarEvolution &SE; const DataLayout &DL; OptimizationRemarkEmitter *ORE = nullptr; - Loop *L = nullptr; - Module *M = nullptr; - const SCEV *ExitCount = nullptr; - Type *CountType = nullptr; - BranchInst *ExitBranch = nullptr; - Value *LoopDecrement = nullptr; - bool UsePHICounter = false; - bool UseLoopGuard = false; - BasicBlock *BeginBB = nullptr; + Loop *L = nullptr; + Module *M = nullptr; + const SCEV *TripCount = nullptr; + Type *CountType = nullptr; + BranchInst *ExitBranch = nullptr; + Value *LoopDecrement = nullptr; + bool UsePHICounter = false; + bool UseLoopGuard = false; + BasicBlock *BeginBB = nullptr; }; } @@ -287,7 +284,8 @@ LLVM_DEBUG(dbgs() << "HWLoops: Try to convert profitable loop: " << *L); if (!HWLoopInfo.isHardwareLoopCandidate(*SE, *LI, *DT, ForceNestedLoop, - ForceHardwareLoopPHI)) { + ForceHardwareLoopPHI, + ForceGuardLoopEntry)) { // TODO: there can be many reasons a loop is not considered a // candidate, so we should let isHardwareLoopCandidate fill in the // reason and then report a better message here. @@ -296,7 +294,7 @@ } assert( - (HWLoopInfo.ExitBlock && HWLoopInfo.ExitBranch && HWLoopInfo.ExitCount) && + (HWLoopInfo.ExitBlock && HWLoopInfo.ExitBranch && HWLoopInfo.TripCount) && "Hardware Loop must have set exit info."); BasicBlock *Preheader = L->getLoopPreheader(); @@ -381,43 +379,29 @@ // loop counter and tests that is not zero? SCEVExpander SCEVE(SE, DL, "loopcnt"); - if (!ExitCount->getType()->isPointerTy() && - ExitCount->getType() != CountType) - ExitCount = SE.getZeroExtendExpr(ExitCount, CountType); - - ExitCount = SE.getAddExpr(ExitCount, SE.getOne(CountType)); - - // If we're trying to use the 'test and set' form of the intrinsic, we need - // to replace a conditional branch that is controlling entry to the loop. It - // is likely (guaranteed?) that the preheader has an unconditional branch to - // the loop header, so also check if it has a single predecessor. - if (SE.isLoopEntryGuardedByCond(L, ICmpInst::ICMP_NE, ExitCount, - SE.getZero(ExitCount->getType()))) { - LLVM_DEBUG(dbgs() << " - Attempting to use test.set counter.\n"); - UseLoopGuard |= ForceGuardLoopEntry; - } else - UseLoopGuard = false; + // If we're trying to use the 'test and set' form of the intrinsic, it is + // likely (guaranteed?) that the preheader has an unconditional branch to the + // loop header, so also check if it has a single predecessor. BasicBlock *BB = L->getLoopPreheader(); if (UseLoopGuard && BB->getSinglePredecessor() && cast(BB->getTerminator())->isUnconditional()) { BasicBlock *Predecessor = BB->getSinglePredecessor(); // If it's not safe to create a while loop then don't force it and create a // do-while loop instead - if (!isSafeToExpandAt(ExitCount, Predecessor->getTerminator(), SE)) - UseLoopGuard = false; + if (!isSafeToExpandAt(TripCount, Predecessor->getTerminator(), SE)) + UseLoopGuard = false; else - BB = Predecessor; + BB = Predecessor; } - if (!isSafeToExpandAt(ExitCount, BB->getTerminator(), SE)) { - LLVM_DEBUG(dbgs() << "- Bailing, unsafe to expand ExitCount " - << *ExitCount << "\n"); + if (!isSafeToExpandAt(TripCount, BB->getTerminator(), SE)) { + LLVM_DEBUG(dbgs() << "- Bailing, unsafe to expand TripCount " << *TripCount + << "\n"); return nullptr; } - Value *Count = SCEVE.expandCodeFor(ExitCount, CountType, - BB->getTerminator()); + Value *Count = SCEVE.expandCodeFor(TripCount, CountType, BB->getTerminator()); // FIXME: We've expanded Count where we hope to insert the counter setting // intrinsic. But, in the case of the 'test and set' form, we may fallback to diff --git a/llvm/test/CodeGen/PowerPC/loop-instr-prep-non-const-increasement.ll b/llvm/test/CodeGen/PowerPC/loop-instr-prep-non-const-increasement.ll --- a/llvm/test/CodeGen/PowerPC/loop-instr-prep-non-const-increasement.ll +++ b/llvm/test/CodeGen/PowerPC/loop-instr-prep-non-const-increasement.ll @@ -20,10 +20,8 @@ ; CHECK-NEXT: blt cr0, .LBB0_4 ; CHECK-NEXT: # %bb.1: # %for.body.preheader ; CHECK-NEXT: addi r6, r3, 5 -; CHECK-NEXT: addi r3, r4, -1 +; CHECK-NEXT: clrldi r3, r4, 32 ; CHECK-NEXT: extsw r5, r5 -; CHECK-NEXT: clrldi r3, r3, 32 -; CHECK-NEXT: addi r3, r3, 1 ; CHECK-NEXT: mtctr r3 ; CHECK-NEXT: li r3, 0 ; CHECK-NEXT: .p2align 5 @@ -87,23 +85,21 @@ ; CHECK-NEXT: cmpwi r4, 1 ; CHECK-NEXT: blt cr0, .LBB1_4 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: addi r4, r4, -1 -; CHECK-NEXT: addi r3, r3, 1000 +; CHECK-NEXT: addi r6, r3, 1000 +; CHECK-NEXT: clrldi r3, r4, 32 ; CHECK-NEXT: extsw r5, r5 -; CHECK-NEXT: li r6, 0 -; CHECK-NEXT: clrldi r4, r4, 32 -; CHECK-NEXT: addi r4, r4, 1 -; CHECK-NEXT: mtctr r4 ; CHECK-NEXT: li r4, 0 +; CHECK-NEXT: mtctr r3 +; CHECK-NEXT: li r3, 0 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB1_2: # %for.body ; CHECK-NEXT: # -; CHECK-NEXT: lbzx r7, r3, r6 -; CHECK-NEXT: add r6, r6, r5 -; CHECK-NEXT: add r4, r7, r4 +; CHECK-NEXT: lbzx r7, r6, r4 +; CHECK-NEXT: add r4, r4, r5 +; CHECK-NEXT: add r3, r7, r3 ; CHECK-NEXT: bdnz .LBB1_2 ; CHECK-NEXT: # %bb.3: # %for.cond.cleanup -; CHECK-NEXT: clrldi r3, r4, 56 +; CHECK-NEXT: clrldi r3, r3, 56 ; CHECK-NEXT: blr ; CHECK-NEXT: .LBB1_4: ; CHECK-NEXT: li r3, 0 diff --git a/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll b/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll --- a/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll +++ b/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll @@ -263,9 +263,7 @@ ; CHECK-NEXT: cmpwi r4, 1 ; CHECK-NEXT: blt cr0, .LBB3_5 ; CHECK-NEXT: .LBB3_3: # %for.body.preheader -; CHECK-NEXT: addi r3, r4, -1 -; CHECK-NEXT: clrldi r3, r3, 32 -; CHECK-NEXT: addi r3, r3, 1 +; CHECK-NEXT: clrldi r3, r4, 32 ; CHECK-NEXT: mtctr r3 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB3_4: # %for.body @@ -295,9 +293,7 @@ ; CHECK-BE-NEXT: cmpwi r4, 1 ; CHECK-BE-NEXT: blt cr0, .LBB3_5 ; CHECK-BE-NEXT: .LBB3_3: # %for.body.preheader -; CHECK-BE-NEXT: addi r3, r4, -1 -; CHECK-BE-NEXT: clrldi r3, r3, 32 -; CHECK-BE-NEXT: addi r3, r3, 1 +; CHECK-BE-NEXT: clrldi r3, r4, 32 ; CHECK-BE-NEXT: mtctr r3 ; CHECK-BE-NEXT: .p2align 4 ; CHECK-BE-NEXT: .LBB3_4: # %for.body