diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -98,7 +98,7 @@ Loop *L = nullptr; BasicBlock *ExitBlock = nullptr; BranchInst *ExitBranch = nullptr; - const SCEV *ExitCount = nullptr; + const SCEV *TripCount = nullptr; IntegerType *CountType = nullptr; Value *LoopDecrement = nullptr; // Decrement the loop counter by this // value in every iteration. diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -104,6 +104,7 @@ SmallVector ExitingBlocks; L->getExitingBlocks(ExitingBlocks); + const SCEV *ExitCount = nullptr; for (BasicBlock *BB : ExitingBlocks) { // If we pass the updated counter back through a phi, we need to know // which latch the updated value will be coming from. @@ -173,6 +174,35 @@ if (!ExitBlock) return false; + + assert(ExitCount->getType()->isIntegerTy() && + "Count type must be integer!\n"); + + assert((SE.getTypeSizeInBits(ExitCount->getType()) <= + CountType->getBitWidth()) && + "Invalid loop count type!\n"); + + // Check if ExitCount + 1 will overflow. + // Please note, if the ExitCount type is same with CountType, we also treat it + // as not overflow. Hareware count register will handle the count 0 case well. + // For example, on PowerPC, if the value in the Count Register is 0 before + // being decremented, it is -1 afterward. + bool WillNotOverflow = + (SE.getTypeSizeInBits(ExitCount->getType()) == + CountType->getBitWidth()) || + SE.isLoopEntryGuardedByCond( + L, ICmpInst::ICMP_NE, + SE.getAddExpr(ExitCount, SE.getOne(ExitCount->getType())), + SE.getZero(ExitCount->getType())); + + TripCount = SE.getTripCountFromExitCount(ExitCount, !WillNotOverflow); + + assert((SE.getTypeSizeInBits(TripCount->getType()) <= + CountType->getBitWidth()) && + "Invalid loop trip count!\n"); + + TripCount = SE.getNoopOrZeroExtend(TripCount, CountType); + return true; } diff --git a/llvm/lib/CodeGen/HardwareLoops.cpp b/llvm/lib/CodeGen/HardwareLoops.cpp --- a/llvm/lib/CodeGen/HardwareLoops.cpp +++ b/llvm/lib/CodeGen/HardwareLoops.cpp @@ -184,15 +184,12 @@ public: HardwareLoop(HardwareLoopInfo &Info, ScalarEvolution &SE, - const DataLayout &DL, - OptimizationRemarkEmitter *ORE) : - SE(SE), DL(DL), ORE(ORE), L(Info.L), M(L->getHeader()->getModule()), - ExitCount(Info.ExitCount), - CountType(Info.CountType), - ExitBranch(Info.ExitBranch), - LoopDecrement(Info.LoopDecrement), - UsePHICounter(Info.CounterInReg), - UseLoopGuard(Info.PerformEntryTest) { } + const DataLayout &DL, OptimizationRemarkEmitter *ORE) + : SE(SE), DL(DL), ORE(ORE), L(Info.L), M(L->getHeader()->getModule()), + TripCount(Info.TripCount), CountType(Info.CountType), + ExitBranch(Info.ExitBranch), LoopDecrement(Info.LoopDecrement), + UsePHICounter(Info.CounterInReg), + UseLoopGuard(Info.PerformEntryTest) {} void Create(); @@ -200,15 +197,15 @@ ScalarEvolution &SE; const DataLayout &DL; OptimizationRemarkEmitter *ORE = nullptr; - Loop *L = nullptr; - Module *M = nullptr; - const SCEV *ExitCount = nullptr; - Type *CountType = nullptr; - BranchInst *ExitBranch = nullptr; - Value *LoopDecrement = nullptr; - bool UsePHICounter = false; - bool UseLoopGuard = false; - BasicBlock *BeginBB = nullptr; + Loop *L = nullptr; + Module *M = nullptr; + const SCEV *TripCount = nullptr; + Type *CountType = nullptr; + BranchInst *ExitBranch = nullptr; + Value *LoopDecrement = nullptr; + bool UsePHICounter = false; + bool UseLoopGuard = false; + BasicBlock *BeginBB = nullptr; }; } @@ -296,7 +293,7 @@ } assert( - (HWLoopInfo.ExitBlock && HWLoopInfo.ExitBranch && HWLoopInfo.ExitCount) && + (HWLoopInfo.ExitBlock && HWLoopInfo.ExitBranch && HWLoopInfo.TripCount) && "Hardware Loop must have set exit info."); BasicBlock *Preheader = L->getLoopPreheader(); @@ -387,50 +384,37 @@ // loop counter and tests that is not zero? SCEVExpander SCEVE(SE, DL, "loopcnt"); - if (!ExitCount->getType()->isPointerTy() && - ExitCount->getType() != CountType) - ExitCount = SE.getZeroExtendExpr(ExitCount, CountType); - - ExitCount = SE.getAddExpr(ExitCount, SE.getOne(CountType)); - - // If we're trying to use the 'test and set' form of the intrinsic, we need - // to replace a conditional branch that is controlling entry to the loop. It - // is likely (guaranteed?) that the preheader has an unconditional branch to - // the loop header, so also check if it has a single predecessor. - if (SE.isLoopEntryGuardedByCond(L, ICmpInst::ICMP_NE, ExitCount, - SE.getZero(ExitCount->getType()))) { - LLVM_DEBUG(dbgs() << " - Attempting to use test.set counter.\n"); - UseLoopGuard |= ForceGuardLoopEntry; - } else - UseLoopGuard = false; + UseLoopGuard |= ForceGuardLoopEntry; + + // If we're trying to use the 'test and set' form of the intrinsic, it is + // likely (guaranteed?) that the preheader has an unconditional branch to the + // loop header, so also check if it has a single predecessor. BasicBlock *BB = L->getLoopPreheader(); if (UseLoopGuard && BB->getSinglePredecessor() && cast(BB->getTerminator())->isUnconditional()) { BasicBlock *Predecessor = BB->getSinglePredecessor(); // If it's not safe to create a while loop then don't force it and create a // do-while loop instead - if (!isSafeToExpandAt(ExitCount, Predecessor->getTerminator(), SE)) - UseLoopGuard = false; + if (!isSafeToExpandAt(TripCount, Predecessor->getTerminator(), SE)) + UseLoopGuard = false; else - BB = Predecessor; + BB = Predecessor; } - if (!isSafeToExpandAt(ExitCount, BB->getTerminator(), SE)) { - LLVM_DEBUG(dbgs() << "- Bailing, unsafe to expand ExitCount " - << *ExitCount << "\n"); + if (!isSafeToExpandAt(TripCount, BB->getTerminator(), SE)) { + LLVM_DEBUG(dbgs() << "- Bailing, unsafe to expand TripCount " << *TripCount + << "\n"); return nullptr; } - Value *Count = SCEVE.expandCodeFor(ExitCount, CountType, - BB->getTerminator()); + Value *Count = SCEVE.expandCodeFor(TripCount, CountType, BB->getTerminator()); // FIXME: We've expanded Count where we hope to insert the counter setting // intrinsic. But, in the case of the 'test and set' form, we may fallback to // the just 'set' form and in which case the insertion block is most likely // different. It means there will be instruction(s) in a block that possibly - // aren't needed. The isLoopEntryGuardedByCond is trying to avoid this issue, - // but it's doesn't appear to work in all cases. + // aren't needed. UseLoopGuard = UseLoopGuard && CanGenerateTest(L, Count); BeginBB = UseLoopGuard ? BB : L->getLoopPreheader(); diff --git a/llvm/test/CodeGen/PowerPC/loop-instr-prep-non-const-increasement.ll b/llvm/test/CodeGen/PowerPC/loop-instr-prep-non-const-increasement.ll --- a/llvm/test/CodeGen/PowerPC/loop-instr-prep-non-const-increasement.ll +++ b/llvm/test/CodeGen/PowerPC/loop-instr-prep-non-const-increasement.ll @@ -20,10 +20,8 @@ ; CHECK-NEXT: blt cr0, .LBB0_4 ; CHECK-NEXT: # %bb.1: # %for.body.preheader ; CHECK-NEXT: addi r6, r3, 5 -; CHECK-NEXT: addi r3, r4, -1 +; CHECK-NEXT: clrldi r3, r4, 32 ; CHECK-NEXT: extsw r5, r5 -; CHECK-NEXT: clrldi r3, r3, 32 -; CHECK-NEXT: addi r3, r3, 1 ; CHECK-NEXT: mtctr r3 ; CHECK-NEXT: li r3, 0 ; CHECK-NEXT: .p2align 5 @@ -90,9 +88,7 @@ ; CHECK-NEXT: extsw r5, r5 ; CHECK-NEXT: sub r3, r3, r5 ; CHECK-NEXT: addi r6, r3, 1000 -; CHECK-NEXT: addi r3, r4, -1 -; CHECK-NEXT: clrldi r3, r3, 32 -; CHECK-NEXT: addi r3, r3, 1 +; CHECK-NEXT: clrldi r3, r4, 32 ; CHECK-NEXT: mtctr r3 ; CHECK-NEXT: li r3, 0 ; CHECK-NEXT: .p2align 4 diff --git a/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll b/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll --- a/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll +++ b/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll @@ -263,9 +263,7 @@ ; CHECK-NEXT: cmpwi r4, 1 ; CHECK-NEXT: blt cr0, .LBB3_5 ; CHECK-NEXT: .LBB3_3: # %for.body.preheader -; CHECK-NEXT: addi r3, r4, -1 -; CHECK-NEXT: clrldi r3, r3, 32 -; CHECK-NEXT: addi r3, r3, 1 +; CHECK-NEXT: clrldi r3, r4, 32 ; CHECK-NEXT: mtctr r3 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB3_4: # %for.body @@ -295,9 +293,7 @@ ; CHECK-BE-NEXT: cmpwi r4, 1 ; CHECK-BE-NEXT: blt cr0, .LBB3_5 ; CHECK-BE-NEXT: .LBB3_3: # %for.body.preheader -; CHECK-BE-NEXT: addi r3, r4, -1 -; CHECK-BE-NEXT: clrldi r3, r3, 32 -; CHECK-BE-NEXT: addi r3, r3, 1 +; CHECK-BE-NEXT: clrldi r3, r4, 32 ; CHECK-BE-NEXT: mtctr r3 ; CHECK-BE-NEXT: .p2align 4 ; CHECK-BE-NEXT: .LBB3_4: # %for.body diff --git a/llvm/test/Transforms/HardwareLoops/loop-guards.ll b/llvm/test/Transforms/HardwareLoops/loop-guards.ll --- a/llvm/test/Transforms/HardwareLoops/loop-guards.ll +++ b/llvm/test/Transforms/HardwareLoops/loop-guards.ll @@ -90,9 +90,9 @@ ; CHECK: entry: ; CHECK-LATCH: br i1 %brmerge.demorgan, label %while.cond ; CHECK-LATCH-NOT: @llvm{{.*}}loop.iterations +; CHECK-EXIT: [[COUNT:%[^ ]+]] = add i32 %N, 1 ; CHECK-EXIT: br i1 %brmerge.demorgan, label %while.cond.preheader ; CHECK-EXIT: while.cond.preheader: -; CHECK-EXIT: [[COUNT:%[^ ]+]] = add i32 %N, 1 ; CHECK-EXIT: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]]) ; CHECK-EXIT: br label %while.cond define void @test4(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) { diff --git a/llvm/test/Transforms/HardwareLoops/scalar-while.ll b/llvm/test/Transforms/HardwareLoops/scalar-while.ll --- a/llvm/test/Transforms/HardwareLoops/scalar-while.ll +++ b/llvm/test/Transforms/HardwareLoops/scalar-while.ll @@ -290,10 +290,10 @@ ; CHECK-GUARD-LABEL: @while_gte( ; CHECK-GUARD-NEXT: entry: ; CHECK-GUARD-NEXT: [[CMP4:%.*]] = icmp slt i32 [[I:%.*]], [[N:%.*]] -; CHECK-GUARD-NEXT: br i1 [[CMP4]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]] -; CHECK-GUARD: while.body.preheader: ; CHECK-GUARD-NEXT: [[TMP0:%.*]] = add i32 [[I]], 1 ; CHECK-GUARD-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[N]] +; CHECK-GUARD-NEXT: br i1 [[CMP4]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]] +; CHECK-GUARD: while.body.preheader: ; CHECK-GUARD-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP1]]) ; CHECK-GUARD-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK-GUARD: while.body: @@ -309,10 +309,10 @@ ; CHECK-PHIGUARD-LABEL: @while_gte( ; CHECK-PHIGUARD-NEXT: entry: ; CHECK-PHIGUARD-NEXT: [[CMP4:%.*]] = icmp slt i32 [[I:%.*]], [[N:%.*]] -; CHECK-PHIGUARD-NEXT: br i1 [[CMP4]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]] -; CHECK-PHIGUARD: while.body.preheader: ; CHECK-PHIGUARD-NEXT: [[TMP0:%.*]] = add i32 [[I]], 1 ; CHECK-PHIGUARD-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[N]] +; CHECK-PHIGUARD-NEXT: br i1 [[CMP4]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]] +; CHECK-PHIGUARD: while.body.preheader: ; CHECK-PHIGUARD-NEXT: [[TMP2:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP1]]) ; CHECK-PHIGUARD-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK-PHIGUARD: while.body: