Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -93,6 +93,9 @@ // another hardware loop? bool CounterInReg = false; // Should loop counter be updated in // the loop via a phi? + bool PerformEntryTest = false; // Generate the intrinsic which also performs + // icmp ne zero on the loop counter value and + // produces an i1 to guard the loop entry. bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI, DominatorTree &DT, bool ForceNestedLoop = false, bool ForceHardwareLoopPHI = false); Index: include/llvm/IR/Intrinsics.td =================================================================== --- include/llvm/IR/Intrinsics.td +++ include/llvm/IR/Intrinsics.td @@ -1190,6 +1190,12 @@ def int_set_loop_iterations : Intrinsic<[], [llvm_anyint_ty], [IntrNoDuplicate]>; +// Specify that the value given is the number of iterations that the next loop +// will execute. Also test that the given count is not zero, allowing it to +// control entry to a 'while' loop. +def int_test_set_loop_iterations : + Intrinsic<[llvm_i1_ty], [llvm_anyint_ty], [IntrNoDuplicate]>; + // Decrement loop counter by the given argument. Return false if the loop // should exit. def int_loop_decrement : Index: lib/CodeGen/HardwareLoops.cpp =================================================================== --- lib/CodeGen/HardwareLoops.cpp +++ lib/CodeGen/HardwareLoops.cpp @@ -68,6 +68,11 @@ CounterBitWidth("hardware-loop-counter-bitwidth", cl::Hidden, cl::init(32), cl::desc("Set the loop counter bitwidth")); +static cl::opt +ForceGuardLoopEntry( + "force-hardware-loop-guard", cl::Hidden, cl::init(false), + cl::desc("Force generation of loop guard intrinsic")); + STATISTIC(NumHWLoops, "Number of loops converted to hardware loops"); namespace { @@ -116,10 +121,10 @@ class HardwareLoop { // Expand the trip count scev into a value that we can use. - Value *InitLoopCount(BasicBlock *BB); + Value *InitLoopCount(); // Insert the set_loop_iteration intrinsic. - void InsertIterationSetup(Value *LoopCountInit, BasicBlock *BB); + void InsertIterationSetup(Value *LoopCountInit); // Insert the loop_decrement intrinsic. void InsertLoopDec(); @@ -144,7 +149,8 @@ CountType(Info.CountType), ExitBranch(Info.ExitBranch), LoopDecrement(Info.LoopDecrement), - UsePHICounter(Info.CounterInReg) { } + UsePHICounter(Info.CounterInReg), + UseLoopGuard(Info.PerformEntryTest) { } void Create(); @@ -156,8 +162,10 @@ const SCEV *ExitCount = nullptr; Type *CountType = nullptr; BranchInst *ExitBranch = nullptr; - Value *LoopDecrement = nullptr; + Value *LoopDecrement = nullptr; bool UsePHICounter = false; + bool UseLoopGuard = false; + BasicBlock *BeginBB = nullptr; }; } @@ -249,12 +257,12 @@ void HardwareLoop::Create() { LLVM_DEBUG(dbgs() << "HWLoops: Converting loop..\n"); - BasicBlock *BeginBB = L->getLoopPreheader(); - Value *LoopCountInit = InitLoopCount(BeginBB); + + Value *LoopCountInit = InitLoopCount(); if (!LoopCountInit) return; - InsertIterationSetup(LoopCountInit, BeginBB); + InsertIterationSetup(LoopCountInit); if (UsePHICounter || ForceHardwareLoopPHI) { Instruction *LoopDec = InsertLoopRegDec(LoopCountInit); @@ -270,7 +278,46 @@ DeleteDeadPHIs(I); } -Value *HardwareLoop::InitLoopCount(BasicBlock *BB) { +static bool CanGenerateTest(Loop *L, Value *Count) { + BasicBlock *Preheader = L->getLoopPreheader(); + if (!Preheader->getSinglePredecessor()) + return false; + + BasicBlock *Pred = Preheader->getSinglePredecessor(); + if (!isa(Pred->getTerminator())) + return false; + + auto *BI = cast(Pred->getTerminator()); + if (BI->isUnconditional() || !isa(BI->getCondition())) + return false; + + // Check that the icmp is checking for equality of Count and zero and that + // a non-zero value results in entering the loop. + auto ICmp = cast(BI->getCondition()); + if (!ICmp->isEquality()) + return false; + + auto IsCompareZero = [](ICmpInst *ICmp, Value *Count, unsigned OpIdx) { + if (auto *Const = dyn_cast(ICmp->getOperand(OpIdx))) + return Const->isZero() && ICmp->getOperand(OpIdx ^ 1) == Count; + return false; + }; + + if (!IsCompareZero(ICmp, Count, 0) && !IsCompareZero(ICmp, Count, 1)) + return false; + + unsigned SuccIdx = ICmp->getPredicate() == ICmpInst::ICMP_NE ? 0 : 1; + if (BI->getSuccessor(SuccIdx) != Preheader) + return false; + + return true; +} + +Value *HardwareLoop::InitLoopCount() { + LLVM_DEBUG(dbgs() << "HWLoops: Initialising loop counter value:\n"); + // Can we replace a conditional branch with an intrinsic that sets the + // loop counter and tests that is not zero? + SCEVExpander SCEVE(SE, DL, "loopcnt"); if (!ExitCount->getType()->isPointerTy() && ExitCount->getType() != CountType) @@ -278,25 +325,67 @@ ExitCount = SE.getAddExpr(ExitCount, SE.getOne(CountType)); + // If we're trying to use the 'test and set' form of the intrinsic, we need + // to replace a conditional branch that is controlling entry to the loop. It + // is likely (guaranteed?) that the preheader has an unconditional branch to + // the loop header, so also check if it has a single predecessor. + if (SE.isLoopEntryGuardedByCond(L, ICmpInst::ICMP_NE, ExitCount, + SE.getZero(ExitCount->getType()))) { + LLVM_DEBUG(dbgs() << " - Attempting to use test.set counter.\n"); + UseLoopGuard |= ForceGuardLoopEntry; + } else + UseLoopGuard = false; + + BasicBlock *BB = L->getLoopPreheader(); + if (UseLoopGuard && BB->getSinglePredecessor() && + cast(BB->getTerminator())->isUnconditional()) + BB = BB->getSinglePredecessor(); + if (!isSafeToExpandAt(ExitCount, BB->getTerminator(), SE)) { - LLVM_DEBUG(dbgs() << "HWLoops: Bailing, unsafe to expand ExitCount " + LLVM_DEBUG(dbgs() << "- Bailing, unsafe to expand ExitCount " << *ExitCount << "\n"); return nullptr; } Value *Count = SCEVE.expandCodeFor(ExitCount, CountType, BB->getTerminator()); - LLVM_DEBUG(dbgs() << "HWLoops: Loop Count: " << *Count << "\n"); + + // FIXME: We've expanded Count where we hope to insert the counter setting + // intrinsic. But, in the case of the 'test and set' form, we may fallback to + // the just 'set' form and in which case the insertion block is most likely + // different. It means there will be instruction(s) in a block that possibly + // aren't needed. The isLoopEntryGuardedByCond is trying to avoid this issue, + // but it's doesn't appear to work in all cases. + + UseLoopGuard = UseLoopGuard && CanGenerateTest(L, Count); + BeginBB = UseLoopGuard ? BB : L->getLoopPreheader(); + LLVM_DEBUG(dbgs() << " - Loop Count: " << *Count << "\n" + << " - Expanded Count in " << BB->getName() << "\n" + << " - Will insert set counter intrinsic into: " + << BeginBB->getName() << "\n"); return Count; } -void HardwareLoop::InsertIterationSetup(Value *LoopCountInit, - BasicBlock *BB) { - IRBuilder<> Builder(BB->getTerminator()); +void HardwareLoop::InsertIterationSetup(Value *LoopCountInit) { + IRBuilder<> Builder(BeginBB->getTerminator()); Type *Ty = LoopCountInit->getType(); - Function *LoopIter = - Intrinsic::getDeclaration(M, Intrinsic::set_loop_iterations, Ty); - Builder.CreateCall(LoopIter, LoopCountInit); + Intrinsic::ID ID = UseLoopGuard ? + Intrinsic::test_set_loop_iterations : Intrinsic::set_loop_iterations; + Function *LoopIter = Intrinsic::getDeclaration(M, ID, Ty); + Value *SetCount = Builder.CreateCall(LoopIter, LoopCountInit); + + // Use the return value of the intrinsic to control the entry of the loop. + if (UseLoopGuard) { + assert((isa(BeginBB->getTerminator()) && + cast(BeginBB->getTerminator())->isConditional()) && + "Expected conditional branch"); + auto *LoopGuard = cast(BeginBB->getTerminator()); + LoopGuard->setCondition(SetCount); + if (LoopGuard->getSuccessor(0) != L->getLoopPreheader()) + LoopGuard->swapSuccessors(); + } + LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop counter: " + << *SetCount << "\n"); } void HardwareLoop::InsertLoopDec() { Index: test/Transforms/HardwareLoops/loop-guards.ll =================================================================== --- /dev/null +++ test/Transforms/HardwareLoops/loop-guards.ll @@ -0,0 +1,339 @@ +; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-hardware-loop-guard=true -S %s -o - | FileCheck %s + +; CHECK-LABEL: test1 +; CHECK: entry: +; CHECK: [[CMP:%[^ ]+]] = icmp ugt i32 %N, 2 +; CHECK: [[MAX:%[^ ]+]] = select i1 [[CMP]], i32 %N, i32 2 +; CHECK: [[COUNT:%[^ ]+]] = add i32 [[MAX]], -1 +; CHECK: br i1 %t1, label %do.body.preheader +; CHECK: do.body.preheader: +; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]]) +; CHECK: br label %do.body +define void @test1(i1 zeroext %t1, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) { +entry: + br i1 %t1, label %do.body, label %if.end + +do.body: ; preds = %do.body, %entry + %b.addr.0 = phi i32* [ %incdec.ptr, %do.body ], [ %b, %entry ] + %a.addr.0 = phi i32* [ %incdec.ptr1, %do.body ], [ %a, %entry ] + %i.0 = phi i32 [ %inc, %do.body ], [ 1, %entry ] + %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.0, i32 1 + %tmp = load i32, i32* %b.addr.0, align 4 + %incdec.ptr1 = getelementptr inbounds i32, i32* %a.addr.0, i32 1 + store i32 %tmp, i32* %a.addr.0, align 4 + %inc = add nuw i32 %i.0, 1 + %cmp = icmp ult i32 %inc, %N + br i1 %cmp, label %do.body, label %if.end + +if.end: ; preds = %do.body, %entry + ret void +} + +; CHECK-LABEL: test2 +; CHECK-NOT: call i1 @llvm.test.set.loop.iterations +; CHECK-NOT: call void @llvm.set.loop.iterations +define void @test2(i1 zeroext %t1, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) { +entry: + br i1 %t1, label %do.body, label %if.end + +do.body: ; preds = %do.body, %entry + %b.addr.0 = phi i32* [ %incdec.ptr, %do.body ], [ %b, %entry ] + %a.addr.0 = phi i32* [ %incdec.ptr1, %do.body ], [ %a, %entry ] + %i.0 = phi i32 [ %add, %do.body ], [ 1, %entry ] + %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.0, i32 1 + %tmp = load i32, i32* %b.addr.0, align 4 + %incdec.ptr1 = getelementptr inbounds i32, i32* %a.addr.0, i32 1 + store i32 %tmp, i32* %a.addr.0, align 4 + %add = add i32 %i.0, 2 + %cmp = icmp ult i32 %add, %N + br i1 %cmp, label %do.body, label %if.end + +if.end: ; preds = %do.body, %entry + ret void +} + +; CHECK-LABEL: test3 +; CHECK: entry: +; CHECK: [[CMP:%[^ ]+]] = icmp ugt i32 %N, 1 +; CHECK: [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %N, i32 1 +; CHECK: br i1 %brmerge.demorgan, label %do.body.preheader +; CHECK: do.body.preheader: +; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]]) +; CHECK: br label %do.body +define void @test3(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) { +entry: + %brmerge.demorgan = and i1 %t1, %t2 + br i1 %brmerge.demorgan, label %do.body, label %if.end + +do.body: ; preds = %do.body, %entry + %b.addr.0 = phi i32* [ %incdec.ptr, %do.body ], [ %b, %entry ] + %a.addr.0 = phi i32* [ %incdec.ptr3, %do.body ], [ %a, %entry ] + %i.0 = phi i32 [ %inc, %do.body ], [ 0, %entry ] + %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.0, i32 1 + %tmp = load i32, i32* %b.addr.0, align 4 + %incdec.ptr3 = getelementptr inbounds i32, i32* %a.addr.0, i32 1 + store i32 %tmp, i32* %a.addr.0, align 4 + %inc = add nuw i32 %i.0, 1 + %cmp = icmp ult i32 %inc, %N + br i1 %cmp, label %do.body, label %if.end + +if.end: ; preds = %do.body, %entry + ret void +} + +; CHECK-LABEL: test4 +; CHECK: entry: +; CHECK: br i1 %brmerge.demorgan, label %while.cond.preheader +; CHECK: while.cond.preheader: +; CHECK: [[COUNT:%[^ ]+]] = add i32 %N, 1 +; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]]) +; CHECK: br label %while.cond +define void @test4(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) { +entry: + %brmerge.demorgan = and i1 %t1, %t2 + br i1 %brmerge.demorgan, label %while.cond, label %if.end + +while.cond: ; preds = %while.body, %entry + %b.addr.0 = phi i32* [ %incdec.ptr, %while.body ], [ %b, %entry ] + %a.addr.0 = phi i32* [ %incdec.ptr3, %while.body ], [ %a, %entry ] + %i.0 = phi i32 [ %inc, %while.body ], [ 0, %entry ] + %exitcond = icmp eq i32 %i.0, %N + br i1 %exitcond, label %if.end, label %while.body + +while.body: ; preds = %while.cond + %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.0, i32 1 + %tmp = load i32, i32* %b.addr.0, align 4 + %incdec.ptr3 = getelementptr inbounds i32, i32* %a.addr.0, i32 1 + store i32 %tmp, i32* %a.addr.0, align 4 + %inc = add i32 %i.0, 1 + br label %while.cond + +if.end: ; preds = %while.cond, %entry + ret void +} + +; CHECK-LABEL: test5 +; CHECK: entry: +; CHECK: br i1 %or.cond, label %while.body.preheader +; CHECK: while.body.preheader: +; CHECK: call void @llvm.set.loop.iterations.i32(i32 %N) +; CHECK: br label %while.body +define void @test5(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) { +entry: + %brmerge.demorgan = and i1 %t1, %t2 + %cmp6 = icmp ne i32 %N, 0 + %or.cond = and i1 %brmerge.demorgan, %cmp6 + br i1 %or.cond, label %while.body, label %if.end + +while.body: ; preds = %while.body, %entry + %i.09 = phi i32 [ %inc, %while.body ], [ 0, %entry ] + %a.addr.08 = phi i32* [ %incdec.ptr3, %while.body ], [ %a, %entry ] + %b.addr.07 = phi i32* [ %incdec.ptr, %while.body ], [ %b, %entry ] + %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.07, i32 1 + %tmp = load i32, i32* %b.addr.07, align 4 + %incdec.ptr3 = getelementptr inbounds i32, i32* %a.addr.08, i32 1 + store i32 %tmp, i32* %a.addr.08, align 4 + %inc = add nuw i32 %i.09, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %if.end, label %while.body + +if.end: ; preds = %while.body, %entry + ret void +} + +; CHECK-LABEL: test6 +; CHECK: entry: +; CHECK: br i1 %brmerge.demorgan, label %while.preheader +; CHECK: while.preheader: +; CHECK: [[TEST:%[^ ]+]] = call i1 @llvm.test.set.loop.iterations.i32(i32 %N) +; CHECK: br i1 [[TEST]], label %while.body.preheader, label %if.end +; CHECK: while.body.preheader: +; CHECK: br label %while.body +; CHECK: while.body: +; CHECK: [[DEC:%[^ ]+]] = call i1 @llvm.loop.decrement.i32(i32 1) +; CHECK: br i1 [[DEC]], label %while.body, label %if.end +define void @test6(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) { +entry: + %brmerge.demorgan = and i1 %t1, %t2 + br i1 %brmerge.demorgan, label %while.preheader, label %if.end + +while.preheader: ; preds = %entry + %cmp = icmp ne i32 %N, 0 + br i1 %cmp, label %while.body, label %if.end + +while.body: ; preds = %while.body, %while.preheader + %i.09 = phi i32 [ %inc, %while.body ], [ 0, %while.preheader ] + %a.addr.08 = phi i32* [ %incdec.ptr3, %while.body ], [ %a, %while.preheader ] + %b.addr.07 = phi i32* [ %incdec.ptr, %while.body ], [ %b, %while.preheader ] + %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.07, i32 1 + %tmp = load i32, i32* %b.addr.07, align 4 + %incdec.ptr3 = getelementptr inbounds i32, i32* %a.addr.08, i32 1 + store i32 %tmp, i32* %a.addr.08, align 4 + %inc = add nuw i32 %i.09, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %if.end, label %while.body + +if.end: ; preds = %while.body, %while.preheader, %entry + ret void +} + +; CHECK-LABEL: test7 +; CHECK: entry: +; CHECK: br i1 %brmerge.demorgan, label %while.preheader +; CHECK: while.preheader: +; CHECK: [[TEST:%[^ ]+]] = call i1 @llvm.test.set.loop.iterations.i32(i32 %N) +; CHECK: br i1 [[TEST]], label %while.body.preheader, label %if.end +; CHECK: while.body.preheader: +; CHECK: br label %while.body +define void @test7(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) { +entry: + %brmerge.demorgan = and i1 %t1, %t2 + br i1 %brmerge.demorgan, label %while.preheader, label %if.end + +while.preheader: ; preds = %entry + %cmp = icmp eq i32 %N, 0 + br i1 %cmp, label %if.end, label %while.body + +while.body: ; preds = %while.body, %while.preheader + %i.09 = phi i32 [ %inc, %while.body ], [ 0, %while.preheader ] + %a.addr.08 = phi i32* [ %incdec.ptr3, %while.body ], [ %a, %while.preheader ] + %b.addr.07 = phi i32* [ %incdec.ptr, %while.body ], [ %b, %while.preheader ] + %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.07, i32 1 + %tmp = load i32, i32* %b.addr.07, align 4 + %incdec.ptr3 = getelementptr inbounds i32, i32* %a.addr.08, i32 1 + store i32 %tmp, i32* %a.addr.08, align 4 + %inc = add nuw i32 %i.09, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %if.end, label %while.body + +if.end: ; preds = %while.body, %while.preheader, %entry + ret void +} + +; TODO: Can we rearrange the conditional blocks so that we can use the test form? +; CHECK-LABEL: test8 +; CHECK: entry: +; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 %N, 0 +; CHECK: br i1 [[CMP]], label %while.preheader +; CHECK: while.preheader: +; CHECK: br i1 %brmerge.demorgan, label %while.body.preheader +; CHECK: while.body.preheader: +; CHECK: call void @llvm.set.loop.iterations.i32(i32 %N) +; CHECK: br label %while.body +; CHECK: while.body: +; CHECK: [[DEC:%[^ ]+]] = call i1 @llvm.loop.decrement.i32(i32 1) +; CHECK: br i1 [[DEC]], label %while.body, label %if.end +define void @test8(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) { +entry: + %cmp = icmp ne i32 %N, 0 + br i1 %cmp, label %while.preheader, label %if.end + +while.preheader: ; preds = %entry + %brmerge.demorgan = and i1 %t1, %t2 + br i1 %brmerge.demorgan, label %while.body, label %if.end + +while.body: ; preds = %while.body, %while.preheader + %i.09 = phi i32 [ %inc, %while.body ], [ 0, %while.preheader ] + %a.addr.08 = phi i32* [ %incdec.ptr3, %while.body ], [ %a, %while.preheader ] + %b.addr.07 = phi i32* [ %incdec.ptr, %while.body ], [ %b, %while.preheader ] + %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.07, i32 1 + %tmp = load i32, i32* %b.addr.07, align 4 + %incdec.ptr3 = getelementptr inbounds i32, i32* %a.addr.08, i32 1 + store i32 %tmp, i32* %a.addr.08, align 4 + %inc = add nuw i32 %i.09, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %if.end, label %while.body + +if.end: ; preds = %while.body, %while.preheader, %entry + ret void +} + +; CHECK-LABEL: test9 +; CHECK: entry: +; CHECK: br i1 %brmerge.demorgan, label %do.body.preheader +; CHECK: do.body.preheader: +; CHECK: call void @llvm.set.loop.iterations.i32(i32 %N) +; CHECK: br label %do.body +define void @test9(i1 zeroext %t1, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) { +entry: + %cmp = icmp ne i32 %N, 0 + %brmerge.demorgan = and i1 %t1, %cmp + br i1 %brmerge.demorgan, label %do.body, label %if.end + +do.body: ; preds = %do.body, %entry + %b.addr.0 = phi i32* [ %incdec.ptr, %do.body ], [ %b, %entry ] + %a.addr.0 = phi i32* [ %incdec.ptr3, %do.body ], [ %a, %entry ] + %i.0 = phi i32 [ %inc, %do.body ], [ 0, %entry ] + %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.0, i32 1 + %tmp = load i32, i32* %b.addr.0, align 4 + %incdec.ptr3 = getelementptr inbounds i32, i32* %a.addr.0, i32 1 + store i32 %tmp, i32* %a.addr.0, align 4 + %inc = add nuw i32 %i.0, 1 + %cmp.1 = icmp ult i32 %inc, %N + br i1 %cmp.1, label %do.body, label %if.end + +if.end: ; preds = %do.body, %entry + ret void +} + +; CHECK-LABEL: test10 +; CHECK: entry: +; CHECK: br i1 %cmp.1, label %do.body.preheader +; CHECK: do.body.preheader: +; CHECK: call void @llvm.set.loop.iterations.i32(i32 +; CHECK: br label %do.body +define void @test10(i32* nocapture %a, i32* nocapture readonly %b, i32 %N) { +entry: + %cmp = icmp ne i32 %N, 0 + %sub = sub i32 %N, 1 + %be = select i1 %cmp, i32 0, i32 %sub + %cmp.1 = icmp ne i32 %be, 0 + br i1 %cmp.1, label %do.body, label %if.end + +do.body: ; preds = %do.body, %entry + %b.addr.0 = phi i32* [ %incdec.ptr, %do.body ], [ %b, %entry ] + %a.addr.0 = phi i32* [ %incdec.ptr3, %do.body ], [ %a, %entry ] + %i.0 = phi i32 [ %inc, %do.body ], [ 0, %entry ] + %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.0, i32 1 + %tmp = load i32, i32* %b.addr.0, align 4 + %incdec.ptr3 = getelementptr inbounds i32, i32* %a.addr.0, i32 1 + store i32 %tmp, i32* %a.addr.0, align 4 + %inc = add nuw i32 %i.0, 1 + %cmp.2 = icmp ult i32 %inc, %N + br i1 %cmp.2, label %do.body, label %if.end + +if.end: ; preds = %do.body, %entry + ret void +} + +; CHECK-LABEL: test11 +; CHECK: entry: +; CHECK: br label %do.body.preheader +; CHECK: do.body.preheader: +; CHECK: [[TEST:%[^ ]+]] = call i1 @llvm.test.set.loop.iterations.i32(i32 %N) +; CHECK: br i1 [[TEST]], label %do.body.preheader1, label %if.end +; CHECK: do.body.preheader1: +; CHECK: br label %do.body +define void @test11(i1 zeroext %t1, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) { +entry: + br label %do.body.preheader + +do.body.preheader: + %cmp = icmp ne i32 %N, 0 + br i1 %cmp, label %do.body, label %if.end + +do.body: + %b.addr.0 = phi i32* [ %incdec.ptr, %do.body ], [ %b, %do.body.preheader ] + %a.addr.0 = phi i32* [ %incdec.ptr3, %do.body ], [ %a, %do.body.preheader ] + %i.0 = phi i32 [ %inc, %do.body ], [ 0, %do.body.preheader ] + %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.0, i32 1 + %tmp = load i32, i32* %b.addr.0, align 4 + %incdec.ptr3 = getelementptr inbounds i32, i32* %a.addr.0, i32 1 + store i32 %tmp, i32* %a.addr.0, align 4 + %inc = add nuw i32 %i.0, 1 + %cmp.1 = icmp ult i32 %inc, %N + br i1 %cmp.1, label %do.body, label %if.end + +if.end: ; preds = %do.body, %entry + ret void +} Index: test/Transforms/HardwareLoops/scalar-while.ll =================================================================== --- test/Transforms/HardwareLoops/scalar-while.ll +++ test/Transforms/HardwareLoops/scalar-while.ll @@ -1,6 +1,8 @@ ; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -S %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEC ; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-hardware-loop-phi=true -S %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-REGDEC ; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-nested-hardware-loop=true -S %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEC --check-prefix=CHECK-NESTED +; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-hardware-loop-guard=true -S %s -o - | FileCheck %s --check-prefix=CHECK-GUARD +; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-hardware-loop-phi=true -force-hardware-loop-guard=true -S %s -o - | FileCheck %s --check-prefix=CHECK-GUARD ; CHECK-LABEL: while_lt define void @while_lt(i32 %i, i32 %N, i32* nocapture %A) { @@ -8,6 +10,11 @@ %cmp4 = icmp ult i32 %i, %N br i1 %cmp4, label %while.body, label %while.end +; CHECK-GUARD-LABEL: while_lt +; CHECK-GUARD: [[COUNT:%[^ ]+]] = sub i32 %N, %i +; CHECK-GUARD: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]]) +; CHECK-GUARD: br label %while.body + ; CHECK: while.body.preheader: ; CHECK: [[COUNT:%[^ ]+]] = sub i32 %N, %i ; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]]) @@ -64,6 +71,17 @@ ret void } +; CHECK-GUARD-LABEL: while_gte +; CHECK-GUARD: entry: +; CHECK-GUARD: br i1 %cmp4, label %while.end, label %while.body.preheader +; CHECK-GUARD: while.body.preheader: +; CHECK-GUARD: [[ADD:%[^ ]+]] = add i32 %i, 1 +; CHECK-GUARD: [[SEL:%[^ ]+]] = icmp slt i32 %N, %i +; CHECK-GUARD: [[MIN:%[^ ]+]] = select i1 [[SEL]], i32 %N, i32 %i +; CHECK-GUARD: [[COUNT:%[^ ]+]] = sub i32 [[ADD]], [[MIN]] +; CHECK-GUARD: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]]) +; CHECK-GUARD: br label %while.body + ; CHECK-LABEL: while_gte ; CHECK: while.body.preheader: ; CHECK: [[ADD:%[^ ]+]] = add i32 %i, 1 @@ -98,6 +116,80 @@ ret void } +; CHECK-GUARD-LABEL: while_ne +; CHECK-GUARD: entry: +; CHECK-GUARD: [[TEST:%[^ ]+]] = call i1 @llvm.test.set.loop.iterations.i32(i32 %N) +; CHECK-GUARD: br i1 [[TEST]], label %while.body.preheader, label %while.end +; CHECK-GUARD: while.body.preheader: +; CHECK-GUARD: br label %while.body +define void @while_ne(i32 %N, i32* nocapture %A) { +entry: + %cmp = icmp ne i32 %N, 0 + br i1 %cmp, label %while.body, label %while.end + +while.body: + %i.addr.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.addr.05 + store i32 %i.addr.05, i32* %arrayidx, align 4 + %inc = add nuw i32 %i.addr.05, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %while.end, label %while.body + +while.end: + ret void +} + +; CHECK-GUARD-LABEL: while_eq +; CHECK-GUARD: entry: +; CHECK-GUARD: [[TEST:%[^ ]+]] = call i1 @llvm.test.set.loop.iterations.i32(i32 %N) +; CHECK-GUARD: br i1 [[TEST]], label %while.body.preheader, label %while.end +; CHECK-GUARD: while.body.preheader: +; CHECK-GUARD: br label %while.body +define void @while_eq(i32 %N, i32* nocapture %A) { +entry: + %cmp = icmp eq i32 %N, 0 + br i1 %cmp, label %while.end, label %while.body + +while.body: + %i.addr.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.addr.05 + store i32 %i.addr.05, i32* %arrayidx, align 4 + %inc = add nuw i32 %i.addr.05, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %while.end, label %while.body + +while.end: + ret void +} + +; CHECK-GUARD-LABEL: while_preheader_eq +; CHECK-GUARD: entry: +; CHECK-GUARD: br label %preheader +; CHECK-GUARD: preheader: +; CHECK-GUARD: [[TEST:%[^ ]+]] = call i1 @llvm.test.set.loop.iterations.i32(i32 %N) +; CHECK-GUARD: br i1 [[TEST]], label %while.body.preheader, label %while.end +; CHECK-GUARD: while.body.preheader: +; CHECK-GUARD: br label %while.body +define void @while_preheader_eq(i32 %N, i32* nocapture %A) { +entry: + br label %preheader + +preheader: + %cmp = icmp eq i32 %N, 0 + br i1 %cmp, label %while.end, label %while.body + +while.body: + %i.addr.05 = phi i32 [ %inc, %while.body ], [ 0, %preheader ] + %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.addr.05 + store i32 %i.addr.05, i32* %arrayidx, align 4 + %inc = add nuw i32 %i.addr.05, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %while.end, label %while.body + +while.end: + ret void +} + ; CHECK-LABEL: nested ; CHECK-NESTED: call void @llvm.set.loop.iterations.i32(i32 %N) ; CHECK-NESTED: br label %while.cond1.preheader.us @@ -115,6 +207,10 @@ ; CHECK-NESTED: [[LOOP_DEC1:%[^ ]+]] = call i1 @llvm.loop.decrement.i32(i32 1) ; CHECK-NESTED: br i1 [[LOOP_DEC1]], label %while.cond1.preheader.us, label %while.end7 +; CHECK-GUARD: while.cond1.preheader.us: +; CHECK-GUARD: call void @llvm.set.loop.iterations.i32(i32 %N) +; CHECK-GUARD: br label %while.body3.us + define void @nested(i32* nocapture %A, i32 %N) { entry: %cmp20 = icmp eq i32 %N, 0 Index: test/Transforms/HardwareLoops/unconditional-latch.ll =================================================================== --- test/Transforms/HardwareLoops/unconditional-latch.ll +++ test/Transforms/HardwareLoops/unconditional-latch.ll @@ -1,46 +1,51 @@ ; RUN: opt -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -hardware-loops -S %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ALLOW +; RUN: opt -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -hardware-loops -force-hardware-loop-guard=true -S %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ALLOW ; RUN: opt -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-hardware-loop-phi=true -hardware-loops -S %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LATCH ; CHECK-LABEL: not_rotated ; CHECK-LATCH-NOT: call void @llvm.set.loop.iterations ; CHECK-LATCH-NOT: call i1 @llvm.loop.decrement -; CHECK-ALLOW: call void @llvm.set.loop.iterations.i32(i32 %4) -; CHECK-ALLOW: br label %10 +; CHECK-ALLOW: bb: +; CHECK-ALLOW: [[COUNT:%[^ ]+]] = add i32 %arg, 1 +; CHECK-ALLOW: br label %bb3 +; CHECK-ALLOW: bb5: +; CHECK-ALLOW: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]]) +; CHECK-ALLOW: br label %bb7 ; CHECK-ALLOW: [[CMP:%[^ ]+]] = call i1 @llvm.loop.decrement.i32(i32 1) -; CHECK-ALLOW: br i1 [[CMP]], label %13, label %19 - -define void @not_rotated(i32, i16* nocapture, i16 signext) { - br label %4 - -4: - %5 = phi i32 [ 0, %3 ], [ %19, %18 ] - %6 = icmp eq i32 %5, %0 - br i1 %6, label %20, label %7 - -7: - %8 = mul i32 %5, %0 - br label %9 - -9: - %10 = phi i32 [ %17, %12 ], [ 0, %7 ] - %11 = icmp eq i32 %10, %0 - br i1 %11, label %18, label %12 - -12: - %13 = add i32 %10, %8 - %14 = getelementptr inbounds i16, i16* %1, i32 %13 - %15 = load i16, i16* %14, align 2 - %16 = add i16 %15, %2 - store i16 %16, i16* %14, align 2 - %17 = add i32 %10, 1 - br label %9 - -18: - %19 = add i32 %5, 1 - br label %4 - -20: +; CHECK-ALLOW: br i1 [[CMP]], label %bb10, label %bb16 +define void @not_rotated(i32 %arg, i16* nocapture %arg1, i16 signext %arg2) { +bb: + br label %bb3 + +bb3: ; preds = %bb16, %bb + %tmp = phi i32 [ 0, %bb ], [ %tmp17, %bb16 ] + %tmp4 = icmp eq i32 %tmp, %arg + br i1 %tmp4, label %bb18, label %bb5 + +bb5: ; preds = %bb3 + %tmp6 = mul i32 %tmp, %arg + br label %bb7 + +bb7: ; preds = %bb10, %bb5 + %tmp8 = phi i32 [ %tmp15, %bb10 ], [ 0, %bb5 ] + %tmp9 = icmp eq i32 %tmp8, %arg + br i1 %tmp9, label %bb16, label %bb10 + +bb10: ; preds = %bb7 + %tmp11 = add i32 %tmp8, %tmp6 + %tmp12 = getelementptr inbounds i16, i16* %arg1, i32 %tmp11 + %tmp13 = load i16, i16* %tmp12, align 2 + %tmp14 = add i16 %tmp13, %arg2 + store i16 %tmp14, i16* %tmp12, align 2 + %tmp15 = add i32 %tmp8, 1 + br label %bb7 + +bb16: ; preds = %bb7 + %tmp17 = add i32 %tmp, 1 + br label %bb3 + +bb18: ; preds = %bb3 ret void }