Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -93,6 +93,9 @@ // another hardware loop? bool CounterInReg = false; // Should loop counter be updated in // the loop via a phi? + bool PerformEntryTest = false; // Generate the intrinsic which also performs + // icmp ne zero on the loop counter value and + // produces an i1 to guard the loop entry. bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI, DominatorTree &DT, bool ForceNestedLoop = false, bool ForceHardwareLoopPHI = false); Index: include/llvm/IR/Intrinsics.td =================================================================== --- include/llvm/IR/Intrinsics.td +++ include/llvm/IR/Intrinsics.td @@ -1190,6 +1190,11 @@ def int_set_loop_iterations : Intrinsic<[], [llvm_anyint_ty], [IntrNoDuplicate]>; +// Specify that the value given is the number of iterations that the next loop +// will execute. Also test that the given count is not zero. +def int_test_set_loop_iterations : + Intrinsic<[llvm_i1_ty], [llvm_anyint_ty], [IntrNoDuplicate]>; + // Decrement loop counter by the given argument. Return false if the loop // should exit. def int_loop_decrement : Index: lib/CodeGen/HardwareLoops.cpp =================================================================== --- lib/CodeGen/HardwareLoops.cpp +++ lib/CodeGen/HardwareLoops.cpp @@ -70,6 +70,11 @@ CounterBitWidth("hardware-loop-counter-bitwidth", cl::Hidden, cl::init(32), cl::desc("Set the loop counter bitwidth")); +static cl::opt +ForceGuardLoopEntry( + "force-hardware-loop-guard", cl::Hidden, cl::init(false), + cl::desc("Force generation of loop guard intrinsic")); + STATISTIC(NumHWLoops, "Number of loops converted to hardware loops"); namespace { @@ -146,7 +151,8 @@ CountType(Info.CountType), ExitBranch(Info.ExitBranch), LoopDecrement(Info.LoopDecrement), - UsePHICounter(Info.CounterInReg) { } + UsePHICounter(Info.CounterInReg), + UseLoopGuard(Info.PerformEntryTest) { } void Create(); @@ -158,8 +164,9 @@ const SCEV *ExitCount = nullptr; Type *CountType = nullptr; BranchInst *ExitBranch = nullptr; - Value *LoopDecrement = nullptr; + Value *LoopDecrement = nullptr; bool UsePHICounter = false; + bool UseLoopGuard = false; }; } @@ -254,7 +261,26 @@ void HardwareLoop::Create() { LLVM_DEBUG(dbgs() << "HWLoops: Converting loop..\n"); - BasicBlock *BeginBB = L->getLoopPreheader(); + + // Can we replace a conditional branch with an intrinsic that sets the + // loop counter and tests that is not zero? + auto CanGenerateTest = [](Loop *L) { + BasicBlock *Preheader = L->getLoopPreheader(); + if (!Preheader->getSinglePredecessor()) + return false; + + BasicBlock *Pred = Preheader->getSinglePredecessor(); + if (!isa(Pred->getTerminator())) + return false; + + return cast(Pred->getTerminator())->isConditional(); + }; + + UseLoopGuard = (UseLoopGuard || ForceGuardLoopEntry) && CanGenerateTest(L); + BasicBlock *Preheader = L->getLoopPreheader(); + BasicBlock *BeginBB = UseLoopGuard ? + Preheader->getSinglePredecessor() : Preheader; + Value *LoopCountInit = InitLoopCount(BeginBB); if (!LoopCountInit) return; @@ -299,9 +325,21 @@ BasicBlock *BB) { IRBuilder<> Builder(BB->getTerminator()); Type *Ty = LoopCountInit->getType(); - Function *LoopIter = - Intrinsic::getDeclaration(M, Intrinsic::set_loop_iterations, Ty); - Builder.CreateCall(LoopIter, LoopCountInit); + Intrinsic::ID ID = UseLoopGuard ? + Intrinsic::test_set_loop_iterations : Intrinsic::set_loop_iterations; + Function *LoopIter = Intrinsic::getDeclaration(M, ID, Ty); + Value *SetCount = Builder.CreateCall(LoopIter, LoopCountInit); + + // Use the return value of the intrinsic to control the entry of the loop. + if (UseLoopGuard) { + assert((isa(BB->getTerminator()) && + cast(BB->getTerminator())->isConditional()) && + "Expected conditional branch"); + auto *LoopGuard = cast(BB->getTerminator()); + LoopGuard->setCondition(SetCount); + if (LoopGuard->getSuccessor(0) != L->getLoopPreheader()) + LoopGuard->swapSuccessors(); + } } void HardwareLoop::InsertLoopDec() { Index: test/Transforms/HardwareLoops/scalar-while.ll =================================================================== --- test/Transforms/HardwareLoops/scalar-while.ll +++ test/Transforms/HardwareLoops/scalar-while.ll @@ -1,6 +1,8 @@ ; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -S %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEC ; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-hardware-loop-phi=true -S %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-REGDEC ; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-nested-hardware-loop=true -S %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEC --check-prefix=CHECK-NESTED +; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-hardware-loop-guard=true -S %s -o - | FileCheck %s --check-prefix=CHECK-GUARD +; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-hardware-loop-phi=true -force-hardware-loop-guard=true -S %s -o - | FileCheck %s --check-prefix=CHECK-GUARD ; CHECK-LABEL: while_lt define void @while_lt(i32 %i, i32 %N, i32* nocapture %A) { @@ -8,6 +10,14 @@ %cmp4 = icmp ult i32 %i, %N br i1 %cmp4, label %while.body, label %while.end +; CHECK-GUARD-LABEL: while_lt +; CHECK-GUARD: entry: +; CHECK-GUARD: [[COUNT:%[^ ]+]] = sub i32 %N, %i +; CHECK-GUARD: [[TEST:%[^ ]+]] = call i1 @llvm.test.set.loop.iterations.i32(i32 [[COUNT]]) +; CHECK-GUARD: br i1 [[TEST]], label %while.body.preheader, label %while.end +; CHECK-GUARD: while.body.preheader: +; CHECK-GUARD: br label %while.body + ; CHECK: while.body.preheader: ; CHECK: [[COUNT:%[^ ]+]] = sub i32 %N, %i ; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]]) @@ -64,6 +74,17 @@ ret void } +; CHECK-GUARD-LABEL: while_gte +; CHECK-GUARD: entry: +; CHECK-GUARD: [[ADD:%[^ ]+]] = add i32 %i, 1 +; CHECK-GUARD: [[SEL:%[^ ]+]] = icmp slt i32 %N, %i +; CHECK-GUARD: [[MIN:%[^ ]+]] = select i1 [[SEL]], i32 %N, i32 %i +; CHECK-GUARD: [[COUNT:%[^ ]+]] = sub i32 [[ADD]], [[MIN]] +; CHECK-GUARD: [[TEST:%[^ ]+]] = call i1 @llvm.test.set.loop.iterations.i32(i32 [[COUNT]]) +; CHECK-GUARD: br i1 [[TEST]], label %while.body.preheader, label %while.end +; CHECK-GUARD: while.body.preheader: +; CHECK-GUARD: br label %while.body + ; CHECK-LABEL: while_gte ; CHECK: while.body.preheader: ; CHECK: [[ADD:%[^ ]+]] = add i32 %i, 1 @@ -98,6 +119,80 @@ ret void } +; CHECK-GUARD-LABEL: while_ne +; CHECK-GUARD: entry: +; CHECK-GUARD: [[TEST:%[^ ]+]] = call i1 @llvm.test.set.loop.iterations.i32(i32 %N) +; CHECK-GUARD: br i1 [[TEST]], label %while.body.preheader, label %while.end +; CHECK-GUARD: while.body.preheader: +; CHECK-GUARD: br label %while.body +define void @while_ne(i32 %N, i32* nocapture %A) { +entry: + %cmp = icmp ne i32 %N, 0 + br i1 %cmp, label %while.body, label %while.end + +while.body: + %i.addr.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.addr.05 + store i32 %i.addr.05, i32* %arrayidx, align 4 + %inc = add nuw i32 %i.addr.05, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %while.end, label %while.body + +while.end: + ret void +} + +; CHECK-GUARD-LABEL: while_eq +; CHECK-GUARD: entry: +; CHECK-GUARD: [[TEST:%[^ ]+]] = call i1 @llvm.test.set.loop.iterations.i32(i32 %N) +; CHECK-GUARD: br i1 [[TEST]], label %while.body.preheader, label %while.end +; CHECK-GUARD: while.body.preheader: +; CHECK-GUARD: br label %while.body +define void @while_eq(i32 %N, i32* nocapture %A) { +entry: + %cmp = icmp eq i32 %N, 0 + br i1 %cmp, label %while.end, label %while.body + +while.body: + %i.addr.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.addr.05 + store i32 %i.addr.05, i32* %arrayidx, align 4 + %inc = add nuw i32 %i.addr.05, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %while.end, label %while.body + +while.end: + ret void +} + +; CHECK-GUARD-LABEL: while_preheader_eq +; CHECK-GUARD: entry: +; CHECK-GUARD: br label %preheader +; CHECK-GUARD: preheader: +; CHECK-GUARD: [[TEST:%[^ ]+]] = call i1 @llvm.test.set.loop.iterations.i32(i32 %N) +; CHECK-GUARD: br i1 [[TEST]], label %while.body.preheader, label %while.end +; CHECK-GUARD: while.body.preheader: +; CHECK-GUARD: br label %while.body +define void @while_preheader_eq(i32 %N, i32* nocapture %A) { +entry: + br label %preheader + +preheader: + %cmp = icmp eq i32 %N, 0 + br i1 %cmp, label %while.end, label %while.body + +while.body: + %i.addr.05 = phi i32 [ %inc, %while.body ], [ 0, %preheader ] + %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.addr.05 + store i32 %i.addr.05, i32* %arrayidx, align 4 + %inc = add nuw i32 %i.addr.05, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %while.end, label %while.body + +while.end: + ret void +} + ; CHECK-LABEL: nested ; CHECK-NESTED: call void @llvm.set.loop.iterations.i32(i32 %N) ; CHECK-NESTED: br label %while.cond1.preheader.us @@ -115,6 +210,10 @@ ; CHECK-NESTED: [[LOOP_DEC1:%[^ ]+]] = call i1 @llvm.loop.decrement.i32(i32 1) ; CHECK-NESTED: br i1 [[LOOP_DEC1]], label %while.cond1.preheader.us, label %while.end7 +; CHECK-GUARD: while.cond1.preheader.us: +; CHECK-GUARD: call void @llvm.set.loop.iterations.i32(i32 %N) +; CHECK-GUARD: br label %while.body3.us + define void @nested(i32* nocapture %A, i32 %N) { entry: %cmp20 = icmp eq i32 %N, 0 Index: test/Transforms/HardwareLoops/unconditional-latch.ll =================================================================== --- test/Transforms/HardwareLoops/unconditional-latch.ll +++ test/Transforms/HardwareLoops/unconditional-latch.ll @@ -1,46 +1,58 @@ ; RUN: opt -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -hardware-loops -S %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ALLOW +; RUN: opt -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -hardware-loops -force-hardware-loop-guard=true -S %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-GUARD ; RUN: opt -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-hardware-loop-phi=true -hardware-loops -S %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LATCH ; CHECK-LABEL: not_rotated ; CHECK-LATCH-NOT: call void @llvm.set.loop.iterations ; CHECK-LATCH-NOT: call i1 @llvm.loop.decrement -; CHECK-ALLOW: call void @llvm.set.loop.iterations.i32(i32 %4) -; CHECK-ALLOW: br label %10 +; CHECK-ALLOW: bb: +; CHECK-ALLOW: [[COUNT:%[^ ]+]] = add i32 %arg, 1 +; CHECK-ALLOW: br label %bb3 +; CHECK-ALLOW: bb5: +; CHECK-ALLOW: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]]) +; CHECK-ALLOW: br label %bb7 + +; CHECK-GUARD: bb: +; CHECK-GUARD: [[COUNT:%[^ ]+]] = add i32 %arg, 1 +; CHECK-GUARD: br label %bb3 +; CHECK-GUARD: bb3: +; CHECK-GUARD: [[TEST:%[^ ]+]] = call i1 @llvm.test.set.loop.iterations.i32(i32 [[COUNT]]) +; CHECK-GUARD: br i1 [[TEST]], label %bb5, label %bb18 ; CHECK-ALLOW: [[CMP:%[^ ]+]] = call i1 @llvm.loop.decrement.i32(i32 1) -; CHECK-ALLOW: br i1 [[CMP]], label %13, label %19 - -define void @not_rotated(i32, i16* nocapture, i16 signext) { - br label %4 - -4: - %5 = phi i32 [ 0, %3 ], [ %19, %18 ] - %6 = icmp eq i32 %5, %0 - br i1 %6, label %20, label %7 - -7: - %8 = mul i32 %5, %0 - br label %9 - -9: - %10 = phi i32 [ %17, %12 ], [ 0, %7 ] - %11 = icmp eq i32 %10, %0 - br i1 %11, label %18, label %12 - -12: - %13 = add i32 %10, %8 - %14 = getelementptr inbounds i16, i16* %1, i32 %13 - %15 = load i16, i16* %14, align 2 - %16 = add i16 %15, %2 - store i16 %16, i16* %14, align 2 - %17 = add i32 %10, 1 - br label %9 - -18: - %19 = add i32 %5, 1 - br label %4 - -20: +; CHECK-ALLOW: br i1 [[CMP]], label %bb10, label %bb16 +define void @not_rotated(i32 %arg, i16* nocapture %arg1, i16 signext %arg2) { +bb: + br label %bb3 + +bb3: ; preds = %bb16, %bb + %tmp = phi i32 [ 0, %bb ], [ %tmp17, %bb16 ] + %tmp4 = icmp eq i32 %tmp, %arg + br i1 %tmp4, label %bb18, label %bb5 + +bb5: ; preds = %bb3 + %tmp6 = mul i32 %tmp, %arg + br label %bb7 + +bb7: ; preds = %bb10, %bb5 + %tmp8 = phi i32 [ %tmp15, %bb10 ], [ 0, %bb5 ] + %tmp9 = icmp eq i32 %tmp8, %arg + br i1 %tmp9, label %bb16, label %bb10 + +bb10: ; preds = %bb7 + %tmp11 = add i32 %tmp8, %tmp6 + %tmp12 = getelementptr inbounds i16, i16* %arg1, i32 %tmp11 + %tmp13 = load i16, i16* %tmp12, align 2 + %tmp14 = add i16 %tmp13, %arg2 + store i16 %tmp14, i16* %tmp12, align 2 + %tmp15 = add i32 %tmp8, 1 + br label %bb7 + +bb16: ; preds = %bb7 + %tmp17 = add i32 %tmp, 1 + br label %bb3 + +bb18: ; preds = %bb3 ret void }