Index: llvm/trunk/lib/Transforms/Scalar/LoopUnrollPass.cpp =================================================================== --- llvm/trunk/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ llvm/trunk/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -362,7 +362,7 @@ /// ApproximateLoopSize - Approximate the size of the loop. static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls, - bool &NotDuplicatable, + bool &NotDuplicatable, bool &Convergent, const TargetTransformInfo &TTI, AssumptionCache *AC) { SmallPtrSet EphValues; @@ -373,6 +373,7 @@ Metrics.analyzeBasicBlock(BB, TTI, EphValues); NumCalls = Metrics.NumInlineCandidates; NotDuplicatable = Metrics.notDuplicatable; + Convergent = Metrics.convergent; unsigned LoopSize = Metrics.NumInsts; @@ -568,8 +569,9 @@ unsigned NumInlineCandidates; bool NotDuplicatable; - unsigned LoopSize = - ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, TTI, &AC); + bool Convergent; + unsigned LoopSize = ApproximateLoopSize( + L, NumInlineCandidates, NotDuplicatable, Convergent, TTI, &AC); DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n"); // When computing the unrolled size, note that the conditional branch on the @@ -623,6 +625,7 @@ if (HasRuntimeUnrollDisablePragma(L) || PragmaFullUnroll) { AllowRuntime = false; } + bool DecreasedCountDueToConvergence = false; if (Unrolling == Partial) { bool AllowPartial = PragmaEnableUnroll || UP.Partial; if (!AllowPartial && !CountSetExplicitly) { @@ -643,14 +646,40 @@ << "-unroll-runtime not given\n"); return false; } + // Reduce unroll count to be the largest power-of-two factor of // the original count which satisfies the threshold limit. while (Count != 0 && UnrolledSize > UP.PartialThreshold) { Count >>= 1; UnrolledSize = (LoopSize-2) * Count + 2; } + if (Count > UP.MaxCount) Count = UP.MaxCount; + + // If the loop contains a convergent operation, the prelude we'd add + // to do the first few instructions before we hit the unrolled loop + // is unsafe -- it adds a control-flow dependency to the convergent + // operation. Therefore Count must divide TripMultiple. + // + // TODO: This is quite conservative. In practice, convergent_op() + // is likely to be called unconditionally in the loop. In this + // case, the program would be ill-formed (on most architectures) + // unless n were the same on all threads in a thread group. + // Assuming n is the same on all threads, any kind of unrolling is + // safe. But currently llvm's notion of convergence isn't powerful + // enough to express this. + unsigned OrigCount = Count; + while (Convergent && Count != 0 && TripMultiple % Count != 0) { + DecreasedCountDueToConvergence = true; + Count >>= 1; + } + if (OrigCount > Count) { + DEBUG(dbgs() << " loop contains a convergent instruction, so unroll " + "count must divide the trip multiple, " + << TripMultiple << ". Reducing unroll count from " + << OrigCount << " to " << Count << ".\n"); + } DEBUG(dbgs() << " partially unrolling with count: " << Count << "\n"); } @@ -665,7 +694,16 @@ DebugLoc LoopLoc = L->getStartLoc(); Function *F = Header->getParent(); LLVMContext &Ctx = F->getContext(); - if ((PragmaCount > 0) && Count != OriginalCount) { + if (PragmaCount > 0 && DecreasedCountDueToConvergence) { + emitOptimizationRemarkMissed( + Ctx, DEBUG_TYPE, *F, LoopLoc, + Twine("Unable to unroll loop the number of times directed by " + "unroll_count pragma because the loop contains a convergent " + "instruction, and so must have an unroll count that divides " + "the loop trip multiple of ") + + Twine(TripMultiple) + ". Unrolling instead " + Twine(Count) + + " time(s)."); + } else if ((PragmaCount > 0) && Count != OriginalCount) { emitOptimizationRemarkMissed( Ctx, DEBUG_TYPE, *F, LoopLoc, "Unable to unroll loop the number of times directed by " Index: llvm/trunk/lib/Transforms/Utils/LoopUnroll.cpp =================================================================== --- llvm/trunk/lib/Transforms/Utils/LoopUnroll.cpp +++ llvm/trunk/lib/Transforms/Utils/LoopUnroll.cpp @@ -273,7 +273,23 @@ // flag is specified. bool RuntimeTripCount = (TripCount == 0 && Count > 0 && AllowRuntime); - if (RuntimeTripCount && + // Loops containing convergent instructions must have a count that divides + // their TripMultiple. + DEBUG( + bool HasConvergent = false; + for (auto &BB : L->blocks()) + for (auto &I : *BB) + if (auto CS = CallSite(&I)) + HasConvergent |= CS.isConvergent(); + assert( + !HasConvergent || TripMultiple % Count == 0 && + "Unroll count must divide trip multiple if loop contains a convergent " + "operation."); + ); + // Don't output the runtime loop prolog if Count is a multiple of + // TripMultiple. Such a prolog is never needed, and is unsafe if the loop + // contains a convergent instruction. + if (RuntimeTripCount && TripMultiple % Count != 0 && !UnrollRuntimeLoopProlog(L, Count, AllowExpensiveTripCount, LI, SE, DT, PreserveLCSSA)) return false; Index: llvm/trunk/test/Transforms/LoopUnroll/convergent.ll =================================================================== --- llvm/trunk/test/Transforms/LoopUnroll/convergent.ll +++ llvm/trunk/test/Transforms/LoopUnroll/convergent.ll @@ -0,0 +1,83 @@ +; RUN: opt < %s -loop-unroll -unroll-runtime -unroll-allow-partial -S | FileCheck %s + +declare void @f() convergent + +; Although this loop contains a convergent instruction, it should be +; fully unrolled. +; +; CHECK-LABEL: @full_unroll( +define i32 @full_unroll() { +entry: + br label %l3 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ] +; CHECK: call void @f() +; CHECK: call void @f() +; CHECK: call void @f() +; CHECK-NOT: call void @f() + call void @f() ;convergent + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, 3 + br i1 %exitcond, label %exit, label %l3 + +exit: + ret i32 0 +} + +; This loop contains a convergent instruction, but it should be partially +; unrolled. The unroll count is the largest power of 2 that divides the +; multiple -- 4, in this case. +; +; CHECK-LABEL: @runtime_unroll( +define i32 @runtime_unroll(i32 %n) { +entry: + %loop_ctl = mul nsw i32 %n, 12 + br label %l3 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ] +; CHECK: call void @f() +; CHECK: call void @f() +; CHECK: call void @f() +; CHECK: call void @f() +; CHECK-NOT: call void @f() + call void @f() convergent + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, %loop_ctl + br i1 %exitcond, label %exit, label %l3 + +exit: + ret i32 0 +} + +; This loop contains a convergent instruction, so its partial unroll +; count must divide its trip multiple. This overrides its unroll +; pragma -- we unroll exactly 8 times, even though 16 is requested. +; CHECK-LABEL: @pragma_unroll +define i32 @pragma_unroll(i32 %n) { +entry: + %loop_ctl = mul nsw i32 %n, 24 + br label %l3, !llvm.loop !0 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ] +; CHECK: call void @f() +; CHECK: call void @f() +; CHECK: call void @f() +; CHECK: call void @f() +; CHECK: call void @f() +; CHECK: call void @f() +; CHECK: call void @f() +; CHECK: call void @f() +; CHECK-NOT: call void @f() + call void @f() convergent + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, %loop_ctl + br i1 %exitcond, label %exit, label %l3, !llvm.loop !0 + +exit: + ret i32 0 +} + +!0 = !{!0, !{!"llvm.loop.unroll.count", i32 16}}