Index: lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp =================================================================== --- lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp +++ lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp @@ -149,7 +149,26 @@ OptimizationRemarkEmitter *ORE, unsigned OuterTripCount, unsigned OuterTripMultiple, unsigned OuterLoopSize, unsigned InnerTripCount, unsigned InnerLoopSize, TargetTransformInfo::UnrollingPreferences &UP) { - // Check for explicit Count from the "unroll-and-jam-count" option. + // First up use computeUnrollCount from the loop unroller to get a count + // for unrolling the outer loop, plus any loops requiring explicit + // unrolling we leave to the unroller. This uses UP.Threshold / + // UP.PartialThreshold / UP.MaxCount to come up with sensible loop values. + // We have already checked that the loop has no unroll.* pragmas. + unsigned MaxTripCount = 0; + bool UseUpperBound = false; + bool ExplicitUnroll = computeUnrollCount( + L, TTI, DT, LI, SE, EphValues, ORE, OuterTripCount, MaxTripCount, + OuterTripMultiple, OuterLoopSize, UP, UseUpperBound); + if (ExplicitUnroll || UseUpperBound) { + // If the user explicitly set the loop as unrolled, dont UnJ it. Leave it + // for the unroller instead. + LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; explicit count set by " + "computeUnrollCount\n"); + UP.Count = 0; + return false; + } + + // Override with any explicit Count from the "unroll-and-jam-count" option. bool UserUnrollCount = UnrollAndJamCount.getNumOccurrences() > 0; if (UserUnrollCount) { UP.Count = UnrollAndJamCount; @@ -174,80 +193,76 @@ return true; } - // Use computeUnrollCount from the loop unroller to get a sensible count - // for the unrolling the outer loop. This uses UP.Threshold / - // UP.PartialThreshold / UP.MaxCount to come up with sensible loop values. - // We have already checked that the loop has no unroll.* pragmas. - unsigned MaxTripCount = 0; - bool UseUpperBound = false; - bool ExplicitUnroll = computeUnrollCount( - L, TTI, DT, LI, SE, EphValues, ORE, OuterTripCount, MaxTripCount, - OuterTripMultiple, OuterLoopSize, UP, UseUpperBound); - if (ExplicitUnroll || UseUpperBound) { - // If the user explicitly set the loop as unrolled, dont UnJ it. Leave it - // for the unroller instead. - UP.Count = 0; - return false; - } - bool PragmaEnableUnroll = HasUnrollAndJamEnablePragma(L); - ExplicitUnroll = PragmaCount > 0 || PragmaEnableUnroll || UserUnrollCount; + bool ExplicitUnrollAndJamCount = PragmaCount > 0 || UserUnrollCount; + bool ExplicitUnrollAndJam = PragmaEnableUnroll || ExplicitUnrollAndJamCount; // If the loop has an unrolling pragma, we want to be more aggressive with // unrolling limits. - if (ExplicitUnroll && OuterTripCount != 0) + if (ExplicitUnrollAndJam) UP.UnrollAndJamInnerLoopThreshold = PragmaUnrollAndJamThreshold; if (!UP.AllowRemainder && getUnrollAndJammedLoopSize(InnerLoopSize, UP) >= UP.UnrollAndJamInnerLoopThreshold) { + LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; can't create remainder and " + "inner loop too large\n"); UP.Count = 0; return false; } + // We have a sensible limit for the outer loop, now adjust it for the inner + // loop and UP.UnrollAndJamInnerLoopThreshold. If the outer limit was set + // explicitly, we want to stick to it. + if (!ExplicitUnrollAndJamCount && UP.AllowRemainder) { + while (UP.Count != 0 && getUnrollAndJammedLoopSize(InnerLoopSize, UP) >= + UP.UnrollAndJamInnerLoopThreshold) + UP.Count--; + } + + // If we are explicitly unroll and jamming, we are done. Otherwise there are a + // number of extra performance heuristics to check. + if (ExplicitUnrollAndJam) + return true; + // If the inner loop count is known and small, leave the entire loop nest to // be the unroller - if (!ExplicitUnroll && InnerTripCount && - InnerLoopSize * InnerTripCount < UP.Threshold) { + if (InnerTripCount && InnerLoopSize * InnerTripCount < UP.Threshold) { + LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; small inner loop count is " + "being left for the unroller\n"); UP.Count = 0; return false; } - // We have a sensible limit for the outer loop, now adjust it for the inner - // loop and UP.UnrollAndJamInnerLoopThreshold. - while (UP.Count != 0 && UP.AllowRemainder && - getUnrollAndJammedLoopSize(InnerLoopSize, UP) >= - UP.UnrollAndJamInnerLoopThreshold) - UP.Count--; - - if (!ExplicitUnroll) { - // Check for situations where UnJ is likely to be unprofitable. Including - // subloops with more than 1 block. - if (SubLoop->getBlocks().size() != 1) { - UP.Count = 0; - return false; - } + // Check for situations where UnJ is likely to be unprofitable. Including + // subloops with more than 1 block. + if (SubLoop->getBlocks().size() != 1) { + LLVM_DEBUG( + dbgs() << "Won't unroll-and-jam; More than one inner loop block\n"); + UP.Count = 0; + return false; + } - // Limit to loops where there is something to gain from unrolling and - // jamming the loop. In this case, look for loads that are invariant in the - // outer loop and can become shared. - unsigned NumInvariant = 0; - for (BasicBlock *BB : SubLoop->getBlocks()) { - for (Instruction &I : *BB) { - if (auto *Ld = dyn_cast(&I)) { - Value *V = Ld->getPointerOperand(); - const SCEV *LSCEV = SE.getSCEVAtScope(V, L); - if (SE.isLoopInvariant(LSCEV, L)) - NumInvariant++; - } + // Limit to loops where there is something to gain from unrolling and + // jamming the loop. In this case, look for loads that are invariant in the + // outer loop and can become shared. + unsigned NumInvariant = 0; + for (BasicBlock *BB : SubLoop->getBlocks()) { + for (Instruction &I : *BB) { + if (auto *Ld = dyn_cast(&I)) { + Value *V = Ld->getPointerOperand(); + const SCEV *LSCEV = SE.getSCEVAtScope(V, L); + if (SE.isLoopInvariant(LSCEV, L)) + NumInvariant++; } } - if (NumInvariant == 0) { - UP.Count = 0; - return false; - } + } + if (NumInvariant == 0) { + LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; No loop invariant loads\n"); + UP.Count = 0; + return false; } - return ExplicitUnroll; + return false; } static LoopUnrollResult Index: test/Transforms/LoopUnrollAndJam/pragma-explicit.ll =================================================================== --- /dev/null +++ test/Transforms/LoopUnrollAndJam/pragma-explicit.ll @@ -0,0 +1,144 @@ +; RUN: opt -loop-unroll-and-jam -allow-unroll-and-jam -unroll-runtime -unroll-partial-threshold=60 < %s -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; CHECK-LABEL: function +; The explicit metadata here should force this to be unroll and jammed 4 times +; CHECK: %indvars.iv.3 = phi i64 [ 0, %for.cond1.preheader.us ], [ %indvars.iv.next.3, %for.body4.us ] +; CHECK-NOT: %indvars.iv.4 = phi i64 [ 0, %for.cond1.preheader.us ], [ %indvars.iv.next.4, %for.body4.us ] +define void @function(i8* noalias nocapture %dst, i32 %dst_stride, i8* noalias nocapture readonly %src, i32 %src_stride, i32 %A, i32 %B, i32 %C, i32 %D, i32 %width, i32 %height) { +entry: + %idxprom = sext i32 %src_stride to i64 + %cmp52 = icmp sgt i32 %height, 0 + br i1 %cmp52, label %for.cond1.preheader.lr.ph, label %for.cond.cleanup + +for.cond1.preheader.lr.ph: ; preds = %entry + %cmp249 = icmp sgt i32 %width, 0 + %idx.ext = sext i32 %dst_stride to i64 + br i1 %cmp249, label %for.cond1.preheader.us.preheader, label %for.cond.cleanup + +for.cond1.preheader.us.preheader: ; preds = %for.cond1.preheader.lr.ph + %.pre.pre = load i8, i8* %src, align 1 + %wide.trip.count = zext i32 %width to i64 + br label %for.cond1.preheader.us + +for.cond1.preheader.us: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader + %.pre = phi i8 [ %.pre60, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %.pre.pre, %for.cond1.preheader.us.preheader ] + %srcp.056.us.pn = phi i8* [ %srcp.056.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %src, %for.cond1.preheader.us.preheader ] + %y.055.us = phi i32 [ %inc30.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ] + %dst.addr.054.us = phi i8* [ %add.ptr.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %dst, %for.cond1.preheader.us.preheader ] + %srcp.056.us = getelementptr inbounds i8, i8* %srcp.056.us.pn, i64 %idxprom + %.pre60 = load i8, i8* %srcp.056.us, align 1 + br label %for.body4.us + +for.body4.us: ; preds = %for.body4.us, %for.cond1.preheader.us + %0 = phi i8 [ %.pre60, %for.cond1.preheader.us ], [ %3, %for.body4.us ] + %1 = phi i8 [ %.pre, %for.cond1.preheader.us ], [ %2, %for.body4.us ] + %indvars.iv = phi i64 [ 0, %for.cond1.preheader.us ], [ %indvars.iv.next, %for.body4.us ] + %conv.us = zext i8 %1 to i32 + %mul.us = mul nsw i32 %conv.us, %A + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidx8.us = getelementptr inbounds i8, i8* %srcp.056.us.pn, i64 %indvars.iv.next + %2 = load i8, i8* %arrayidx8.us, align 1 + %conv9.us = zext i8 %2 to i32 + %mul10.us = mul nsw i32 %conv9.us, %B + %conv14.us = zext i8 %0 to i32 + %mul15.us = mul nsw i32 %conv14.us, %C + %arrayidx19.us = getelementptr inbounds i8, i8* %srcp.056.us, i64 %indvars.iv.next + %3 = load i8, i8* %arrayidx19.us, align 1 + %conv20.us = zext i8 %3 to i32 + %mul21.us = mul nsw i32 %conv20.us, %D + %add11.us = add i32 %mul.us, 32 + %add16.us = add i32 %add11.us, %mul10.us + %add22.us = add i32 %add16.us, %mul15.us + %add23.us = add i32 %add22.us, %mul21.us + %4 = lshr i32 %add23.us, 6 + %conv24.us = trunc i32 %4 to i8 + %arrayidx26.us = getelementptr inbounds i8, i8* %dst.addr.054.us, i64 %indvars.iv + store i8 %conv24.us, i8* %arrayidx26.us, align 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us + +for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.body4.us + %add.ptr.us = getelementptr inbounds i8, i8* %dst.addr.054.us, i64 %idx.ext + %inc30.us = add nuw nsw i32 %y.055.us, 1 + %exitcond58 = icmp eq i32 %inc30.us, %height + br i1 %exitcond58, label %for.cond.cleanup, label %for.cond1.preheader.us, !llvm.loop !5 + +for.cond.cleanup: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.lr.ph, %entry + ret void +} + +; CHECK-LABEL: function2 +; The explicit metadata here should force this to be unroll and jammed, but +; the count is left to thresholds. In this case 2. +; CHECK: %indvars.iv.1 = phi i64 [ 0, %for.cond1.preheader.us ], [ %indvars.iv.next.1, %for.body4.us ] +; CHECK-NOT: %indvars.iv.2 = phi i64 [ 0, %for.cond1.preheader.us ], [ %indvars.iv.next.2, %for.body4.us ] +define void @function2(i8* noalias nocapture %dst, i32 %dst_stride, i8* noalias nocapture readonly %src, i32 %src_stride, i32 %A, i32 %B, i32 %C, i32 %D, i32 %width, i32 %height) { +entry: + %idxprom = sext i32 %src_stride to i64 + %cmp52 = icmp sgt i32 %height, 0 + br i1 %cmp52, label %for.cond1.preheader.lr.ph, label %for.cond.cleanup + +for.cond1.preheader.lr.ph: ; preds = %entry + %cmp249 = icmp sgt i32 %width, 0 + %idx.ext = sext i32 %dst_stride to i64 + br i1 %cmp249, label %for.cond1.preheader.us.preheader, label %for.cond.cleanup + +for.cond1.preheader.us.preheader: ; preds = %for.cond1.preheader.lr.ph + %.pre.pre = load i8, i8* %src, align 1 + %wide.trip.count = zext i32 %width to i64 + br label %for.cond1.preheader.us + +for.cond1.preheader.us: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader + %.pre = phi i8 [ %.pre60, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %.pre.pre, %for.cond1.preheader.us.preheader ] + %srcp.056.us.pn = phi i8* [ %srcp.056.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %src, %for.cond1.preheader.us.preheader ] + %y.055.us = phi i32 [ %inc30.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ] + %dst.addr.054.us = phi i8* [ %add.ptr.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %dst, %for.cond1.preheader.us.preheader ] + %srcp.056.us = getelementptr inbounds i8, i8* %srcp.056.us.pn, i64 %idxprom + %.pre60 = load i8, i8* %srcp.056.us, align 1 + br label %for.body4.us + +for.body4.us: ; preds = %for.body4.us, %for.cond1.preheader.us + %0 = phi i8 [ %.pre60, %for.cond1.preheader.us ], [ %3, %for.body4.us ] + %1 = phi i8 [ %.pre, %for.cond1.preheader.us ], [ %2, %for.body4.us ] + %indvars.iv = phi i64 [ 0, %for.cond1.preheader.us ], [ %indvars.iv.next, %for.body4.us ] + %conv.us = zext i8 %1 to i32 + %mul.us = mul nsw i32 %conv.us, %A + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidx8.us = getelementptr inbounds i8, i8* %srcp.056.us.pn, i64 %indvars.iv.next + %2 = load i8, i8* %arrayidx8.us, align 1 + %conv9.us = zext i8 %2 to i32 + %mul10.us = mul nsw i32 %conv9.us, %B + %conv14.us = zext i8 %0 to i32 + %mul15.us = mul nsw i32 %conv14.us, %C + %arrayidx19.us = getelementptr inbounds i8, i8* %srcp.056.us, i64 %indvars.iv.next + %3 = load i8, i8* %arrayidx19.us, align 1 + %conv20.us = zext i8 %3 to i32 + %mul21.us = mul nsw i32 %conv20.us, %D + %add11.us = add i32 %mul.us, 32 + %add16.us = add i32 %add11.us, %mul10.us + %add22.us = add i32 %add16.us, %mul15.us + %add23.us = add i32 %add22.us, %mul21.us + %4 = lshr i32 %add23.us, 6 + %conv24.us = trunc i32 %4 to i8 + %arrayidx26.us = getelementptr inbounds i8, i8* %dst.addr.054.us, i64 %indvars.iv + store i8 %conv24.us, i8* %arrayidx26.us, align 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us + +for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.body4.us + %add.ptr.us = getelementptr inbounds i8, i8* %dst.addr.054.us, i64 %idx.ext + %inc30.us = add nuw nsw i32 %y.055.us, 1 + %exitcond58 = icmp eq i32 %inc30.us, %height + br i1 %exitcond58, label %for.cond.cleanup, label %for.cond1.preheader.us, !llvm.loop !7 + +for.cond.cleanup: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.lr.ph, %entry + ret void +} + +!5 = distinct !{!5, !6} +!6 = !{!"llvm.loop.unroll_and_jam.count", i32 4} +!7 = distinct !{!7, !8} +!8 = !{!"llvm.loop.unroll_and_jam.enable"}