diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -270,6 +270,12 @@ cl::desc( "Enable runtime interleaving until load/store ports are saturated")); +/// Interleave small loops with scalar reductions. +static cl::opt InterleaveSmallLoopScalarReduction( + "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, + cl::desc("Enable interleaving for loops with small iteration counts that " + "contain scalar reductions to expose ILP.")); + /// The number of stores in a loop that are allowed to need predication. static cl::opt NumberOfStoresToPredicate( "vectorize-num-stores-pred", cl::init(1), cl::Hidden, @@ -5519,10 +5525,15 @@ if (Legal->getMaxSafeDepDistBytes() != -1U) return 1; - // Do not interleave loops with a relatively small known or estimated trip - // count. auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); - if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold) + const bool HasReductions = !Legal->getReductionVars().empty(); + // Do not interleave loops with a relatively small known or estimated trip + // count. But we will interleave when InterleaveSmallLoopScalarReduction is + // enabled, and the code has scalar reductions(HasReductions && VF = 1), + // because with the above conditions interleaving can expose ILP and break + // cross iteration dependences for reductions. + if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && + !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) return 1; RegisterUsage R = calculateRegisterUsage({VF})[0]; @@ -5550,7 +5561,7 @@ LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters << " registers of " << TTI.getRegisterClassName(pair.first) << " register class\n"); - if (VF == 1) { + if (VF.isScalar()) { if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) TargetNumRegisters = ForceTargetNumScalarRegs; } else { @@ -5579,7 +5590,7 @@ TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); // Check if the user has overridden the max. - if (VF == 1) { + if (VF.isScalar()) { if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; } else { @@ -5610,7 +5621,7 @@ // Interleave if we vectorized this loop and there is a reduction that could // benefit from interleaving. - if (VF.isVector() && !Legal->getReductionVars().empty()) { + if (VF.isVector() && HasReductions) { LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); return IC; } @@ -5622,7 +5633,11 @@ // We want to interleave small loops in order to reduce the loop overhead and // potentially expose ILP opportunities. - LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); + LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' + << "LV: IC is " << IC << '\n' + << "LV: VF is " << VF.getKnownMinValue() << '\n'); + const bool AggressivelyInterleaveReductions = + TTI.enableAggressiveInterleaving(HasReductions); if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { // We assume that the cost overhead is 1 and we use the cost model // to estimate the cost of the loop and interleave until the cost of the @@ -5641,7 +5656,7 @@ // by this point), we can increase the critical path length if the loop // we're interleaving is inside another loop. Limit, by default to 2, so the // critical path only gets increased by one reduction operation. - if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) { + if (HasReductions && TheLoop->getLoopDepth() > 1) { unsigned F = static_cast(MaxNestedScalarReductionIC); SmallIC = std::min(SmallIC, F); StoresIC = std::min(StoresIC, F); @@ -5655,14 +5670,23 @@ return std::max(StoresIC, LoadsIC); } - LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); - return SmallIC; + // If there are scalar reductions and TTI has enabled aggressive + // interleaving for reductions, we will interleave to expose ILP. + if (InterleaveSmallLoopScalarReduction && VF.isScalar() && + AggressivelyInterleaveReductions) { + LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); + // Interleave no less than SmallIC but not as aggressive as the normal IC + // to satisfy the rare situation when resources are too limited. + return std::max(IC / 2, SmallIC); + } else { + LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); + return SmallIC; + } } // Interleave if this is a large loop (small loops are already dealt with by // this point) that could benefit from interleaving. - bool HasReductions = !Legal->getReductionVars().empty(); - if (TTI.enableAggressiveInterleaving(HasReductions)) { + if (AggressivelyInterleaveReductions) { LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); return IC; } diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/interleave_IC.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/interleave_IC.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/interleave_IC.ll @@ -0,0 +1,57 @@ +; RUN: opt < %s -loop-vectorize -S -mcpu=pwr9 -interleave-small-loop-scalar-reduction=true 2>&1 | FileCheck %s +; RUN: opt < %s -passes='loop-vectorize' -S -mcpu=pwr9 -interleave-small-loop-scalar-reduction=true 2>&1 | FileCheck %s + +; CHECK-LABEL: vector.body +; CHECK: load double, double* +; CHECK-NEXT: load double, double* +; CHECK-NEXT: load double, double* +; CHECK-NEXT: load double, double* + +; CHECK: fmul fast double +; CHECK-NEXT: fmul fast double +; CHECK-NEXT: fmul fast double +; CHECK-NEXT: fmul fast double + +; CHECK: fadd fast double +; CHECK-NEXT: fadd fast double +; CHECK-NEXT: fadd fast double +; CHECK-NEXT: fadd fast double + +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-unknown-linux-gnu" + +define dso_local void @test(i32*** %arg, double** %arg1) align 2 { +bb: + %tpm15 = load i32**, i32*** %arg, align 8 + %tpm19 = load double*, double** %arg1, align 8 + br label %bb22 +bb22: ; preds = %bb33, %bb + %tpm26 = add i64 0, 1 + %tpm10 = alloca i32, align 8 + %tpm27 = getelementptr inbounds i32, i32* %tpm10, i64 %tpm26 + %tpm28 = getelementptr inbounds i32*, i32** %tpm15, i64 0 + %tpm29 = load i32*, i32** %tpm28, align 8 + %tpm17 = alloca double, align 8 + %tpm32 = getelementptr inbounds double, double* %tpm17, i64 %tpm26 + br label %bb40 +bb33: ; preds = %bb40 + %tpm35 = getelementptr inbounds double, double* %tpm19, i64 0 + %tpm37 = fsub fast double 0.000000e+00, %tpm50 + store double %tpm37, double* %tpm35, align 8 + br label %bb22 +bb40: ; preds = %bb40, %bb22 + %tpm41 = phi i32* [ %tpm51, %bb40 ], [ %tpm27, %bb22 ] + %tpm42 = phi double* [ %tpm52, %bb40 ], [ %tpm32, %bb22 ] + %tpm43 = phi double [ %tpm50, %bb40 ], [ 0.000000e+00, %bb22 ] + %tpm44 = load double, double* %tpm42, align 8 + %tpm45 = load i32, i32* %tpm41, align 4 + %tpm46 = zext i32 %tpm45 to i64 + %tpm47 = getelementptr inbounds double, double* %tpm19, i64 %tpm46 + %tpm48 = load double, double* %tpm47, align 8 + %tpm49 = fmul fast double %tpm48, %tpm44 + %tpm50 = fadd fast double %tpm49, %tpm43 + %tpm51 = getelementptr inbounds i32, i32* %tpm41, i64 1 + %tpm52 = getelementptr inbounds double, double* %tpm42, i64 1 + %tpm53 = icmp eq i32* %tpm51, %tpm29 + br i1 %tpm53, label %bb33, label %bb40 +}