diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1169,6 +1169,18 @@ return foldTailByMasking() || Legal->blockNeedsPredication(BB); } + /// Estimate cost of an intrinsic call instruction CI if it were vectorized + /// with factor VF. Return the cost of the instruction, including + /// scalarization overhead if it's needed. + unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF); + + /// Estimate cost of a call instruction CI if it were vectorized with factor + /// VF. Return the cost of the instruction, including scalarization overhead + /// if it's needed. The flag NeedToScalarize shows if the call needs to be + /// scalarized - + /// i.e. either vector version isn't available, or is too expensive. + unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); + private: unsigned NumPredStores = 0; @@ -1221,6 +1233,10 @@ /// element) unsigned getUniformMemOpCost(Instruction *I, unsigned VF); + /// Estimate the overhead of scalarizing an instruction. This is a + /// convenience wrapper for the type-based getScalarizationOverhead API. + unsigned getScalarizationOverhead(Instruction *I, unsigned VF); + /// Returns whether the instruction is a load or store and will be a emitted /// as a vector operation. bool isConsecutiveLoadOrStore(Instruction *I); @@ -1315,6 +1331,28 @@ DecisionList WideningDecisions; + /// Returns true if \p V is expected to be vectorized and it needs to be + /// extracted. + bool needsExtract(Value *V, unsigned VF) const { + // Assume we can vectorize V (and hence we need extraction) if the + // scalars are not computed yet. This can happen, because it is called + // via getScalarizationOverhead from setCostBasedWideningDecision, before + // the scalars are collected. + if (VF > 1 && Scalars.find(VF) == Scalars.end()) + return true; + + Instruction *I = dyn_cast(V); + return I && TheLoop->contains(I) && !TheLoop->isLoopInvariant(I) && + !isScalarAfterVectorization(I, VF); + }; + + /// Returns a range containing only operands needing to be extracted. + SmallVector filterExtractingOperands(Instruction::op_range Ops, + unsigned VF) { + return SmallVector(make_filter_range( + Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); + } + public: /// The loop that we evaluate. Loop *TheLoop; @@ -3057,45 +3095,9 @@ } } -/// Estimate the overhead of scalarizing an instruction. This is a -/// convenience wrapper for the type-based getScalarizationOverhead API. -static unsigned getScalarizationOverhead(Instruction *I, unsigned VF, - const TargetTransformInfo &TTI) { - if (VF == 1) - return 0; - - unsigned Cost = 0; - Type *RetTy = ToVectorTy(I->getType(), VF); - if (!RetTy->isVoidTy() && - (!isa(I) || - !TTI.supportsEfficientVectorElementLoadStore())) - Cost += TTI.getScalarizationOverhead(RetTy, true, false); - - // Some targets keep addresses scalar. - if (isa(I) && !TTI.prefersVectorizedAddressing()) - return Cost; - - if (CallInst *CI = dyn_cast(I)) { - SmallVector Operands(CI->arg_operands()); - Cost += TTI.getOperandsScalarizationOverhead(Operands, VF); - } - else if (!isa(I) || - !TTI.supportsEfficientVectorElementLoadStore()) { - SmallVector Operands(I->operand_values()); - Cost += TTI.getOperandsScalarizationOverhead(Operands, VF); - } - - return Cost; -} - -// Estimate cost of a call instruction CI if it were vectorized with factor VF. -// Return the cost of the instruction, including scalarization overhead if it's -// needed. The flag NeedToScalarize shows if the call needs to be scalarized - -// i.e. either vector version isn't available, or is too expensive. -static unsigned getVectorCallCost(CallInst *CI, unsigned VF, - const TargetTransformInfo &TTI, - const TargetLibraryInfo *TLI, - bool &NeedToScalarize) { +unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, + unsigned VF, + bool &NeedToScalarize) { Function *F = CI->getCalledFunction(); StringRef FnName = CI->getCalledFunction()->getName(); Type *ScalarRetTy = CI->getType(); @@ -3118,7 +3120,7 @@ // Compute costs of unpacking argument values for the scalar calls and // packing the return values to a vector. - unsigned ScalarizationCost = getScalarizationOverhead(CI, VF, TTI); + unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); unsigned Cost = ScalarCallCost * VF + ScalarizationCost; @@ -3137,12 +3139,8 @@ return Cost; } -// Estimate cost of an intrinsic call instruction CI if it were vectorized with -// factor VF. Return the cost of the instruction, including scalarization -// overhead if it's needed. -static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF, - const TargetTransformInfo &TTI, - const TargetLibraryInfo *TLI) { +unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, + unsigned VF) { Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); assert(ID && "Expected intrinsic call!"); @@ -3150,8 +3148,11 @@ if (auto *FPMO = dyn_cast(CI)) FMF = FPMO->getFastMathFlags(); - SmallVector Operands(CI->arg_operands()); - return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF); + // Skip operands that do not require extraction/scalarization and do not incur + // any overhead. + return TTI.getIntrinsicInstrCost( + ID, CI->getType(), filterExtractingOperands(CI->arg_operands(), VF), FMF, + VF); } static Type *smallestIntegerVectorType(Type *T1, Type *T2) { @@ -4126,9 +4127,9 @@ // version of the instruction. // Is it beneficial to perform intrinsic call compared to lib call? bool NeedToScalarize; - unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize); + unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); bool UseVectorIntrinsic = - ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; + ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; assert((UseVectorIntrinsic || !NeedToScalarize) && "Instruction should be scalarized elsewhere."); @@ -5352,15 +5353,6 @@ return true; }; - // Returns true if an operand that cannot be scalarized must be extracted - // from a vector. We will account for this scalarization overhead below. Note - // that the non-void predicated instructions are placed in their own blocks, - // and their return values are inserted into vectors. Thus, an extract would - // still be required. - auto needsExtract = [&](Instruction *I) -> bool { - return TheLoop->contains(I) && !isScalarAfterVectorization(I, VF); - }; - // Compute the expected cost discount from scalarizing the entire expression // feeding the predicated instruction. We currently only consider expressions // that are single-use instruction chains. @@ -5400,7 +5392,7 @@ "Instruction has non-scalar type"); if (canBeScalarized(J)) Worklist.push_back(J); - else if (needsExtract(J)) + else if (needsExtract(J, VF)) ScalarCost += TTI.getScalarizationOverhead( ToVectorTy(J->getType(),VF), false, true); } @@ -5522,7 +5514,7 @@ // Get the overhead of the extractelement and insertelement instructions // we might create due to scalarization. - Cost += getScalarizationOverhead(I, VF, TTI); + Cost += getScalarizationOverhead(I, VF); // If we have a predicated store, it may not be executed for each vector // lane. Scale the cost by the probability of executing the predicated @@ -5674,6 +5666,36 @@ return VectorizationCostTy(C, TypeNotScalarized); } +unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, + unsigned VF) { + + if (VF == 1) + return 0; + + unsigned Cost = 0; + Type *RetTy = ToVectorTy(I->getType(), VF); + if (!RetTy->isVoidTy() && + (!isa(I) || !TTI.supportsEfficientVectorElementLoadStore())) + Cost += TTI.getScalarizationOverhead(RetTy, true, false); + + // Some targets keep addresses scalar. + if (isa(I) && !TTI.prefersVectorizedAddressing()) + return Cost; + + // Some targets support efficient element stores. + if (isa(I) && TTI.supportsEfficientVectorElementLoadStore()) + return Cost; + + // Collect operands to consider. + CallInst *CI = dyn_cast(I); + Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); + + // Skip operands that do not require extraction/scalarization and do not incur + // any overhead. + return Cost + TTI.getOperandsScalarizationOverhead( + filterExtractingOperands(Ops, VF), VF); +} + void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { if (VF == 1) return; @@ -5914,7 +5936,7 @@ // The cost of insertelement and extractelement instructions needed for // scalarization. - Cost += getScalarizationOverhead(I, VF, TTI); + Cost += getScalarizationOverhead(I, VF); // Scale the cost by the probability of executing the predicated blocks. // This assumes the predicated block for each vector lane is equally @@ -6035,16 +6057,16 @@ case Instruction::Call: { bool NeedToScalarize; CallInst *CI = cast(I); - unsigned CallCost = getVectorCallCost(CI, VF, TTI, TLI, NeedToScalarize); + unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); if (getVectorIntrinsicIDForCall(CI, TLI)) - return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI)); + return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); return CallCost; } default: // The cost of executing VF copies of the scalar instruction. This opcode // is unknown. Assume that it is the same as 'mul'. return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) + - getScalarizationOverhead(I, VF, TTI); + getScalarizationOverhead(I, VF); } // end of switch. } @@ -6638,9 +6660,9 @@ // version of the instruction. // Is it beneficial to perform intrinsic call compared to lib call? bool NeedToScalarize; - unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize); + unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); bool UseVectorIntrinsic = - ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; + ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; return UseVectorIntrinsic || !NeedToScalarize; } if (isa(I) || isa(I)) { @@ -6828,7 +6850,7 @@ VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); auto Plan = llvm::make_unique(VPBB); - VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, TTI, Legal, CM, Builder); + VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder); // Represent values that will have defs inside VPlan. for (Value *V : NeedDef) Plan->addVPValue(V); diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -29,9 +29,6 @@ /// Target Library Info. const TargetLibraryInfo *TLI; - /// Target Transform Info. - const TargetTransformInfo *TTI; - /// The legality analysis. LoopVectorizationLegality *Legal; @@ -104,11 +101,9 @@ public: VPRecipeBuilder(Loop *OrigLoop, const TargetLibraryInfo *TLI, - const TargetTransformInfo *TTI, LoopVectorizationLegality *Legal, LoopVectorizationCostModel &CM, VPBuilder &Builder) - : OrigLoop(OrigLoop), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), - Builder(Builder) {} + : OrigLoop(OrigLoop), TLI(TLI), Legal(Legal), CM(CM), Builder(Builder) {} /// Check if a recipe can be create for \p I withing the given VF \p Range. /// If a recipe can be created, it adds it to \p VPBB. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll b/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll @@ -0,0 +1,109 @@ +; REQUIRES: asserts + +; RUN: opt -loop-vectorize -mtriple=arm64-apple-ios %s -S -debug -disable-output 2>&1 | FileCheck --check-prefix=CM %s +; RUN: opt -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 %s -S | FileCheck --check-prefix=FORCED %s + +; Test case from PR41294. + +; Check scalar cost for extractvalue. The constant and loop invariant operands are free, +; leaving cost 3 for scalarizing the result + 2 for executing the op with VF 2. + +; CM: LV: Scalar loop costs: 7. +; CM: LV: Found an estimated cost of 5 for VF 2 For instruction: %a = extractvalue { i64, i64 } %sv, 0 +; CM-NEXT: LV: Found an estimated cost of 5 for VF 2 For instruction: %b = extractvalue { i64, i64 } %sv, 1 + +; Check that the extractvalue operands are actually free in vector code. + +; FORCED-LABEL: vector.body: ; preds = %vector.body, %vector.ph +; FORCED-NEXT: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; FORCED-NEXT: %broadcast.splatinsert = insertelement <2 x i32> undef, i32 %index, i32 0 +; FORCED-NEXT: %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> undef, <2 x i32> zeroinitializer +; FORCED-NEXT: %induction = add <2 x i32> %broadcast.splat, +; FORCED-NEXT: %0 = add i32 %index, 0 +; FORCED-NEXT: %1 = extractvalue { i64, i64 } %sv, 0 +; FORCED-NEXT: %2 = extractvalue { i64, i64 } %sv, 0 +; FORCED-NEXT: %3 = insertelement <2 x i64> undef, i64 %1, i32 0 +; FORCED-NEXT: %4 = insertelement <2 x i64> %3, i64 %2, i32 1 +; FORCED-NEXT: %5 = extractvalue { i64, i64 } %sv, 1 +; FORCED-NEXT: %6 = extractvalue { i64, i64 } %sv, 1 +; FORCED-NEXT: %7 = insertelement <2 x i64> undef, i64 %5, i32 0 +; FORCED-NEXT: %8 = insertelement <2 x i64> %7, i64 %6, i32 1 +; FORCED-NEXT: %9 = getelementptr i64, i64* %dst, i32 %0 +; FORCED-NEXT: %10 = add <2 x i64> %4, %8 +; FORCED-NEXT: %11 = getelementptr i64, i64* %9, i32 0 +; FORCED-NEXT: %12 = bitcast i64* %11 to <2 x i64>* +; FORCED-NEXT: store <2 x i64> %10, <2 x i64>* %12, align 4 +; FORCED-NEXT: %index.next = add i32 %index, 2 +; FORCED-NEXT: %13 = icmp eq i32 %index.next, 0 +; FORCED-NEXT: br i1 %13, label %middle.block, label %vector.body, !llvm.loop !0 + +define void @test1(i64* %dst, {i64, i64} %sv) { +entry: + br label %loop.body + +loop.body: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.body ] + %a = extractvalue { i64, i64 } %sv, 0 + %b = extractvalue { i64, i64 } %sv, 1 + %addr = getelementptr i64, i64* %dst, i32 %iv + %add = add i64 %a, %b + store i64 %add, i64* %addr + %iv.next = add nsw i32 %iv, 1 + %cond = icmp ne i32 %iv.next, 0 + br i1 %cond, label %loop.body, label %exit + +exit: + ret void +} + + +; Similar to the test case above, but checks getVectorCallCost as well. +declare float @pow(float, float) readnone nounwind + +; CM: LV: Scalar loop costs: 16. +; CM: LV: Found an estimated cost of 5 for VF 2 For instruction: %a = extractvalue { float, float } %sv, 0 +; CM-NEXT: LV: Found an estimated cost of 5 for VF 2 For instruction: %b = extractvalue { float, float } %sv, 1 + +; FORCED-LABEL: define void @test_getVectorCallCost + +; FORCED-LABEL: vector.body: ; preds = %vector.body, %vector.ph +; FORCED-NEXT: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; FORCED-NEXT: %broadcast.splatinsert = insertelement <2 x i32> undef, i32 %index, i32 0 +; FORCED-NEXT: %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> undef, <2 x i32> zeroinitializer +; FORCED-NEXT: %induction = add <2 x i32> %broadcast.splat, +; FORCED-NEXT: %0 = add i32 %index, 0 +; FORCED-NEXT: %1 = extractvalue { float, float } %sv, 0 +; FORCED-NEXT: %2 = extractvalue { float, float } %sv, 0 +; FORCED-NEXT: %3 = insertelement <2 x float> undef, float %1, i32 0 +; FORCED-NEXT: %4 = insertelement <2 x float> %3, float %2, i32 1 +; FORCED-NEXT: %5 = extractvalue { float, float } %sv, 1 +; FORCED-NEXT: %6 = extractvalue { float, float } %sv, 1 +; FORCED-NEXT: %7 = insertelement <2 x float> undef, float %5, i32 0 +; FORCED-NEXT: %8 = insertelement <2 x float> %7, float %6, i32 1 +; FORCED-NEXT: %9 = getelementptr float, float* %dst, i32 %0 +; FORCED-NEXT: %10 = call <2 x float> @llvm.pow.v2f32(<2 x float> %4, <2 x float> %8) +; FORCED-NEXT: %11 = getelementptr float, float* %9, i32 0 +; FORCED-NEXT: %12 = bitcast float* %11 to <2 x float>* +; FORCED-NEXT: store <2 x float> %10, <2 x float>* %12, align 4 +; FORCED-NEXT: %index.next = add i32 %index, 2 +; FORCED-NEXT: %13 = icmp eq i32 %index.next, 0 +; FORCED-NEXT: br i1 %13, label %middle.block, label %vector.body, !llvm.loop !4 + +define void @test_getVectorCallCost(float* %dst, {float, float} %sv) { +entry: + br label %loop.body + +loop.body: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.body ] + %a = extractvalue { float, float } %sv, 0 + %b = extractvalue { float, float } %sv, 1 + %addr = getelementptr float, float* %dst, i32 %iv + %p = call float @pow(float %a, float %b) + store float %p, float* %addr + %iv.next = add nsw i32 %iv, 1 + %cond = icmp ne i32 %iv.next, 0 + br i1 %cond, label %loop.body, label %exit + +exit: + ret void +}