diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3031,6 +3031,7 @@ /// Estimate the overhead of scalarizing an instruction. This is a /// convenience wrapper for the type-based getScalarizationOverhead API. static unsigned getScalarizationOverhead(Instruction *I, unsigned VF, + Loop *OrigLoop, const TargetTransformInfo &TTI) { if (VF == 1) return 0; @@ -3046,14 +3047,25 @@ if (isa(I) && !TTI.prefersVectorizedAddressing()) return Cost; + // Skip loop-invariant operands, as they do not incur any scalarization + // overhead. + auto FilterOperands = + [OrigLoop]( + Instruction::const_op_range Ops) -> SmallVector { + return SmallVector( + make_filter_range(Ops, [OrigLoop](const Value *V) { + return !OrigLoop->isLoopInvariant(V); + })); + }; + if (CallInst *CI = dyn_cast(I)) { - SmallVector Operands(CI->arg_operands()); - Cost += TTI.getOperandsScalarizationOverhead(Operands, VF); + Cost += TTI.getOperandsScalarizationOverhead( + FilterOperands(CI->arg_operands()), VF); } else if (!isa(I) || !TTI.supportsEfficientVectorElementLoadStore()) { - SmallVector Operands(I->operand_values()); - Cost += TTI.getOperandsScalarizationOverhead(Operands, VF); + Cost += + TTI.getOperandsScalarizationOverhead(FilterOperands(I->operands()), VF); } return Cost; @@ -3063,7 +3075,7 @@ // Return the cost of the instruction, including scalarization overhead if it's // needed. The flag NeedToScalarize shows if the call needs to be scalarized - // i.e. either vector version isn't available, or is too expensive. -static unsigned getVectorCallCost(CallInst *CI, unsigned VF, +static unsigned getVectorCallCost(CallInst *CI, unsigned VF, Loop *TheLoop, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, bool &NeedToScalarize) { @@ -3089,7 +3101,7 @@ // Compute costs of unpacking argument values for the scalar calls and // packing the return values to a vector. - unsigned ScalarizationCost = getScalarizationOverhead(CI, VF, TTI); + unsigned ScalarizationCost = getScalarizationOverhead(CI, VF, TheLoop, TTI); unsigned Cost = ScalarCallCost * VF + ScalarizationCost; @@ -4097,7 +4109,8 @@ // version of the instruction. // Is it beneficial to perform intrinsic call compared to lib call? bool NeedToScalarize; - unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize); + unsigned CallCost = + getVectorCallCost(CI, VF, OrigLoop, *TTI, TLI, NeedToScalarize); bool UseVectorIntrinsic = ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; assert((UseVectorIntrinsic || !NeedToScalarize) && @@ -5493,7 +5506,7 @@ // Get the overhead of the extractelement and insertelement instructions // we might create due to scalarization. - Cost += getScalarizationOverhead(I, VF, TTI); + Cost += getScalarizationOverhead(I, VF, TheLoop, TTI); // If we have a predicated store, it may not be executed for each vector // lane. Scale the cost by the probability of executing the predicated @@ -5885,7 +5898,7 @@ // The cost of insertelement and extractelement instructions needed for // scalarization. - Cost += getScalarizationOverhead(I, VF, TTI); + Cost += getScalarizationOverhead(I, VF, TheLoop, TTI); // Scale the cost by the probability of executing the predicated blocks. // This assumes the predicated block for each vector lane is equally @@ -6006,7 +6019,8 @@ case Instruction::Call: { bool NeedToScalarize; CallInst *CI = cast(I); - unsigned CallCost = getVectorCallCost(CI, VF, TTI, TLI, NeedToScalarize); + unsigned CallCost = + getVectorCallCost(CI, VF, TheLoop, TTI, TLI, NeedToScalarize); if (getVectorIntrinsicIDForCall(CI, TLI)) return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI)); return CallCost; @@ -6015,7 +6029,7 @@ // The cost of executing VF copies of the scalar instruction. This opcode // is unknown. Assume that it is the same as 'mul'. return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) + - getScalarizationOverhead(I, VF, TTI); + getScalarizationOverhead(I, VF, TheLoop, TTI); } // end of switch. } @@ -6603,7 +6617,8 @@ // version of the instruction. // Is it beneficial to perform intrinsic call compared to lib call? bool NeedToScalarize; - unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize); + unsigned CallCost = + getVectorCallCost(CI, VF, OrigLoop, *TTI, TLI, NeedToScalarize); bool UseVectorIntrinsic = ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; return UseVectorIntrinsic || !NeedToScalarize; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/instructions-with-struct-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/instructions-with-struct-ops.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/instructions-with-struct-ops.ll @@ -0,0 +1,52 @@ +; RUN: opt -loop-vectorize -mtriple=arm64-apple-ios %s -S -debug -disable-output 2>&1 | FileCheck --check-prefix=CM %s +; RUN: opt -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 %s -S | FileCheck --check-prefix=FORCED %s + +; Check scalar cost for extractelement. The constant and loop invariant operands are free, +; leaving cost 3 for scalarizing the result + 2 for executing the op with VF 2. + +; CM: LV: Found an estimated cost of 5 for VF 2 For instruction: %a = extractvalue { i64, i64 } %sv, 0 +; CM-NEXT: LV: Found an estimated cost of 5 for VF 2 For instruction: %b = extractvalue { i64, i64 } %sv, 1 + +; Check that the extractlement operands are actually free in vector code. + +; FORCED-LABEL: vector.body: ; preds = %vector.body, %vector.ph +; FORCED-NEXT: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; FORCED-NEXT: %broadcast.splatinsert = insertelement <2 x i32> undef, i32 %index, i32 0 +; FORCED-NEXT: %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> undef, <2 x i32> zeroinitializer +; FORCED-NEXT: %induction = add <2 x i32> %broadcast.splat, +; FORCED-NEXT: %0 = add i32 %index, 0 +; FORCED-NEXT: %1 = extractvalue { i64, i64 } %sv, 0 +; FORCED-NEXT: %2 = extractvalue { i64, i64 } %sv, 0 +; FORCED-NEXT: %3 = insertelement <2 x i64> undef, i64 %1, i32 0 +; FORCED-NEXT: %4 = insertelement <2 x i64> %3, i64 %2, i32 1 +; FORCED-NEXT: %5 = extractvalue { i64, i64 } %sv, 1 +; FORCED-NEXT: %6 = extractvalue { i64, i64 } %sv, 1 +; FORCED-NEXT: %7 = insertelement <2 x i64> undef, i64 %5, i32 0 +; FORCED-NEXT: %8 = insertelement <2 x i64> %7, i64 %6, i32 1 +; FORCED-NEXT: %9 = getelementptr i64, i64* %dst, i32 %0 +; FORCED-NEXT: %10 = add <2 x i64> %4, %8 +; FORCED-NEXT: %11 = getelementptr i64, i64* %9, i32 0 +; FORCED-NEXT: %12 = bitcast i64* %11 to <2 x i64>* +; FORCED-NEXT: store <2 x i64> %10, <2 x i64>* %12, align 4 +; FORCED-NEXT: %index.next = add i32 %index, 2 +; FORCED-NEXT: %13 = icmp eq i32 %index.next, 0 +; FORCED-NEXT: br i1 %13, label %middle.block, label %vector.body, !llvm.loop !0 + +define void @test1(i64* %dst, {i64, i64} %sv) { +entry: + br label %loop.body + +loop.body: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.body ] + %a = extractvalue { i64, i64 } %sv, 0 + %b = extractvalue { i64, i64 } %sv, 1 + %addr = getelementptr i64, i64* %dst, i32 %iv + %add = add i64 %a, %b + store i64 %add, i64* %addr + %iv.next = add nsw i32 %iv, 1 + %cond = icmp ne i32 %iv.next, 0 + br i1 %cond, label %loop.body, label %exit + +exit: + ret void +}