Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3542,10 +3542,7 @@ /// \brief Estimate the overhead of scalarizing a value based on its type. /// Insert and Extract are set if the result needs to be inserted and/or /// extracted from vectors. -/// If the instruction is also to be predicated, add the cost of a PHI -/// node to the insertion cost. static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract, - bool Predicated, const TargetTransformInfo &TTI) { if (Ty->isVoidTy()) return 0; @@ -3556,33 +3553,23 @@ for (unsigned I = 0, E = Ty->getVectorNumElements(); I < E; ++I) { if (Extract) Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, Ty, I); - if (Insert) { + if (Insert) Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, I); - if (Predicated) - Cost += TTI.getCFInstrCost(Instruction::PHI); - } } - // If we have a predicated instruction, it may not be executed for each - // vector lane. Scale the cost by the probability of executing the - // predicated block. - if (Predicated) - Cost /= getReciprocalPredBlockProb(); - return Cost; } /// \brief Estimate the overhead of scalarizing an Instruction based on the /// types of its operands and return value. static unsigned getScalarizationOverhead(SmallVectorImpl &OpTys, - Type *RetTy, bool Predicated, + Type *RetTy, const TargetTransformInfo &TTI) { unsigned ScalarizationCost = - getScalarizationOverhead(RetTy, true, false, Predicated, TTI); + getScalarizationOverhead(RetTy, true, false, TTI); for (Type *Ty : OpTys) - ScalarizationCost += - getScalarizationOverhead(Ty, false, true, Predicated, TTI); + ScalarizationCost += getScalarizationOverhead(Ty, false, true, TTI); return ScalarizationCost; } @@ -3590,7 +3577,6 @@ /// \brief Estimate the overhead of scalarizing an instruction. This is a /// convenience wrapper for the type-based getScalarizationOverhead API. static unsigned getScalarizationOverhead(Instruction *I, unsigned VF, - bool Predicated, const TargetTransformInfo &TTI) { if (VF == 1) return 0; @@ -3602,7 +3588,7 @@ for (unsigned OpInd = 0; OpInd < OperandsNum; ++OpInd) OpTys.push_back(ToVectorTy(I->getOperand(OpInd)->getType(), VF)); - return getScalarizationOverhead(OpTys, RetTy, Predicated, TTI); + return getScalarizationOverhead(OpTys, RetTy, TTI); } // Estimate cost of a call instruction CI if it were vectorized with factor VF. @@ -3635,7 +3621,7 @@ // Compute costs of unpacking argument values for the scalar calls and // packing the return values to a vector. - unsigned ScalarizationCost = getScalarizationOverhead(Tys, RetTy, false, TTI); + unsigned ScalarizationCost = getScalarizationOverhead(Tys, RetTy, TTI); unsigned Cost = ScalarCallCost * VF + ScalarizationCost; @@ -6536,10 +6522,27 @@ // vector lane. Get the scalarization cost and scale this amount by the // probability of executing the predicated block. If the instruction is not // predicated, we fall through to the next case. - if (VF > 1 && Legal->isScalarWithPredication(I)) - return VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy) / - getReciprocalPredBlockProb() + - getScalarizationOverhead(I, VF, true, TTI); + if (VF > 1 && Legal->isScalarWithPredication(I)) { + unsigned Cost = 0; + + // These instructions have a non-void type, so account for the phi nodes + // that we will create. This cost is likely to be zero. The phi node + // cost, if any, should be scaled by the block probability because it + // models a copy at the end of each predicated block. + Cost += VF * TTI.getCFInstrCost(Instruction::PHI); + + // The cost of the non-predicated instruction. + Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy); + + // The cost of insertelement and extractelement instructions needed for + // scalarization. + Cost += getScalarizationOverhead(I, VF, TTI); + + // Scale the cost by the probability of executing the predicated blocks. + // This assumes the predicated block for each vector lane is equally + // likely. + return Cost / getReciprocalPredBlockProb(); + } case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: @@ -6695,7 +6698,7 @@ // Get the overhead of the extractelement and insertelement instructions // we might create due to scalarization. - Cost += getScalarizationOverhead(I, VF, false, TTI); + Cost += getScalarizationOverhead(I, VF, TTI); return Cost; } @@ -6782,7 +6785,7 @@ // The cost of executing VF copies of the scalar instruction. This opcode // is unknown. Assume that it is the same as 'mul'. return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) + - getScalarizationOverhead(I, VF, false, TTI); + getScalarizationOverhead(I, VF, TTI); } // end of switch. } Index: test/Transforms/LoopVectorize/AArch64/predication_costs.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/AArch64/predication_costs.ll @@ -0,0 +1,59 @@ +; REQUIRES: asserts +; RUN: opt < %s -force-vector-width=2 -loop-vectorize -debug -disable-output 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-linux-gnu" + +; CHECK-LABEL: predicated_udiv +; +; This test checks that we correctly compute the cost of the predicated udiv +; instruction. It ensures that we don't accumulate rounding errors by scaling +; intermediate costs by the block probability. If we assume the block +; probability is 50%, we want to compute the cost like: +; +; Cost for vector lane zero: +; (udiv(1) + 2 * extractelement(0) + insertelement(0)) / 2 = 0 +; Cost for vector lane one: +; (udiv(1) + 2 * extractelement(3) + insertelement(3)) / 2 = 5 +; +; Rather than like: +; +; Cost for vector lane zero: +; udiv(1) / 2 + (2 * extractelement(0) + insertelement(0)) / 2 = 0 +; Cost for vector lane one: +; udiv(1) / 2 + (2 * extractelement(3) + insertelement(3)) / 2 = 4 +; +; The truncations from the integer division in the second case accumulate so +; that the resulting cost is less than it is in the first case. +; +; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3 +; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3 +; +define i32 @predicated_udiv(i32* %a, i32* %b, i1 %c, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ] + %r = phi i32 [ 0, %entry ], [ %tmp6, %for.inc ] + %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i + %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i + %tmp2 = load i32, i32* %tmp0, align 4 + %tmp3 = load i32, i32* %tmp1, align 4 + br i1 %c, label %if.then, label %for.inc + +if.then: + %tmp4 = udiv i32 %tmp2, %tmp3 + br label %for.inc + +for.inc: + %tmp5 = phi i32 [ %tmp3, %for.body ], [ %tmp4, %if.then] + %tmp6 = add i32 %r, %tmp5 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + %tmp7 = phi i32 [ %tmp6, %for.inc ] + ret i32 %tmp7 +}