[LV] Avoid rounding errors for predicated instruction costs

mssimpso · mssimpso · commit 6cdb5a6f9663 · 2016-10-13T14:19:48.000Z
This patch modifies the cost calculation of predicated instructions (div and rem) to avoid the accumulation of rounding errors due to multiple truncating integer divisions. The calculation for predicated stores will be addressed in a follow-on patch since we currently don't scale the cost of predicated stores by block probability. Differential Revision: https://reviews.llvm.org/D25333 llvm-svn: 284123
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3542,10 +3542,7 @@ static Value *addFastMathFlag(Value *V) {
 /// \brief Estimate the overhead of scalarizing a value based on its type.
 /// Insert and Extract are set if the result needs to be inserted and/or
 /// extracted from vectors.
-/// If the instruction is also to be predicated, add the cost of a PHI
-/// node to the insertion cost.
 static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract,
-                                         bool Predicated,
                                          const TargetTransformInfo &TTI) {
   if (Ty->isVoidTy())
     return 0;
@@ -3556,41 +3553,30 @@ static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract,
   for (unsigned I = 0, E = Ty->getVectorNumElements(); I < E; ++I) {
     if (Extract)
       Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, Ty, I);
-    if (Insert) {
+    if (Insert)
       Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, I);
-      if (Predicated)
-        Cost += TTI.getCFInstrCost(Instruction::PHI);
-    }
   }
 
-  // If we have a predicated instruction, it may not be executed for each
-  // vector lane. Scale the cost by the probability of executing the
-  // predicated block.
-  if (Predicated)
-    Cost /= getReciprocalPredBlockProb();
-
   return Cost;
 }
 
 /// \brief Estimate the overhead of scalarizing an Instruction based on the
 /// types of its operands and return value.
 static unsigned getScalarizationOverhead(SmallVectorImpl<Type *> &OpTys,
-                                         Type *RetTy, bool Predicated,
+                                         Type *RetTy,
                                          const TargetTransformInfo &TTI) {
   unsigned ScalarizationCost =
-      getScalarizationOverhead(RetTy, true, false, Predicated, TTI);
+      getScalarizationOverhead(RetTy, true, false, TTI);
 
   for (Type *Ty : OpTys)
-    ScalarizationCost +=
-        getScalarizationOverhead(Ty, false, true, Predicated, TTI);
+    ScalarizationCost += getScalarizationOverhead(Ty, false, true, TTI);
 
   return ScalarizationCost;
 }
 
 /// \brief Estimate the overhead of scalarizing an instruction. This is a
 /// convenience wrapper for the type-based getScalarizationOverhead API.
 static unsigned getScalarizationOverhead(Instruction *I, unsigned VF,
-                                         bool Predicated,
                                          const TargetTransformInfo &TTI) {
   if (VF == 1)
     return 0;
@@ -3602,7 +3588,7 @@ static unsigned getScalarizationOverhead(Instruction *I, unsigned VF,
   for (unsigned OpInd = 0; OpInd < OperandsNum; ++OpInd)
     OpTys.push_back(ToVectorTy(I->getOperand(OpInd)->getType(), VF));
 
-  return getScalarizationOverhead(OpTys, RetTy, Predicated, TTI);
+  return getScalarizationOverhead(OpTys, RetTy, TTI);
 }
 
 // Estimate cost of a call instruction CI if it were vectorized with factor VF.
@@ -3635,7 +3621,7 @@ static unsigned getVectorCallCost(CallInst *CI, unsigned VF,
 
   // Compute costs of unpacking argument values for the scalar calls and
   // packing the return values to a vector.
-  unsigned ScalarizationCost = getScalarizationOverhead(Tys, RetTy, false, TTI);
+  unsigned ScalarizationCost = getScalarizationOverhead(Tys, RetTy, TTI);
 
   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
 
@@ -6536,10 +6522,27 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     // vector lane. Get the scalarization cost and scale this amount by the
     // probability of executing the predicated block. If the instruction is not
     // predicated, we fall through to the next case.
-    if (VF > 1 && Legal->isScalarWithPredication(I))
-      return VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy) /
-                 getReciprocalPredBlockProb() +
-             getScalarizationOverhead(I, VF, true, TTI);
+    if (VF > 1 && Legal->isScalarWithPredication(I)) {
+      unsigned Cost = 0;
+
+      // These instructions have a non-void type, so account for the phi nodes
+      // that we will create. This cost is likely to be zero. The phi node
+      // cost, if any, should be scaled by the block probability because it
+      // models a copy at the end of each predicated block.
+      Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
+
+      // The cost of the non-predicated instruction.
+      Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
+
+      // The cost of insertelement and extractelement instructions needed for
+      // scalarization.
+      Cost += getScalarizationOverhead(I, VF, TTI);
+
+      // Scale the cost by the probability of executing the predicated blocks.
+      // This assumes the predicated block for each vector lane is equally
+      // likely.
+      return Cost / getReciprocalPredBlockProb();
+    }
   case Instruction::Add:
   case Instruction::FAdd:
   case Instruction::Sub:
@@ -6695,7 +6698,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
 
       // Get the overhead of the extractelement and insertelement instructions
       // we might create due to scalarization.
-      Cost += getScalarizationOverhead(I, VF, false, TTI);
+      Cost += getScalarizationOverhead(I, VF, TTI);
 
       return Cost;
     }
@@ -6782,7 +6785,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     // The cost of executing VF copies of the scalar instruction. This opcode
     // is unknown. Assume that it is the same as 'mul'.
     return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
-           getScalarizationOverhead(I, VF, false, TTI);
+           getScalarizationOverhead(I, VF, TTI);
   } // end of switch.
 }
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
@@ -0,0 +1,53 @@
+; REQUIRES: asserts
+; RUN: opt < %s -force-vector-width=2 -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; Check predication-related cost calculations, including scalarization overhead
+; and block probability scaling. Note that the functionality being tested is
+; not specific to AArch64. We specify a target to get actual values for the
+; instruction costs.
+
+; CHECK-LABEL: predicated_udiv
+;
+; This test checks that we correctly compute the cost of the predicated udiv
+; instruction. If we assume the block probability is 50%, we compute the cost
+; as:
+;
+; Cost for vector lane zero:
+;   (udiv(1) + 2 * extractelement(0) + insertelement(0)) / 2 = 0
+; Cost for vector lane one:
+;   (udiv(1) + 2 * extractelement(3) + insertelement(3)) / 2 = 5
+;
+; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3
+; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3
+;
+define i32 @predicated_udiv(i32* %a, i32* %b, i1 %c, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+  %r = phi i32 [ 0, %entry ], [ %tmp6, %for.inc ]
+  %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp0, align 4
+  %tmp3 = load i32, i32* %tmp1, align 4
+  br i1 %c, label %if.then, label %for.inc
+
+if.then:
+  %tmp4 = udiv i32 %tmp2, %tmp3
+  br label %for.inc
+
+for.inc:
+  %tmp5 = phi i32 [ %tmp3, %for.body ], [ %tmp4, %if.then]
+  %tmp6 = add i32 %r, %tmp5
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %tmp7 = phi i32 [ %tmp6, %for.inc ]
+  ret i32 %tmp7
+}