Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -342,7 +342,13 @@ /// /// TODO: We should use actual block probability here, if available. Currently, /// we always assume predicated blocks have a 50% chance of executing. -static unsigned getReciprocalPredBlockProb() { return 2; } + +static unsigned getReciprocalPredBlockProb1(BlockFrequencyInfo *BFI, + BasicBlock *BB, + BasicBlock *HeaderBB) { + return ((BFI->getBlockFreq(HeaderBB)).getFrequency()) / + ((BFI->getBlockFreq(BB)).getFrequency()); +} /// A helper function that adds a 'fast' flag to floating-point operations. static Value *addFastMathFlag(Value *V) { @@ -1815,9 +1821,10 @@ const TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, const Function *F, - const LoopVectorizeHints *Hints) + const LoopVectorizeHints *Hints, + BlockFrequencyInfo *BFI ) : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), - AC(AC), ORE(ORE), TheFunction(F), Hints(Hints) {} + AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), BFI(BFI) {} /// \return An upper bound for the vectorization factor, or None if /// vectorization should be avoided up front. @@ -2168,6 +2175,8 @@ const Function *TheFunction; /// Loop Vectorize Hint. const LoopVectorizeHints *Hints; + /// Block Frequency Info + BlockFrequencyInfo *BFI; /// Values to ignore in the cost model. SmallPtrSet ValuesToIgnore; /// Values to ignore in the cost model when VF > 1. @@ -6928,8 +6937,10 @@ ToVectorTy(J->getType(),VF), false, true); } + // Scale the total scalar cost by block probability. - ScalarCost /= getReciprocalPredBlockProb(); + ScalarCost /= + getReciprocalPredBlockProb1(BFI,I->getParent(),TheLoop->getHeader()); // Compute the discount. A non-negative discount means the vector version // of the instruction costs more, and scalarizing would be beneficial. @@ -6984,7 +6995,8 @@ // the predicated block. Thus, scale the block's cost by the probability of // executing it. if (VF == 1 && Legal->blockNeedsPredication(BB)) - BlockCost.first /= getReciprocalPredBlockProb(); + BlockCost.first /= + getReciprocalPredBlockProb1(BFI, BB, TheLoop->getHeader()); Cost.first += BlockCost.first; Cost.second |= BlockCost.second; @@ -7055,7 +7067,8 @@ // lane. Scale the cost by the probability of executing the predicated // block. if (Legal->isScalarWithPredication(I)) - Cost /= getReciprocalPredBlockProb(); + Cost /= + getReciprocalPredBlockProb1(BFI, I->getParent(), TheLoop->getHeader()); return Cost; } @@ -7398,7 +7411,8 @@ // Scale the cost by the probability of executing the predicated blocks. // This assumes the predicated block for each vector lane is equally // likely. - return Cost / getReciprocalPredBlockProb(); + return Cost / getReciprocalPredBlockProb1(BFI, I->getParent(), + TheLoop->getHeader()); } LLVM_FALLTHROUGH; case Instruction::Add: @@ -7862,7 +7876,7 @@ // Use the cost model. LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F, - &Hints); + &Hints, BFI); CM.collectValuesToIgnore(); // Use the planner for vectorization. Index: test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll =================================================================== --- test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll +++ test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll @@ -12,7 +12,7 @@ ; %tmp4 a lower scalarization overhead. ; ; COST-LABEL: predicated_udiv_scalarized_operand -; COST: LV: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i64 %tmp2, %tmp3 +; COST: LV: Found an estimated cost of 11 for VF 2 For instruction: %tmp4 = udiv i64 %tmp2, %tmp3 ; ; CHECK-LABEL: @predicated_udiv_scalarized_operand( ; CHECK: vector.body: @@ -22,13 +22,13 @@ ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[TMP0]] to <2 x i64>* ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <2 x i64> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], %broadcast.splat2 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 ; CHECK-NEXT: br i1 [[TMP3]], label %[[PRED_UDIV_IF:.*]], label %[[PRED_UDIV_CONTINUE:.*]] ; CHECK: [[PRED_UDIV_IF]]: ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = add nsw i64 [[TMP5]], %x -; CHECK-NEXT: [[TMP7:%.*]] = udiv i64 [[TMP4]], [[TMP6]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = udiv i64 [[TMP4]], [[TMP5]] ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> undef, i64 [[TMP7]], i32 0 ; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE]] ; CHECK: [[PRED_UDIV_CONTINUE]]: @@ -37,9 +37,8 @@ ; CHECK-NEXT: br i1 [[TMP10]], label %[[PRED_UDIV_IF1:.*]], label %[[PRED_UDIV_CONTINUE2]] ; CHECK: [[PRED_UDIV_IF1]]: ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1 -; CHECK-NEXT: [[TMP13:%.*]] = add nsw i64 [[TMP12]], %x -; CHECK-NEXT: [[TMP14:%.*]] = udiv i64 [[TMP11]], [[TMP13]] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = udiv i64 [[TMP11]], [[TMP12]] ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP14]], i32 1 ; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE2]] ; CHECK: [[PRED_UDIV_CONTINUE2]]: