Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -336,14 +336,6 @@ return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); } -/// A helper function that returns the reciprocal of the block probability of -/// predicated blocks. If we return X, we are assuming the predicated block -/// will execute once for for every X iterations of the loop header. -/// -/// TODO: We should use actual block probability here, if available. Currently, -/// we always assume predicated blocks have a 50% chance of executing. -static unsigned getReciprocalPredBlockProb() { return 2; } - /// A helper function that adds a 'fast' flag to floating-point operations. static Value *addFastMathFlag(Value *V) { if (isa(V)) { @@ -1815,9 +1807,10 @@ const TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, const Function *F, - const LoopVectorizeHints *Hints) + const LoopVectorizeHints *Hints, + BlockFrequencyInfo *BFI ) : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), - AC(AC), ORE(ORE), TheFunction(F), Hints(Hints) {} + AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), BFI(BFI) {} /// \return An upper bound for the vectorization factor, or None if /// vectorization should be avoided up front. @@ -2002,6 +1995,19 @@ return Legal->isInductionVariable(Op); } + /// Returns the reciprocal of the block probability of predicated blocks + /// If we return X, we are assuming the predicated block will execute + /// once for for every X iterations of the loop header. + + unsigned getReciprocalPredBlockProb(const BasicBlock *BB) const { + if (!BFI || (BFI->getBlockFreq(BB)).getFrequency() == 0 || + (BFI->getBlockFreq(TheLoop->getHeader())).getFrequency() == 0) + return 2; + return (BFI->getBlockFreq(TheLoop->getHeader())).getFrequency() / + (BFI->getBlockFreq(BB)).getFrequency(); + } + + private: /// \return An upper bound for the vectorization factor, larger than zero. /// One is returned if vectorization should best be avoided due to cost. @@ -2168,6 +2174,8 @@ const Function *TheFunction; /// Loop Vectorize Hint. const LoopVectorizeHints *Hints; + /// Block Frequency Info + BlockFrequencyInfo *BFI; /// Values to ignore in the cost model. SmallPtrSet ValuesToIgnore; /// Values to ignore in the cost model when VF > 1. @@ -6929,7 +6937,7 @@ } // Scale the total scalar cost by block probability. - ScalarCost /= getReciprocalPredBlockProb(); + ScalarCost /= getReciprocalPredBlockProb(I->getParent()); // Compute the discount. A non-negative discount means the vector version // of the instruction costs more, and scalarizing would be beneficial. @@ -6983,8 +6991,13 @@ // unconditionally executed. For the scalar case, we may not always execute // the predicated block. Thus, scale the block's cost by the probability of // executing it. - if (VF == 1 && Legal->blockNeedsPredication(BB)) - BlockCost.first /= getReciprocalPredBlockProb(); + if (VF == 1 && Legal->blockNeedsPredication(BB)) { + BlockCost.first /= getReciprocalPredBlockProb(BB); + DEBUG(dbgs() << "LV: Found an estimated cost of " << BlockCost.first + << " for VF " << VF << " For BasicBlock: " << BB->getName() + << '\n'); + } + Cost.first += BlockCost.first; Cost.second |= BlockCost.second; @@ -7055,7 +7068,7 @@ // lane. Scale the cost by the probability of executing the predicated // block. if (Legal->isScalarWithPredication(I)) - Cost /= getReciprocalPredBlockProb(); + Cost /= getReciprocalPredBlockProb(I->getParent()); return Cost; } @@ -7398,7 +7411,7 @@ // Scale the cost by the probability of executing the predicated blocks. // This assumes the predicated block for each vector lane is equally // likely. - return Cost / getReciprocalPredBlockProb(); + return Cost / getReciprocalPredBlockProb(I->getParent()); } LLVM_FALLTHROUGH; case Instruction::Add: @@ -7862,7 +7875,7 @@ // Use the cost model. LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F, - &Hints); + &Hints, BFI); CM.collectValuesToIgnore(); // Use the planner for vectorization. Index: test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll =================================================================== --- test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll +++ test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll @@ -12,7 +12,7 @@ ; %tmp4 a lower scalarization overhead. ; ; COST-LABEL: predicated_udiv_scalarized_operand -; COST: LV: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i64 %tmp2, %tmp3 +; COST: LV: Found an estimated cost of 11 for VF 2 For instruction: %tmp4 = udiv i64 %tmp2, %tmp3 ; ; CHECK-LABEL: @predicated_udiv_scalarized_operand( ; CHECK: vector.body: @@ -22,13 +22,13 @@ ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[TMP0]] to <2 x i64>* ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <2 x i64> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], %broadcast.splat2 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 ; CHECK-NEXT: br i1 [[TMP3]], label %[[PRED_UDIV_IF:.*]], label %[[PRED_UDIV_CONTINUE:.*]] ; CHECK: [[PRED_UDIV_IF]]: ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = add nsw i64 [[TMP5]], %x -; CHECK-NEXT: [[TMP7:%.*]] = udiv i64 [[TMP4]], [[TMP6]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = udiv i64 [[TMP4]], [[TMP5]] ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> undef, i64 [[TMP7]], i32 0 ; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE]] ; CHECK: [[PRED_UDIV_CONTINUE]]: @@ -37,9 +37,8 @@ ; CHECK-NEXT: br i1 [[TMP10]], label %[[PRED_UDIV_IF1:.*]], label %[[PRED_UDIV_CONTINUE2]] ; CHECK: [[PRED_UDIV_IF1]]: ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1 -; CHECK-NEXT: [[TMP13:%.*]] = add nsw i64 [[TMP12]], %x -; CHECK-NEXT: [[TMP14:%.*]] = udiv i64 [[TMP11]], [[TMP13]] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = udiv i64 [[TMP11]], [[TMP12]] ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP14]], i32 1 ; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE2]] ; CHECK: [[PRED_UDIV_CONTINUE2]]: Index: test/Transforms/LoopVectorize/if-pred-blockFreq.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/if-pred-blockFreq.ll @@ -0,0 +1,70 @@ +; RUN: opt < %s -loop-vectorize -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefix=COST + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" + + +@a = global i32* null, align 8 +@b = global i32* null, align 8 +@c = global i32* null, align 8 + +; This test checks that the frequency of the predicated block(for computing the cost) +; is obtained from BlockFrequencyInfo Class +; COST-LABEL: foo +; COST: LV: Found an estimated cost of 4 for VF 1 For BasicBlock: if.then3.i + + +define void @foo(i32 %p1) { +entry: + %cmp88 = icmp sgt i32 %p1, 0 + br i1 %cmp88, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: + %0 = load i32*, i32** @b, align 8 + %1 = load i32*, i32** @a, align 8 + %2 = load i32*, i32** @c, align 8 + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %if.end3.i ] + %arrayidx = getelementptr inbounds i32, i32* %0, i64 %indvars.iv + %3 = load i32, i32* %arrayidx, align 4 %4 = trunc i64 %indvars.iv to i32 + %and.i = and i32 %4, 1 + %tobool.i.i = icmp eq i32 %and.i, 0 + br i1 %tobool.i.i, label %if.end.i, label %if.then.i + +if.then.i: + %and.i.i = lshr i32 %3, 2 + %cmp.i = icmp sgt i32 %and.i.i, 0 + %conv.i = zext i1 %cmp.i to i32 + br label %if.end.i + +if.end.i: + %p1.addr.0.i = phi i32 [ %conv.i, %if.then.i ], [ %3, %for.body ] + %5 = trunc i64 %indvars.iv to i32 + %and1.i = and i32 %5, 7 + %tobool2.i = icmp eq i32 %and1.i, 0 + br i1 %tobool2.i, label %if.end3.i, label %if.then3.i + +if.then3.i: + %p1.addr.0.lobit.i = lshr i32 %p1.addr.0.i, 31 + %and6.i = and i32 %p1.addr.0.i, 1 + %or.i = or i32 %p1.addr.0.lobit.i, %and6.i + br label %if.end3.i + +if.end3.i: +%p1.addr.1.i = phi i32 [ %or.i, %if.then3.i ], [ %p1.addr.0.i, %if.end.i ] + + %6 = trunc i64 %indvars.iv to i32 + %shr.i = ashr i32 %6, 1 + %arrayidx7 = getelementptr inbounds i32, i32* %2, i64 %indvars.iv + store i32 %p1.addr.1.i, i32* %arrayidx7, align 4 %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp ne i32 %lftr.wideiv, %p1 + br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge + +for.cond.for.end_crit_edge: + br label %for.end + +for.end: + ret void +}