diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -114,6 +114,11 @@ "to speculatively execute to fold a 2-entry PHI node into a " "select (default = 4)")); +static cl::opt TwoEntryPHINodeMispredictPercentage( + "two-entry-phi-node-mispredict-percentage", cl::Hidden, cl::init(50), + cl::desc("Misprediction percentage to consider when folding 2-entry PHI " + "nodes into selects")); + static cl::opt HoistCommon("simplifycfg-hoist-common", cl::Hidden, cl::init(true), cl::desc("Hoist common instructions up to the parent block")); @@ -2828,14 +2833,6 @@ // Okay, we found that we can merge this two-entry phi node into a select. // Doing so would require us to fold *all* two entry phi nodes in this block. - // At some point this becomes non-profitable (particularly if the target - // doesn't support cmov's). Only do this transformation if there are two or - // fewer PHI nodes in this block. - unsigned NumPhis = 0; - for (BasicBlock::iterator I = BB->begin(); isa(I); ++NumPhis, ++I) - if (NumPhis > 2) - return false; - // Loop over the PHI's seeing if we can promote them all to select // instructions. While we are at it, keep track of the instructions // that need to be moved to the dominating block. @@ -2845,7 +2842,9 @@ TwoEntryPHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic; bool Changed = false; + unsigned NumPhis = 0; for (BasicBlock::iterator II = BB->begin(); isa(II);) { + ++NumPhis; PHINode *PN = cast(II++); if (Value *V = SimplifyInstruction(PN, {DL, PN})) { PN->replaceAllUsesWith(V); @@ -2861,6 +2860,20 @@ return Changed; } + // FIXME: MCSchedModel contains the exact value of MispredictPenalty for each target. + InstructionCost MispredictCost = 10 * TargetTransformInfo::TCC_Basic; + // Say p is the percentage of branch misprediction, + // If cost(true_bb) + cost(false_bb) + cost(selects) > + // (1-p) * (cost(true_bb) + cost(false_bb)) / 2 + + // p * (cost(true_bb) + cost(false_bb) + mispredictcost) + // branch is more profitable than using selects. + if (Cost.isValid() && + (Cost + NumPhis * TargetTransformInfo::TCC_Basic > + ((100 - TwoEntryPHINodeMispredictPercentage) * (Cost / 2) + + 1 * TwoEntryPHINodeMispredictPercentage * (Cost + MispredictCost)) / + 100)) + return Changed; + // If we folded the first phi, PN dangles at this point. Refresh it. If // we ran out of PHIs then we simplified them all. PN = dyn_cast(BB->begin()); diff --git a/llvm/test/Transforms/SimplifyCFG/two-entry-phi-node-mispredict.ll b/llvm/test/Transforms/SimplifyCFG/two-entry-phi-node-mispredict.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SimplifyCFG/two-entry-phi-node-mispredict.ll @@ -0,0 +1,102 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -S -simplifycfg -simplifycfg-require-and-preserve-domtree=1 \ +; RUN: -two-entry-phi-node-mispredict-percentage=10 | \ +; RUN: FileCheck --check-prefix=CHECK-LOW-MISS %s +; RUN: opt < %s -S -simplifycfg -simplifycfg-require-and-preserve-domtree=1 \ +; RUN: -two-entry-phi-node-mispredict-percentage=90 | \ +; RUN: FileCheck --check-prefix=CHECK-HIGH-MISS %s + +define dso_local void @foobar(float* %a, i32 signext %n, i32* %imax, float* %max) local_unnamed_addr #0 { +; CHECK-LOW-MISS-LABEL: @foobar( +; CHECK-LOW-MISS-NEXT: entry: +; CHECK-LOW-MISS-NEXT: [[TMP0:%.*]] = load float, float* [[A:%.*]], align 4 +; CHECK-LOW-MISS-NEXT: [[MUL:%.*]] = fmul float [[TMP0]], 2.000000e+00 +; CHECK-LOW-MISS-NEXT: br label [[FOR_COND:%.*]] +; CHECK-LOW-MISS: for.cond: +; CHECK-LOW-MISS-NEXT: [[C_0:%.*]] = phi float [ [[MUL]], [[ENTRY:%.*]] ], [ [[C_1:%.*]], [[FOR_INC:%.*]] ] +; CHECK-LOW-MISS-NEXT: [[CI_0:%.*]] = phi i32 [ 7, [[ENTRY]] ], [ [[CI_1:%.*]], [[FOR_INC]] ] +; CHECK-LOW-MISS-NEXT: [[I_0:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ [[INC:%.*]], [[FOR_INC]] ] +; CHECK-LOW-MISS-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], [[N:%.*]] +; CHECK-LOW-MISS-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-LOW-MISS: for.cond.cleanup: +; CHECK-LOW-MISS-NEXT: store i32 [[CI_0]], i32* [[IMAX:%.*]], align 4 +; CHECK-LOW-MISS-NEXT: store float [[C_0]], float* [[MAX:%.*]], align 4 +; CHECK-LOW-MISS-NEXT: ret void +; CHECK-LOW-MISS: for.body: +; CHECK-LOW-MISS-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64 +; CHECK-LOW-MISS-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IDXPROM]] +; CHECK-LOW-MISS-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX1]], align 4 +; CHECK-LOW-MISS-NEXT: [[CMP2:%.*]] = fcmp ogt float [[TMP1]], [[C_0]] +; CHECK-LOW-MISS-NEXT: br i1 [[CMP2]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; CHECK-LOW-MISS: if.then: +; CHECK-LOW-MISS-NEXT: [[MUL5:%.*]] = fmul float [[TMP1]], 2.000000e+00 +; CHECK-LOW-MISS-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[I_0]], 7 +; CHECK-LOW-MISS-NEXT: br label [[FOR_INC]] +; CHECK-LOW-MISS: for.inc: +; CHECK-LOW-MISS-NEXT: [[C_1]] = phi float [ [[MUL5]], [[IF_THEN]] ], [ [[C_0]], [[FOR_BODY]] ] +; CHECK-LOW-MISS-NEXT: [[CI_1]] = phi i32 [ [[ADD]], [[IF_THEN]] ], [ [[CI_0]], [[FOR_BODY]] ] +; CHECK-LOW-MISS-NEXT: [[INC]] = add nuw nsw i32 [[I_0]], 1 +; CHECK-LOW-MISS-NEXT: br label [[FOR_COND]] +; +; CHECK-HIGH-MISS-LABEL: @foobar( +; CHECK-HIGH-MISS-NEXT: entry: +; CHECK-HIGH-MISS-NEXT: [[TMP0:%.*]] = load float, float* [[A:%.*]], align 4 +; CHECK-HIGH-MISS-NEXT: [[MUL:%.*]] = fmul float [[TMP0]], 2.000000e+00 +; CHECK-HIGH-MISS-NEXT: br label [[FOR_COND:%.*]] +; CHECK-HIGH-MISS: for.cond: +; CHECK-HIGH-MISS-NEXT: [[C_0:%.*]] = phi float [ [[MUL]], [[ENTRY:%.*]] ], [ [[C_1:%.*]], [[FOR_BODY:%.*]] ] +; CHECK-HIGH-MISS-NEXT: [[CI_0:%.*]] = phi i32 [ 7, [[ENTRY]] ], [ [[CI_1:%.*]], [[FOR_BODY]] ] +; CHECK-HIGH-MISS-NEXT: [[I_0:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-HIGH-MISS-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], [[N:%.*]] +; CHECK-HIGH-MISS-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-HIGH-MISS: for.cond.cleanup: +; CHECK-HIGH-MISS-NEXT: store i32 [[CI_0]], i32* [[IMAX:%.*]], align 4 +; CHECK-HIGH-MISS-NEXT: store float [[C_0]], float* [[MAX:%.*]], align 4 +; CHECK-HIGH-MISS-NEXT: ret void +; CHECK-HIGH-MISS: for.body: +; CHECK-HIGH-MISS-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64 +; CHECK-HIGH-MISS-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IDXPROM]] +; CHECK-HIGH-MISS-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX1]], align 4 +; CHECK-HIGH-MISS-NEXT: [[CMP2:%.*]] = fcmp ogt float [[TMP1]], [[C_0]] +; CHECK-HIGH-MISS-NEXT: [[MUL5:%.*]] = fmul float [[TMP1]], 2.000000e+00 +; CHECK-HIGH-MISS-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[I_0]], 7 +; CHECK-HIGH-MISS-NEXT: [[C_1]] = select i1 [[CMP2]], float [[MUL5]], float [[C_0]] +; CHECK-HIGH-MISS-NEXT: [[CI_1]] = select i1 [[CMP2]], i32 [[ADD]], i32 [[CI_0]] +; CHECK-HIGH-MISS-NEXT: [[INC]] = add nuw nsw i32 [[I_0]], 1 +; CHECK-HIGH-MISS-NEXT: br label [[FOR_COND]] +; +entry: + %0 = load float, float* %a, align 4 + %mul = fmul float %0, 2.000000e+00 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %c.0 = phi float [ %mul, %entry ], [ %c.1, %for.inc ] + %ci.0 = phi i32 [ 7, %entry ], [ %ci.1, %for.inc ] + %i.0 = phi i32 [ 1, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %n + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond + store i32 %ci.0, i32* %imax, align 4 + store float %c.0, float* %max, align 4 + ret void + +for.body: ; preds = %for.cond + %idxprom = zext i32 %i.0 to i64 + %arrayidx1 = getelementptr inbounds float, float* %a, i64 %idxprom + %1 = load float, float* %arrayidx1, align 4 + %cmp2 = fcmp ogt float %1, %c.0 + br i1 %cmp2, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %mul5 = fmul float %1, 2.000000e+00 + %add = add nuw nsw i32 %i.0, 7 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %c.1 = phi float [ %mul5, %if.then ], [ %c.0, %for.body ] + %ci.1 = phi i32 [ %add, %if.then ], [ %ci.0, %for.body ] + %inc = add nuw nsw i32 %i.0, 1 + br label %for.cond +}