diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -143,6 +143,12 @@ "disable-cgp-select2branch", cl::Hidden, cl::init(false), cl::desc("Disable select to branch conversion.")); +static cl::opt SinkSelectOperandRatioAgainstMisprediction( + "cgp-sink-select-operand-ratio-against-misprediction", cl::Hidden, + cl::init(0), + cl::desc("If (select operand cycles) * ratio > (branch misprection " + "penalty), sink the operand.")); + static cl::opt AddrSinkUsingGEPs( "addr-sink-using-gep", cl::Hidden, cl::init(true), cl::desc("Address sinking in CGP using GEPs.")); @@ -6563,20 +6569,51 @@ } /// Check if V (an operand of a select instruction) is an expensive instruction -/// that is only used once. -static bool sinkSelectOperand(const TargetTransformInfo *TTI, Value *V) { +/// that is only used by SelectInst consistently in ASI. +static bool sinkSelectOperand(const TargetTransformInfo *TTI, + const TargetSubtargetInfo *SubtargetInfo, + ArrayRef ASI, Value *V) { auto *I = dyn_cast(V); + if (!I || I->user_empty()) + return false; + auto IsUseConsistent = [&]() { + bool IsTrueOperand = false; + for (SelectInst *SI : ASI) { + if (V == SI->getTrueValue()) + IsTrueOperand = false; + else if (V == SI->getFalseValue() && IsTrueOperand) + return false; + } + return true; + }; + const MCSchedModel &SM = SubtargetInfo->getSchedModel(); + // FIXME: InstructionCost is not measured at cycle dimension, but + // MispredictPenalty is, so we can't compare getUserCost and MispredictPenalty + // directly. + static const unsigned CheapInstCycles = 1; // If it's safe to speculatively execute, then it should not have side // effects; therefore, it's safe to sink and possibly *not* execute. - return I && I->hasOneUse() && isSafeToSpeculativelyExecute(I) && - TTI->getUserCost(I, TargetTransformInfo::TCK_SizeAndLatency) >= - TargetTransformInfo::TCC_Expensive; + return all_of(I->users(), + [&](User *user) { + // Check if all users of V in ASI consistently use V as true + // or false operand. + SelectInst *U = dyn_cast(user); + return U && + std::find(ASI.begin(), ASI.end(), U) != ASI.end() && + IsUseConsistent(); + }) && + isSafeToSpeculativelyExecute(I) && + (TTI->getUserCost(I, TargetTransformInfo::TCK_SizeAndLatency) >= + TargetTransformInfo::TCC_Expensive || + (CheapInstCycles * SinkSelectOperandRatioAgainstMisprediction > + SM.MispredictPenalty)); } /// Returns true if a SelectInst should be turned into an explicit branch. static bool isFormingBranchFromSelectProfitable(const TargetTransformInfo *TTI, const TargetLowering *TLI, - SelectInst *SI) { + const TargetSubtargetInfo *SubtargetInfo, + ArrayRef ASI) { // If even a predictable select is cheap, then a branch can't be cheaper. if (!TLI->isPredictableSelectExpensive()) return false; @@ -6586,6 +6623,8 @@ // If metadata tells us that the select condition is obviously predictable, // then we want to replace the select with a branch. + assert(!ASI.empty() && "ASI should have at least one SelectInst."); + SelectInst *SI = ASI.back(); uint64_t TrueWeight, FalseWeight; if (SI->extractProfMetadata(TrueWeight, FalseWeight)) { uint64_t Max = std::max(TrueWeight, FalseWeight); @@ -6600,16 +6639,22 @@ CmpInst *Cmp = dyn_cast(SI->getCondition()); // If a branch is predictable, an out-of-order CPU can avoid blocking on its - // comparison condition. If the compare has more than one use, there's - // probably another cmov or setcc around, so it's not worth emitting a branch. - if (!Cmp || !Cmp->hasOneUse()) + // comparison condition. If the compare's uses are all selects in the same + // basic block, we try to form branch since select is considered expensive at + // this point. + if (!Cmp || !all_of(Cmp->uses(), [&](const Use &use) { + SelectInst *SI = dyn_cast(use.getUser()); + return SI && std::find(ASI.begin(), ASI.end(), SI) != ASI.end(); + })) return false; // If either operand of the select is expensive and only needed on one side // of the select, we should form a branch. - if (sinkSelectOperand(TTI, SI->getTrueValue()) || - sinkSelectOperand(TTI, SI->getFalseValue())) - return true; + for (SelectInst *SI : ASI) { + if (sinkSelectOperand(TTI, SubtargetInfo, ASI, SI->getTrueValue()) || + sinkSelectOperand(TTI, SubtargetInfo, ASI, SI->getFalseValue())) + return true; + } return false; } @@ -6741,8 +6786,8 @@ SelectKind = TargetLowering::ScalarValSelect; if (TLI->isSelectSupported(SelectKind) && - (!isFormingBranchFromSelectProfitable(TTI, TLI, SI) || OptSize || - llvm::shouldOptimizeForSize(SI->getParent(), PSI, BFI.get()))) + (!isFormingBranchFromSelectProfitable(TTI, TLI, SubtargetInfo, ASI) || + OptSize || llvm::shouldOptimizeForSize(SI->getParent(), PSI, BFI.get()))) return false; // The DominatorTree needs to be rebuilt by any consumers after this @@ -6795,7 +6840,7 @@ // Sink expensive instructions into the conditional blocks to avoid executing // them speculatively. for (SelectInst *SI : ASI) { - if (sinkSelectOperand(TTI, SI->getTrueValue())) { + if (sinkSelectOperand(TTI, SubtargetInfo, ASI, SI->getTrueValue())) { if (TrueBlock == nullptr) { TrueBlock = BasicBlock::Create(SI->getContext(), "select.true.sink", EndBlock->getParent(), EndBlock); @@ -6805,7 +6850,7 @@ auto *TrueInst = cast(SI->getTrueValue()); TrueInst->moveBefore(TrueBranch); } - if (sinkSelectOperand(TTI, SI->getFalseValue())) { + if (sinkSelectOperand(TTI, SubtargetInfo, ASI, SI->getFalseValue())) { if (FalseBlock == nullptr) { FalseBlock = BasicBlock::Create(SI->getContext(), "select.false.sink", EndBlock->getParent(), EndBlock); diff --git a/llvm/test/CodeGen/PowerPC/cgp-select.ll b/llvm/test/CodeGen/PowerPC/cgp-select.ll --- a/llvm/test/CodeGen/PowerPC/cgp-select.ll +++ b/llvm/test/CodeGen/PowerPC/cgp-select.ll @@ -1,5 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -O3 -mcpu=pwr9 -verify-machineinstrs -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s +; RUN: llc -O3 -mcpu=pwr9 -verify-machineinstrs -mtriple=powerpc64le-unknown-unknown \ +; RUN: -cgp-sink-select-operand-ratio-against-misprediction=20 < %s | FileCheck --check-prefix=CHECK-SINK %s define dso_local void @wibble(float* nocapture readonly %arg, i32 signext %arg1, i32* nocapture %arg2, float* nocapture %arg3) { ; CHECK-LABEL: wibble: @@ -34,6 +36,40 @@ ; CHECK-NEXT: stw 7, 0(5) ; CHECK-NEXT: stfs 0, 0(6) ; CHECK-NEXT: blr +; +; CHECK-SINK-LABEL: wibble: +; CHECK-SINK: # %bb.0: # %bb +; CHECK-SINK-NEXT: lfs 0, 0(3) +; CHECK-SINK-NEXT: li 7, 7 +; CHECK-SINK-NEXT: cmpwi 4, 2 +; CHECK-SINK-NEXT: xsaddsp 0, 0, 0 +; CHECK-SINK-NEXT: blt 0, .LBB0_5 +; CHECK-SINK-NEXT: # %bb.1: # %bb6 +; CHECK-SINK-NEXT: clrldi 4, 4, 32 +; CHECK-SINK-NEXT: li 7, 7 +; CHECK-SINK-NEXT: addi 4, 4, -1 +; CHECK-SINK-NEXT: mtctr 4 +; CHECK-SINK-NEXT: li 4, 8 +; CHECK-SINK-NEXT: b .LBB0_3 +; CHECK-SINK-NEXT: .p2align 5 +; CHECK-SINK-NEXT: .LBB0_2: # %select.end +; CHECK-SINK-NEXT: # +; CHECK-SINK-NEXT: addi 4, 4, 1 +; CHECK-SINK-NEXT: bdz .LBB0_5 +; CHECK-SINK-NEXT: .LBB0_3: # %bb11 +; CHECK-SINK-NEXT: # +; CHECK-SINK-NEXT: lfsu 1, 4(3) +; CHECK-SINK-NEXT: fcmpu 0, 1, 0 +; CHECK-SINK-NEXT: bc 4, 1, .LBB0_2 +; CHECK-SINK-NEXT: # %bb.4: # %select.true.sink +; CHECK-SINK-NEXT: # +; CHECK-SINK-NEXT: xsaddsp 0, 1, 1 +; CHECK-SINK-NEXT: mr 7, 4 +; CHECK-SINK-NEXT: b .LBB0_2 +; CHECK-SINK-NEXT: .LBB0_5: # %bb8 +; CHECK-SINK-NEXT: stw 7, 0(5) +; CHECK-SINK-NEXT: stfs 0, 0(6) +; CHECK-SINK-NEXT: blr bb: %tmp = load float, float* %arg, align 4 %tmp4 = fmul float %tmp, 2.000000e+00 @@ -67,3 +103,114 @@ %tmp24 = icmp eq i64 %tmp23, %tmp7 br i1 %tmp24, label %bb8, label %bb11 } + +define dso_local void @foo(float* nocapture readonly %arg, i32 signext %arg1, i32* nocapture %arg2, float* nocapture %arg3) { +; CHECK-LABEL: foo: +; CHECK: # %bb.0: # %bb +; CHECK-NEXT: lfs 0, 0(3) +; CHECK-NEXT: li 9, 7 +; CHECK-NEXT: cmpwi 4, 2 +; CHECK-NEXT: xsaddsp 0, 0, 0 +; CHECK-NEXT: blt 0, .LBB1_5 +; CHECK-NEXT: # %bb.1: # %bb6 +; CHECK-NEXT: clrldi 4, 4, 32 +; CHECK-NEXT: li 8, 7 +; CHECK-NEXT: li 7, 1 +; CHECK-NEXT: b .LBB1_3 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB1_2: # %bb11 +; CHECK-NEXT: # +; CHECK-NEXT: addi 10, 7, 7 +; CHECK-NEXT: addi 7, 7, 1 +; CHECK-NEXT: iselgt 9, 10, 8 +; CHECK-NEXT: iselgt 8, 8, 10 +; CHECK-NEXT: clrldi 8, 8, 32 +; CHECK-NEXT: sub 8, 4, 8 +; CHECK-NEXT: cmpld 8, 7 +; CHECK-NEXT: mr 8, 9 +; CHECK-NEXT: beq 0, .LBB1_5 +; CHECK-NEXT: .LBB1_3: # %bb11 +; CHECK-NEXT: # +; CHECK-NEXT: lfsu 1, 4(3) +; CHECK-NEXT: fcmpu 0, 1, 0 +; CHECK-NEXT: ble 0, .LBB1_2 +; CHECK-NEXT: # %bb.4: +; CHECK-NEXT: xsaddsp 0, 1, 1 +; CHECK-NEXT: b .LBB1_2 +; CHECK-NEXT: .LBB1_5: # %bb8 +; CHECK-NEXT: stw 9, 0(5) +; CHECK-NEXT: stfs 0, 0(6) +; CHECK-NEXT: blr +; +; CHECK-SINK-LABEL: foo: +; CHECK-SINK: # %bb.0: # %bb +; CHECK-SINK-NEXT: lfs 0, 0(3) +; CHECK-SINK-NEXT: li 9, 7 +; CHECK-SINK-NEXT: cmpwi 4, 2 +; CHECK-SINK-NEXT: xsaddsp 0, 0, 0 +; CHECK-SINK-NEXT: blt 0, .LBB1_5 +; CHECK-SINK-NEXT: # %bb.1: # %bb6 +; CHECK-SINK-NEXT: clrldi 4, 4, 32 +; CHECK-SINK-NEXT: li 8, 7 +; CHECK-SINK-NEXT: li 7, 1 +; CHECK-SINK-NEXT: b .LBB1_3 +; CHECK-SINK-NEXT: .p2align 4 +; CHECK-SINK-NEXT: .LBB1_2: # %bb11 +; CHECK-SINK-NEXT: # +; CHECK-SINK-NEXT: addi 10, 7, 7 +; CHECK-SINK-NEXT: addi 7, 7, 1 +; CHECK-SINK-NEXT: iselgt 9, 10, 8 +; CHECK-SINK-NEXT: iselgt 8, 8, 10 +; CHECK-SINK-NEXT: clrldi 8, 8, 32 +; CHECK-SINK-NEXT: sub 8, 4, 8 +; CHECK-SINK-NEXT: cmpld 8, 7 +; CHECK-SINK-NEXT: mr 8, 9 +; CHECK-SINK-NEXT: beq 0, .LBB1_5 +; CHECK-SINK-NEXT: .LBB1_3: # %bb11 +; CHECK-SINK-NEXT: # +; CHECK-SINK-NEXT: lfsu 1, 4(3) +; CHECK-SINK-NEXT: fcmpu 0, 1, 0 +; CHECK-SINK-NEXT: ble 0, .LBB1_2 +; CHECK-SINK-NEXT: # %bb.4: +; CHECK-SINK-NEXT: xsaddsp 0, 1, 1 +; CHECK-SINK-NEXT: b .LBB1_2 +; CHECK-SINK-NEXT: .LBB1_5: # %bb8 +; CHECK-SINK-NEXT: stw 9, 0(5) +; CHECK-SINK-NEXT: stfs 0, 0(6) +; CHECK-SINK-NEXT: blr +bb: + %tmp = load float, float* %arg, align 4 + %tmp4 = fmul float %tmp, 2.000000e+00 + %tmp5 = icmp sgt i32 %arg1, 1 + br i1 %tmp5, label %bb6, label %bb8 + +bb6: ; preds = %bb + %tmp7 = zext i32 %arg1 to i64 + br label %bb11 + +bb8: ; preds = %bb11, %bb + %tmp9 = phi float [ %tmp4, %bb ], [ %tmp19, %bb11 ] + %tmp10 = phi i32 [ 7, %bb ], [ %tmp22, %bb11 ] + store i32 %tmp10, i32* %arg2, align 4 + store float %tmp9, float* %arg3, align 4 + ret void + +bb11: ; preds = %bb11, %bb6 + %tmp12 = phi i64 [ 1, %bb6 ], [ %tmp23, %bb11 ] + %tmp13 = phi i32 [ 7, %bb6 ], [ %tmp22, %bb11 ] + %tmp14 = phi float [ %tmp4, %bb6 ], [ %tmp19, %bb11 ] + %tmp15 = getelementptr inbounds float, float* %arg, i64 %tmp12 + %tmp16 = load float, float* %tmp15, align 4 + %tmp17 = fcmp ogt float %tmp16, %tmp14 + %tmp18 = fmul float %tmp16, 2.000000e+00 + %tmp19 = select i1 %tmp17, float %tmp18, float %tmp14 + %tmp20 = trunc i64 %tmp12 to i32 + %tmp21 = add i32 %tmp20, 7 + %tmp22 = select i1 %tmp17, i32 %tmp21, i32 %tmp13 + %tmp23 = add nuw nsw i64 %tmp12, 1 + %tmp24 = select i1 %tmp17, i32 %tmp13, i32 %tmp21 + %tmp24.ext = zext i32 %tmp24 to i64 + %tmp25 = add nuw nsw i64 %tmp23, %tmp24.ext + %tmp26 = icmp eq i64 %tmp25, %tmp7 + br i1 %tmp26, label %bb8, label %bb11 +}