diff --git a/llvm/lib/CodeGen/SelectOptimize.cpp b/llvm/lib/CodeGen/SelectOptimize.cpp --- a/llvm/lib/CodeGen/SelectOptimize.cpp +++ b/llvm/lib/CodeGen/SelectOptimize.cpp @@ -134,7 +134,8 @@ SelectGroups &ProfSIGroups); bool isConvertToBranchProfitableBase(const SmallVector &ASI); bool hasExpensiveColdOperand(const SmallVector &ASI); - void getOneUseSlice(Instruction *I, SmallVector &Slice); + void getOneUseSlice(Instruction *I, std::stack &Slice, + bool ForSinking = false); bool isSelectHighlyPredictable(const SelectInst *SI); bool checkLoopHeuristics(CostInfo LoopDepth[2]); bool computeLoopCosts(const Loop *L, const SelectGroups &SIGroups, @@ -270,7 +271,11 @@ void SelectOptimize::convertProfitableSIGroups(SelectGroups &ProfSIGroups) { for (SelectGroup &ASI : ProfSIGroups) { - // TODO: eliminate the redundancy of logic transforming selects to branches + // The code transformation here is a modified version of the sinking + // transformation in CodeGenPrepare::optimizeSelectInst with a more + // aggressive strategy of which instructions to sink. + // + // TODO: eliminate the redundancy of logic transforming selects to branche // by removing CodeGenPrepare::optimizeSelectInst and optimizing here // selects for all cases (with and without profile information). @@ -283,13 +288,72 @@ // start: // %cmp = cmp uge i32 %a, %b // %cmp.frozen = freeze %cmp - // br i1 %cmp.frozen, label %select.end, label %select.false + // br i1 %cmp.frozen, label %select.true, label %select.false + // select.true: + // br label %select.end // select.false: // br label %select.end // select.end: - // %sel = phi i32 [ %c, %start ], [ %d, %select.false ] + // %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ] // // %cmp should be frozen, otherwise it may introduce undefined behavior. + // In addition, we may sink instructions that produce %c or %d into the + // destination(s) of the new branch. + // If the true or false blocks do not contain a sunken instruction, that + // block and its branch may be optimized away. In that case, one side of the + // first branch will point directly to select.end, and the corresponding PHI + // predecessor block will be the start block. + + // Find all the instructions that can be soundly sunk to the true/false + // blocks. These are instructions that are computed solely for producing the + // operands of the select instructions in the group and can be sunk without + // breaking the semantics of the LLVM IR (e.g., cannot sink instructions + // with side effects). + SmallVector, 2> TrueSlices, FalseSlices; + unsigned long maxTrueSliceLen = 0, maxFalseSliceLen = 0; + for (SelectInst *SI : ASI) { + // For each select, compute the sinkable dependence chains of the true and + // false operands. + if (auto *TI = dyn_cast(SI->getTrueValue())) { + std::stack TrueSlice; + getOneUseSlice(TI, TrueSlice, true); + maxTrueSliceLen = std::max(maxTrueSliceLen, TrueSlice.size()); + TrueSlices.push_back(TrueSlice); + } + if (auto *FI = dyn_cast(SI->getFalseValue())) { + std::stack FalseSlice; + getOneUseSlice(FI, FalseSlice, true); + maxFalseSliceLen = std::max(maxFalseSliceLen, FalseSlice.size()); + FalseSlices.push_back(FalseSlice); + } + } + // In the case of multiple select instructions in the same group, the order + // of non-dependent instructions (instructions of different dependence + // slices) in the true/false blocks appears to affect performance. + // Interleaving the slices seems to experimentally be the optimal approach. + // This interleaving scheduling allows for more ILP (with a natural downside + // of increasing a bit register pressure) compared to a simple ordering of + // one whole chain after another. One would expect that this ordering would + // not matter since the scheduling in the backend of the compiler would + // take care of it, but apparently the scheduler fails to deliver optimal + // ILP with a naive ordering here. + SmallVector TrueSlicesInterleaved, FalseSlicesInterleaved; + for (unsigned long IS = 0; IS < maxTrueSliceLen; ++IS) { + for (auto &S : TrueSlices) { + if (!S.empty()) { + TrueSlicesInterleaved.push_back(S.top()); + S.pop(); + } + } + } + for (unsigned long IS = 0; IS < maxFalseSliceLen; ++IS) { + for (auto &S : FalseSlices) { + if (!S.empty()) { + FalseSlicesInterleaved.push_back(S.top()); + S.pop(); + } + } + } // We split the block containing the select(s) into two blocks. SelectInst *SI = ASI.front(); @@ -315,24 +379,55 @@ } // These are the new basic blocks for the conditional branch. - // For now, no instruction sinking to the true/false blocks. - // Thus both True and False blocks will be empty. + // At least one will become an actual new basic block. BasicBlock *TrueBlock = nullptr, *FalseBlock = nullptr; - - // Use the 'false' side for a new input value to the PHI. - FalseBlock = BasicBlock::Create(SI->getContext(), "select.false", - EndBlock->getParent(), EndBlock); - auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock); - FalseBranch->setDebugLoc(SI->getDebugLoc()); - - // For the 'true' side the path originates from the start block from the - // point view of the new PHI. - TrueBlock = StartBlock; + BranchInst *TrueBranch = nullptr, *FalseBranch = nullptr; + if (!TrueSlicesInterleaved.empty()) { + TrueBlock = BasicBlock::Create(LastSI->getContext(), "select.true.sink", + EndBlock->getParent(), EndBlock); + TrueBranch = BranchInst::Create(EndBlock, TrueBlock); + TrueBranch->setDebugLoc(LastSI->getDebugLoc()); + for (Instruction *TrueInst : TrueSlicesInterleaved) + TrueInst->moveBefore(TrueBranch); + } + if (!FalseSlicesInterleaved.empty()) { + FalseBlock = BasicBlock::Create(LastSI->getContext(), "select.false.sink", + EndBlock->getParent(), EndBlock); + FalseBranch = BranchInst::Create(EndBlock, FalseBlock); + FalseBranch->setDebugLoc(LastSI->getDebugLoc()); + for (Instruction *FalseInst : FalseSlicesInterleaved) + FalseInst->moveBefore(FalseBranch); + } + // If there was nothing to sink, then arbitrarily choose the 'false' side + // for a new input value to the PHI. + if (TrueBlock == FalseBlock) { + assert(TrueBlock == nullptr && + "Unexpected basic block transform while optimizing select"); + + FalseBlock = BasicBlock::Create(SI->getContext(), "select.false", + EndBlock->getParent(), EndBlock); + auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock); + FalseBranch->setDebugLoc(SI->getDebugLoc()); + } // Insert the real conditional branch based on the original condition. + // If we did not create a new block for one of the 'true' or 'false' paths + // of the condition, it means that side of the branch goes to the end block + // directly and the path originates from the start block from the point of + // view of the new PHI. BasicBlock *TT, *FT; - TT = EndBlock; - FT = FalseBlock; + if (TrueBlock == nullptr) { + TT = EndBlock; + FT = FalseBlock; + TrueBlock = StartBlock; + } else if (FalseBlock == nullptr) { + TT = TrueBlock; + FT = EndBlock; + FalseBlock = StartBlock; + } else { + TT = TrueBlock; + FT = FalseBlock; + } IRBuilder<> IB(SI); auto *CondFr = IB.CreateFreeze(SI->getCondition(), SI->getName() + ".frozen"); @@ -505,18 +600,21 @@ else ColdI = dyn_cast(SI->getFalseValue()); if (ColdI) { - SmallVector ColdSlice; + std::stack ColdSlice; getOneUseSlice(ColdI, ColdSlice); - for (auto *ColdII : ColdSlice) - if (isExpensiveInst(ColdII)) + while (!ColdSlice.empty()) { + if (isExpensiveInst(ColdSlice.top())) return true; + ColdSlice.pop(); + } } } return false; } void SelectOptimize::getOneUseSlice(Instruction *I, - SmallVector &Slice) { + std::stack &Slice, + bool ForSinking) { std::queue Worklist; Worklist.push(I); SmallPtrSet Visited; @@ -532,9 +630,16 @@ if (!II->hasOneUse()) continue; - // Avoid considering instructions in outer or independent loops, or non-loop - // instructions (instructions with potentially much less frequency than the - // operand of the select). + // Cannot soundly sink instructions with side-effects. + // Terminator instructions cannot be sunk. + // Avoid sinking other select instructions (should be handled separetely). + if (ForSinking && + (II->isTerminator() || II->mayHaveSideEffects() || isa(II))) + continue; + + // Avoid considering/sinking instructions from outer or independent loops, + // or non-loop instructions (instructions with potentially much less + // frequency than the operand of the select). const Loop *LII = LI->getLoopFor(II->getParent()); if (L && L != LII && (!LII || !L->contains(LII))) continue; @@ -542,13 +647,15 @@ // Avoid phi nodes that are not part of a loop-carried dependence. // Could source from a cold basic block. if (isa(II)) { + if (ForSinking) + continue; if (!LII) continue; if (LII->getHeader() != II->getParent()) continue; } - Slice.push_back(II); + Slice.push(II); for (unsigned k = 0; k < II->getNumOperands(); ++k) if (auto *OpI = dyn_cast(II->getOperand(k))) Worklist.push(OpI); diff --git a/llvm/test/CodeGen/X86/select-optimize.ll b/llvm/test/CodeGen/X86/select-optimize.ll --- a/llvm/test/CodeGen/X86/select-optimize.ll +++ b/llvm/test/CodeGen/X86/select-optimize.ll @@ -151,13 +151,13 @@ ; sink load define i32 @expensive_val_operand1(i32* nocapture %a, i32 %y, i1 %cmp) { ; CHECK-LABEL: @expensive_val_operand1( -; CHECK-NEXT: [[LOAD:%.*]] = load i32, i32* [[A:%.*]], align 8 ; CHECK-NEXT: [[SEL_FROZEN:%.*]] = freeze i1 [[CMP:%.*]] -; CHECK-NEXT: br i1 [[SEL_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]], !prof [[PROF18]] -; CHECK: select.false: +; CHECK-NEXT: br i1 [[SEL_FROZEN]], label [[SELECT_TRUE_SINK:%.*]], label [[SELECT_END:%.*]], !prof [[PROF18]] +; CHECK: select.true.sink: +; CHECK-NEXT: [[LOAD:%.*]] = load i32, i32* [[A:%.*]], align 8 ; CHECK-NEXT: br label [[SELECT_END]] ; CHECK: select.end: -; CHECK-NEXT: [[SEL:%.*]] = phi i32 [ [[LOAD]], [[TMP0:%.*]] ], [ [[Y:%.*]], [[SELECT_FALSE]] ] +; CHECK-NEXT: [[SEL:%.*]] = phi i32 [ [[LOAD]], [[SELECT_TRUE_SINK]] ], [ [[Y:%.*]], [[TMP0:%.*]] ] ; CHECK-NEXT: ret i32 [[SEL]] ; %load = load i32, i32* %a, align 8 @@ -181,14 +181,14 @@ ; into a branch with sinked dependence slice. define i32 @expensive_val_operand3(i32* nocapture %a, i32 %b, i32 %y, i1 %cmp) { ; CHECK-LABEL: @expensive_val_operand3( +; CHECK-NEXT: [[SEL_FROZEN:%.*]] = freeze i1 [[CMP:%.*]] +; CHECK-NEXT: br i1 [[SEL_FROZEN]], label [[SELECT_TRUE_SINK:%.*]], label [[SELECT_END:%.*]], !prof [[PROF18]] +; CHECK: select.true.sink: ; CHECK-NEXT: [[LOAD:%.*]] = load i32, i32* [[A:%.*]], align 8 ; CHECK-NEXT: [[X:%.*]] = add i32 [[LOAD]], [[B:%.*]] -; CHECK-NEXT: [[SEL_FROZEN:%.*]] = freeze i1 [[CMP:%.*]] -; CHECK-NEXT: br i1 [[SEL_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]], !prof [[PROF18]] -; CHECK: select.false: ; CHECK-NEXT: br label [[SELECT_END]] ; CHECK: select.end: -; CHECK-NEXT: [[SEL:%.*]] = phi i32 [ [[X]], [[TMP0:%.*]] ], [ [[Y:%.*]], [[SELECT_FALSE]] ] +; CHECK-NEXT: [[SEL:%.*]] = phi i32 [ [[X]], [[SELECT_TRUE_SINK]] ], [ [[Y:%.*]], [[TMP0:%.*]] ] ; CHECK-NEXT: ret i32 [[SEL]] ; %load = load i32, i32* %a, align 8 @@ -242,14 +242,14 @@ ; CHECK-NEXT: [[X1:%.*]] = phi double [ [[X2:%.*]], [[SELECT_END]] ], [ [[X]], [[FOR_BODY_PREHEADER]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[R:%.*]] = load double, double* [[ARRAYIDX]], align 8 -; CHECK-NEXT: [[SUB:%.*]] = fsub double [[X1]], [[R]] ; CHECK-NEXT: [[CMP2:%.*]] = fcmp ogt double [[X1]], [[R]] ; CHECK-NEXT: [[X2_FROZEN:%.*]] = freeze i1 [[CMP2]] -; CHECK-NEXT: br i1 [[X2_FROZEN]], label [[SELECT_END]], label [[SELECT_FALSE:%.*]], !prof [[PROF20:![0-9]+]] -; CHECK: select.false: +; CHECK-NEXT: br i1 [[X2_FROZEN]], label [[SELECT_TRUE_SINK:%.*]], label [[SELECT_END]], !prof [[PROF27:![0-9]+]] +; CHECK: select.true.sink: +; CHECK-NEXT: [[SUB:%.*]] = fsub double [[X1]], [[R]] ; CHECK-NEXT: br label [[SELECT_END]] ; CHECK: select.end: -; CHECK-NEXT: [[X2]] = phi double [ [[SUB]], [[FOR_BODY]] ], [ [[X1]], [[SELECT_FALSE]] ] +; CHECK-NEXT: [[X2]] = phi double [ [[SUB]], [[SELECT_TRUE_SINK]] ], [ [[X1]], [[FOR_BODY]] ] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]]