diff --git a/llvm/lib/CodeGen/SelectOptimize.cpp b/llvm/lib/CodeGen/SelectOptimize.cpp --- a/llvm/lib/CodeGen/SelectOptimize.cpp +++ b/llvm/lib/CodeGen/SelectOptimize.cpp @@ -134,7 +134,8 @@ SelectGroups &ProfSIGroups); bool isConvertToBranchProfitableBase(const SmallVector &ASI); bool hasExpensiveColdOperand(const SmallVector &ASI); - void getOneUseSlice(Instruction *I, SmallVector &Slice); + void getOneUseSlice(Instruction *I, std::stack &Slice, + bool ForSinking = false); bool isSelectHighlyPredictable(const SelectInst *SI); bool checkLoopHeuristics(CostInfo LoopDepth[2]); bool computeLoopCosts(const Loop *L, const SelectGroups &SIGroups, @@ -270,7 +271,11 @@ void SelectOptimize::convertProfitableSIGroups(SelectGroups &ProfSIGroups) { for (SelectGroup &ASI : ProfSIGroups) { - // TODO: eliminate the redundancy of logic transforming selects to branches + // The code transformation here is a modified version of the sinking + // transformation in CodeGenPrepare::optimizeSelectInst with a more + // aggressive strategy of which instructions to sink. + // + // TODO: eliminate the redundancy of logic transforming selects to branche // by removing CodeGenPrepare::optimizeSelectInst and optimizing here // selects for all cases (with and without profile information). @@ -283,13 +288,72 @@ // start: // %cmp = cmp uge i32 %a, %b // %cmp.frozen = freeze %cmp - // br i1 %cmp.frozen, label %select.end, label %select.false + // br i1 %cmp.frozen, label %select.true, label %select.false + // select.true: + // br label %select.end // select.false: // br label %select.end // select.end: - // %sel = phi i32 [ %c, %start ], [ %d, %select.false ] + // %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ] // // %cmp should be frozen, otherwise it may introduce undefined behavior. + // In addition, we may sink instructions that produce %c or %d into the + // destination(s) of the new branch. + // If the true or false blocks do not contain a sunken instruction, that + // block and its branch may be optimized away. In that case, one side of the + // first branch will point directly to select.end, and the corresponding PHI + // predecessor block will be the start block. + + // Find all the instructions that can be soundly sunk to the true/false + // blocks. These are instructions that are computed solely for producing the + // operands of the select instructions in the group and can be sunk without + // breaking the semantics of the LLVM IR (e.g., cannot sink instructions + // with side effects). + SmallVector, 2> TrueSlices, FalseSlices; + unsigned long maxTrueSliceLen = 0, maxFalseSliceLen = 0; + for (SelectInst *SI : ASI) { + // For each select, compute the sinkable dependence chains of the true and + // false operands. + if (auto *TI = dyn_cast(SI->getTrueValue())) { + std::stack TrueSlice; + getOneUseSlice(TI, TrueSlice, true); + maxTrueSliceLen = std::max(maxTrueSliceLen, TrueSlice.size()); + TrueSlices.push_back(TrueSlice); + } + if (auto *FI = dyn_cast(SI->getFalseValue())) { + std::stack FalseSlice; + getOneUseSlice(FI, FalseSlice, true); + maxFalseSliceLen = std::max(maxFalseSliceLen, FalseSlice.size()); + FalseSlices.push_back(FalseSlice); + } + } + // In the case of multiple select instructions in the same group, the order + // of non-dependent instructions (instructions of different dependence + // slices) in the true/false blocks appears to affect performance. + // Interleaving the slices seems to experimentally be the optimal approach. + // This interleaving scheduling allows for more ILP (with a natural downside + // of increasing a bit register pressure) compared to a simple ordering of + // one whole chain after another. One would expect that this ordering would + // not matter since the scheduling in the backend of the compiler would + // take care of it, but apparently the scheduler fails to deliver optimal + // ILP with a naive ordering here. + SmallVector TrueSlicesInterleaved, FalseSlicesInterleaved; + for (unsigned long IS = 0; IS < maxTrueSliceLen; ++IS) { + for (auto &S : TrueSlices) { + if (!S.empty()) { + TrueSlicesInterleaved.push_back(S.top()); + S.pop(); + } + } + } + for (unsigned long IS = 0; IS < maxFalseSliceLen; ++IS) { + for (auto &S : FalseSlices) { + if (!S.empty()) { + FalseSlicesInterleaved.push_back(S.top()); + S.pop(); + } + } + } // We split the block containing the select(s) into two blocks. SelectInst *SI = ASI.front(); @@ -302,24 +366,55 @@ StartBlock->getTerminator()->eraseFromParent(); // These are the new basic blocks for the conditional branch. - // For now, no instruction sinking to the true/false blocks. - // Thus both True and False blocks will be empty. + // At least one will become an actual new basic block. BasicBlock *TrueBlock = nullptr, *FalseBlock = nullptr; - - // Use the 'false' side for a new input value to the PHI. - FalseBlock = BasicBlock::Create(SI->getContext(), "select.false", - EndBlock->getParent(), EndBlock); - auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock); - FalseBranch->setDebugLoc(SI->getDebugLoc()); - - // For the 'true' side the path originates from the start block from the - // point view of the new PHI. - TrueBlock = StartBlock; + BranchInst *TrueBranch = nullptr, *FalseBranch = nullptr; + if (!TrueSlicesInterleaved.empty()) { + TrueBlock = BasicBlock::Create(LastSI->getContext(), "select.true.sink", + EndBlock->getParent(), EndBlock); + TrueBranch = BranchInst::Create(EndBlock, TrueBlock); + TrueBranch->setDebugLoc(LastSI->getDebugLoc()); + for (Instruction *TrueInst : TrueSlicesInterleaved) + TrueInst->moveBefore(TrueBranch); + } + if (!FalseSlicesInterleaved.empty()) { + FalseBlock = BasicBlock::Create(LastSI->getContext(), "select.false.sink", + EndBlock->getParent(), EndBlock); + FalseBranch = BranchInst::Create(EndBlock, FalseBlock); + FalseBranch->setDebugLoc(LastSI->getDebugLoc()); + for (Instruction *FalseInst : FalseSlicesInterleaved) + FalseInst->moveBefore(FalseBranch); + } + // If there was nothing to sink, then arbitrarily choose the 'false' side + // for a new input value to the PHI. + if (TrueBlock == FalseBlock) { + assert(TrueBlock == nullptr && + "Unexpected basic block transform while optimizing select"); + + FalseBlock = BasicBlock::Create(SI->getContext(), "select.false", + EndBlock->getParent(), EndBlock); + auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock); + FalseBranch->setDebugLoc(SI->getDebugLoc()); + } // Insert the real conditional branch based on the original condition. + // If we did not create a new block for one of the 'true' or 'false' paths + // of the condition, it means that side of the branch goes to the end block + // directly and the path originates from the start block from the point of + // view of the new PHI. BasicBlock *TT, *FT; - TT = EndBlock; - FT = FalseBlock; + if (TrueBlock == nullptr) { + TT = EndBlock; + FT = FalseBlock; + TrueBlock = StartBlock; + } else if (FalseBlock == nullptr) { + TT = TrueBlock; + FT = EndBlock; + FalseBlock = StartBlock; + } else { + TT = TrueBlock; + FT = FalseBlock; + } IRBuilder<> IB(SI); auto *CondFr = IB.CreateFreeze(SI->getCondition(), SI->getName() + ".frozen"); @@ -489,18 +584,21 @@ else ColdI = dyn_cast(SI->getFalseValue()); if (ColdI) { - SmallVector ColdSlice; + std::stack ColdSlice; getOneUseSlice(ColdI, ColdSlice); - for (auto *ColdII : ColdSlice) - if (isExpensiveInst(ColdII)) + while (!ColdSlice.empty()) { + if (isExpensiveInst(ColdSlice.top())) return true; + ColdSlice.pop(); + } } } return false; } void SelectOptimize::getOneUseSlice(Instruction *I, - SmallVector &Slice) { + std::stack &Slice, + bool ForSinking) { std::queue Worklist; Worklist.push(I); SmallPtrSet Visited; @@ -516,9 +614,16 @@ if (!II->hasOneUse()) continue; - // Avoid considering instructions in outer or independent loops, or non-loop - // instructions (instructions with potentially much less frequency than the - // operand of the select). + // Cannot soundly sink instructions with side-effects. + // Terminator instructions cannot be sunk. + // Avoid sinking other select instructions (should be handled separetely). + if (ForSinking && + (II->isTerminator() || II->mayHaveSideEffects() || isa(II))) + continue; + + // Avoid considering/sinking instructions from outer or independent loops, + // or non-loop instructions (instructions with potentially much less + // frequency than the operand of the select). const Loop *LII = LI->getLoopFor(II->getParent()); if (L && L != LII && (!LII || !L->contains(LII))) continue; @@ -526,13 +631,15 @@ // Avoid phi nodes that are not part of a loop-carried dependence. // Could source from a cold basic block. if (isa(II)) { + if (ForSinking) + continue; if (!LII) continue; if (LII->getHeader() != II->getParent()) continue; } - Slice.push_back(II); + Slice.push(II); for (unsigned k = 0; k < II->getNumOperands(); ++k) if (auto *OpI = dyn_cast(II->getOperand(k))) Worklist.push(OpI); diff --git a/llvm/test/CodeGen/X86/select-optimize.ll b/llvm/test/CodeGen/X86/select-optimize.ll --- a/llvm/test/CodeGen/X86/select-optimize.ll +++ b/llvm/test/CodeGen/X86/select-optimize.ll @@ -158,12 +158,12 @@ define i32 @expensive_val_operand1(i32* nocapture %a, i32 %y, i1 %cmp) { ; CHECK-LABEL: expensive_val_operand1: ; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %eax ; CHECK-NEXT: testb $1, %dl ; CHECK-NEXT: jne .LBB9_1 -; CHECK-NEXT: # %bb.2: # %select.false -; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: # %bb.2: # %select.end ; CHECK-NEXT: retq -; CHECK-NEXT: .LBB9_1: +; CHECK-NEXT: .LBB9_1: # %select.true.sink ; CHECK-NEXT: movl (%rdi), %eax ; CHECK-NEXT: retq %load = load i32, i32* %a, align 8 @@ -189,12 +189,12 @@ define i32 @expensive_val_operand3(i32* nocapture %a, i32 %b, i32 %y, i1 %cmp) { ; CHECK-LABEL: expensive_val_operand3: ; CHECK: # %bb.0: +; CHECK-NEXT: movl %edx, %eax ; CHECK-NEXT: testb $1, %cl ; CHECK-NEXT: jne .LBB11_1 -; CHECK-NEXT: # %bb.2: # %select.false -; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: # %bb.2: # %select.end ; CHECK-NEXT: retq -; CHECK-NEXT: .LBB11_1: +; CHECK-NEXT: .LBB11_1: # %select.true.sink ; CHECK-NEXT: addl (%rdi), %esi ; CHECK-NEXT: movl %esi, %eax ; CHECK-NEXT: retq @@ -254,7 +254,8 @@ ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; CHECK-NEXT: ucomisd %xmm1, %xmm0 ; CHECK-NEXT: jbe .LBB13_4 -; CHECK-NEXT: # %bb.3: # in Loop: Header=BB13_2 Depth=1 +; CHECK-NEXT: # %bb.3: # %select.true.sink +; CHECK-NEXT: # in Loop: Header=BB13_2 Depth=1 ; CHECK-NEXT: subsd %xmm1, %xmm0 ; CHECK-NEXT: jmp .LBB13_4 ; CHECK-NEXT: .LBB13_5: # %for.cond.cleanup