diff --git a/llvm/lib/CodeGen/SelectOptimize.cpp b/llvm/lib/CodeGen/SelectOptimize.cpp --- a/llvm/lib/CodeGen/SelectOptimize.cpp +++ b/llvm/lib/CodeGen/SelectOptimize.cpp @@ -179,8 +179,8 @@ // For a given source instruction, collect its backwards dependence slice // consisting of instructions exclusively computed for producing the operands // of the source instruction. - void getExclBackwardsSlice(Instruction *I, - SmallVector &Slice); + void getExclBackwardsSlice(Instruction *I, std::stack &Slice, + bool ForSinking = false); // Returns true if the condition of the select is highly predictable. bool isSelectHighlyPredictable(const SelectInst *SI); @@ -329,6 +329,10 @@ void SelectOptimize::convertProfitableSIGroups(SelectGroups &ProfSIGroups) { for (SelectGroup &ASI : ProfSIGroups) { + // The code transformation here is a modified version of the sinking + // transformation in CodeGenPrepare::optimizeSelectInst with a more + // aggressive strategy of which instructions to sink. + // // TODO: eliminate the redundancy of logic transforming selects to branches // by removing CodeGenPrepare::optimizeSelectInst and optimizing here // selects for all cases (with and without profile information). @@ -342,13 +346,72 @@ // start: // %cmp = cmp uge i32 %a, %b // %cmp.frozen = freeze %cmp - // br i1 %cmp.frozen, label %select.end, label %select.false + // br i1 %cmp.frozen, label %select.true, label %select.false + // select.true: + // br label %select.end // select.false: // br label %select.end // select.end: - // %sel = phi i32 [ %c, %start ], [ %d, %select.false ] + // %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ] // // %cmp should be frozen, otherwise it may introduce undefined behavior. + // In addition, we may sink instructions that produce %c or %d into the + // destination(s) of the new branch. + // If the true or false blocks do not contain a sunken instruction, that + // block and its branch may be optimized away. In that case, one side of the + // first branch will point directly to select.end, and the corresponding PHI + // predecessor block will be the start block. + + // Find all the instructions that can be soundly sunk to the true/false + // blocks. These are instructions that are computed solely for producing the + // operands of the select instructions in the group and can be sunk without + // breaking the semantics of the LLVM IR (e.g., cannot sink instructions + // with side effects). + SmallVector, 2> TrueSlices, FalseSlices; + unsigned long maxTrueSliceLen = 0, maxFalseSliceLen = 0; + for (SelectInst *SI : ASI) { + // For each select, compute the sinkable dependence chains of the true and + // false operands. + if (auto *TI = dyn_cast(SI->getTrueValue())) { + std::stack TrueSlice; + getExclBackwardsSlice(TI, TrueSlice, true); + maxTrueSliceLen = std::max(maxTrueSliceLen, TrueSlice.size()); + TrueSlices.push_back(TrueSlice); + } + if (auto *FI = dyn_cast(SI->getFalseValue())) { + std::stack FalseSlice; + getExclBackwardsSlice(FI, FalseSlice, true); + maxFalseSliceLen = std::max(maxFalseSliceLen, FalseSlice.size()); + FalseSlices.push_back(FalseSlice); + } + } + // In the case of multiple select instructions in the same group, the order + // of non-dependent instructions (instructions of different dependence + // slices) in the true/false blocks appears to affect performance. + // Interleaving the slices seems to experimentally be the optimal approach. + // This interleaving scheduling allows for more ILP (with a natural downside + // of increasing a bit register pressure) compared to a simple ordering of + // one whole chain after another. One would expect that this ordering would + // not matter since the scheduling in the backend of the compiler would + // take care of it, but apparently the scheduler fails to deliver optimal + // ILP with a naive ordering here. + SmallVector TrueSlicesInterleaved, FalseSlicesInterleaved; + for (unsigned long IS = 0; IS < maxTrueSliceLen; ++IS) { + for (auto &S : TrueSlices) { + if (!S.empty()) { + TrueSlicesInterleaved.push_back(S.top()); + S.pop(); + } + } + } + for (unsigned long IS = 0; IS < maxFalseSliceLen; ++IS) { + for (auto &S : FalseSlices) { + if (!S.empty()) { + FalseSlicesInterleaved.push_back(S.top()); + S.pop(); + } + } + } // We split the block containing the select(s) into two blocks. SelectInst *SI = ASI.front(); @@ -374,24 +437,55 @@ } // These are the new basic blocks for the conditional branch. - // For now, no instruction sinking to the true/false blocks. - // Thus both True and False blocks will be empty. + // At least one will become an actual new basic block. BasicBlock *TrueBlock = nullptr, *FalseBlock = nullptr; - - // Use the 'false' side for a new input value to the PHI. - FalseBlock = BasicBlock::Create(SI->getContext(), "select.false", - EndBlock->getParent(), EndBlock); - auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock); - FalseBranch->setDebugLoc(SI->getDebugLoc()); - - // For the 'true' side the path originates from the start block from the - // point view of the new PHI. - TrueBlock = StartBlock; + BranchInst *TrueBranch = nullptr, *FalseBranch = nullptr; + if (!TrueSlicesInterleaved.empty()) { + TrueBlock = BasicBlock::Create(LastSI->getContext(), "select.true.sink", + EndBlock->getParent(), EndBlock); + TrueBranch = BranchInst::Create(EndBlock, TrueBlock); + TrueBranch->setDebugLoc(LastSI->getDebugLoc()); + for (Instruction *TrueInst : TrueSlicesInterleaved) + TrueInst->moveBefore(TrueBranch); + } + if (!FalseSlicesInterleaved.empty()) { + FalseBlock = BasicBlock::Create(LastSI->getContext(), "select.false.sink", + EndBlock->getParent(), EndBlock); + FalseBranch = BranchInst::Create(EndBlock, FalseBlock); + FalseBranch->setDebugLoc(LastSI->getDebugLoc()); + for (Instruction *FalseInst : FalseSlicesInterleaved) + FalseInst->moveBefore(FalseBranch); + } + // If there was nothing to sink, then arbitrarily choose the 'false' side + // for a new input value to the PHI. + if (TrueBlock == FalseBlock) { + assert(TrueBlock == nullptr && + "Unexpected basic block transform while optimizing select"); + + FalseBlock = BasicBlock::Create(SI->getContext(), "select.false", + EndBlock->getParent(), EndBlock); + auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock); + FalseBranch->setDebugLoc(SI->getDebugLoc()); + } // Insert the real conditional branch based on the original condition. + // If we did not create a new block for one of the 'true' or 'false' paths + // of the condition, it means that side of the branch goes to the end block + // directly and the path originates from the start block from the point of + // view of the new PHI. BasicBlock *TT, *FT; - TT = EndBlock; - FT = FalseBlock; + if (TrueBlock == nullptr) { + TT = EndBlock; + FT = FalseBlock; + TrueBlock = StartBlock; + } else if (FalseBlock == nullptr) { + TT = TrueBlock; + FT = EndBlock; + FalseBlock = StartBlock; + } else { + TT = TrueBlock; + FT = FalseBlock; + } IRBuilder<> IB(SI); auto *CondFr = IB.CreateFreeze(SI->getCondition(), SI->getName() + ".frozen"); @@ -586,12 +680,13 @@ HotWeight = TrueWeight; } if (ColdI) { - SmallVector ColdSlice; + std::stack ColdSlice; getExclBackwardsSlice(ColdI, ColdSlice); InstructionCost SliceCost = 0; - for (auto *ColdII : ColdSlice) { - SliceCost += - TTI->getInstructionCost(ColdII, TargetTransformInfo::TCK_Latency); + while (!ColdSlice.empty()) { + SliceCost += TTI->getInstructionCost(ColdSlice.top(), + TargetTransformInfo::TCK_Latency); + ColdSlice.pop(); } // The colder the cold value operand of the select is the more expensive // the cmov becomes for computing the cold value operand every time. Thus, @@ -613,8 +708,9 @@ // (sufficiently-accurate in practice), we populate this set with the // instructions of the backwards dependence slice that only have one-use and // form an one-use chain that leads to the source instruction. -void SelectOptimize::getExclBackwardsSlice( - Instruction *I, SmallVector &Slice) { +void SelectOptimize::getExclBackwardsSlice(Instruction *I, + std::stack &Slice, + bool ForSinking) { SmallPtrSet Visited; std::queue Worklist; Worklist.push(I); @@ -630,13 +726,20 @@ if (!II->hasOneUse()) continue; + // Cannot soundly sink instructions with side-effects. + // Terminator or phi instructions cannot be sunk. + // Avoid sinking other select instructions (should be handled separetely). + if (ForSinking && (II->isTerminator() || II->mayHaveSideEffects() || + isa(II) || isa(II))) + continue; + // Avoid considering instructions with less frequency than the source // instruction (i.e., avoid colder code regions of the dependence slice). if (BFI->getBlockFreq(II->getParent()) < BFI->getBlockFreq(I->getParent())) continue; // Eligible one-use instruction added to the dependence slice. - Slice.push_back(II); + Slice.push(II); // Explore all the operands of the current instruction to expand the slice. for (unsigned k = 0; k < II->getNumOperands(); ++k) diff --git a/llvm/test/CodeGen/X86/select-optimize.ll b/llvm/test/CodeGen/X86/select-optimize.ll --- a/llvm/test/CodeGen/X86/select-optimize.ll +++ b/llvm/test/CodeGen/X86/select-optimize.ll @@ -105,20 +105,28 @@ ; If select group predictable, turn it into a branch. define i32 @weighted_select_group(i32 %a, i32 %b, i32 %c, i1 %cmp) !prof !19 { ; CHECK-LABEL: @weighted_select_group( +; CHECK-NEXT: [[A1:%.*]] = add i32 [[A:%.*]], 1 ; CHECK-NEXT: [[SEL1_FROZEN:%.*]] = freeze i1 [[CMP:%.*]] -; CHECK-NEXT: br i1 [[SEL1_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]], !prof [[PROF16]] -; CHECK: select.false: +; CHECK-NEXT: br i1 [[SEL1_FROZEN]], label [[SELECT_TRUE_SINK:%.*]], label [[SELECT_FALSE_SINK:%.*]], !prof [[PROF16]] +; CHECK: select.true.sink: +; CHECK-NEXT: [[C1:%.*]] = add i32 [[C:%.*]], 1 +; CHECK-NEXT: br label [[SELECT_END:%.*]] +; CHECK: select.false.sink: +; CHECK-NEXT: [[B1:%.*]] = add i32 [[B:%.*]], 1 ; CHECK-NEXT: br label [[SELECT_END]] ; CHECK: select.end: -; CHECK-NEXT: [[SEL1:%.*]] = phi i32 [ [[A:%.*]], [[TMP0:%.*]] ], [ [[B:%.*]], [[SELECT_FALSE]] ] -; CHECK-NEXT: [[SEL2:%.*]] = phi i32 [ [[C:%.*]], [[TMP0]] ], [ [[A]], [[SELECT_FALSE]] ] +; CHECK-NEXT: [[SEL1:%.*]] = phi i32 [ [[A1]], [[SELECT_TRUE_SINK]] ], [ [[B1]], [[SELECT_FALSE_SINK]] ] +; CHECK-NEXT: [[SEL2:%.*]] = phi i32 [ [[C1]], [[SELECT_TRUE_SINK]] ], [ [[A1]], [[SELECT_FALSE_SINK]] ] ; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 [[SEL1]], metadata [[META22:![0-9]+]], metadata !DIExpression()), !dbg [[DBG26:![0-9]+]] ; CHECK-NEXT: [[ADD:%.*]] = add i32 [[SEL1]], [[SEL2]] ; CHECK-NEXT: ret i32 [[ADD]] ; - %sel1 = select i1 %cmp, i32 %a, i32 %b, !prof !15 + %a1 = add i32 %a, 1 + %b1 = add i32 %b, 1 + %c1 = add i32 %c, 1 + %sel1 = select i1 %cmp, i32 %a1, i32 %b1, !prof !15 call void @llvm.dbg.value(metadata i32 %sel1, metadata !24, metadata !DIExpression()), !dbg !DILocation(scope: !23) - %sel2 = select i1 %cmp, i32 %c, i32 %a, !prof !15 + %sel2 = select i1 %cmp, i32 %c1, i32 %a1, !prof !15 %add = add i32 %sel1, %sel2 ret i32 %add } @@ -151,13 +159,13 @@ ; sink load define i32 @expensive_val_operand1(i32* nocapture %a, i32 %y, i1 %cmp) { ; CHECK-LABEL: @expensive_val_operand1( -; CHECK-NEXT: [[LOAD:%.*]] = load i32, i32* [[A:%.*]], align 8 ; CHECK-NEXT: [[SEL_FROZEN:%.*]] = freeze i1 [[CMP:%.*]] -; CHECK-NEXT: br i1 [[SEL_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]], !prof [[PROF18]] -; CHECK: select.false: +; CHECK-NEXT: br i1 [[SEL_FROZEN]], label [[SELECT_TRUE_SINK:%.*]], label [[SELECT_END:%.*]], !prof [[PROF18]] +; CHECK: select.true.sink: +; CHECK-NEXT: [[LOAD:%.*]] = load i32, i32* [[A:%.*]], align 8 ; CHECK-NEXT: br label [[SELECT_END]] ; CHECK: select.end: -; CHECK-NEXT: [[SEL:%.*]] = phi i32 [ [[LOAD]], [[TMP0:%.*]] ], [ [[Y:%.*]], [[SELECT_FALSE]] ] +; CHECK-NEXT: [[SEL:%.*]] = phi i32 [ [[LOAD]], [[SELECT_TRUE_SINK]] ], [ [[Y:%.*]], [[TMP0:%.*]] ] ; CHECK-NEXT: ret i32 [[SEL]] ; %load = load i32, i32* %a, align 8 @@ -181,14 +189,14 @@ ; into a branch with sinked dependence slice. define i32 @expensive_val_operand3(i32* nocapture %a, i32 %b, i32 %y, i1 %cmp) { ; CHECK-LABEL: @expensive_val_operand3( +; CHECK-NEXT: [[SEL_FROZEN:%.*]] = freeze i1 [[CMP:%.*]] +; CHECK-NEXT: br i1 [[SEL_FROZEN]], label [[SELECT_TRUE_SINK:%.*]], label [[SELECT_END:%.*]], !prof [[PROF18]] +; CHECK: select.true.sink: ; CHECK-NEXT: [[LOAD:%.*]] = load i32, i32* [[A:%.*]], align 8 ; CHECK-NEXT: [[X:%.*]] = add i32 [[LOAD]], [[B:%.*]] -; CHECK-NEXT: [[SEL_FROZEN:%.*]] = freeze i1 [[CMP:%.*]] -; CHECK-NEXT: br i1 [[SEL_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]], !prof [[PROF18]] -; CHECK: select.false: ; CHECK-NEXT: br label [[SELECT_END]] ; CHECK: select.end: -; CHECK-NEXT: [[SEL:%.*]] = phi i32 [ [[X]], [[TMP0:%.*]] ], [ [[Y:%.*]], [[SELECT_FALSE]] ] +; CHECK-NEXT: [[SEL:%.*]] = phi i32 [ [[X]], [[SELECT_TRUE_SINK]] ], [ [[Y:%.*]], [[TMP0:%.*]] ] ; CHECK-NEXT: ret i32 [[SEL]] ; %load = load i32, i32* %a, align 8 @@ -242,14 +250,14 @@ ; CHECK-NEXT: [[X1:%.*]] = phi double [ [[X2:%.*]], [[SELECT_END]] ], [ [[X]], [[FOR_BODY_PREHEADER]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[R:%.*]] = load double, double* [[ARRAYIDX]], align 8 -; CHECK-NEXT: [[SUB:%.*]] = fsub double [[X1]], [[R]] ; CHECK-NEXT: [[CMP2:%.*]] = fcmp ogt double [[X1]], [[R]] ; CHECK-NEXT: [[X2_FROZEN:%.*]] = freeze i1 [[CMP2]] -; CHECK-NEXT: br i1 [[X2_FROZEN]], label [[SELECT_END]], label [[SELECT_FALSE:%.*]], !prof [[PROF27:![0-9]+]] -; CHECK: select.false: +; CHECK-NEXT: br i1 [[X2_FROZEN]], label [[SELECT_TRUE_SINK:%.*]], label [[SELECT_END]], !prof [[PROF27:![0-9]+]] +; CHECK: select.true.sink: +; CHECK-NEXT: [[SUB:%.*]] = fsub double [[X1]], [[R]] ; CHECK-NEXT: br label [[SELECT_END]] ; CHECK: select.end: -; CHECK-NEXT: [[X2]] = phi double [ [[SUB]], [[FOR_BODY]] ], [ [[X1]], [[SELECT_FALSE]] ] +; CHECK-NEXT: [[X2]] = phi double [ [[SUB]], [[SELECT_TRUE_SINK]] ], [ [[X1]], [[FOR_BODY]] ] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]]