diff --git a/llvm/lib/CodeGen/SelectOptimize.cpp b/llvm/lib/CodeGen/SelectOptimize.cpp --- a/llvm/lib/CodeGen/SelectOptimize.cpp +++ b/llvm/lib/CodeGen/SelectOptimize.cpp @@ -27,6 +27,7 @@ #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" @@ -52,6 +53,7 @@ "Number of select groups not converted due to unpredictability"); STATISTIC(NumSelectConvertedLoop, "Number of select groups converted due to loop-level analysis"); +STATISTIC(NumSelectsConverted, "Number of selects converted"); static cl::opt ColdOperandThreshold( "cold-operand-threshold", @@ -132,7 +134,8 @@ SelectGroups &ProfSIGroups); bool isConvertToBranchProfitableBase(const SmallVector &ASI); bool hasExpensiveColdOperand(const SmallVector &ASI); - void getOneUseSlice(Instruction *I, SmallVector &Slice); + void getOneUseSlice(Instruction *I, std::stack &Slice, + bool ForSinking = false); bool isSelectHighlyPredictable(const SelectInst *SI); bool checkLoopHeuristics(CostInfo LoopDepth[2]); bool computeLoopCosts(const Loop *L, const SelectGroups &SIGroups, @@ -248,8 +251,195 @@ } } +/// If \p isTrue is true, return the true value of \p SI, otherwise return +/// false value of \p SI. If the true/false value of \p SI is defined by any +/// select instructions in \p Selects, look through the defining select +/// instruction until the true/false value is not defined in \p Selects. +static Value * +getTrueOrFalseValue(SelectInst *SI, bool isTrue, + const SmallPtrSet &Selects) { + Value *V = nullptr; + for (SelectInst *DefSI = SI; DefSI != nullptr && Selects.count(DefSI); + DefSI = dyn_cast(V)) { + assert(DefSI->getCondition() == SI->getCondition() && + "The condition of DefSI does not match with SI"); + V = (isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue()); + } + assert(V && "Failed to get select true/false value"); + return V; +} + void SelectOptimize::convertProfitableSIGroups(SelectGroups &ProfSIGroups) { - llvm_unreachable("Unimplemented"); + for (SelectGroup &ASI : ProfSIGroups) { + // The code transformation here is a modified version of the sinking + // transformation in CodeGenPrepare::optimizeSelectInst with a more + // aggressive strategy of which instructions to sink. + // + // TODO: eliminate the redundancy by removing + // CodeGenPrepare::optimizeSelectInst and optimizing here selects for all + // cases (with and without profile information). + + // Transform a sequence like this: + // start: + // %cmp = cmp uge i32 %a, %b + // %sel = select i1 %cmp, i32 %c, i32 %d + // + // Into: + // start: + // %cmp = cmp uge i32 %a, %b + // %cmp.frozen = freeze %cmp + // br i1 %cmp.frozen, label %select.true, label %select.false + // select.true: + // br label %select.end + // select.false: + // br label %select.end + // select.end: + // %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ] + // + // %cmp should be frozen, otherwise it may introduce undefined behavior. + // In addition, we may sink instructions that produce %c or %d into the + // destination(s) of the new branch. + // If the true or false blocks do not contain a sunken instruction, that + // block and its branch may be optimized away. In that case, one side of the + // first branch will point directly to select.end, and the corresponding PHI + // predecessor block will be the start block. + + // Find all the instructions that can be soundly sunk to the true/false + // blocks. These are instructions that are computed solely for producing the + // operands of the select instructions in the group and can be sunk without + // breaking the semantics of the LLVM IR (e.g., cannot sink instructions + // with side effects). + SmallVector, 2> TrueSlices, FalseSlices; + unsigned long maxTrueSliceLen = 0, maxFalseSliceLen = 0; + for (SelectInst *SI : ASI) { + // For each select, compute the sinkable dependence chains of the true and + // false operands. + if (auto *TI = dyn_cast(SI->getTrueValue())) { + std::stack TrueSlice; + getOneUseSlice(TI, TrueSlice, true); + maxTrueSliceLen = std::max(maxTrueSliceLen, TrueSlice.size()); + TrueSlices.push_back(TrueSlice); + } + if (auto *FI = dyn_cast(SI->getFalseValue())) { + std::stack FalseSlice; + getOneUseSlice(FI, FalseSlice, true); + maxFalseSliceLen = std::max(maxFalseSliceLen, FalseSlice.size()); + FalseSlices.push_back(FalseSlice); + } + } + // In the case of multiple select instructions in the same group, the order + // of non-dependent instructions (instructions of different dependence + // slices) in the true/false blocks appears to affect performance. + // Interleaving the slices seems to experimentally be the optimal approach. + // This interleaving scheduling allows for more ILP (with a natural downside + // of increasing a bit register pressure) compared to a simple ordering of + // one whole chain after another. One would expect that this ordering would + // not matter since the scheduling in the backend of the compiler would + // take care of it, but apparently the scheduler fails to deliver optimal + // ILP with a naive ordering here. + SmallVector TrueSlicesInterleaved, FalseSlicesInterleaved; + for (unsigned long IS = 0; IS < maxTrueSliceLen; ++IS) { + for (auto &S : TrueSlices) { + if (!S.empty()) { + TrueSlicesInterleaved.push_back(S.top()); + S.pop(); + } + } + } + for (unsigned long IS = 0; IS < maxFalseSliceLen; ++IS) { + for (auto &S : FalseSlices) { + if (!S.empty()) { + FalseSlicesInterleaved.push_back(S.top()); + S.pop(); + } + } + } + + // We split the block containing the select(s) into two blocks. + SelectInst *SI = ASI.front(); + SelectInst *LastSI = ASI.back(); + BasicBlock *StartBlock = SI->getParent(); + BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI)); + BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end"); + BFI->setBlockFreq(EndBlock, BFI->getBlockFreq(StartBlock).getFrequency()); + // Delete the unconditional branch that was just created by the split. + StartBlock->getTerminator()->eraseFromParent(); + + // These are the new basic blocks for the conditional branch. + // At least one will become an actual new basic block. + BasicBlock *TrueBlock = nullptr, *FalseBlock = nullptr; + BranchInst *TrueBranch = nullptr, *FalseBranch = nullptr; + if (!TrueSlicesInterleaved.empty()) { + TrueBlock = BasicBlock::Create(LastSI->getContext(), "select.true.sink", + EndBlock->getParent(), EndBlock); + TrueBranch = BranchInst::Create(EndBlock, TrueBlock); + TrueBranch->setDebugLoc(LastSI->getDebugLoc()); + for (Instruction *TrueInst : TrueSlicesInterleaved) + TrueInst->moveBefore(TrueBranch); + } + if (!FalseSlicesInterleaved.empty()) { + FalseBlock = BasicBlock::Create(LastSI->getContext(), "select.false.sink", + EndBlock->getParent(), EndBlock); + FalseBranch = BranchInst::Create(EndBlock, FalseBlock); + FalseBranch->setDebugLoc(LastSI->getDebugLoc()); + for (Instruction *FalseInst : FalseSlicesInterleaved) + FalseInst->moveBefore(FalseBranch); + } + // If there was nothing to sink, then arbitrarily choose the 'false' side + // for a new input value to the PHI. + if (TrueBlock == FalseBlock) { + assert(TrueBlock == nullptr && + "Unexpected basic block transform while optimizing select"); + + FalseBlock = BasicBlock::Create(SI->getContext(), "select.false", + EndBlock->getParent(), EndBlock); + auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock); + FalseBranch->setDebugLoc(SI->getDebugLoc()); + } + + // Insert the real conditional branch based on the original condition. + // If we did not create a new block for one of the 'true' or 'false' paths + // of the condition, it means that side of the branch goes to the end block + // directly and the path originates from the start block from the point of + // view of the new PHI. + BasicBlock *TT, *FT; + if (TrueBlock == nullptr) { + TT = EndBlock; + FT = FalseBlock; + TrueBlock = StartBlock; + } else if (FalseBlock == nullptr) { + TT = TrueBlock; + FT = EndBlock; + FalseBlock = StartBlock; + } else { + TT = TrueBlock; + FT = FalseBlock; + } + IRBuilder<> IB(SI); + auto *CondFr = + IB.CreateFreeze(SI->getCondition(), SI->getName() + ".frozen"); + IB.CreateCondBr(CondFr, TT, FT, SI); + + SmallPtrSet INS; + INS.insert(ASI.begin(), ASI.end()); + // Use reverse iterator because later select may use the value of the + // earlier select, and we need to propagate value through earlier select + // to get the PHI operand. + for (auto It = ASI.rbegin(); It != ASI.rend(); ++It) { + SelectInst *SI = *It; + // The select itself is replaced with a PHI Node. + PHINode *PN = PHINode::Create(SI->getType(), 2, "", &EndBlock->front()); + PN->takeName(SI); + PN->addIncoming(getTrueOrFalseValue(SI, true, INS), TrueBlock); + PN->addIncoming(getTrueOrFalseValue(SI, false, INS), FalseBlock); + PN->setDebugLoc(SI->getDebugLoc()); + + SI->replaceAllUsesWith(PN); + SI->eraseFromParent(); + INS.erase(SI); + ++NumSelectsConverted; + } + } } void SelectOptimize::collectSelectGroups(BasicBlock &BB, @@ -394,18 +584,21 @@ else ColdI = dyn_cast(SI->getFalseValue()); if (ColdI) { - SmallVector ColdSlice; + std::stack ColdSlice; getOneUseSlice(ColdI, ColdSlice); - for (auto *ColdII : ColdSlice) - if (isExpensiveInst(ColdII)) + while (!ColdSlice.empty()) { + if (isExpensiveInst(ColdSlice.top())) return true; + ColdSlice.pop(); + } } } return false; } void SelectOptimize::getOneUseSlice(Instruction *I, - SmallVector &Slice) { + std::stack &Slice, + bool ForSinking) { std::queue Worklist; Worklist.push(I); SmallPtrSet Visited; @@ -421,9 +614,16 @@ if (!II->hasOneUse()) continue; - // Avoid considering instructions in outer or independent loops, or non-loop - // instructions (instructions with potentially much less frequency than the - // operand of the select). + // Cannot soundly sink instructions with side-effects. + // Terminator instructions cannot be sunk. + // Avoid sinking other select instructions (should be handled separetely). + if (ForSinking && + (II->isTerminator() || II->mayHaveSideEffects() || isa(II))) + continue; + + // Avoid considering/sinking instructions from outer or independent loops, + // or non-loop instructions (instructions with potentially much less + // frequency than the operand of the select). const Loop *LII = LI->getLoopFor(II->getParent()); if (L && L != LII && (!LII || !L->contains(LII))) continue; @@ -431,13 +631,15 @@ // Avoid phi nodes that are not part of a loop-carried dependence. // Could source from a cold basic block. if (isa(II)) { + if (ForSinking) + continue; if (!LII) continue; if (LII->getHeader() != II->getParent()) continue; } - Slice.push_back(II); + Slice.push(II); for (unsigned k = 0; k < II->getNumOperands(); ++k) if (auto *OpI = dyn_cast(II->getOperand(k))) Worklist.push(OpI); diff --git a/llvm/test/CodeGen/X86/select-optimize.ll b/llvm/test/CodeGen/X86/select-optimize.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/select-optimize.ll @@ -0,0 +1,292 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-unknown-unknown -disable-select-optimize=false -x86-cmov-converter=false -disable-cgp-select2branch=true < %s | FileCheck %s + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Test base heuristic 1: +;; highly-biased selects assumed to be highly predictable, converted to branches +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; If a select is obviously predictable, turn it into a branch. +define i32 @weighted_select1(i32 %a, i32 %b, i1 %cmp) { +; CHECK-LABEL: weighted_select1: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: testb $1, %dl +; CHECK-NEXT: jne .LBB0_2 +; CHECK-NEXT: # %bb.1: # %select.false +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: .LBB0_2: # %select.end +; CHECK-NEXT: retq + %sel = select i1 %cmp, i32 %a, i32 %b, !prof !15 + ret i32 %sel +} + +; If a select is obviously predictable (reversed profile weights), +; turn it into a branch. +define i32 @weighted_select2(i32 %a, i32 %b, i1 %cmp) { +; CHECK-LABEL: weighted_select2: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: testb $1, %dl +; CHECK-NEXT: je .LBB1_1 +; CHECK-NEXT: # %bb.2: # %select.end +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB1_1: # %select.false +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: retq + %sel = select i1 %cmp, i32 %a, i32 %b, !prof !16 + ret i32 %sel +} + +; Not obvioulsy predictable select. +define i32 @weighted_select3(i32 %a, i32 %b, i1 %cmp) { +; CHECK-LABEL: weighted_select3: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: testb $1, %dl +; CHECK-NEXT: cmovel %esi, %eax +; CHECK-NEXT: retq + %sel = select i1 %cmp, i32 %a, i32 %b, !prof !17 + ret i32 %sel +} + +; Unpredictable select should not form a branch. +define i32 @unpred_select(i32 %a, i32 %b, i1 %cmp) { +; CHECK-LABEL: unpred_select: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: testb $1, %dl +; CHECK-NEXT: cmovel %esi, %eax +; CHECK-NEXT: retq + %sel = select i1 %cmp, i32 %a, i32 %b, !unpredictable !20 + ret i32 %sel +} + +; Predictable select in function with optsize attribute should not form branch. +define i32 @weighted_select_optsize(i32 %a, i32 %b, i1 %cmp) optsize { +; CHECK-LABEL: weighted_select_optsize: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: testb $1, %dl +; CHECK-NEXT: cmovel %esi, %eax +; CHECK-NEXT: retq + %sel = select i1 %cmp, i32 %a, i32 %b, !prof !15 + ret i32 %sel +} + +define i32 @weighted_select_pgso(i32 %a, i32 %b, i1 %cmp) !prof !14 { +; CHECK-LABEL: weighted_select_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: testb $1, %dl +; CHECK-NEXT: cmovel %esi, %eax +; CHECK-NEXT: retq + %sel = select i1 %cmp, i32 %a, i32 %b, !prof !15 + ret i32 %sel +} + +; If two selects in a row are predictable, turn them into branches. +define i32 @weighted_selects(i32 %a, i32 %b) !prof !19 { +; CHECK-LABEL: weighted_selects: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: jne .LBB6_2 +; CHECK-NEXT: # %bb.1: # %select.false +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: .LBB6_2: # %select.end +; CHECK-NEXT: testl %ecx, %ecx +; CHECK-NEXT: jne .LBB6_4 +; CHECK-NEXT: # %bb.3: # %select.false2 +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: .LBB6_4: # %select.end1 +; CHECK-NEXT: retq + %cmp = icmp ne i32 %a, 0 + %sel = select i1 %cmp, i32 %a, i32 %b, !prof !15 + %cmp1 = icmp ne i32 %sel, 0 + %sel1 = select i1 %cmp1, i32 %b, i32 %a, !prof !15 + ret i32 %sel1 +} + +; If select group predictable, turn it into a branch. +define i32 @weighted_select_group(i32 %a, i32 %b, i32 %c, i1 %cmp) !prof !19 { +; CHECK-LABEL: weighted_select_group: +; CHECK: # %bb.0: +; CHECK-NEXT: testb $1, %cl +; CHECK-NEXT: jne .LBB7_1 +; CHECK-NEXT: # %bb.2: # %select.false +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: addl %edi, %eax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB7_1: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: addl %edx, %eax +; CHECK-NEXT: retq + %sel1 = select i1 %cmp, i32 %a, i32 %b, !prof !15 + %sel2 = select i1 %cmp, i32 %c, i32 %a, !prof !15 + %add = add i32 %sel1, %sel2 + ret i32 %add +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Test base heuristic 2: +;; look for expensive instructions in the one-use slice of the cold path +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; select with cold one-use load value operand should form branch and +; sink load +define i32 @expensive_val_operand1(i32* nocapture %a, i32 %y, i1 %cmp) { +; CHECK-LABEL: expensive_val_operand1: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: testb $1, %dl +; CHECK-NEXT: jne .LBB8_1 +; CHECK-NEXT: # %bb.2: # %select.end +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB8_1: # %select.true.sink +; CHECK-NEXT: movl (%rdi), %eax +; CHECK-NEXT: retq + %load = load i32, i32* %a, align 8 + %sel = select i1 %cmp, i32 %load, i32 %y, !prof !17 + ret i32 %sel +} + +; expensive hot value operand and cheap cold value operand. +define i32 @expensive_val_operand2(i32* nocapture %a, i32 %x, i1 %cmp) { +; CHECK-LABEL: expensive_val_operand2: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: testb $1, %dl +; CHECK-NEXT: cmovel (%rdi), %eax +; CHECK-NEXT: retq + %load = load i32, i32* %a, align 8 + %sel = select i1 %cmp, i32 %x, i32 %load, !prof !17 + ret i32 %sel +} + +; cold value operand with load in its one-use dependence slice shoud result +; into a branch with sinked dependence slice. +define i32 @expensive_val_operand3(i32* nocapture %a, i32 %b, i32 %y, i1 %cmp) { +; CHECK-LABEL: expensive_val_operand3: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: testb $1, %cl +; CHECK-NEXT: jne .LBB10_1 +; CHECK-NEXT: # %bb.2: # %select.end +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB10_1: # %select.true.sink +; CHECK-NEXT: addl (%rdi), %esi +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: retq + %load = load i32, i32* %a, align 8 + %x = add i32 %load, %b + %sel = select i1 %cmp, i32 %x, i32 %y, !prof !17 + ret i32 %sel +} + +; Multiple uses of the load value operand. +define i32 @expensive_val_operand4(i32 %a, i32* nocapture %b, i32 %x, i1 %cmp) { +; CHECK-LABEL: expensive_val_operand4: +; CHECK: # %bb.0: +; CHECK-NEXT: movl (%rsi), %eax +; CHECK-NEXT: testb $1, %cl +; CHECK-NEXT: cmovel %eax, %edx +; CHECK-NEXT: addl %edx, %eax +; CHECK-NEXT: retq + %load = load i32, i32* %b, align 4 + %sel = select i1 %cmp, i32 %x, i32 %load + %add = add i32 %sel, %load + ret i32 %add +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Test loop heuristic: loop-level critical-path analysis +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Use of cmov in this test would put a load and a fsub on the critical path. +;; Loop-level analysis should decide to form a branch. +;; +;;double cmov_on_critical_path(int n, double x, double *a) { +;; for (int i = 0; i < n; i++) { +;; double r = a[i]; +;; if (x > r) +;; // 50% of iterations +;; x -= r; +;; } +;; return x; +;;} +define double @cmov_on_critical_path(i32 %n, double %x, double* nocapture %a) { +; CHECK-LABEL: cmov_on_critical_path: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: jle .LBB12_5 +; CHECK-NEXT: # %bb.1: # %for.body.preheader +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: jmp .LBB12_2 +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB12_4: # %select.end +; CHECK-NEXT: # in Loop: Header=BB12_2 Depth=1 +; CHECK-NEXT: addq $8, %rsi +; CHECK-NEXT: decq %rax +; CHECK-NEXT: je .LBB12_5 +; CHECK-NEXT: .LBB12_2: # %for.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; CHECK-NEXT: ucomisd %xmm1, %xmm0 +; CHECK-NEXT: jbe .LBB12_4 +; CHECK-NEXT: # %bb.3: # %select.true.sink +; CHECK-NEXT: # in Loop: Header=BB12_2 Depth=1 +; CHECK-NEXT: subsd %xmm1, %xmm0 +; CHECK-NEXT: jmp .LBB12_4 +; CHECK-NEXT: .LBB12_5: # %for.cond.cleanup +; CHECK-NEXT: retq +entry: + %cmp1 = icmp sgt i32 %n, 0 + br i1 %cmp1, label %for.body.preheader, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %entry + ret double %x + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %x1 = phi double [ %x2, %for.body ], [ %x, %for.body.preheader ] + %arrayidx = getelementptr inbounds double, double* %a, i64 %indvars.iv + %r = load double, double* %arrayidx, align 8 + %sub = fsub double %x1, %r + %cmp2 = fcmp ogt double %x1, %r + %x2 = select i1 %cmp2, double %sub, double %x1, !prof !18 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret double %x2 +} + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} +!15 = !{!"branch_weights", i32 1, i32 100} +!16 = !{!"branch_weights", i32 100, i32 1} +!17 = !{!"branch_weights", i32 1, i32 99} +!18 = !{!"branch_weights", i32 50, i32 50} +!19 = !{!"function_entry_count", i64 100} +!20 = !{}