Index: llvm/trunk/lib/Target/X86/X86InstrInfo.h =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrInfo.h +++ llvm/trunk/lib/Target/X86/X86InstrInfo.h @@ -29,35 +29,44 @@ namespace X86 { // X86 specific condition code. These correspond to X86_*_COND in // X86InstrInfo.td. They must be kept in synch. - enum CondCode { - COND_A = 0, - COND_AE = 1, - COND_B = 2, - COND_BE = 3, - COND_E = 4, - COND_G = 5, - COND_GE = 6, - COND_L = 7, - COND_LE = 8, - COND_NE = 9, - COND_NO = 10, - COND_NP = 11, - COND_NS = 12, - COND_O = 13, - COND_P = 14, - COND_S = 15, - LAST_VALID_COND = COND_S, - - // Artificial condition codes. These are used by AnalyzeBranch - // to indicate a block terminated with two conditional branches to - // the same location. This occurs in code using FCMP_OEQ or FCMP_UNE, - // which can't be represented on x86 with a single condition. These - // are never used in MachineInstrs. - COND_NE_OR_P, - COND_NP_OR_E, +enum CondCode { + COND_A = 0, + COND_AE = 1, + COND_B = 2, + COND_BE = 3, + COND_E = 4, + COND_G = 5, + COND_GE = 6, + COND_L = 7, + COND_LE = 8, + COND_NE = 9, + COND_NO = 10, + COND_NP = 11, + COND_NS = 12, + COND_O = 13, + COND_P = 14, + COND_S = 15, + LAST_VALID_COND = COND_S, + + // Artificial condition codes. These are used by AnalyzeBranch + // to indicate a block terminated with two conditional branches to + // the same location. This occurs in code using FCMP_OEQ or FCMP_UNE, + // which can't be represented on x86 with a single condition. These + // are never used in MachineInstrs. + COND_NE_OR_P, + COND_NP_OR_E, + + // Artificial condition codes. These are used to represent the negation of + // above two conditions. The only scenario we need these two conditions is + // when we try to reverse above two conditions in order to remove redundant + // unconditional jumps. Note that both true and false bodies need to be + // avaiable in order to correctly synthesize instructions for them. These are + // never used in MachineInstrs. + COND_E_AND_NP, // negate of COND_NE_OR_P + COND_P_AND_NE, // negate of COND_NP_OR_E - COND_INVALID - }; + COND_INVALID +}; // Turn condition code into conditional branch opcode. unsigned GetCondBranchFromCond(CondCode CC); Index: llvm/trunk/lib/Target/X86/X86InstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrInfo.cpp +++ llvm/trunk/lib/Target/X86/X86InstrInfo.cpp @@ -3805,6 +3805,10 @@ case X86::COND_NP: return X86::COND_P; case X86::COND_O: return X86::COND_NO; case X86::COND_NO: return X86::COND_O; + case X86::COND_NE_OR_P: return X86::COND_E_AND_NP; + case X86::COND_NP_OR_E: return X86::COND_P_AND_NE; + case X86::COND_E_AND_NP: return X86::COND_NE_OR_P; + case X86::COND_P_AND_NE: return X86::COND_NP_OR_E; } } @@ -3998,9 +4002,9 @@ MachineBasicBlock::iterator OldInst = I; BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(JNCC)) - .addMBB(UnCondBrIter->getOperand(0).getMBB()); + .addMBB(UnCondBrIter->getOperand(0).getMBB()); BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JMP_1)) - .addMBB(TargetBB); + .addMBB(TargetBB); OldInst->eraseFromParent(); UnCondBrIter->eraseFromParent(); @@ -4024,11 +4028,6 @@ assert(Cond.size() == 1); assert(TBB); - // Only handle the case where all conditional branches branch to the same - // destination. - if (TBB != I->getOperand(0).getMBB()) - return true; - // If the conditions are the same, we can leave them alone. X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm(); if (OldBranchCode == BranchCode) @@ -4037,17 +4036,40 @@ // If they differ, see if they fit one of the known patterns. Theoretically, // we could handle more patterns here, but we shouldn't expect to see them // if instruction selection has done a reasonable job. - if ((OldBranchCode == X86::COND_NP && - BranchCode == X86::COND_E) || - (OldBranchCode == X86::COND_E && - BranchCode == X86::COND_NP)) + auto NewTBB = I->getOperand(0).getMBB(); + if (TBB == NewTBB && + ((OldBranchCode == X86::COND_NP && BranchCode == X86::COND_E) || + (OldBranchCode == X86::COND_E && BranchCode == X86::COND_NP))) { BranchCode = X86::COND_NP_OR_E; - else if ((OldBranchCode == X86::COND_P && - BranchCode == X86::COND_NE) || - (OldBranchCode == X86::COND_NE && - BranchCode == X86::COND_P)) + } else if (TBB == NewTBB && + ((OldBranchCode == X86::COND_P && BranchCode == X86::COND_NE) || + (OldBranchCode == X86::COND_NE && BranchCode == X86::COND_P))) { BranchCode = X86::COND_NE_OR_P; - else + } else if ((OldBranchCode == X86::COND_NE && BranchCode == X86::COND_NP) || + (OldBranchCode == X86::COND_P && BranchCode == X86::COND_E)) { + // X86::COND_P_AND_NE usually has two different branch destinations. + // + // JNP B1 + // JNE B2 + // B1: (fall-through) + // B2: + // + // Here this condition branches to B2 only if P && NE. It has another + // equivalent form: + // + // JE B1 + // JP B2 + // B1: (fall-through) + // B2: + // + // Similarly it branches to B2 only if NE && P. That is why this condition + // is named COND_P_AND_NE. + BranchCode = X86::COND_P_AND_NE; + } else if ((OldBranchCode == X86::COND_NP && BranchCode == X86::COND_NE) || + (OldBranchCode == X86::COND_E && BranchCode == X86::COND_P)) { + // See comments above for X86::COND_P_AND_NE. + BranchCode = X86::COND_E_AND_NP; + } else return true; // Update the MachineOperand. @@ -4156,6 +4178,13 @@ return Count; } +static MachineBasicBlock *getFallThroughMBB(MachineBasicBlock *MBB) { + auto I = std::next(MBB->getIterator()); + if (I == MBB->getParent()->end()) + return nullptr; + return &*I; +} + unsigned X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef Cond, @@ -4172,6 +4201,9 @@ return 1; } + // If FBB is null, it is implied to be a fall-through block. + bool FallThru = FBB == nullptr; + // Conditional branch. unsigned Count = 0; X86::CondCode CC = (X86::CondCode)Cond[0].getImm(); @@ -4190,13 +4222,39 @@ BuildMI(&MBB, DL, get(X86::JP_1)).addMBB(TBB); ++Count; break; + case X86::COND_P_AND_NE: + // Use the next block of MBB as FBB if it is null. + if (FBB == nullptr) { + FBB = getFallThroughMBB(&MBB); + assert(FBB && "MBB cannot be the last block in function when the false " + "body is a fall-through."); + } + // Synthesize NEG_NP_OR_E with two branches. + BuildMI(&MBB, DL, get(X86::JNP_1)).addMBB(FBB); + ++Count; + BuildMI(&MBB, DL, get(X86::JNE_1)).addMBB(TBB); + ++Count; + break; + case X86::COND_E_AND_NP: + // Use the next block of MBB as FBB if it is null. + if (FBB == nullptr) { + FBB = getFallThroughMBB(&MBB); + assert(FBB && "MBB cannot be the last block in function when the false " + "body is a fall-through."); + } + // Synthesize NEG_NE_OR_P with two branches. + BuildMI(&MBB, DL, get(X86::JNE_1)).addMBB(FBB); + ++Count; + BuildMI(&MBB, DL, get(X86::JNP_1)).addMBB(TBB); + ++Count; + break; default: { unsigned Opc = GetCondBranchFromCond(CC); BuildMI(&MBB, DL, get(Opc)).addMBB(TBB); ++Count; } } - if (FBB) { + if (!FallThru) { // Two-way Conditional branch. Insert the second branch. BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(FBB); ++Count; @@ -6717,8 +6775,6 @@ ReverseBranchCondition(SmallVectorImpl &Cond) const { assert(Cond.size() == 1 && "Invalid X86 branch condition!"); X86::CondCode CC = static_cast(Cond[0].getImm()); - if (CC == X86::COND_NE_OR_P || CC == X86::COND_NP_OR_E) - return true; Cond[0].setImm(GetOppositeBranchCondition(CC)); return false; } Index: llvm/trunk/test/CodeGen/X86/block-placement.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/block-placement.ll +++ llvm/trunk/test/CodeGen/X86/block-placement.ll @@ -463,26 +463,23 @@ } define void @fpcmp_unanalyzable_branch(i1 %cond) { -; This function's CFG contains an unanalyzable branch that is likely to be -; split due to having a different high-probability predecessor. +; This function's CFG contains an once-unanalyzable branch (une on floating +; points). As now it becomes analyzable, we should get best layout in which each +; edge in 'entry' -> 'entry.if.then_crit_edge' -> 'if.then' -> 'if.end' is +; fall-through. ; CHECK: fpcmp_unanalyzable_branch ; CHECK: %entry +; CHECK: %entry.if.then_crit_edge +; CHECK: %if.then +; CHECK: %if.end ; CHECK: %exit -; CHECK-NOT: %if.then -; CHECK-NOT: %if.end -; CHECK-NOT: jne -; CHECK-NOT: jnp ; CHECK: jne ; CHECK-NEXT: jnp -; CHECK-NEXT: %if.then entry: ; Note that this branch must be strongly biased toward ; 'entry.if.then_crit_edge' to ensure that we would try to form a chain for -; 'entry' -> 'entry.if.then_crit_edge' -> 'if.then'. It is the last edge in that -; chain which would violate the unanalyzable branch in 'exit', but we won't even -; try this trick unless 'if.then' is believed to almost always be reached from -; 'entry.if.then_crit_edge'. +; 'entry' -> 'entry.if.then_crit_edge' -> 'if.then' -> 'if.end'. br i1 %cond, label %entry.if.then_crit_edge, label %lor.lhs.false, !prof !1 entry.if.then_crit_edge: @@ -494,7 +491,7 @@ exit: %cmp.i = fcmp une double 0.000000e+00, undef - br i1 %cmp.i, label %if.then, label %if.end + br i1 %cmp.i, label %if.then, label %if.end, !prof !3 if.then: %0 = phi i8 [ %.pre14, %entry.if.then_crit_edge ], [ undef, %exit ] @@ -507,6 +504,7 @@ } !1 = !{!"branch_weights", i32 1000, i32 1} +!3 = !{!"branch_weights", i32 1, i32 1000} declare i32 @f() declare i32 @g() @@ -665,11 +663,14 @@ ; Ensure that we can handle unanalyzable branches where the destination block ; gets selected as the optimal successor to merge. ; +; This branch is now analyzable and hence the destination block becomes the +; hotter one. The right order is entry->bar->exit->foo. +; ; CHECK: unanalyzable_branch_to_best_succ ; CHECK: %entry -; CHECK: %foo ; CHECK: %bar ; CHECK: %exit +; CHECK: %foo entry: ; Bias this branch toward bar to ensure we form that chain. Index: llvm/trunk/test/CodeGen/X86/fast-isel-cmp-branch2.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/fast-isel-cmp-branch2.ll +++ llvm/trunk/test/CodeGen/X86/fast-isel-cmp-branch2.ll @@ -5,7 +5,7 @@ ; CHECK-LABEL: fcmp_oeq ; CHECK: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: jne {{LBB.+_1}} -; CHECK-NEXT: jnp {{LBB.+_2}} +; CHECK-NEXT: jp {{LBB.+_1}} %1 = fcmp oeq float %x, %y br i1 %1, label %bb1, label %bb2 bb2: @@ -162,8 +162,7 @@ ; CHECK-LABEL: fcmp_une ; CHECK: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: jne {{LBB.+_2}} -; CHECK-NEXT: jp {{LBB.+_2}} -; CHECK-NEXT: jmp {{LBB.+_1}} +; CHECK-NEXT: jnp {{LBB.+_1}} %1 = fcmp une float %x, %y br i1 %1, label %bb1, label %bb2 bb2: Index: llvm/trunk/test/CodeGen/X86/fast-isel-cmp-branch3.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/fast-isel-cmp-branch3.ll +++ llvm/trunk/test/CodeGen/X86/fast-isel-cmp-branch3.ll @@ -17,7 +17,7 @@ ; CHECK: xorps %xmm1, %xmm1 ; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: jne {{LBB.+_1}} -; CHECK-NEXT: jnp {{LBB.+_2}} +; CHECK-NEXT: jp {{LBB.+_1}} %1 = fcmp oeq float %x, 0.000000e+00 br i1 %1, label %bb1, label %bb2 bb2: @@ -338,8 +338,7 @@ ; CHECK: xorps %xmm1, %xmm1 ; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: jne {{LBB.+_2}} -; CHECK-NEXT: jp {{LBB.+_2}} -; CHECK-NEXT: jmp {{LBB.+_1}} +; CHECK-NEXT: jnp {{LBB.+_1}} %1 = fcmp une float %x, 0.000000e+00 br i1 %1, label %bb1, label %bb2 bb2: Index: llvm/trunk/test/CodeGen/X86/fp-une-cmp.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/fp-une-cmp.ll +++ llvm/trunk/test/CodeGen/X86/fp-une-cmp.ll @@ -19,12 +19,12 @@ ; addsd ... ; LBB0_2: -; CHECK: func +define float @func1(float %x, float %y) nounwind readnone optsize ssp { +; CHECK: func1 ; CHECK: jne [[LABEL:.*]] ; CHECK-NEXT: jp [[LABEL]] ; CHECK-NOT: jmp - -define float @func(float %x, float %y) nounwind readnone optsize ssp { +; entry: %0 = fpext float %x to double %1 = fpext float %y to double @@ -41,3 +41,30 @@ %.0 = fptrunc double %.0.in to float ret float %.0 } + +define float @func2(float %x, float %y) nounwind readnone optsize ssp { +; CHECK: func2 +; CHECK: jne [[LABEL:.*]] +; CHECK-NEXT: jp [[LABEL]] +; CHECK: %bb2 +; CHECK: %bb1 +; CHECK: jmp +; +entry: + %0 = fpext float %x to double + %1 = fpext float %y to double + %2 = fmul double %0, %1 + %3 = fcmp une double %2, 0.000000e+00 + br i1 %3, label %bb1, label %bb2, !prof !1 + +bb1: + %4 = fadd double %2, -1.000000e+00 + br label %bb2 + +bb2: + %.0.in = phi double [ %4, %bb1 ], [ %2, %entry ] + %.0 = fptrunc double %.0.in to float + ret float %.0 +} + +!1 = !{!"branch_weights", i32 1, i32 1000}