Index: include/llvm/Target/TargetInstrInfo.h =================================================================== --- include/llvm/Target/TargetInstrInfo.h +++ include/llvm/Target/TargetInstrInfo.h @@ -452,6 +452,8 @@ /// If AllowModify is true, then this routine is allowed to modify the basic /// block (e.g. delete instructions after the unconditional branch). /// + /// The CFG information in MBB.Predecessors and MBB.Successors must be valid + /// before calling this function. virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl &Cond, @@ -521,6 +523,9 @@ /// cases where AnalyzeBranch doesn't apply because there was no original /// branch to analyze. At least this much must be implemented, else tail /// merging needs to be disabled. + /// + /// The CFG information in MBB.Predecessors and MBB.Successors must be valid + /// before calling this function. virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef Cond, Index: lib/CodeGen/TailDuplication.cpp =================================================================== --- lib/CodeGen/TailDuplication.cpp +++ lib/CodeGen/TailDuplication.cpp @@ -749,9 +749,6 @@ TII->RemoveBranch(*PredBB); - if (PredTBB) - TII->InsertBranch(*PredBB, PredTBB, PredFBB, PredCond, DebugLoc()); - if (!PredBB->isSuccessor(NewTarget)) PredBB->replaceSuccessor(TailBB, NewTarget); else { @@ -759,6 +756,9 @@ assert(PredBB->succ_size() <= 1); } + if (PredTBB) + TII->InsertBranch(*PredBB, PredTBB, PredFBB, PredCond, DebugLoc()); + TDBBs.push_back(PredBB); } return Changed; Index: lib/Target/X86/X86InstrInfo.h =================================================================== --- lib/Target/X86/X86InstrInfo.h +++ lib/Target/X86/X86InstrInfo.h @@ -29,54 +29,54 @@ namespace X86 { // X86 specific condition code. These correspond to X86_*_COND in // X86InstrInfo.td. They must be kept in synch. - enum CondCode { - COND_A = 0, - COND_AE = 1, - COND_B = 2, - COND_BE = 3, - COND_E = 4, - COND_G = 5, - COND_GE = 6, - COND_L = 7, - COND_LE = 8, - COND_NE = 9, - COND_NO = 10, - COND_NP = 11, - COND_NS = 12, - COND_O = 13, - COND_P = 14, - COND_S = 15, - LAST_VALID_COND = COND_S, - - // Artificial condition codes. These are used by AnalyzeBranch - // to indicate a block terminated with two conditional branches to - // the same location. This occurs in code using FCMP_OEQ or FCMP_UNE, - // which can't be represented on x86 with a single condition. These - // are never used in MachineInstrs. - COND_NE_OR_P, - COND_NP_OR_E, - - COND_INVALID - }; - - // Turn condition code into conditional branch opcode. - unsigned GetCondBranchFromCond(CondCode CC); - - /// \brief Return a set opcode for the given condition and whether it has - /// a memory operand. - unsigned getSETFromCond(CondCode CC, bool HasMemoryOperand = false); - - /// \brief Return a cmov opcode for the given condition, register size in - /// bytes, and operand type. - unsigned getCMovFromCond(CondCode CC, unsigned RegBytes, - bool HasMemoryOperand = false); - - // Turn CMov opcode into condition code. - CondCode getCondFromCMovOpc(unsigned Opc); - - /// GetOppositeBranchCondition - Return the inverse of the specified cond, - /// e.g. turning COND_E to COND_NE. - CondCode GetOppositeBranchCondition(CondCode CC); +enum CondCode { + COND_A = 0, + COND_AE = 1, + COND_B = 2, + COND_BE = 3, + COND_E = 4, + COND_G = 5, + COND_GE = 6, + COND_L = 7, + COND_LE = 8, + COND_NE = 9, + COND_NO = 10, + COND_NP = 11, + COND_NS = 12, + COND_O = 13, + COND_P = 14, + COND_S = 15, + LAST_VALID_COND = COND_S, + + // Artificial condition codes. These are used by AnalyzeBranch + // to indicate a block terminated with two conditional branches that together + // form a compound condition. They occur in code using FCMP_OEQ or FCMP_UNE, + // which can't be represented on x86 with a single condition. These + // are never used in MachineInstrs and are inverses of one another. + COND_NE_OR_P, + COND_E_AND_NP, + + COND_INVALID +}; + +// Turn condition code into conditional branch opcode. +unsigned GetCondBranchFromCond(CondCode CC); + +/// \brief Return a set opcode for the given condition and whether it has +/// a memory operand. +unsigned getSETFromCond(CondCode CC, bool HasMemoryOperand = false); + +/// \brief Return a cmov opcode for the given condition, register size in +/// bytes, and operand type. +unsigned getCMovFromCond(CondCode CC, unsigned RegBytes, + bool HasMemoryOperand = false); + +// Turn CMov opcode into condition code. +CondCode getCondFromCMovOpc(unsigned Opc); + +/// GetOppositeBranchCondition - Return the inverse of the specified cond, +/// e.g. turning COND_E to COND_NE. +CondCode GetOppositeBranchCondition(CondCode CC); } // end namespace X86; Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -3807,6 +3807,8 @@ case X86::COND_NP: return X86::COND_P; case X86::COND_O: return X86::COND_NO; case X86::COND_NO: return X86::COND_O; + case X86::COND_NE_OR_P: return X86::COND_E_AND_NP; + case X86::COND_E_AND_NP: return X86::COND_NE_OR_P; } } @@ -3914,6 +3916,23 @@ return !isPredicated(MI); } +// Given a MBB and its TBB, find the FBB which was a fallthrough MBB (it may not +// be a fallthorough MBB now due to layout changes). Return nullptr if the +// fallthough MBB cannot be identified. +static MachineBasicBlock *getFallThroughMBB(MachineBasicBlock *MBB, + MachineBasicBlock *TBB) { + MachineBasicBlock *FallthroughBB = nullptr; + for (auto SI = MBB->succ_begin(), SE = MBB->succ_end(); SI != SE; ++SI) { + if ((*SI)->isEHPad() || *SI == TBB) + continue; + // Return a nullptr if we found more than one fallthrough successor. + if (FallthroughBB) + return nullptr; + FallthroughBB = *SI; + } + return FallthroughBB; +} + bool X86InstrInfo::AnalyzeBranchImpl( MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl &Cond, @@ -4026,30 +4045,45 @@ assert(Cond.size() == 1); assert(TBB); - // Only handle the case where all conditional branches branch to the same - // destination. - if (TBB != I->getOperand(0).getMBB()) - return true; - // If the conditions are the same, we can leave them alone. X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm(); - if (OldBranchCode == BranchCode) + auto NewTBB = I->getOperand(0).getMBB(); + if (OldBranchCode == BranchCode && TBB == NewTBB) continue; // If they differ, see if they fit one of the known patterns. Theoretically, // we could handle more patterns here, but we shouldn't expect to see them // if instruction selection has done a reasonable job. - if ((OldBranchCode == X86::COND_NP && - BranchCode == X86::COND_E) || - (OldBranchCode == X86::COND_E && - BranchCode == X86::COND_NP)) - BranchCode = X86::COND_NP_OR_E; - else if ((OldBranchCode == X86::COND_P && - BranchCode == X86::COND_NE) || - (OldBranchCode == X86::COND_NE && - BranchCode == X86::COND_P)) + if (TBB == NewTBB && + ((OldBranchCode == X86::COND_P && BranchCode == X86::COND_NE) || + (OldBranchCode == X86::COND_NE && BranchCode == X86::COND_P))) { BranchCode = X86::COND_NE_OR_P; - else + } else if ((OldBranchCode == X86::COND_NP && BranchCode == X86::COND_NE) || + (OldBranchCode == X86::COND_E && BranchCode == X86::COND_P)) { + if (NewTBB != (FBB ? FBB : getFallThroughMBB(&MBB, TBB))) + return true; + + // X86::COND_E_AND_NP usually has two different branch destinations. + // + // JP B1 + // JE B2 + // JMP B1 + // B1: + // B2: + // + // Here this condition branches to B2 only if NP && E. It has another + // equivalent form: + // + // JNE B1 + // JNP B2 + // JMP B1 + // B1: + // B2: + // + // Similarly it branches to B2 only if E && NP. That is why this condition + // is named with COND_E_AND_NP. + BranchCode = X86::COND_E_AND_NP; + } else return true; // Update the MachineOperand. @@ -4174,17 +4208,13 @@ return 1; } + // If FBB is null, it is implied to be a fall-through block. + bool FallThru = FBB == nullptr; + // Conditional branch. unsigned Count = 0; X86::CondCode CC = (X86::CondCode)Cond[0].getImm(); switch (CC) { - case X86::COND_NP_OR_E: - // Synthesize NP_OR_E with two branches. - BuildMI(&MBB, DL, get(X86::JNP_1)).addMBB(TBB); - ++Count; - BuildMI(&MBB, DL, get(X86::JE_1)).addMBB(TBB); - ++Count; - break; case X86::COND_NE_OR_P: // Synthesize NE_OR_P with two branches. BuildMI(&MBB, DL, get(X86::JNE_1)).addMBB(TBB); @@ -4192,13 +4222,26 @@ BuildMI(&MBB, DL, get(X86::JP_1)).addMBB(TBB); ++Count; break; + case X86::COND_E_AND_NP: + // Use the next block of MBB as FBB if it is null. + if (FBB == nullptr) { + FBB = getFallThroughMBB(&MBB, TBB); + assert(FBB && "MBB cannot be the last block in function when the false " + "body is a fall-through."); + } + // Synthesize COND_E_AND_NP with two branches. + BuildMI(&MBB, DL, get(X86::JNE_1)).addMBB(FBB); + ++Count; + BuildMI(&MBB, DL, get(X86::JNP_1)).addMBB(TBB); + ++Count; + break; default: { unsigned Opc = GetCondBranchFromCond(CC); BuildMI(&MBB, DL, get(Opc)).addMBB(TBB); ++Count; } } - if (FBB) { + if (!FallThru) { // Two-way Conditional branch. Insert the second branch. BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(FBB); ++Count; @@ -6759,8 +6802,6 @@ ReverseBranchCondition(SmallVectorImpl &Cond) const { assert(Cond.size() == 1 && "Invalid X86 branch condition!"); X86::CondCode CC = static_cast(Cond[0].getImm()); - if (CC == X86::COND_NE_OR_P || CC == X86::COND_NP_OR_E) - return true; Cond[0].setImm(GetOppositeBranchCondition(CC)); return false; } Index: test/CodeGen/X86/block-placement.ll =================================================================== --- test/CodeGen/X86/block-placement.ll +++ test/CodeGen/X86/block-placement.ll @@ -463,26 +463,24 @@ } define void @fpcmp_unanalyzable_branch(i1 %cond) { -; This function's CFG contains an unanalyzable branch that is likely to be -; split due to having a different high-probability predecessor. -; CHECK: fpcmp_unanalyzable_branch -; CHECK: %entry -; CHECK: %exit -; CHECK-NOT: %if.then -; CHECK-NOT: %if.end -; CHECK-NOT: jne -; CHECK-NOT: jnp -; CHECK: jne -; CHECK-NEXT: jnp -; CHECK-NEXT: %if.then +; This function's CFG contains an once-unanalyzable branch (une on floating +; points). As now it becomes analyzable, we should get best layout in which each +; edge in 'entry' -> 'entry.if.then_crit_edge' -> 'if.then' -> 'if.end' is +; fall-through. +; CHECK-LABEL: fpcmp_unanalyzable_branch: +; CHECK: # BB#0: # %entry +; CHECK: # BB#1: # %entry.if.then_crit_edge +; CHECK: .LBB10_4: # %if.then +; CHECK: .LBB10_5: # %if.end +; CHECK: # BB#3: # %exit +; CHECK: jne .LBB10_4 +; CHECK-NEXT: jnp .LBB10_5 +; CHECK-NEXT: jmp .LBB10_4 entry: ; Note that this branch must be strongly biased toward ; 'entry.if.then_crit_edge' to ensure that we would try to form a chain for -; 'entry' -> 'entry.if.then_crit_edge' -> 'if.then'. It is the last edge in that -; chain which would violate the unanalyzable branch in 'exit', but we won't even -; try this trick unless 'if.then' is believed to almost always be reached from -; 'entry.if.then_crit_edge'. +; 'entry' -> 'entry.if.then_crit_edge' -> 'if.then' -> 'if.end'. br i1 %cond, label %entry.if.then_crit_edge, label %lor.lhs.false, !prof !1 entry.if.then_crit_edge: @@ -494,7 +492,7 @@ exit: %cmp.i = fcmp une double 0.000000e+00, undef - br i1 %cmp.i, label %if.then, label %if.end + br i1 %cmp.i, label %if.then, label %if.end, !prof !3 if.then: %0 = phi i8 [ %.pre14, %entry.if.then_crit_edge ], [ undef, %exit ] @@ -507,6 +505,7 @@ } !1 = !{!"branch_weights", i32 1000, i32 1} +!3 = !{!"branch_weights", i32 1, i32 1000} declare i32 @f() declare i32 @g() @@ -665,11 +664,14 @@ ; Ensure that we can handle unanalyzable branches where the destination block ; gets selected as the optimal successor to merge. ; +; This branch is now analyzable and hence the destination block becomes the +; hotter one. The right order is entry->bar->exit->foo. +; ; CHECK: unanalyzable_branch_to_best_succ ; CHECK: %entry -; CHECK: %foo ; CHECK: %bar ; CHECK: %exit +; CHECK: %foo entry: ; Bias this branch toward bar to ensure we form that chain. Index: test/CodeGen/X86/fast-isel-cmp-branch2.ll =================================================================== --- test/CodeGen/X86/fast-isel-cmp-branch2.ll +++ test/CodeGen/X86/fast-isel-cmp-branch2.ll @@ -5,7 +5,7 @@ ; CHECK-LABEL: fcmp_oeq ; CHECK: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: jne {{LBB.+_1}} -; CHECK-NEXT: jnp {{LBB.+_2}} +; CHECK-NEXT: jp {{LBB.+_1}} %1 = fcmp oeq float %x, %y br i1 %1, label %bb1, label %bb2 bb2: @@ -162,8 +162,7 @@ ; CHECK-LABEL: fcmp_une ; CHECK: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: jne {{LBB.+_2}} -; CHECK-NEXT: jp {{LBB.+_2}} -; CHECK-NEXT: jmp {{LBB.+_1}} +; CHECK-NEXT: jnp {{LBB.+_1}} %1 = fcmp une float %x, %y br i1 %1, label %bb1, label %bb2 bb2: Index: test/CodeGen/X86/fast-isel-cmp-branch3.ll =================================================================== --- test/CodeGen/X86/fast-isel-cmp-branch3.ll +++ test/CodeGen/X86/fast-isel-cmp-branch3.ll @@ -17,7 +17,7 @@ ; CHECK: xorps %xmm1, %xmm1 ; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: jne {{LBB.+_1}} -; CHECK-NEXT: jnp {{LBB.+_2}} +; CHECK-NEXT: jp {{LBB.+_1}} %1 = fcmp oeq float %x, 0.000000e+00 br i1 %1, label %bb1, label %bb2 bb2: @@ -338,8 +338,7 @@ ; CHECK: xorps %xmm1, %xmm1 ; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: jne {{LBB.+_2}} -; CHECK-NEXT: jp {{LBB.+_2}} -; CHECK-NEXT: jmp {{LBB.+_1}} +; CHECK-NEXT: jnp {{LBB.+_1}} %1 = fcmp une float %x, 0.000000e+00 br i1 %1, label %bb1, label %bb2 bb2: Index: test/CodeGen/X86/fp-une-cmp.ll =================================================================== --- test/CodeGen/X86/fp-une-cmp.ll +++ test/CodeGen/X86/fp-une-cmp.ll @@ -48,8 +48,6 @@ ret double %phi } -; FIXME: With branch weights indicated, bb2 should be placed ahead of bb1. - define double @profile_metadata(double %x, double %y) { ; CHECK-LABEL: profile_metadata: ; CHECK: # BB#0: # %entry @@ -57,11 +55,12 @@ ; CHECK-NEXT: xorpd %xmm1, %xmm1 ; CHECK-NEXT: ucomisd %xmm1, %xmm0 ; CHECK-NEXT: jne .LBB1_1 -; CHECK-NEXT: jnp .LBB1_2 -; CHECK-NEXT: .LBB1_1: # %bb1 -; CHECK-NEXT: addsd {{.*}}(%rip), %xmm0 +; CHECK-NEXT: jp .LBB1_1 ; CHECK-NEXT: .LBB1_2: # %bb2 ; CHECK-NEXT: retq +; CHECK-NEXT: .LBB1_1: # %bb1 +; CHECK-NEXT: addsd {{.*}}(%rip), %xmm0 +; CHECK-NEXT: jmp .LBB1_2 entry: %mul = fmul double %x, %y @@ -77,5 +76,32 @@ ret double %phi } -!1 = !{!"branch_weights", i32 1, i32 1000} +; Test if the negation of the non-equality check between floating points are +; translated to jnp followed by jne. +define void @foo(float %f) { +; CHECK-LABEL: foo: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 +; CHECK-NEXT: jne .LBB2_2 +; CHECK-NEXT: jnp .LBB2_1 +; CHECK-NEXT: .LBB2_2: # %if.then +; CHECK-NEXT: jmp a # TAILCALL +; CHECK-NEXT: .LBB2_1: # %if.end +; CHECK-NEXT: retq +entry: + %cmp = fcmp une float %f, 0.000000e+00 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @a() + br label %if.end + +if.end: + ret void +} + +declare void @a() + +!1 = !{!"branch_weights", i32 1, i32 1000} Index: test/CodeGen/X86/x86-analyze-branch-jne-jp.s =================================================================== --- /dev/null +++ test/CodeGen/X86/x86-analyze-branch-jne-jp.s @@ -0,0 +1,22 @@ + .text + .file "../llvm/test/CodeGen/X86/x86-analyze-branch-jne-jp.ll" + .globl foo + .p2align 4, 0x90 + .type foo,@function +foo: # @foo + .cfi_startproc +# BB#0: # %entry + xorps %xmm1, %xmm1 + ucomiss %xmm1, %xmm0 + jne .LBB0_2 + jnp .LBB0_1 +.LBB0_2: # %if.then + jmp a # TAILCALL +.LBB0_1: # %if.end + retq +.Lfunc_end0: + .size foo, .Lfunc_end0-foo + .cfi_endproc + + + .section ".note.GNU-stack","",@progbits