Index: llvm/trunk/lib/Target/X86/X86InstrInfo.h
===================================================================
--- llvm/trunk/lib/Target/X86/X86InstrInfo.h
+++ llvm/trunk/lib/Target/X86/X86InstrInfo.h
@@ -29,35 +29,44 @@
 namespace X86 {
   // X86 specific condition code. These correspond to X86_*_COND in
   // X86InstrInfo.td. They must be kept in synch.
-  enum CondCode {
-    COND_A  = 0,
-    COND_AE = 1,
-    COND_B  = 2,
-    COND_BE = 3,
-    COND_E  = 4,
-    COND_G  = 5,
-    COND_GE = 6,
-    COND_L  = 7,
-    COND_LE = 8,
-    COND_NE = 9,
-    COND_NO = 10,
-    COND_NP = 11,
-    COND_NS = 12,
-    COND_O  = 13,
-    COND_P  = 14,
-    COND_S  = 15,
-    LAST_VALID_COND = COND_S,
-
-    // Artificial condition codes. These are used by AnalyzeBranch
-    // to indicate a block terminated with two conditional branches to
-    // the same location. This occurs in code using FCMP_OEQ or FCMP_UNE,
-    // which can't be represented on x86 with a single condition. These
-    // are never used in MachineInstrs.
-    COND_NE_OR_P,
-    COND_NP_OR_E,
+enum CondCode {
+  COND_A  = 0,
+  COND_AE = 1,
+  COND_B  = 2,
+  COND_BE = 3,
+  COND_E  = 4,
+  COND_G  = 5,
+  COND_GE = 6,
+  COND_L  = 7,
+  COND_LE = 8,
+  COND_NE = 9,
+  COND_NO = 10,
+  COND_NP = 11,
+  COND_NS = 12,
+  COND_O  = 13,
+  COND_P  = 14,
+  COND_S  = 15,
+  LAST_VALID_COND = COND_S,
+
+  // Artificial condition codes. These are used by AnalyzeBranch
+  // to indicate a block terminated with two conditional branches to
+  // the same location. This occurs in code using FCMP_OEQ or FCMP_UNE,
+  // which can't be represented on x86 with a single condition. These
+  // are never used in MachineInstrs.
+  COND_NE_OR_P,
+  COND_NP_OR_E,
+
+  // Artificial condition codes. These are used to represent the negation of
+  // above two conditions. The only scenario we need these two conditions is
+  // when we try to reverse above two conditions in order to remove redundant
+  // unconditional jumps. Note that both true and false bodies need to be
+  // avaiable in order to correctly synthesize instructions for them. These are
+  // never used in MachineInstrs.
+  COND_E_AND_NP, // negate of COND_NE_OR_P
+  COND_P_AND_NE, // negate of COND_NP_OR_E
 
-    COND_INVALID
-  };
+  COND_INVALID
+};
 
   // Turn condition code into conditional branch opcode.
   unsigned GetCondBranchFromCond(CondCode CC);
Index: llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
===================================================================
--- llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
+++ llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
@@ -3805,6 +3805,10 @@
   case X86::COND_NP: return X86::COND_P;
   case X86::COND_O:  return X86::COND_NO;
   case X86::COND_NO: return X86::COND_O;
+  case X86::COND_NE_OR_P:  return X86::COND_E_AND_NP;
+  case X86::COND_NP_OR_E:  return X86::COND_P_AND_NE;
+  case X86::COND_E_AND_NP: return X86::COND_NE_OR_P;
+  case X86::COND_P_AND_NE: return X86::COND_NP_OR_E;
   }
 }
 
@@ -3998,9 +4002,9 @@
         MachineBasicBlock::iterator OldInst = I;
 
         BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(JNCC))
-          .addMBB(UnCondBrIter->getOperand(0).getMBB());
+            .addMBB(UnCondBrIter->getOperand(0).getMBB());
         BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JMP_1))
-          .addMBB(TargetBB);
+            .addMBB(TargetBB);
 
         OldInst->eraseFromParent();
         UnCondBrIter->eraseFromParent();
@@ -4024,11 +4028,6 @@
     assert(Cond.size() == 1);
     assert(TBB);
 
-    // Only handle the case where all conditional branches branch to the same
-    // destination.
-    if (TBB != I->getOperand(0).getMBB())
-      return true;
-
     // If the conditions are the same, we can leave them alone.
     X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm();
     if (OldBranchCode == BranchCode)
@@ -4037,17 +4036,40 @@
     // If they differ, see if they fit one of the known patterns. Theoretically,
     // we could handle more patterns here, but we shouldn't expect to see them
     // if instruction selection has done a reasonable job.
-    if ((OldBranchCode == X86::COND_NP &&
-         BranchCode == X86::COND_E) ||
-        (OldBranchCode == X86::COND_E &&
-         BranchCode == X86::COND_NP))
+    auto NewTBB = I->getOperand(0).getMBB();
+    if (TBB == NewTBB &&
+        ((OldBranchCode == X86::COND_NP && BranchCode == X86::COND_E) ||
+         (OldBranchCode == X86::COND_E && BranchCode == X86::COND_NP))) {
       BranchCode = X86::COND_NP_OR_E;
-    else if ((OldBranchCode == X86::COND_P &&
-              BranchCode == X86::COND_NE) ||
-             (OldBranchCode == X86::COND_NE &&
-              BranchCode == X86::COND_P))
+    } else if (TBB == NewTBB &&
+               ((OldBranchCode == X86::COND_P && BranchCode == X86::COND_NE) ||
+                (OldBranchCode == X86::COND_NE && BranchCode == X86::COND_P))) {
       BranchCode = X86::COND_NE_OR_P;
-    else
+    } else if ((OldBranchCode == X86::COND_NE && BranchCode == X86::COND_NP) ||
+               (OldBranchCode == X86::COND_P && BranchCode == X86::COND_E)) {
+      // X86::COND_P_AND_NE usually has two different branch destinations.
+      //
+      // JNP B1
+      // JNE B2
+      // B1: (fall-through)
+      // B2:
+      //
+      // Here this condition branches to B2 only if P && NE. It has another
+      // equivalent form:
+      //
+      // JE B1
+      // JP B2
+      // B1: (fall-through)
+      // B2:
+      //
+      // Similarly it branches to B2 only if NE && P. That is why this condition
+      // is named COND_P_AND_NE.
+      BranchCode = X86::COND_P_AND_NE;
+    } else if ((OldBranchCode == X86::COND_NP && BranchCode == X86::COND_NE) ||
+               (OldBranchCode == X86::COND_E && BranchCode == X86::COND_P)) {
+      // See comments above for X86::COND_P_AND_NE.
+      BranchCode = X86::COND_E_AND_NP;
+    } else
       return true;
 
     // Update the MachineOperand.
@@ -4156,6 +4178,13 @@
   return Count;
 }
 
+static MachineBasicBlock *getFallThroughMBB(MachineBasicBlock *MBB) {
+  auto I = std::next(MBB->getIterator());
+  if (I == MBB->getParent()->end())
+    return nullptr;
+  return &*I;
+}
+
 unsigned
 X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                            MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
@@ -4172,6 +4201,9 @@
     return 1;
   }
 
+  // If FBB is null, it is implied to be a fall-through block.
+  bool FallThru = FBB == nullptr;
+
   // Conditional branch.
   unsigned Count = 0;
   X86::CondCode CC = (X86::CondCode)Cond[0].getImm();
@@ -4190,13 +4222,39 @@
     BuildMI(&MBB, DL, get(X86::JP_1)).addMBB(TBB);
     ++Count;
     break;
+  case X86::COND_P_AND_NE:
+    // Use the next block of MBB as FBB if it is null.
+    if (FBB == nullptr) {
+      FBB = getFallThroughMBB(&MBB);
+      assert(FBB && "MBB cannot be the last block in function when the false "
+                    "body is a fall-through.");
+    }
+    // Synthesize NEG_NP_OR_E with two branches.
+    BuildMI(&MBB, DL, get(X86::JNP_1)).addMBB(FBB);
+    ++Count;
+    BuildMI(&MBB, DL, get(X86::JNE_1)).addMBB(TBB);
+    ++Count;
+    break;
+  case X86::COND_E_AND_NP:
+    // Use the next block of MBB as FBB if it is null.
+    if (FBB == nullptr) {
+      FBB = getFallThroughMBB(&MBB);
+      assert(FBB && "MBB cannot be the last block in function when the false "
+                    "body is a fall-through.");
+    }
+    // Synthesize NEG_NE_OR_P with two branches.
+    BuildMI(&MBB, DL, get(X86::JNE_1)).addMBB(FBB);
+    ++Count;
+    BuildMI(&MBB, DL, get(X86::JNP_1)).addMBB(TBB);
+    ++Count;
+    break;
   default: {
     unsigned Opc = GetCondBranchFromCond(CC);
     BuildMI(&MBB, DL, get(Opc)).addMBB(TBB);
     ++Count;
   }
   }
-  if (FBB) {
+  if (!FallThru) {
     // Two-way Conditional branch. Insert the second branch.
     BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(FBB);
     ++Count;
@@ -6717,8 +6775,6 @@
 ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
   assert(Cond.size() == 1 && "Invalid X86 branch condition!");
   X86::CondCode CC = static_cast<X86::CondCode>(Cond[0].getImm());
-  if (CC == X86::COND_NE_OR_P || CC == X86::COND_NP_OR_E)
-    return true;
   Cond[0].setImm(GetOppositeBranchCondition(CC));
   return false;
 }
Index: llvm/trunk/test/CodeGen/X86/block-placement.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/block-placement.ll
+++ llvm/trunk/test/CodeGen/X86/block-placement.ll
@@ -463,26 +463,23 @@
 }
 
 define void @fpcmp_unanalyzable_branch(i1 %cond) {
-; This function's CFG contains an unanalyzable branch that is likely to be
-; split due to having a different high-probability predecessor.
+; This function's CFG contains an once-unanalyzable branch (une on floating
+; points). As now it becomes analyzable, we should get best layout in which each
+; edge in 'entry' -> 'entry.if.then_crit_edge' -> 'if.then' -> 'if.end' is
+; fall-through.
 ; CHECK: fpcmp_unanalyzable_branch
 ; CHECK: %entry
+; CHECK: %entry.if.then_crit_edge
+; CHECK: %if.then
+; CHECK: %if.end
 ; CHECK: %exit
-; CHECK-NOT: %if.then
-; CHECK-NOT: %if.end
-; CHECK-NOT: jne
-; CHECK-NOT: jnp
 ; CHECK: jne
 ; CHECK-NEXT: jnp
-; CHECK-NEXT: %if.then
 
 entry:
 ; Note that this branch must be strongly biased toward
 ; 'entry.if.then_crit_edge' to ensure that we would try to form a chain for
-; 'entry' -> 'entry.if.then_crit_edge' -> 'if.then'. It is the last edge in that
-; chain which would violate the unanalyzable branch in 'exit', but we won't even
-; try this trick unless 'if.then' is believed to almost always be reached from
-; 'entry.if.then_crit_edge'.
+; 'entry' -> 'entry.if.then_crit_edge' -> 'if.then' -> 'if.end'.
   br i1 %cond, label %entry.if.then_crit_edge, label %lor.lhs.false, !prof !1
 
 entry.if.then_crit_edge:
@@ -494,7 +491,7 @@
 
 exit:
   %cmp.i = fcmp une double 0.000000e+00, undef
-  br i1 %cmp.i, label %if.then, label %if.end
+  br i1 %cmp.i, label %if.then, label %if.end, !prof !3
 
 if.then:
   %0 = phi i8 [ %.pre14, %entry.if.then_crit_edge ], [ undef, %exit ]
@@ -507,6 +504,7 @@
 }
 
 !1 = !{!"branch_weights", i32 1000, i32 1}
+!3 = !{!"branch_weights", i32 1, i32 1000}
 
 declare i32 @f()
 declare i32 @g()
@@ -665,11 +663,14 @@
 ; Ensure that we can handle unanalyzable branches where the destination block
 ; gets selected as the optimal successor to merge.
 ;
+; This branch is now analyzable and hence the destination block becomes the
+; hotter one. The right order is entry->bar->exit->foo.
+;
 ; CHECK: unanalyzable_branch_to_best_succ
 ; CHECK: %entry
-; CHECK: %foo
 ; CHECK: %bar
 ; CHECK: %exit
+; CHECK: %foo
 
 entry:
   ; Bias this branch toward bar to ensure we form that chain.
Index: llvm/trunk/test/CodeGen/X86/fast-isel-cmp-branch2.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/fast-isel-cmp-branch2.ll
+++ llvm/trunk/test/CodeGen/X86/fast-isel-cmp-branch2.ll
@@ -5,7 +5,7 @@
 ; CHECK-LABEL: fcmp_oeq
 ; CHECK:       ucomiss  %xmm1, %xmm0
 ; CHECK-NEXT:  jne {{LBB.+_1}}
-; CHECK-NEXT:  jnp {{LBB.+_2}}
+; CHECK-NEXT:  jp {{LBB.+_1}}
   %1 = fcmp oeq float %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -162,8 +162,7 @@
 ; CHECK-LABEL: fcmp_une
 ; CHECK:       ucomiss  %xmm1, %xmm0
 ; CHECK-NEXT:  jne {{LBB.+_2}}
-; CHECK-NEXT:  jp  {{LBB.+_2}}
-; CHECK-NEXT:  jmp {{LBB.+_1}}
+; CHECK-NEXT:  jnp {{LBB.+_1}}
   %1 = fcmp une float %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
Index: llvm/trunk/test/CodeGen/X86/fast-isel-cmp-branch3.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/fast-isel-cmp-branch3.ll
+++ llvm/trunk/test/CodeGen/X86/fast-isel-cmp-branch3.ll
@@ -17,7 +17,7 @@
 ; CHECK:       xorps    %xmm1, %xmm1
 ; CHECK-NEXT:  ucomiss  %xmm1, %xmm0
 ; CHECK-NEXT:  jne {{LBB.+_1}}
-; CHECK-NEXT:  jnp {{LBB.+_2}}
+; CHECK-NEXT:  jp {{LBB.+_1}}
   %1 = fcmp oeq float %x, 0.000000e+00
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -338,8 +338,7 @@
 ; CHECK:       xorps    %xmm1, %xmm1
 ; CHECK-NEXT:  ucomiss  %xmm1, %xmm0
 ; CHECK-NEXT:  jne {{LBB.+_2}}
-; CHECK-NEXT:  jp {{LBB.+_2}}
-; CHECK-NEXT:  jmp {{LBB.+_1}}
+; CHECK-NEXT:  jnp {{LBB.+_1}}
   %1 = fcmp une float %x, 0.000000e+00
   br i1 %1, label %bb1, label %bb2
 bb2:
Index: llvm/trunk/test/CodeGen/X86/fp-une-cmp.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/fp-une-cmp.ll
+++ llvm/trunk/test/CodeGen/X86/fp-une-cmp.ll
@@ -19,12 +19,12 @@
 ;       addsd ...
 ;   LBB0_2:
 
-; CHECK:       func
+define float @func1(float %x, float %y) nounwind readnone optsize ssp {
+; CHECK:       func1
 ; CHECK:       jne [[LABEL:.*]]
 ; CHECK-NEXT:  jp  [[LABEL]]
 ; CHECK-NOT:   jmp
-
-define float @func(float %x, float %y) nounwind readnone optsize ssp {
+;
 entry:
   %0 = fpext float %x to double
   %1 = fpext float %y to double
@@ -41,3 +41,30 @@
   %.0 = fptrunc double %.0.in to float
   ret float %.0
 }
+
+define float @func2(float %x, float %y) nounwind readnone optsize ssp {
+; CHECK:       func2
+; CHECK:       jne [[LABEL:.*]]
+; CHECK-NEXT:  jp  [[LABEL]]
+; CHECK:       %bb2
+; CHECK:       %bb1
+; CHECK:       jmp
+;
+entry:
+  %0 = fpext float %x to double
+  %1 = fpext float %y to double
+  %2 = fmul double %0, %1
+  %3 = fcmp une double %2, 0.000000e+00
+  br i1 %3, label %bb1, label %bb2, !prof !1
+
+bb1:
+  %4 = fadd double %2, -1.000000e+00
+  br label %bb2
+
+bb2:
+  %.0.in = phi double [ %4, %bb1 ], [ %2, %entry ]
+  %.0 = fptrunc double %.0.in to float
+  ret float %.0
+}
+
+!1 = !{!"branch_weights", i32 1, i32 1000}