Index: include/llvm/Target/TargetInstrInfo.h
===================================================================
--- include/llvm/Target/TargetInstrInfo.h
+++ include/llvm/Target/TargetInstrInfo.h
@@ -452,6 +452,8 @@
   /// If AllowModify is true, then this routine is allowed to modify the basic
   /// block (e.g. delete instructions after the unconditional branch).
   ///
+  /// The CFG information in MBB.Predecessors and MBB.Successors must be valid
+  /// before calling this function.
   virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                              MachineBasicBlock *&FBB,
                              SmallVectorImpl<MachineOperand> &Cond,
@@ -521,6 +523,9 @@
   /// cases where AnalyzeBranch doesn't apply because there was no original
   /// branch to analyze.  At least this much must be implemented, else tail
   /// merging needs to be disabled.
+  ///
+  /// The CFG information in MBB.Predecessors and MBB.Successors must be valid
+  /// before calling this function.
   virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                                 MachineBasicBlock *FBB,
                                 ArrayRef<MachineOperand> Cond,
Index: lib/CodeGen/TailDuplication.cpp
===================================================================
--- lib/CodeGen/TailDuplication.cpp
+++ lib/CodeGen/TailDuplication.cpp
@@ -749,9 +749,6 @@
 
     TII->RemoveBranch(*PredBB);
 
-    if (PredTBB)
-      TII->InsertBranch(*PredBB, PredTBB, PredFBB, PredCond, DebugLoc());
-
     if (!PredBB->isSuccessor(NewTarget))
       PredBB->replaceSuccessor(TailBB, NewTarget);
     else {
@@ -759,6 +756,9 @@
       assert(PredBB->succ_size() <= 1);
     }
 
+    if (PredTBB)
+      TII->InsertBranch(*PredBB, PredTBB, PredFBB, PredCond, DebugLoc());
+
     TDBBs.push_back(PredBB);
   }
   return Changed;
Index: lib/Target/X86/X86InstrInfo.h
===================================================================
--- lib/Target/X86/X86InstrInfo.h
+++ lib/Target/X86/X86InstrInfo.h
@@ -29,54 +29,54 @@
 namespace X86 {
   // X86 specific condition code. These correspond to X86_*_COND in
   // X86InstrInfo.td. They must be kept in synch.
-  enum CondCode {
-    COND_A  = 0,
-    COND_AE = 1,
-    COND_B  = 2,
-    COND_BE = 3,
-    COND_E  = 4,
-    COND_G  = 5,
-    COND_GE = 6,
-    COND_L  = 7,
-    COND_LE = 8,
-    COND_NE = 9,
-    COND_NO = 10,
-    COND_NP = 11,
-    COND_NS = 12,
-    COND_O  = 13,
-    COND_P  = 14,
-    COND_S  = 15,
-    LAST_VALID_COND = COND_S,
-
-    // Artificial condition codes. These are used by AnalyzeBranch
-    // to indicate a block terminated with two conditional branches to
-    // the same location. This occurs in code using FCMP_OEQ or FCMP_UNE,
-    // which can't be represented on x86 with a single condition. These
-    // are never used in MachineInstrs.
-    COND_NE_OR_P,
-    COND_NP_OR_E,
-
-    COND_INVALID
-  };
-
-  // Turn condition code into conditional branch opcode.
-  unsigned GetCondBranchFromCond(CondCode CC);
-
-  /// \brief Return a set opcode for the given condition and whether it has
-  /// a memory operand.
-  unsigned getSETFromCond(CondCode CC, bool HasMemoryOperand = false);
-
-  /// \brief Return a cmov opcode for the given condition, register size in
-  /// bytes, and operand type.
-  unsigned getCMovFromCond(CondCode CC, unsigned RegBytes,
-                           bool HasMemoryOperand = false);
-
-  // Turn CMov opcode into condition code.
-  CondCode getCondFromCMovOpc(unsigned Opc);
-
-  /// GetOppositeBranchCondition - Return the inverse of the specified cond,
-  /// e.g. turning COND_E to COND_NE.
-  CondCode GetOppositeBranchCondition(CondCode CC);
+enum CondCode {
+  COND_A = 0,
+  COND_AE = 1,
+  COND_B = 2,
+  COND_BE = 3,
+  COND_E = 4,
+  COND_G = 5,
+  COND_GE = 6,
+  COND_L = 7,
+  COND_LE = 8,
+  COND_NE = 9,
+  COND_NO = 10,
+  COND_NP = 11,
+  COND_NS = 12,
+  COND_O = 13,
+  COND_P = 14,
+  COND_S = 15,
+  LAST_VALID_COND = COND_S,
+
+  // Artificial condition codes. These are used by AnalyzeBranch
+  // to indicate a block terminated with two conditional branches that together
+  // form a compound condition. They occur in code using FCMP_OEQ or FCMP_UNE,
+  // which can't be represented on x86 with a single condition. These
+  // are never used in MachineInstrs and are inverses of one another.
+  COND_NE_OR_P,
+  COND_E_AND_NP,
+
+  COND_INVALID
+};
+
+// Turn condition code into conditional branch opcode.
+unsigned GetCondBranchFromCond(CondCode CC);
+
+/// \brief Return a set opcode for the given condition and whether it has
+/// a memory operand.
+unsigned getSETFromCond(CondCode CC, bool HasMemoryOperand = false);
+
+/// \brief Return a cmov opcode for the given condition, register size in
+/// bytes, and operand type.
+unsigned getCMovFromCond(CondCode CC, unsigned RegBytes,
+                         bool HasMemoryOperand = false);
+
+// Turn CMov opcode into condition code.
+CondCode getCondFromCMovOpc(unsigned Opc);
+
+/// GetOppositeBranchCondition - Return the inverse of the specified cond,
+/// e.g. turning COND_E to COND_NE.
+CondCode GetOppositeBranchCondition(CondCode CC);
 }  // end namespace X86;
 
 
Index: lib/Target/X86/X86InstrInfo.cpp
===================================================================
--- lib/Target/X86/X86InstrInfo.cpp
+++ lib/Target/X86/X86InstrInfo.cpp
@@ -3807,6 +3807,8 @@
   case X86::COND_NP: return X86::COND_P;
   case X86::COND_O:  return X86::COND_NO;
   case X86::COND_NO: return X86::COND_O;
+  case X86::COND_NE_OR_P:  return X86::COND_E_AND_NP;
+  case X86::COND_E_AND_NP: return X86::COND_NE_OR_P;
   }
 }
 
@@ -3914,6 +3916,23 @@
   return !isPredicated(MI);
 }
 
+// Given a MBB and its TBB, find the FBB which was a fallthrough MBB (it may not
+// be a fallthorough MBB now due to layout changes). Return nullptr if the
+// fallthough MBB cannot be identified.
+static MachineBasicBlock *getFallThroughMBB(MachineBasicBlock *MBB,
+                                            MachineBasicBlock *TBB) {
+  MachineBasicBlock *FallthroughBB = nullptr;
+  for (auto SI = MBB->succ_begin(), SE = MBB->succ_end(); SI != SE; ++SI) {
+    if ((*SI)->isEHPad() || *SI == TBB)
+      continue;
+    // Return a nullptr if we found more than one fallthrough successor.
+    if (FallthroughBB)
+      return nullptr;
+    FallthroughBB = *SI;
+  }
+  return FallthroughBB;
+}
+
 bool X86InstrInfo::AnalyzeBranchImpl(
     MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
     SmallVectorImpl<MachineOperand> &Cond,
@@ -4026,30 +4045,45 @@
     assert(Cond.size() == 1);
     assert(TBB);
 
-    // Only handle the case where all conditional branches branch to the same
-    // destination.
-    if (TBB != I->getOperand(0).getMBB())
-      return true;
-
     // If the conditions are the same, we can leave them alone.
     X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm();
-    if (OldBranchCode == BranchCode)
+    auto NewTBB = I->getOperand(0).getMBB();
+    if (OldBranchCode == BranchCode && TBB == NewTBB)
       continue;
 
     // If they differ, see if they fit one of the known patterns. Theoretically,
     // we could handle more patterns here, but we shouldn't expect to see them
     // if instruction selection has done a reasonable job.
-    if ((OldBranchCode == X86::COND_NP &&
-         BranchCode == X86::COND_E) ||
-        (OldBranchCode == X86::COND_E &&
-         BranchCode == X86::COND_NP))
-      BranchCode = X86::COND_NP_OR_E;
-    else if ((OldBranchCode == X86::COND_P &&
-              BranchCode == X86::COND_NE) ||
-             (OldBranchCode == X86::COND_NE &&
-              BranchCode == X86::COND_P))
+    if (TBB == NewTBB &&
+               ((OldBranchCode == X86::COND_P && BranchCode == X86::COND_NE) ||
+                (OldBranchCode == X86::COND_NE && BranchCode == X86::COND_P))) {
       BranchCode = X86::COND_NE_OR_P;
-    else
+    } else if ((OldBranchCode == X86::COND_NP && BranchCode == X86::COND_NE) ||
+               (OldBranchCode == X86::COND_E && BranchCode == X86::COND_P)) {
+      if (NewTBB != (FBB ? FBB : getFallThroughMBB(&MBB, TBB)))
+        return true;
+
+      // X86::COND_E_AND_NP usually has two different branch destinations.
+      //
+      // JP B1
+      // JE B2
+      // JMP B1
+      // B1:
+      // B2:
+      //
+      // Here this condition branches to B2 only if NP && E. It has another
+      // equivalent form:
+      //
+      // JNE B1
+      // JNP B2
+      // JMP B1
+      // B1:
+      // B2:
+      //
+      // Similarly it branches to B2 only if E && NP. That is why this condition
+      // is named with COND_E_AND_NP.
+      BranchCode = X86::COND_E_AND_NP;
+    } else
       return true;
 
     // Update the MachineOperand.
@@ -4174,17 +4208,13 @@
     return 1;
   }
 
+  // If FBB is null, it is implied to be a fall-through block.
+  bool FallThru = FBB == nullptr;
+
   // Conditional branch.
   unsigned Count = 0;
   X86::CondCode CC = (X86::CondCode)Cond[0].getImm();
   switch (CC) {
-  case X86::COND_NP_OR_E:
-    // Synthesize NP_OR_E with two branches.
-    BuildMI(&MBB, DL, get(X86::JNP_1)).addMBB(TBB);
-    ++Count;
-    BuildMI(&MBB, DL, get(X86::JE_1)).addMBB(TBB);
-    ++Count;
-    break;
   case X86::COND_NE_OR_P:
     // Synthesize NE_OR_P with two branches.
     BuildMI(&MBB, DL, get(X86::JNE_1)).addMBB(TBB);
@@ -4192,13 +4222,26 @@
     BuildMI(&MBB, DL, get(X86::JP_1)).addMBB(TBB);
     ++Count;
     break;
+  case X86::COND_E_AND_NP:
+    // Use the next block of MBB as FBB if it is null.
+    if (FBB == nullptr) {
+      FBB = getFallThroughMBB(&MBB, TBB);
+      assert(FBB && "MBB cannot be the last block in function when the false "
+                    "body is a fall-through.");
+    }
+    // Synthesize COND_E_AND_NP with two branches.
+    BuildMI(&MBB, DL, get(X86::JNE_1)).addMBB(FBB);
+    ++Count;
+    BuildMI(&MBB, DL, get(X86::JNP_1)).addMBB(TBB);
+    ++Count;
+    break;
   default: {
     unsigned Opc = GetCondBranchFromCond(CC);
     BuildMI(&MBB, DL, get(Opc)).addMBB(TBB);
     ++Count;
   }
   }
-  if (FBB) {
+  if (!FallThru) {
     // Two-way Conditional branch. Insert the second branch.
     BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(FBB);
     ++Count;
@@ -6759,8 +6802,6 @@
 ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
   assert(Cond.size() == 1 && "Invalid X86 branch condition!");
   X86::CondCode CC = static_cast<X86::CondCode>(Cond[0].getImm());
-  if (CC == X86::COND_NE_OR_P || CC == X86::COND_NP_OR_E)
-    return true;
   Cond[0].setImm(GetOppositeBranchCondition(CC));
   return false;
 }
Index: test/CodeGen/X86/block-placement.ll
===================================================================
--- test/CodeGen/X86/block-placement.ll
+++ test/CodeGen/X86/block-placement.ll
@@ -463,26 +463,24 @@
 }
 
 define void @fpcmp_unanalyzable_branch(i1 %cond) {
-; This function's CFG contains an unanalyzable branch that is likely to be
-; split due to having a different high-probability predecessor.
-; CHECK: fpcmp_unanalyzable_branch
-; CHECK: %entry
-; CHECK: %exit
-; CHECK-NOT: %if.then
-; CHECK-NOT: %if.end
-; CHECK-NOT: jne
-; CHECK-NOT: jnp
-; CHECK: jne
-; CHECK-NEXT: jnp
-; CHECK-NEXT: %if.then
+; This function's CFG contains an once-unanalyzable branch (une on floating
+; points). As now it becomes analyzable, we should get best layout in which each
+; edge in 'entry' -> 'entry.if.then_crit_edge' -> 'if.then' -> 'if.end' is
+; fall-through.
+; CHECK-LABEL: fpcmp_unanalyzable_branch:
+; CHECK:       # BB#0: # %entry
+; CHECK:       # BB#1: # %entry.if.then_crit_edge
+; CHECK:       .LBB10_4: # %if.then
+; CHECK:       .LBB10_5: # %if.end
+; CHECK:       # BB#3: # %exit
+; CHECK:       jne .LBB10_4
+; CHECK-NEXT:  jnp .LBB10_5
+; CHECK-NEXT:  jmp .LBB10_4
 
 entry:
 ; Note that this branch must be strongly biased toward
 ; 'entry.if.then_crit_edge' to ensure that we would try to form a chain for
-; 'entry' -> 'entry.if.then_crit_edge' -> 'if.then'. It is the last edge in that
-; chain which would violate the unanalyzable branch in 'exit', but we won't even
-; try this trick unless 'if.then' is believed to almost always be reached from
-; 'entry.if.then_crit_edge'.
+; 'entry' -> 'entry.if.then_crit_edge' -> 'if.then' -> 'if.end'.
   br i1 %cond, label %entry.if.then_crit_edge, label %lor.lhs.false, !prof !1
 
 entry.if.then_crit_edge:
@@ -494,7 +492,7 @@
 
 exit:
   %cmp.i = fcmp une double 0.000000e+00, undef
-  br i1 %cmp.i, label %if.then, label %if.end
+  br i1 %cmp.i, label %if.then, label %if.end, !prof !3
 
 if.then:
   %0 = phi i8 [ %.pre14, %entry.if.then_crit_edge ], [ undef, %exit ]
@@ -507,6 +505,7 @@
 }
 
 !1 = !{!"branch_weights", i32 1000, i32 1}
+!3 = !{!"branch_weights", i32 1, i32 1000}
 
 declare i32 @f()
 declare i32 @g()
@@ -665,11 +664,14 @@
 ; Ensure that we can handle unanalyzable branches where the destination block
 ; gets selected as the optimal successor to merge.
 ;
+; This branch is now analyzable and hence the destination block becomes the
+; hotter one. The right order is entry->bar->exit->foo.
+;
 ; CHECK: unanalyzable_branch_to_best_succ
 ; CHECK: %entry
-; CHECK: %foo
 ; CHECK: %bar
 ; CHECK: %exit
+; CHECK: %foo
 
 entry:
   ; Bias this branch toward bar to ensure we form that chain.
Index: test/CodeGen/X86/fast-isel-cmp-branch2.ll
===================================================================
--- test/CodeGen/X86/fast-isel-cmp-branch2.ll
+++ test/CodeGen/X86/fast-isel-cmp-branch2.ll
@@ -5,7 +5,7 @@
 ; CHECK-LABEL: fcmp_oeq
 ; CHECK:       ucomiss  %xmm1, %xmm0
 ; CHECK-NEXT:  jne {{LBB.+_1}}
-; CHECK-NEXT:  jnp {{LBB.+_2}}
+; CHECK-NEXT:  jp {{LBB.+_1}}
   %1 = fcmp oeq float %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -162,8 +162,7 @@
 ; CHECK-LABEL: fcmp_une
 ; CHECK:       ucomiss  %xmm1, %xmm0
 ; CHECK-NEXT:  jne {{LBB.+_2}}
-; CHECK-NEXT:  jp  {{LBB.+_2}}
-; CHECK-NEXT:  jmp {{LBB.+_1}}
+; CHECK-NEXT:  jnp {{LBB.+_1}}
   %1 = fcmp une float %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
Index: test/CodeGen/X86/fast-isel-cmp-branch3.ll
===================================================================
--- test/CodeGen/X86/fast-isel-cmp-branch3.ll
+++ test/CodeGen/X86/fast-isel-cmp-branch3.ll
@@ -17,7 +17,7 @@
 ; CHECK:       xorps    %xmm1, %xmm1
 ; CHECK-NEXT:  ucomiss  %xmm1, %xmm0
 ; CHECK-NEXT:  jne {{LBB.+_1}}
-; CHECK-NEXT:  jnp {{LBB.+_2}}
+; CHECK-NEXT:  jp {{LBB.+_1}}
   %1 = fcmp oeq float %x, 0.000000e+00
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -338,8 +338,7 @@
 ; CHECK:       xorps    %xmm1, %xmm1
 ; CHECK-NEXT:  ucomiss  %xmm1, %xmm0
 ; CHECK-NEXT:  jne {{LBB.+_2}}
-; CHECK-NEXT:  jp {{LBB.+_2}}
-; CHECK-NEXT:  jmp {{LBB.+_1}}
+; CHECK-NEXT:  jnp {{LBB.+_1}}
   %1 = fcmp une float %x, 0.000000e+00
   br i1 %1, label %bb1, label %bb2
 bb2:
Index: test/CodeGen/X86/fp-une-cmp.ll
===================================================================
--- test/CodeGen/X86/fp-une-cmp.ll
+++ test/CodeGen/X86/fp-une-cmp.ll
@@ -48,8 +48,6 @@
   ret double %phi
 }
 
-; FIXME: With branch weights indicated, bb2 should be placed ahead of bb1.
-
 define double @profile_metadata(double %x, double %y) {
 ; CHECK-LABEL: profile_metadata:
 ; CHECK:       # BB#0: # %entry
@@ -57,11 +55,12 @@
 ; CHECK-NEXT:    xorpd %xmm1, %xmm1
 ; CHECK-NEXT:    ucomisd %xmm1, %xmm0
 ; CHECK-NEXT:    jne .LBB1_1
-; CHECK-NEXT:    jnp .LBB1_2
-; CHECK-NEXT:  .LBB1_1: # %bb1
-; CHECK-NEXT:    addsd {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    jp .LBB1_1
 ; CHECK-NEXT:  .LBB1_2: # %bb2
 ; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB1_1: # %bb1
+; CHECK-NEXT:    addsd {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    jmp .LBB1_2
 
 entry:
   %mul = fmul double %x, %y
@@ -77,5 +76,32 @@
   ret double %phi
 }
 
-!1 = !{!"branch_weights", i32 1, i32 1000}
+; Test if the negation of the non-equality check between floating points are
+; translated to jnp followed by jne.
 
+define void @foo(float %f) {
+; CHECK-LABEL: foo:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    ucomiss %xmm1, %xmm0
+; CHECK-NEXT:    jne .LBB2_2
+; CHECK-NEXT:    jnp .LBB2_1
+; CHECK-NEXT:  .LBB2_2: # %if.then
+; CHECK-NEXT:    jmp a # TAILCALL
+; CHECK-NEXT:  .LBB2_1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %cmp = fcmp une float %f, 0.000000e+00
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @a()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+declare void @a()
+
+!1 = !{!"branch_weights", i32 1, i32 1000}
Index: test/CodeGen/X86/x86-analyze-branch-jne-jp.s
===================================================================
--- /dev/null
+++ test/CodeGen/X86/x86-analyze-branch-jne-jp.s
@@ -0,0 +1,22 @@
+	.text
+	.file	"../llvm/test/CodeGen/X86/x86-analyze-branch-jne-jp.ll"
+	.globl	foo
+	.p2align	4, 0x90
+	.type	foo,@function
+foo:                                    # @foo
+	.cfi_startproc
+# BB#0:                                 # %entry
+	xorps	%xmm1, %xmm1
+	ucomiss	%xmm1, %xmm0
+	jne	.LBB0_2
+	jnp	.LBB0_1
+.LBB0_2:                                # %if.then
+	jmp	a                       # TAILCALL
+.LBB0_1:                                # %if.end
+	retq
+.Lfunc_end0:
+	.size	foo, .Lfunc_end0-foo
+	.cfi_endproc
+
+
+	.section	".note.GNU-stack","",@progbits