Index: llvm/trunk/lib/Target/X86/X86CmovConversion.cpp
===================================================================
--- llvm/trunk/lib/Target/X86/X86CmovConversion.cpp
+++ llvm/trunk/lib/Target/X86/X86CmovConversion.cpp
@@ -72,6 +72,11 @@
                        cl::desc("Minimum gain per loop (in cycles) threshold."),
                        cl::init(4), cl::Hidden);
 
+static cl::opt<bool> ForceMemOperand(
+    "x86-cmov-converter-force-mem-operand",
+    cl::desc("Convert cmovs to branches whenever they have memory operands."),
+    cl::init(true), cl::Hidden);
+
 /// Converts X86 cmov instructions into branches when profitable.
 class X86CmovConverterPass : public MachineFunctionPass {
 public:
@@ -86,8 +91,9 @@
   /// Pass identification, replacement for typeid.
   static char ID;
 
-  const MachineRegisterInfo *MRI;
+  MachineRegisterInfo *MRI;
   const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
   TargetSchedModel TSchedModel;
 
   /// List of consecutive CMOV instructions.
@@ -101,7 +107,8 @@
   /// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop.
   /// \returns true iff it found any CMOV-group-candidate.
   bool collectCmovCandidates(ArrayRef<MachineBasicBlock *> Blocks,
-                             CmovGroups &CmovInstGroups);
+                             CmovGroups &CmovInstGroups,
+                             bool IncludeLoads = false);
 
   /// Check if it is profitable to transform each CMOV-group-candidates into
   /// branch. Remove all groups that are not profitable from \p CmovInstGroups.
@@ -139,10 +146,36 @@
   const TargetSubtargetInfo &STI = MF.getSubtarget();
   MRI = &MF.getRegInfo();
   TII = STI.getInstrInfo();
+  TRI = STI.getRegisterInfo();
   TSchedModel.init(STI.getSchedModel(), &STI, TII);
 
+  // Before we handle the more subtle cases of register-register CMOVs inside
+  // of potentially hot loops, we want to quickly remove all CMOVs with
+  // a memory operand. The CMOV will risk a stall waiting for the load to
+  // complete that speculative execution behind a branch is better suited to
+  // handle on modern x86 chips.
+  if (ForceMemOperand) {
+    CmovGroups AllCmovGroups;
+    SmallVector<MachineBasicBlock *, 4> Blocks;
+    for (auto &MBB : MF)
+      Blocks.push_back(&MBB);
+    if (collectCmovCandidates(Blocks, AllCmovGroups, /*IncludeLoads*/ true)) {
+      for (auto &Group : AllCmovGroups) {
+        // Skip any group that doesn't do at least one memory operand cmov.
+        if (!llvm::any_of(Group, [&](MachineInstr *I) { return I->mayLoad(); }))
+          continue;
+
+        // For CMOV groups which we can rewrite and which contain a memory load,
+        // always rewrite them. On x86, a CMOV will dramatically amplify any
+        // memory latency by blocking speculative execution.
+        Changed = true;
+        convertCmovInstsToBranches(Group);
+      }
+    }
+  }
+
   //===--------------------------------------------------------------------===//
-  // Algorithm
+  // Register-operand Conversion Algorithm
   // ---------
   //   For each inner most loop
   //     collectCmovCandidates() {
@@ -191,11 +224,13 @@
     for (auto &Group : CmovInstGroups)
       convertCmovInstsToBranches(Group);
   }
+
   return Changed;
 }
 
 bool X86CmovConverterPass::collectCmovCandidates(
-    ArrayRef<MachineBasicBlock *> Blocks, CmovGroups &CmovInstGroups) {
+    ArrayRef<MachineBasicBlock *> Blocks, CmovGroups &CmovInstGroups,
+    bool IncludeLoads) {
   //===--------------------------------------------------------------------===//
   // Collect all CMOV-group-candidates and add them into CmovInstGroups.
   //
@@ -221,7 +256,7 @@
     Group.clear();
     // Condition code of first CMOV instruction current processed range and its
     // opposite condition code.
-    X86::CondCode FirstCC, FirstOppCC;
+    X86::CondCode FirstCC, FirstOppCC, MemOpCC;
     // Indicator of a non CMOVrr instruction in the current processed range.
     bool FoundNonCMOVInst = false;
     // Indicator for current processed CMOV-group if it should be skipped.
@@ -230,11 +265,13 @@
     for (auto &I : *MBB) {
       X86::CondCode CC = X86::getCondFromCMovOpc(I.getOpcode());
       // Check if we found a X86::CMOVrr instruction.
-      if (CC != X86::COND_INVALID && !I.mayLoad()) {
+      if (CC != X86::COND_INVALID && (IncludeLoads || !I.mayLoad())) {
         if (Group.empty()) {
           // We found first CMOV in the range, reset flags.
           FirstCC = CC;
           FirstOppCC = X86::GetOppositeBranchCondition(CC);
+          // Clear out the prior group's memory operand CC.
+          MemOpCC = X86::COND_INVALID;
           FoundNonCMOVInst = false;
           SkipGroup = false;
         }
@@ -244,6 +281,14 @@
         if (FoundNonCMOVInst || (CC != FirstCC && CC != FirstOppCC))
           // Mark the SKipGroup indicator to skip current processed CMOV-Group.
           SkipGroup = true;
+        if (I.mayLoad()) {
+          if (MemOpCC == X86::COND_INVALID)
+            // The first memory operand CMOV.
+            MemOpCC = CC;
+          else if (CC != MemOpCC)
+            // Can't handle mixed conditions with memory operands.
+            SkipGroup = true;
+        }
         continue;
       }
       // If Group is empty, keep looking for first CMOV in the range.
@@ -540,8 +585,18 @@
   MachineInstr &MI = *Group.front();
   MachineInstr *LastCMOV = Group.back();
   DebugLoc DL = MI.getDebugLoc();
+
   X86::CondCode CC = X86::CondCode(X86::getCondFromCMovOpc(MI.getOpcode()));
   X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
+  // Potentially swap the condition codes so that any memory operand to a CMOV
+  // is in the *false* position instead of the *true* position. We can invert
+  // any non-memory operand CMOV instructions to cope with this and we ensure
+  // memory operand CMOVs are only included with a single condition code.
+  if (llvm::any_of(Group, [&](MachineInstr *I) {
+        return I->mayLoad() && X86::getCondFromCMovOpc(I->getOpcode()) == CC;
+      }))
+    std::swap(CC, OppCC);
+
   MachineBasicBlock *MBB = MI.getParent();
   MachineFunction::iterator It = ++MBB->getIterator();
   MachineFunction *F = MBB->getParent();
@@ -578,7 +633,103 @@
   MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
   MachineBasicBlock::iterator MIItEnd =
       std::next(MachineBasicBlock::iterator(LastCMOV));
+  MachineBasicBlock::iterator FalseInsertionPoint = FalseMBB->begin();
   MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
+
+  // First we need to insert an explicit load on the false path for any memory
+  // operand. We also need to potentially do register rewriting here, but it is
+  // simpler as the memory operands are always on the false path so we can
+  // simply take that input, whatever it is.
+  DenseMap<unsigned, unsigned> FalseBBRegRewriteTable;
+  for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd;) {
+    auto &MI = *MIIt++;
+    // Skip any CMOVs in this group which don't load from memory.
+    if (!MI.mayLoad()) {
+      // Remember the false-side register input.
+      FalseBBRegRewriteTable[MI.getOperand(0).getReg()] =
+          MI.getOperand(X86::getCondFromCMovOpc(MI.getOpcode()) == CC ? 1 : 2)
+              .getReg();
+      continue;
+    }
+
+    // The condition must be the *opposite* of the one we've decided to branch
+    // on as the branch will go *around* the load and the load should happen
+    // when the CMOV condition is false.
+    assert(X86::getCondFromCMovOpc(MI.getOpcode()) == OppCC &&
+           "Can only handle memory-operand cmov instructions with a condition "
+           "opposite to the selected branch direction.");
+
+    // The goal is to rewrite the cmov from:
+    //
+    //   MBB:
+    //     %A = CMOVcc %B (tied), (mem)
+    //
+    // to
+    //
+    //   MBB:
+    //     %A = CMOVcc %B (tied), %C
+    //   FalseMBB:
+    //     %C = MOV (mem)
+    //
+    // Which will allow the next loop to rewrite the CMOV in terms of a PHI:
+    //
+    //   MBB:
+    //     JMP!cc SinkMBB
+    //   FalseMBB:
+    //     %C = MOV (mem)
+    //   SinkMBB:
+    //     %A = PHI [ %C, FalseMBB ], [ %B, MBB]
+
+    // Get a fresh register to use as the destination of the MOV.
+    const TargetRegisterClass *RC = MRI->getRegClass(MI.getOperand(0).getReg());
+    unsigned TmpReg = MRI->createVirtualRegister(RC);
+
+    SmallVector<MachineInstr *, 4> NewMIs;
+    bool Unfolded = TII->unfoldMemoryOperand(*MBB->getParent(), MI, TmpReg,
+                                             /*UnfoldLoad*/ true,
+                                             /*UnfoldStore*/ false, NewMIs);
+    (void)Unfolded;
+    assert(Unfolded && "Should never fail to unfold a loading cmov!");
+
+    // Move the new CMOV to just before the old one and reset any impacted
+    // iterator.
+    auto *NewCMOV = NewMIs.pop_back_val();
+    assert(X86::getCondFromCMovOpc(NewCMOV->getOpcode()) == OppCC &&
+           "Last new instruction isn't the expected CMOV!");
+    DEBUG(dbgs() << "\tRewritten cmov: "; NewCMOV->dump());
+    MBB->insert(MachineBasicBlock::iterator(MI), NewCMOV);
+    if (&*MIItBegin == &MI)
+      MIItBegin = MachineBasicBlock::iterator(NewCMOV);
+
+    // Sink whatever instructions were needed to produce the unfolded operand
+    // into the false block.
+    for (auto *NewMI : NewMIs) {
+      DEBUG(dbgs() << "\tRewritten load instr: "; NewMI->dump());
+      FalseMBB->insert(FalseInsertionPoint, NewMI);
+      // Re-map any operands that are from other cmovs to the inputs for this block.
+      for (auto &MOp : NewMI->uses()) {
+        if (!MOp.isReg())
+          continue;
+        auto It = FalseBBRegRewriteTable.find(MOp.getReg());
+        if (It == FalseBBRegRewriteTable.end())
+          continue;
+
+        MOp.setReg(It->second);
+        // This might have been a kill when it referenced the cmov result, but
+        // it won't necessarily be once rewritten.
+        // FIXME: We could potentially improve this by tracking whether the
+        // operand to the cmov was also a kill, and then skipping the PHI node
+        // construction below.
+        MOp.setIsKill(false);
+      }
+    }
+    MBB->erase(MachineBasicBlock::iterator(MI),
+               std::next(MachineBasicBlock::iterator(MI)));
+
+    // Add this PHI to the rewrite table.
+    FalseBBRegRewriteTable[NewCMOV->getOperand(0).getReg()] = TmpReg;
+  }
+
   // As we are creating the PHIs, we have to be careful if there is more than
   // one.  Later CMOVs may reference the results of earlier CMOVs, but later
   // PHIs have to reference the individual true/false inputs from earlier PHIs.
Index: llvm/trunk/test/CodeGen/X86/cmov.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/cmov.ll
+++ llvm/trunk/test/CodeGen/X86/cmov.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown -disable-cgp-select2branch | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown -disable-cgp-select2branch -x86-cmov-converter=false | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 
 define i32 @test1(i32 %x, i32 %n, i32 %w, i32* %vp) nounwind readnone {
Index: llvm/trunk/test/CodeGen/X86/pr15981.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/pr15981.ll
+++ llvm/trunk/test/CodeGen/X86/pr15981.ll
@@ -44,7 +44,10 @@
 ; X64:       # BB#0:
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    decl {{.*}}(%rip)
-; X64-NEXT:    cmovnel {{.*}}(%rip), %eax
+; X64-NEXT:    je .LBB1_2
+; X64-NEXT:  # BB#1:
+; X64-NEXT:    movl {{.*}}(%rip), %eax
+; X64-NEXT:  .LBB1_2:
 ; X64-NEXT:    movl %eax, {{.*}}(%rip)
 ; X64-NEXT:    retq
   %1 = load volatile i32, i32* @b, align 4
Index: llvm/trunk/test/CodeGen/X86/x86-cmov-converter.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/x86-cmov-converter.ll
+++ llvm/trunk/test/CodeGen/X86/x86-cmov-converter.ll
@@ -330,4 +330,141 @@
   ret void
 }
 
+; Test that we always will convert a cmov with a memory operand into a branch,
+; even outside of a loop.
+define i32 @test_cmov_memoperand(i32 %a, i32 %b, i32 %x, i32* %y) #0 {
+; CHECK-LABEL: test_cmov_memoperand:
+entry:
+  %cond = icmp ugt i32 %a, %b
+; CHECK:         cmpl
+  %load = load i32, i32* %y
+  %z = select i1 %cond, i32 %x, i32 %load
+; CHECK-NOT:     cmov
+; CHECK:         ja [[FALSE_BB:.*]]
+; CHECK:         movl (%r{{..}}), %[[R:.*]]
+; CHECK:       [[FALSE_BB]]:
+; CHECK:         movl %[[R]], %
+  ret i32 %z
+}
+
+; Test that we can convert a group of cmovs where only one has a memory
+; operand.
+define i32 @test_cmov_memoperand_in_group(i32 %a, i32 %b, i32 %x, i32* %y.ptr) #0 {
+; CHECK-LABEL: test_cmov_memoperand_in_group:
+entry:
+  %cond = icmp ugt i32 %a, %b
+; CHECK:         cmpl
+  %y = load i32, i32* %y.ptr
+  %z1 = select i1 %cond, i32 %x, i32 %a
+  %z2 = select i1 %cond, i32 %x, i32 %y
+  %z3 = select i1 %cond, i32 %x, i32 %b
+; CHECK-NOT:     cmov
+; CHECK:         ja [[FALSE_BB:.*]]
+; CHECK-DAG:     movl %{{.*}}, %[[R1:.*]]
+; CHECK-DAG:     movl (%r{{..}}), %[[R2:.*]]
+; CHECK-DAG:     movl %{{.*}} %[[R3:.*]]
+; CHECK:       [[FALSE_BB]]:
+; CHECK:         addl
+; CHECK-DAG:       %[[R1]]
+; CHECK-DAG:       ,
+; CHECK-DAG:       %[[R3]]
+; CHECK-DAG:     addl
+; CHECK-DAG:       %[[R2]]
+; CHECK-DAG:       ,
+; CHECK-DAG:       %[[R3]]
+; CHECK:         movl %[[R3]], %eax
+; CHECK:         retq
+  %s1 = add i32 %z1, %z2
+  %s2 = add i32 %s1, %z3
+  ret i32 %s2
+}
+
+; Same as before but with operands reversed in the select with a load.
+define i32 @test_cmov_memoperand_in_group2(i32 %a, i32 %b, i32 %x, i32* %y.ptr) #0 {
+; CHECK-LABEL: test_cmov_memoperand_in_group2:
+entry:
+  %cond = icmp ugt i32 %a, %b
+; CHECK:         cmpl
+  %y = load i32, i32* %y.ptr
+  %z2 = select i1 %cond, i32 %a, i32 %x
+  %z1 = select i1 %cond, i32 %y, i32 %x
+  %z3 = select i1 %cond, i32 %b, i32 %x
+; CHECK-NOT:     cmov
+; CHECK:         jbe [[FALSE_BB:.*]]
+; CHECK-DAG:     movl %{{.*}}, %[[R1:.*]]
+; CHECK-DAG:     movl (%r{{..}}), %[[R2:.*]]
+; CHECK-DAG:     movl %{{.*}} %[[R3:.*]]
+; CHECK:       [[FALSE_BB]]:
+; CHECK:         addl
+; CHECK-DAG:       %[[R1]]
+; CHECK-DAG:       ,
+; CHECK-DAG:       %[[R3]]
+; CHECK-DAG:     addl
+; CHECK-DAG:       %[[R2]]
+; CHECK-DAG:       ,
+; CHECK-DAG:       %[[R3]]
+; CHECK:         movl %[[R3]], %eax
+; CHECK:         retq
+  %s1 = add i32 %z1, %z2
+  %s2 = add i32 %s1, %z3
+  ret i32 %s2
+}
+
+; Test that we don't convert a group of cmovs with conflicting directions of
+; loads.
+define i32 @test_cmov_memoperand_conflicting_dir(i32 %a, i32 %b, i32 %x, i32* %y1.ptr, i32* %y2.ptr) #0 {
+; CHECK-LABEL: test_cmov_memoperand_conflicting_dir:
+entry:
+  %cond = icmp ugt i32 %a, %b
+; CHECK:         cmpl
+  %y1 = load i32, i32* %y1.ptr
+  %y2 = load i32, i32* %y2.ptr
+  %z1 = select i1 %cond, i32 %x, i32 %y1
+  %z2 = select i1 %cond, i32 %y2, i32 %x
+; CHECK:         cmoval
+; CHECK:         cmoval
+  %s1 = add i32 %z1, %z2
+  ret i32 %s1
+}
+
+; Test that we can convert a group of cmovs where only one has a memory
+; operand and where that memory operand's registers come from a prior cmov in the group.
+define i32 @test_cmov_memoperand_in_group_reuse_for_addr(i32 %a, i32 %b, i32* %x, i32* %y) #0 {
+; CHECK-LABEL: test_cmov_memoperand_in_group_reuse_for_addr:
+entry:
+  %cond = icmp ugt i32 %a, %b
+; CHECK:         cmpl
+  %p = select i1 %cond, i32* %x, i32* %y
+  %load = load i32, i32* %p
+  %z = select i1 %cond, i32 %a, i32 %load
+; CHECK-NOT:     cmov
+; CHECK:         ja [[FALSE_BB:.*]]
+; CHECK:         movl (%r{{..}}), %[[R:.*]]
+; CHECK:       [[FALSE_BB]]:
+; CHECK:         movl %[[R]], %eax
+; CHECK:         retq
+  ret i32 %z
+}
+
+; Test that we can convert a group of two cmovs with memory operands where one
+; uses the result of the other as part of the address.
+define i32 @test_cmov_memoperand_in_group_reuse_for_addr2(i32 %a, i32 %b, i32* %x, i32** %y) #0 {
+; CHECK-LABEL: test_cmov_memoperand_in_group_reuse_for_addr2:
+entry:
+  %cond = icmp ugt i32 %a, %b
+; CHECK:         cmpl
+  %load1 = load i32*, i32** %y
+  %p = select i1 %cond, i32* %x, i32* %load1
+  %load2 = load i32, i32* %p
+  %z = select i1 %cond, i32 %a, i32 %load2
+; CHECK-NOT:     cmov
+; CHECK:         ja [[FALSE_BB:.*]]
+; CHECK:         movq (%r{{..}}), %[[R1:.*]]
+; CHECK:         movl (%[[R1]]), %[[R2:.*]]
+; CHECK:       [[FALSE_BB]]:
+; CHECK:         movl %[[R2]], %eax
+; CHECK:         retq
+  ret i32 %z
+}
+
 attributes #0 = {"target-cpu"="x86-64"}