Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -24505,6 +24505,11 @@
 MachineBasicBlock *
 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                MachineBasicBlock *BB) const {
+  MachineFunction *MF = BB->getParent();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  DebugLoc DL = MI.getDebugLoc();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
   switch (MI.getOpcode()) {
   default: llvm_unreachable("Unexpected instr type to insert");
   case X86::TAILJMPd64:
@@ -24558,8 +24563,6 @@
 
   case X86::RDFLAGS32:
   case X86::RDFLAGS64: {
-    DebugLoc DL = MI.getDebugLoc();
-    const TargetInstrInfo *TII = Subtarget.getInstrInfo();
     unsigned PushF =
         MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
     unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
@@ -24577,8 +24580,6 @@
 
   case X86::WRFLAGS32:
   case X86::WRFLAGS64: {
-    DebugLoc DL = MI.getDebugLoc();
-    const TargetInstrInfo *TII = Subtarget.getInstrInfo();
     unsigned Push =
         MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
     unsigned PopF =
@@ -24603,19 +24604,15 @@
   case X86::FP80_TO_INT16_IN_MEM:
   case X86::FP80_TO_INT32_IN_MEM:
   case X86::FP80_TO_INT64_IN_MEM: {
-    MachineFunction *F = BB->getParent();
-    const TargetInstrInfo *TII = Subtarget.getInstrInfo();
-    DebugLoc DL = MI.getDebugLoc();
-
     // Change the floating point control register to use "round towards zero"
     // mode when truncating to an integer value.
-    int CWFrameIdx = F->getFrameInfo().CreateStackObject(2, 2, false);
+    int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
     addFrameReference(BuildMI(*BB, MI, DL,
                               TII->get(X86::FNSTCW16m)), CWFrameIdx);
 
     // Load the old value of the high byte of the control word...
     unsigned OldCW =
-      F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
+      MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
                       CWFrameIdx);
 
@@ -24723,6 +24720,45 @@
   case TargetOpcode::PATCHPOINT:
     return emitPatchPoint(MI, BB);
 
+  case X86::LCMPXCHG8B: {
+    const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
+    // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B requires
+    // a memory operand. If it happens that current architecture is i686 and for
+    // current function we need a base pointer - which is ESI for i686 -
+    // register allocator would not be able to allocate registers for an address
+    // in form of X(%reg, %reg, Y) - there never would be enough unreserved
+    // registers during regalloc (without the need for base ptr the only option
+    // would be X(%edi, %esi, Y).
+    // We are giving a hand to register allocator by precomputing the address in
+    // a new vreg using LEA.
+    if (TRI->hasBasePointer(*MF) && TRI->getBaseRegister() == X86::ESI) {
+      MVT SPTy = getPointerTy(MF->getDataLayout());
+      const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
+      unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
+
+      X86AddressMode AM = getAddressFromInstr(&MI, 0);
+      // Regalloc does not need any help when the memory operand of CMPXCHG8B
+      // does not use index register.
+      if (AM.IndexReg == X86::NoRegister)
+        return BB;
+
+      // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
+      // four operand definitions that are E[ABCD] registers. We skip them and
+      // then insert the LEA.
+      MachineBasicBlock::iterator MBBI(MI);
+      while (MBBI->definesRegister(X86::EAX) ||
+             MBBI->definesRegister(X86::EBX) ||
+             MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))
+        --MBBI;
+      addFullAddress(
+          BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
+
+      setDirectAddressInInstr(&MI, 0, computedAddrVReg);
+    }
+    return BB;
+  }
+  case X86::LCMPXCHG16B:
+    return BB;
   case X86::LCMPXCHG8B_SAVE_EBX:
   case X86::LCMPXCHG16B_SAVE_RBX: {
     unsigned BasePtr =
Index: lib/Target/X86/X86InstrBuilder.h
===================================================================
--- lib/Target/X86/X86InstrBuilder.h
+++ lib/Target/X86/X86InstrBuilder.h
@@ -119,6 +119,16 @@
   return AM;
 }
 
+// Replace the address used in the instruction with the direct memory reference.
+static inline void setDirectAddressInInstr(MachineInstr *MI, unsigned Operand,
+                                           unsigned Reg) {
+  MI->getOperand(Operand).setReg(Reg);
+  MI->getOperand(Operand + 1).setImm(1);
+  MI->getOperand(Operand + 2).setReg(0);
+  MI->getOperand(Operand + 3).setImm(0);
+  MI->getOperand(Operand + 4).setReg(0);
+}
+
 /// addDirectMem - This function is used to add a direct memory reference to the
 /// current instruction -- that is, a dereference of an address in a register,
 /// with no scale, index or displacement. An example is: DWORD PTR [EAX].
Index: lib/Target/X86/X86InstrCompiler.td
===================================================================
--- lib/Target/X86/X86InstrCompiler.td
+++ lib/Target/X86/X86InstrCompiler.td
@@ -723,7 +723,7 @@
 multiclass LCMPXCHG_UnOp<bits<8> Opc, Format Form, string mnemonic,
                          SDPatternOperator frag, X86MemOperand x86memop,
                          InstrItinClass itin> {
-let isCodeGenOnly = 1 in {
+let isCodeGenOnly = 1, usesCustomInserter = 1 in {
   def NAME : I<Opc, Form, (outs), (ins x86memop:$ptr),
                !strconcat(mnemonic, "\t$ptr"),
                [(frag addr:$ptr)], itin>, TB, LOCK;
Index: test/CodeGen/X86/pr28755.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/pr28755.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -march=x86 -stackrealign
+
+define void @foo(i64* %a, i32 %off, i32 %n) {
+  %dummy = alloca i32, i32 %n
+  %addr = getelementptr inbounds i64, i64* %a, i32 %off
+
+  %res = cmpxchg i64* %addr, i64 0, i64 1 monotonic monotonic
+  ret void
+}