Index: lib/AsmParser/LLParser.cpp
===================================================================
--- lib/AsmParser/LLParser.cpp
+++ lib/AsmParser/LLParser.cpp
@@ -6722,12 +6722,20 @@
   if (cast<PointerType>(Ptr->getType())->getElementType() != Val->getType())
     return Error(ValLoc, "atomicrmw value and pointer type do not match");
 
-  if (!Val->getType()->isIntegerTy()) {
+  if (Operation != AtomicRMWInst::Xchg && !Val->getType()->isIntegerTy()) {
     return Error(ValLoc, "atomicrmw " +
                  AtomicRMWInst::getOperationName(Operation) +
                  " operand must be an integer");
   }
 
+  if (Operation == AtomicRMWInst::Xchg &&
+      !Val->getType()->isIntegerTy() &&
+      !Val->getType()->isFloatingPointTy()) {
+    return Error(ValLoc, "atomicrmw " +
+                 AtomicRMWInst::getOperationName(Operation) +
+                 " operand must be an integer or floating point type");
+  }
+
   unsigned Size = Val->getType()->getPrimitiveSizeInBits();
   if (Size < 8 || (Size & (Size - 1)))
     return Error(ValLoc, "atomicrmw operand must be power-of-two byte-sized"
Index: lib/CodeGen/AtomicExpandPass.cpp
===================================================================
--- lib/CodeGen/AtomicExpandPass.cpp
+++ lib/CodeGen/AtomicExpandPass.cpp
@@ -499,11 +499,25 @@
                                  Value *Loaded, Value *NewVal,
                                  AtomicOrdering MemOpOrder,
                                  Value *&Success, Value *&NewLoaded) {
+  Type *OrigTy = NewVal->getType();
+
+  bool NeedBitcast = OrigTy->isFloatingPointTy();
+  if (NeedBitcast) {
+    IntegerType *IntTy = Builder.getIntNTy(OrigTy->getPrimitiveSizeInBits());
+    unsigned AS = Addr->getType()->getPointerAddressSpace();
+    Addr = Builder.CreateBitCast(Addr, IntTy->getPointerTo(AS));
+    NewVal = Builder.CreateBitCast(NewVal, IntTy);
+    Loaded = Builder.CreateBitCast(Loaded, IntTy);
+  }
+
   Value* Pair = Builder.CreateAtomicCmpXchg(
       Addr, Loaded, NewVal, MemOpOrder,
       AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder));
   Success = Builder.CreateExtractValue(Pair, 1, "success");
   NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
+
+  if (NeedBitcast)
+    NewLoaded = Builder.CreateBitCast(NewLoaded, OrigTy);
 }
 
 /// Emit IR to implement the given atomicrmw operation on values in registers,
Index: lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -4737,6 +4737,24 @@
     Results.push_back(CvtVec);
     break;
   }
+  case ISD::ATOMIC_SWAP: {
+    AtomicSDNode *AM = cast<AtomicSDNode>(Node);
+    SDLoc SL(Node);
+    SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NVT, AM->getVal());
+    assert(NVT.getSizeInBits() == OVT.getSizeInBits() &&
+           "unexpected promotion type");
+    assert(AM->getMemoryVT().getSizeInBits() == NVT.getSizeInBits() &&
+           "unexpected atomic_swap with illegal type");
+
+    SDValue NewAtomic
+      = DAG.getAtomic(ISD::ATOMIC_SWAP, SL, NVT,
+                      DAG.getVTList(NVT, MVT::Other),
+                      { AM->getChain(), AM->getBasePtr(), CastVal },
+                      AM->getMemOperand());
+    Results.push_back(DAG.getNode(ISD::BITCAST, SL, OVT, NewAtomic));
+    Results.push_back(NewAtomic.getValue(1));
+    break;
+  }
   }
 
   // Replace the original node with the legalized result.
Index: lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -1932,7 +1932,7 @@
     case ISD::SINT_TO_FP:
     case ISD::UINT_TO_FP: R = PromoteFloatRes_XINT_TO_FP(N); break;
     case ISD::UNDEF:      R = PromoteFloatRes_UNDEF(N); break;
-
+    case ISD::ATOMIC_SWAP: R = PromoteFloatRes_ATOMIC_SWAP(N); break;
   }
 
   if (R.getNode())
@@ -2166,3 +2166,31 @@
                                                N->getValueType(0)));
 }
 
+SDValue DAGTypeLegalizer::PromoteFloatRes_ATOMIC_SWAP(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  assert(VT == MVT::f16 && "unhandled case");
+
+  EVT NFPVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+
+  AtomicSDNode *AM = cast<AtomicSDNode>(N);
+  SDLoc SL(N);
+
+  SDValue Promoted = GetPromotedFloat(AM->getVal());
+  SDValue CastVal = DAG.getNode(ISD::FP_TO_FP16, SDLoc(N), MVT::i16, Promoted);
+
+  SDValue NewAtomic
+    = DAG.getAtomic(ISD::ATOMIC_SWAP, SL, MVT::i16,
+                    DAG.getVTList(MVT::i16, MVT::Other),
+                    { AM->getChain(), AM->getBasePtr(), CastVal },
+                    AM->getMemOperand());
+
+  SDValue ResultCast = DAG.getNode(GetPromotionOpcode(VT, NFPVT), SL, NFPVT,
+                                   NewAtomic);
+
+  // Legalize the chain result by replacing uses of the old value chain with the
+  // new one
+  ReplaceValueWith(SDValue(N, 1), NewAtomic.getValue(1));
+
+  return ResultCast;
+}
+
Index: lib/CodeGen/SelectionDAG/LegalizeTypes.h
===================================================================
--- lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -618,6 +618,7 @@
   SDValue PromoteFloatRes_SELECT_CC(SDNode *N);
   SDValue PromoteFloatRes_UnaryOp(SDNode *N);
   SDValue PromoteFloatRes_UNDEF(SDNode *N);
+  SDValue PromoteFloatRes_ATOMIC_SWAP(SDNode *N);
   SDValue PromoteFloatRes_XINT_TO_FP(SDNode *N);
 
   bool PromoteFloatOperand(SDNode *N, unsigned OpNo);
Index: lib/CodeGen/TargetLoweringBase.cpp
===================================================================
--- lib/CodeGen/TargetLoweringBase.cpp
+++ lib/CodeGen/TargetLoweringBase.cpp
@@ -582,6 +582,14 @@
   std::fill(std::begin(TargetDAGCombineArray),
             std::end(TargetDAGCombineArray), 0);
 
+  for (MVT VT : MVT::fp_valuetypes()) {
+    MVT IntVT = MVT::getIntegerVT(VT.getSizeInBits());
+    if (IntVT.isValid()) {
+      setOperationAction(ISD::ATOMIC_SWAP, VT, Promote);
+      AddPromotedToType(ISD::ATOMIC_SWAP, VT, IntVT);
+    }
+  }
+
   // Set default actions for various operations.
   for (MVT VT : MVT::all_valuetypes()) {
     // Default all indexed load / store to expand.
Index: lib/IR/Verifier.cpp
===================================================================
--- lib/IR/Verifier.cpp
+++ lib/IR/Verifier.cpp
@@ -3352,10 +3352,17 @@
   PointerType *PTy = dyn_cast<PointerType>(RMWI.getOperand(0)->getType());
   Assert(PTy, "First atomicrmw operand must be a pointer.", &RMWI);
   Type *ElTy = PTy->getElementType();
-  Assert(ElTy->isIntegerTy(), "atomicrmw " +
-         AtomicRMWInst::getOperationName(Op) +
-         " operand must have integer type!",
-         &RMWI, ElTy);
+  if (Op == AtomicRMWInst::Xchg) {
+    Assert(ElTy->isIntegerTy() || ElTy->isFloatingPointTy(), "atomicrmw " +
+           AtomicRMWInst::getOperationName(Op) +
+           " operand must have integer or floating point type!",
+           &RMWI, ElTy);
+  } else {
+    Assert(ElTy->isIntegerTy(), "atomicrmw " +
+           AtomicRMWInst::getOperationName(Op) +
+           " operand must have integer type!",
+           &RMWI, ElTy);
+  }
   checkAtomicMemAccessSize(ElTy, &RMWI);
   Assert(ElTy == RMWI.getOperand(1)->getType(),
          "Argument value type does not match pointer operand type!", &RMWI,
Index: lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- lib/Target/AArch64/AArch64ISelLowering.cpp
+++ lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -11419,9 +11419,13 @@
       IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
   Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);
 
-  return Builder.CreateTruncOrBitCast(
-      Builder.CreateCall(Ldxr, Addr),
-      cast<PointerType>(Addr->getType())->getElementType());
+  Type *EltTy = cast<PointerType>(Addr->getType())->getElementType();
+
+  const DataLayout &DL = M->getDataLayout();
+  IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(EltTy));
+  Value *Trunc = Builder.CreateTrunc(Builder.CreateCall(Ldxr, Addr), IntEltTy);
+
+  return Builder.CreateBitCast(Trunc, EltTy);
 }
 
 void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
@@ -11456,6 +11460,10 @@
   Type *Tys[] = { Addr->getType() };
   Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
 
+  const DataLayout &DL = M->getDataLayout();
+  IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
+  Val = Builder.CreateBitCast(Val, IntValTy);
+
   return Builder.CreateCall(Stxr,
                             {Builder.CreateZExtOrBitCast(
                                  Val, Stxr->getFunctionType()->getParamType(0)),
Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -287,7 +287,6 @@
   setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
   setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
 
-
   setOperationAction(ISD::Constant, MVT::i32, Legal);
   setOperationAction(ISD::Constant, MVT::i64, Legal);
   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
Index: test/Assembler/invalid-atomicrmw-xchg-must-be-integer-or-fp-type.ll
===================================================================
--- /dev/null
+++ test/Assembler/invalid-atomicrmw-xchg-must-be-integer-or-fp-type.ll
@@ -0,0 +1,7 @@
+; RUN: not llvm-as -disable-output %s 2>&1 | FileCheck %s
+
+; CHECK: error: atomicrmw xchg operand must be an integer or floating point type
+define void @f(i32** %ptr) {
+  atomicrmw xchg i32** %ptr, i32* null seq_cst
+  ret void
+}
Index: test/Bitcode/compatibility.ll
===================================================================
--- test/Bitcode/compatibility.ll
+++ test/Bitcode/compatibility.ll
@@ -761,6 +761,12 @@
   ret void
 }
 
+define void @fp_atomics(float* %word) {
+; CHECK: %atomicrmw.xchg = atomicrmw xchg float* %word, float 1.000000e+00 monotonic
+  %atomicrmw.xchg = atomicrmw xchg float* %word, float 1.0 monotonic
+  ret void
+}
+
 ;; Fast Math Flags
 define void @fastmathflags(float %op1, float %op2) {
   %f.nnan = fadd nnan float %op1, %op2
Index: test/CodeGen/AMDGPU/flat_atomics.ll
===================================================================
--- test/CodeGen/AMDGPU/flat_atomics.ll
+++ test/CodeGen/AMDGPU/flat_atomics.ll
@@ -703,6 +703,16 @@
   ret void
 }
 
+; GCN-LABEL: {{^}}atomic_xchg_f32_offset:
+; CIVI: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+; GFX9: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}}
+define amdgpu_kernel void @atomic_xchg_f32_offset(float* %out, float %in) {
+entry:
+  %gep = getelementptr float, float* %out, i32 4
+  %val = atomicrmw volatile xchg float* %gep, float %in seq_cst
+  ret void
+}
+
 ; GCN-LABEL: {{^}}atomic_xchg_i32_ret_offset:
 ; CIVI: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GFX9: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}}
Index: test/CodeGen/AMDGPU/flat_atomics_i64.ll
===================================================================
--- test/CodeGen/AMDGPU/flat_atomics_i64.ll
+++ test/CodeGen/AMDGPU/flat_atomics_i64.ll
@@ -650,6 +650,15 @@
   ret void
 }
 
+; GCN-LABEL: {{^}}atomic_xchg_f64_offset:
+; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define amdgpu_kernel void @atomic_xchg_f64_offset(double* %out, double %in) {
+entry:
+  %gep = getelementptr double, double* %out, i64 4
+  %tmp0 = atomicrmw volatile xchg double* %gep, double %in seq_cst
+  ret void
+}
+
 ; GCN-LABEL: {{^}}atomic_xchg_i64_ret_offset:
 ; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
Index: test/CodeGen/AMDGPU/global_atomics.ll
===================================================================
--- test/CodeGen/AMDGPU/global_atomics.ll
+++ test/CodeGen/AMDGPU/global_atomics.ll
@@ -839,6 +839,17 @@
   ret void
 }
 
+; GCN-LABEL: {{^}}atomic_xchg_f32_offset:
+; SIVI: buffer_atomic_swap v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+
+; GFX9: global_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}}
+define amdgpu_kernel void @atomic_xchg_f32_offset(float addrspace(1)* %out, float %in) {
+entry:
+  %gep = getelementptr float, float addrspace(1)* %out, i64 4
+  %val = atomicrmw volatile xchg float addrspace(1)* %gep, float %in seq_cst
+  ret void
+}
+
 ; GCN-LABEL: {{^}}atomic_xchg_i32_ret_offset:
 ; SIVI: buffer_atomic_swap [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; SIVI: buffer_store_dword [[RET]]
Index: test/CodeGen/AMDGPU/global_atomics_i64.ll
===================================================================
--- test/CodeGen/AMDGPU/global_atomics_i64.ll
+++ test/CodeGen/AMDGPU/global_atomics_i64.ll
@@ -783,6 +783,17 @@
   ret void
 }
 
+; GCN-LABEL: {{^}}atomic_xchg_f64_offset:
+; CIVI: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
+
+; GFX9: global_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}}
+define amdgpu_kernel void @atomic_xchg_f64_offset(double addrspace(1)* %out, double %in) {
+entry:
+  %gep = getelementptr double, double addrspace(1)* %out, i64 4
+  %tmp0 = atomicrmw volatile xchg double addrspace(1)* %gep, double %in seq_cst
+  ret void
+}
+
 ; GCN-LABEL: {{^}}atomic_xchg_i64_ret_offset:
 ; CIVI: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; CIVI: buffer_store_dwordx2 [[RET]]
Index: test/CodeGen/AMDGPU/local-atomics.ll
===================================================================
--- test/CodeGen/AMDGPU/local-atomics.ll
+++ test/CodeGen/AMDGPU/local-atomics.ll
@@ -36,6 +36,20 @@
   ret void
 }
 
+; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_f32_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; EG: LDS_WRXCHG_RET *
+; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define amdgpu_kernel void @lds_atomic_xchg_ret_f32_offset(float addrspace(1)* %out, float addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr float, float addrspace(3)* %ptr, i32 4
+  %result = atomicrmw xchg float addrspace(3)* %gep, float 4.0 seq_cst
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
 ; XXX - Is it really necessary to load 4 into VGPR?
 ; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32:
 ; EG: LDS_ADD_RET *
Index: test/CodeGen/AMDGPU/local-atomics64.ll
===================================================================
--- test/CodeGen/AMDGPU/local-atomics64.ll
+++ test/CodeGen/AMDGPU/local-atomics64.ll
@@ -27,6 +27,19 @@
   ret void
 }
 
+; GCN-LABEL: {{^}}lds_atomic_xchg_ret_f64_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32
+; GCN: s_endpgm
+define amdgpu_kernel void @lds_atomic_xchg_ret_f64_offset(double addrspace(1)* %out, double addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr double, double addrspace(3)* %ptr, i32 4
+  %result = atomicrmw xchg double addrspace(3)* %gep, double 4.0 seq_cst
+  store double %result, double addrspace(1)* %out, align 8
+  ret void
+}
+
 ; GCN-LABEL: {{^}}lds_atomic_add_ret_i64:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
Index: test/CodeGen/X86/atomic16.ll
===================================================================
--- test/CodeGen/X86/atomic16.ll
+++ test/CodeGen/X86/atomic16.ll
@@ -2,6 +2,7 @@
 ; RUN: llc < %s -O0 -mtriple=i386-unknown-unknown -mcpu=corei7 -verify-machineinstrs | FileCheck %s --check-prefix X32
 
 @sc16 = external global i16
+@fsc16 = external global half
 
 define void @atomic_fetch_add16() nounwind {
 ; X64-LABEL:   atomic_fetch_add16
@@ -273,3 +274,14 @@
 ; X64:       ret
 ; X32:       ret
 }
+
+define void @atomic_fetch_swapf16(half %x) nounwind {
+  %t1 = atomicrmw xchg half* @fsc16, half %x acquire
+; X64-NOT:   lock
+; X64:       xchgw
+; X32-NOT:   lock
+; X32:       xchgw
+  ret void
+; X64:       ret
+; X32:       ret
+}
Index: test/CodeGen/X86/atomic32.ll
===================================================================
--- test/CodeGen/X86/atomic32.ll
+++ test/CodeGen/X86/atomic32.ll
@@ -4,6 +4,7 @@
 ; RUN: llc < %s -O0 -mtriple=i686-unknown-unknown -mcpu=corei7 -mattr=-cmov,-sse -verify-machineinstrs | FileCheck %s --check-prefixes=X86,X86-NOCMOV
 
 @sc32 = external global i32
+@fsc32 = external global float
 
 define void @atomic_fetch_add32() nounwind {
 ; X64-LABEL: atomic_fetch_add32:
@@ -61,22 +62,22 @@
 ; X64:       # %bb.0:
 ; X64-NEXT:    lock andl $3, {{.*}}(%rip)
 ; X64-NEXT:    movl sc32, %eax
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:  .LBB2_1: # %atomicrmw.start
 ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
 ; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    andl $5, %ecx
 ; X64-NEXT:    lock cmpxchgl %ecx, {{.*}}(%rip)
 ; X64-NEXT:    sete %dl
 ; X64-NEXT:    testb $1, %dl
 ; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    jne .LBB2_2
 ; X64-NEXT:    jmp .LBB2_1
 ; X64-NEXT:  .LBB2_2: # %atomicrmw.end
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
 ; X64-NEXT:    lock andl %eax, {{.*}}(%rip)
 ; X64-NEXT:    retq
 ;
@@ -85,10 +86,10 @@
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    lock andl $3, sc32
 ; X86-NEXT:    movl sc32, %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:  .LBB2_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $5, %ecx
 ; X86-NEXT:    lock cmpxchgl %ecx, sc32
@@ -96,7 +97,7 @@
 ; X86-NEXT:    testb $1, %dl
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    jne .LBB2_2
 ; X86-NEXT:    jmp .LBB2_1
 ; X86-NEXT:  .LBB2_2: # %atomicrmw.end
@@ -115,22 +116,22 @@
 ; X64:       # %bb.0:
 ; X64-NEXT:    lock orl $3, {{.*}}(%rip)
 ; X64-NEXT:    movl sc32, %eax
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:  .LBB3_1: # %atomicrmw.start
 ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
 ; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    orl $5, %ecx
 ; X64-NEXT:    lock cmpxchgl %ecx, {{.*}}(%rip)
 ; X64-NEXT:    sete %dl
 ; X64-NEXT:    testb $1, %dl
 ; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    jne .LBB3_2
 ; X64-NEXT:    jmp .LBB3_1
 ; X64-NEXT:  .LBB3_2: # %atomicrmw.end
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
 ; X64-NEXT:    lock orl %eax, {{.*}}(%rip)
 ; X64-NEXT:    retq
 ;
@@ -139,10 +140,10 @@
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    lock orl $3, sc32
 ; X86-NEXT:    movl sc32, %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:  .LBB3_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    orl $5, %ecx
 ; X86-NEXT:    lock cmpxchgl %ecx, sc32
@@ -150,7 +151,7 @@
 ; X86-NEXT:    testb $1, %dl
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    jne .LBB3_2
 ; X86-NEXT:    jmp .LBB3_1
 ; X86-NEXT:  .LBB3_2: # %atomicrmw.end
@@ -169,22 +170,22 @@
 ; X64:       # %bb.0:
 ; X64-NEXT:    lock xorl $3, {{.*}}(%rip)
 ; X64-NEXT:    movl sc32, %eax
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:  .LBB4_1: # %atomicrmw.start
 ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
 ; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    xorl $5, %ecx
 ; X64-NEXT:    lock cmpxchgl %ecx, {{.*}}(%rip)
 ; X64-NEXT:    sete %dl
 ; X64-NEXT:    testb $1, %dl
 ; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    jne .LBB4_2
 ; X64-NEXT:    jmp .LBB4_1
 ; X64-NEXT:  .LBB4_2: # %atomicrmw.end
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
 ; X64-NEXT:    lock xorl %eax, {{.*}}(%rip)
 ; X64-NEXT:    retq
 ;
@@ -193,10 +194,10 @@
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    lock xorl $3, sc32
 ; X86-NEXT:    movl sc32, %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:  .LBB4_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    xorl $5, %ecx
 ; X86-NEXT:    lock cmpxchgl %ecx, sc32
@@ -204,7 +205,7 @@
 ; X86-NEXT:    testb $1, %dl
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    jne .LBB4_2
 ; X86-NEXT:    jmp .LBB4_1
 ; X86-NEXT:  .LBB4_2: # %atomicrmw.end
@@ -222,19 +223,19 @@
 ; X64-LABEL: atomic_fetch_nand32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl sc32, %eax
-; X64-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:  .LBB5_1: # %atomicrmw.start
 ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
 ; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %edx # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
 ; X64-NEXT:    andl %edx, %ecx
 ; X64-NEXT:    notl %ecx
 ; X64-NEXT:    lock cmpxchgl %ecx, {{.*}}(%rip)
 ; X64-NEXT:    sete %sil
 ; X64-NEXT:    testb $1, %sil
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    jne .LBB5_2
 ; X64-NEXT:    jmp .LBB5_1
 ; X64-NEXT:  .LBB5_2: # %atomicrmw.end
@@ -246,13 +247,13 @@
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl sc32, %ecx
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X86-NEXT:  .LBB5_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    andl %edx, %ecx
 ; X86-NEXT:    notl %ecx
 ; X86-NEXT:    lock cmpxchgl %ecx, sc32
@@ -273,20 +274,20 @@
 ; X64-LABEL: atomic_fetch_max32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl sc32, %eax
-; X64-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:  .LBB6_1: # %atomicrmw.start
 ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
 ; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %edx # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
 ; X64-NEXT:    subl %edx, %ecx
 ; X64-NEXT:    cmovgel %eax, %edx
 ; X64-NEXT:    lock cmpxchgl %edx, {{.*}}(%rip)
 ; X64-NEXT:    sete %sil
 ; X64-NEXT:    testb $1, %sil
-; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    jne .LBB6_2
 ; X64-NEXT:    jmp .LBB6_1
 ; X64-NEXT:  .LBB6_2: # %atomicrmw.end
@@ -298,20 +299,20 @@
 ; X86-CMOV-NEXT:    subl $12, %esp
 ; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-CMOV-NEXT:    movl sc32, %ecx
-; X86-CMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-CMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-CMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-CMOV-NEXT:  .LBB6_1: # %atomicrmw.start
 ; X86-CMOV-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X86-CMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-CMOV-NEXT:    movl %eax, %ecx
-; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X86-CMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-CMOV-NEXT:    subl %edx, %ecx
 ; X86-CMOV-NEXT:    cmovgel %eax, %edx
 ; X86-CMOV-NEXT:    lock cmpxchgl %edx, sc32
 ; X86-CMOV-NEXT:    sete %bl
 ; X86-CMOV-NEXT:    testb $1, %bl
 ; X86-CMOV-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-CMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-CMOV-NEXT:    jne .LBB6_2
 ; X86-CMOV-NEXT:    jmp .LBB6_1
 ; X86-CMOV-NEXT:  .LBB6_2: # %atomicrmw.end
@@ -326,34 +327,34 @@
 ; X86-NOCMOV-NEXT:    subl $24, %esp
 ; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOCMOV-NEXT:    movl sc32, %ecx
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOCMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:  .LBB6_1: # %atomicrmw.start
 ; X86-NOCMOV-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NOCMOV-NEXT:    movl %eax, %ecx
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    subl %edx, %ecx
 ; X86-NOCMOV-NEXT:    movl %eax, %esi
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOCMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    jge .LBB6_4
 ; X86-NOCMOV-NEXT:  # %bb.3: # %atomicrmw.start
 ; X86-NOCMOV-NEXT:    # in Loop: Header=BB6_1 Depth=1
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:  .LBB6_4: # %atomicrmw.start
 ; X86-NOCMOV-NEXT:    # in Loop: Header=BB6_1 Depth=1
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    movl %ecx, %eax
 ; X86-NOCMOV-NEXT:    movl (%esp), %edx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    lock cmpxchgl %edx, sc32
 ; X86-NOCMOV-NEXT:    sete %bl
 ; X86-NOCMOV-NEXT:    testb $1, %bl
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    jne .LBB6_2
 ; X86-NOCMOV-NEXT:    jmp .LBB6_1
 ; X86-NOCMOV-NEXT:  .LBB6_2: # %atomicrmw.end
@@ -369,20 +370,20 @@
 ; X64-LABEL: atomic_fetch_min32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl sc32, %eax
-; X64-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:  .LBB7_1: # %atomicrmw.start
 ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
 ; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %edx # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
 ; X64-NEXT:    subl %edx, %ecx
 ; X64-NEXT:    cmovlel %eax, %edx
 ; X64-NEXT:    lock cmpxchgl %edx, {{.*}}(%rip)
 ; X64-NEXT:    sete %sil
 ; X64-NEXT:    testb $1, %sil
-; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    jne .LBB7_2
 ; X64-NEXT:    jmp .LBB7_1
 ; X64-NEXT:  .LBB7_2: # %atomicrmw.end
@@ -394,20 +395,20 @@
 ; X86-CMOV-NEXT:    subl $12, %esp
 ; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-CMOV-NEXT:    movl sc32, %ecx
-; X86-CMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-CMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-CMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-CMOV-NEXT:  .LBB7_1: # %atomicrmw.start
 ; X86-CMOV-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X86-CMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-CMOV-NEXT:    movl %eax, %ecx
-; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X86-CMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-CMOV-NEXT:    subl %edx, %ecx
 ; X86-CMOV-NEXT:    cmovlel %eax, %edx
 ; X86-CMOV-NEXT:    lock cmpxchgl %edx, sc32
 ; X86-CMOV-NEXT:    sete %bl
 ; X86-CMOV-NEXT:    testb $1, %bl
 ; X86-CMOV-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-CMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-CMOV-NEXT:    jne .LBB7_2
 ; X86-CMOV-NEXT:    jmp .LBB7_1
 ; X86-CMOV-NEXT:  .LBB7_2: # %atomicrmw.end
@@ -422,34 +423,34 @@
 ; X86-NOCMOV-NEXT:    subl $24, %esp
 ; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOCMOV-NEXT:    movl sc32, %ecx
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOCMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:  .LBB7_1: # %atomicrmw.start
 ; X86-NOCMOV-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NOCMOV-NEXT:    movl %eax, %ecx
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    subl %edx, %ecx
 ; X86-NOCMOV-NEXT:    movl %eax, %esi
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOCMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    jle .LBB7_4
 ; X86-NOCMOV-NEXT:  # %bb.3: # %atomicrmw.start
 ; X86-NOCMOV-NEXT:    # in Loop: Header=BB7_1 Depth=1
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:  .LBB7_4: # %atomicrmw.start
 ; X86-NOCMOV-NEXT:    # in Loop: Header=BB7_1 Depth=1
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    movl %ecx, %eax
 ; X86-NOCMOV-NEXT:    movl (%esp), %edx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    lock cmpxchgl %edx, sc32
 ; X86-NOCMOV-NEXT:    sete %bl
 ; X86-NOCMOV-NEXT:    testb $1, %bl
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    jne .LBB7_2
 ; X86-NOCMOV-NEXT:    jmp .LBB7_1
 ; X86-NOCMOV-NEXT:  .LBB7_2: # %atomicrmw.end
@@ -465,20 +466,20 @@
 ; X64-LABEL: atomic_fetch_umax32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl sc32, %eax
-; X64-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:  .LBB8_1: # %atomicrmw.start
 ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
 ; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %edx # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
 ; X64-NEXT:    subl %edx, %ecx
 ; X64-NEXT:    cmoval %eax, %edx
 ; X64-NEXT:    lock cmpxchgl %edx, {{.*}}(%rip)
 ; X64-NEXT:    sete %sil
 ; X64-NEXT:    testb $1, %sil
-; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    jne .LBB8_2
 ; X64-NEXT:    jmp .LBB8_1
 ; X64-NEXT:  .LBB8_2: # %atomicrmw.end
@@ -490,20 +491,20 @@
 ; X86-CMOV-NEXT:    subl $12, %esp
 ; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-CMOV-NEXT:    movl sc32, %ecx
-; X86-CMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-CMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-CMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-CMOV-NEXT:  .LBB8_1: # %atomicrmw.start
 ; X86-CMOV-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X86-CMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-CMOV-NEXT:    movl %eax, %ecx
-; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X86-CMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-CMOV-NEXT:    subl %edx, %ecx
 ; X86-CMOV-NEXT:    cmoval %eax, %edx
 ; X86-CMOV-NEXT:    lock cmpxchgl %edx, sc32
 ; X86-CMOV-NEXT:    sete %bl
 ; X86-CMOV-NEXT:    testb $1, %bl
 ; X86-CMOV-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-CMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-CMOV-NEXT:    jne .LBB8_2
 ; X86-CMOV-NEXT:    jmp .LBB8_1
 ; X86-CMOV-NEXT:  .LBB8_2: # %atomicrmw.end
@@ -518,34 +519,34 @@
 ; X86-NOCMOV-NEXT:    subl $24, %esp
 ; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOCMOV-NEXT:    movl sc32, %ecx
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOCMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:  .LBB8_1: # %atomicrmw.start
 ; X86-NOCMOV-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NOCMOV-NEXT:    movl %eax, %ecx
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    subl %edx, %ecx
 ; X86-NOCMOV-NEXT:    movl %eax, %esi
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOCMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    ja .LBB8_4
 ; X86-NOCMOV-NEXT:  # %bb.3: # %atomicrmw.start
 ; X86-NOCMOV-NEXT:    # in Loop: Header=BB8_1 Depth=1
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:  .LBB8_4: # %atomicrmw.start
 ; X86-NOCMOV-NEXT:    # in Loop: Header=BB8_1 Depth=1
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    movl %ecx, %eax
 ; X86-NOCMOV-NEXT:    movl (%esp), %edx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    lock cmpxchgl %edx, sc32
 ; X86-NOCMOV-NEXT:    sete %bl
 ; X86-NOCMOV-NEXT:    testb $1, %bl
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    jne .LBB8_2
 ; X86-NOCMOV-NEXT:    jmp .LBB8_1
 ; X86-NOCMOV-NEXT:  .LBB8_2: # %atomicrmw.end
@@ -561,20 +562,20 @@
 ; X64-LABEL: atomic_fetch_umin32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl sc32, %eax
-; X64-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:  .LBB9_1: # %atomicrmw.start
 ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
 ; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %edx # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
 ; X64-NEXT:    subl %edx, %ecx
 ; X64-NEXT:    cmovbel %eax, %edx
 ; X64-NEXT:    lock cmpxchgl %edx, {{.*}}(%rip)
 ; X64-NEXT:    sete %sil
 ; X64-NEXT:    testb $1, %sil
-; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    jne .LBB9_2
 ; X64-NEXT:    jmp .LBB9_1
 ; X64-NEXT:  .LBB9_2: # %atomicrmw.end
@@ -586,20 +587,20 @@
 ; X86-CMOV-NEXT:    subl $12, %esp
 ; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-CMOV-NEXT:    movl sc32, %ecx
-; X86-CMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-CMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-CMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-CMOV-NEXT:  .LBB9_1: # %atomicrmw.start
 ; X86-CMOV-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X86-CMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-CMOV-NEXT:    movl %eax, %ecx
-; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X86-CMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-CMOV-NEXT:    subl %edx, %ecx
 ; X86-CMOV-NEXT:    cmovbel %eax, %edx
 ; X86-CMOV-NEXT:    lock cmpxchgl %edx, sc32
 ; X86-CMOV-NEXT:    sete %bl
 ; X86-CMOV-NEXT:    testb $1, %bl
 ; X86-CMOV-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-CMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-CMOV-NEXT:    jne .LBB9_2
 ; X86-CMOV-NEXT:    jmp .LBB9_1
 ; X86-CMOV-NEXT:  .LBB9_2: # %atomicrmw.end
@@ -614,34 +615,34 @@
 ; X86-NOCMOV-NEXT:    subl $24, %esp
 ; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOCMOV-NEXT:    movl sc32, %ecx
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOCMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:  .LBB9_1: # %atomicrmw.start
 ; X86-NOCMOV-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NOCMOV-NEXT:    movl %eax, %ecx
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    subl %edx, %ecx
 ; X86-NOCMOV-NEXT:    movl %eax, %esi
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOCMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    jbe .LBB9_4
 ; X86-NOCMOV-NEXT:  # %bb.3: # %atomicrmw.start
 ; X86-NOCMOV-NEXT:    # in Loop: Header=BB9_1 Depth=1
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:  .LBB9_4: # %atomicrmw.start
 ; X86-NOCMOV-NEXT:    # in Loop: Header=BB9_1 Depth=1
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    movl %ecx, %eax
 ; X86-NOCMOV-NEXT:    movl (%esp), %edx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    lock cmpxchgl %edx, sc32
 ; X86-NOCMOV-NEXT:    sete %bl
 ; X86-NOCMOV-NEXT:    testb $1, %bl
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    jne .LBB9_2
 ; X86-NOCMOV-NEXT:    jmp .LBB9_1
 ; X86-NOCMOV-NEXT:  .LBB9_2: # %atomicrmw.end
@@ -659,7 +660,7 @@
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    movl $1, %ecx
 ; X64-NEXT:    lock cmpxchgl %ecx, {{.*}}(%rip)
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: atomic_fetch_cmpxchg32:
@@ -694,7 +695,7 @@
 ; X64-LABEL: atomic_fetch_swap32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    xchgl %edi, {{.*}}(%rip)
-; X64-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: atomic_fetch_swap32:
@@ -708,3 +709,35 @@
   %t1 = atomicrmw xchg i32* @sc32, i32 %x acquire
   ret void
 }
+
+define void @atomic_fetch_swapf32(float %x) nounwind {
+; X64-LABEL: atomic_fetch_swapf32:
+; X64:       # %bb.0:
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    xchgl %eax, {{.*}}(%rip)
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    retq
+;
+; X86-CMOV-LABEL: atomic_fetch_swapf32:
+; X86-CMOV:       # %bb.0:
+; X86-CMOV-NEXT:    pushl %eax
+; X86-CMOV-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-CMOV-NEXT:    movd %xmm0, %eax
+; X86-CMOV-NEXT:    xchgl %eax, fsc32
+; X86-CMOV-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-CMOV-NEXT:    popl %eax
+; X86-CMOV-NEXT:    retl
+;
+; X86-NOCMOV-LABEL: atomic_fetch_swapf32:
+; X86-NOCMOV:       # %bb.0:
+; X86-NOCMOV-NEXT:    subl $8, %esp
+; X86-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NOCMOV-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOCMOV-NEXT:    xchgl %eax, fsc32
+; X86-NOCMOV-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    addl $8, %esp
+; X86-NOCMOV-NEXT:    retl
+  %t1 = atomicrmw xchg float* @fsc32, float %x acquire
+  ret void
+}
Index: test/CodeGen/X86/atomic64.ll
===================================================================
--- test/CodeGen/X86/atomic64.ll
+++ test/CodeGen/X86/atomic64.ll
@@ -1,6 +1,7 @@
 ; RUN: llc < %s -O0 -mtriple=x86_64-- -mcpu=corei7 -verify-machineinstrs | FileCheck %s --check-prefix X64
 
 @sc64 = external global i64
+@fsc64 = external global double
 
 define void @atomic_fetch_add64() nounwind {
 ; X64-LABEL:   atomic_fetch_add64:
@@ -233,3 +234,17 @@
 ; X64:       ret
 ; X32:       ret
 }
+
+
+define void @atomic_fetch_swapf64(double %x) nounwind {
+; X64-LABEL:   atomic_fetch_swapf64:
+; X32-LABEL:   atomic_fetch_swapf64:
+  %t1 = atomicrmw xchg double* @fsc64, double %x acquire
+; X64-NOT:   lock
+; X64:       xchgq
+; X32:       lock
+; X32:       xchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}