Index: lib/AsmParser/LLParser.cpp =================================================================== --- lib/AsmParser/LLParser.cpp +++ lib/AsmParser/LLParser.cpp @@ -6722,12 +6722,20 @@ if (cast(Ptr->getType())->getElementType() != Val->getType()) return Error(ValLoc, "atomicrmw value and pointer type do not match"); - if (!Val->getType()->isIntegerTy()) { + if (Operation != AtomicRMWInst::Xchg && !Val->getType()->isIntegerTy()) { return Error(ValLoc, "atomicrmw " + AtomicRMWInst::getOperationName(Operation) + " operand must be an integer"); } + if (Operation == AtomicRMWInst::Xchg && + !Val->getType()->isIntegerTy() && + !Val->getType()->isFloatingPointTy()) { + return Error(ValLoc, "atomicrmw " + + AtomicRMWInst::getOperationName(Operation) + + " operand must be an integer or floating point type"); + } + unsigned Size = Val->getType()->getPrimitiveSizeInBits(); if (Size < 8 || (Size & (Size - 1))) return Error(ValLoc, "atomicrmw operand must be power-of-two byte-sized" Index: lib/CodeGen/AtomicExpandPass.cpp =================================================================== --- lib/CodeGen/AtomicExpandPass.cpp +++ lib/CodeGen/AtomicExpandPass.cpp @@ -499,11 +499,25 @@ Value *Loaded, Value *NewVal, AtomicOrdering MemOpOrder, Value *&Success, Value *&NewLoaded) { + Type *OrigTy = NewVal->getType(); + + bool NeedBitcast = OrigTy->isFloatingPointTy(); + if (NeedBitcast) { + IntegerType *IntTy = Builder.getIntNTy(OrigTy->getPrimitiveSizeInBits()); + unsigned AS = Addr->getType()->getPointerAddressSpace(); + Addr = Builder.CreateBitCast(Addr, IntTy->getPointerTo(AS)); + NewVal = Builder.CreateBitCast(NewVal, IntTy); + Loaded = Builder.CreateBitCast(Loaded, IntTy); + } + Value* Pair = Builder.CreateAtomicCmpXchg( Addr, Loaded, NewVal, MemOpOrder, AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder)); Success = Builder.CreateExtractValue(Pair, 1, "success"); NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded"); + + if (NeedBitcast) + NewLoaded = Builder.CreateBitCast(NewLoaded, OrigTy); } /// Emit IR to implement the given atomicrmw operation on values in registers, Index: lib/CodeGen/SelectionDAG/LegalizeDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -4737,6 +4737,24 @@ Results.push_back(CvtVec); break; } + case ISD::ATOMIC_SWAP: { + AtomicSDNode *AM = cast(Node); + SDLoc SL(Node); + SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NVT, AM->getVal()); + assert(NVT.getSizeInBits() == OVT.getSizeInBits() && + "unexpected promotion type"); + assert(AM->getMemoryVT().getSizeInBits() == NVT.getSizeInBits() && + "unexpected atomic_swap with illegal type"); + + SDValue NewAtomic + = DAG.getAtomic(ISD::ATOMIC_SWAP, SL, NVT, + DAG.getVTList(NVT, MVT::Other), + { AM->getChain(), AM->getBasePtr(), CastVal }, + AM->getMemOperand()); + Results.push_back(DAG.getNode(ISD::BITCAST, SL, OVT, NewAtomic)); + Results.push_back(NewAtomic.getValue(1)); + break; + } } // Replace the original node with the legalized result. Index: lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -1932,7 +1932,7 @@ case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: R = PromoteFloatRes_XINT_TO_FP(N); break; case ISD::UNDEF: R = PromoteFloatRes_UNDEF(N); break; - + case ISD::ATOMIC_SWAP: R = PromoteFloatRes_ATOMIC_SWAP(N); break; } if (R.getNode()) @@ -2166,3 +2166,31 @@ N->getValueType(0))); } +SDValue DAGTypeLegalizer::PromoteFloatRes_ATOMIC_SWAP(SDNode *N) { + EVT VT = N->getValueType(0); + assert(VT == MVT::f16 && "unhandled case"); + + EVT NFPVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); + + AtomicSDNode *AM = cast(N); + SDLoc SL(N); + + SDValue Promoted = GetPromotedFloat(AM->getVal()); + SDValue CastVal = DAG.getNode(ISD::FP_TO_FP16, SDLoc(N), MVT::i16, Promoted); + + SDValue NewAtomic + = DAG.getAtomic(ISD::ATOMIC_SWAP, SL, MVT::i16, + DAG.getVTList(MVT::i16, MVT::Other), + { AM->getChain(), AM->getBasePtr(), CastVal }, + AM->getMemOperand()); + + SDValue ResultCast = DAG.getNode(GetPromotionOpcode(VT, NFPVT), SL, NFPVT, + NewAtomic); + + // Legalize the chain result by replacing uses of the old value chain with the + // new one + ReplaceValueWith(SDValue(N, 1), NewAtomic.getValue(1)); + + return ResultCast; +} + Index: lib/CodeGen/SelectionDAG/LegalizeTypes.h =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -618,6 +618,7 @@ SDValue PromoteFloatRes_SELECT_CC(SDNode *N); SDValue PromoteFloatRes_UnaryOp(SDNode *N); SDValue PromoteFloatRes_UNDEF(SDNode *N); + SDValue PromoteFloatRes_ATOMIC_SWAP(SDNode *N); SDValue PromoteFloatRes_XINT_TO_FP(SDNode *N); bool PromoteFloatOperand(SDNode *N, unsigned OpNo); Index: lib/CodeGen/TargetLoweringBase.cpp =================================================================== --- lib/CodeGen/TargetLoweringBase.cpp +++ lib/CodeGen/TargetLoweringBase.cpp @@ -582,6 +582,14 @@ std::fill(std::begin(TargetDAGCombineArray), std::end(TargetDAGCombineArray), 0); + for (MVT VT : MVT::fp_valuetypes()) { + MVT IntVT = MVT::getIntegerVT(VT.getSizeInBits()); + if (IntVT.isValid()) { + setOperationAction(ISD::ATOMIC_SWAP, VT, Promote); + AddPromotedToType(ISD::ATOMIC_SWAP, VT, IntVT); + } + } + // Set default actions for various operations. for (MVT VT : MVT::all_valuetypes()) { // Default all indexed load / store to expand. Index: lib/IR/Verifier.cpp =================================================================== --- lib/IR/Verifier.cpp +++ lib/IR/Verifier.cpp @@ -3352,10 +3352,17 @@ PointerType *PTy = dyn_cast(RMWI.getOperand(0)->getType()); Assert(PTy, "First atomicrmw operand must be a pointer.", &RMWI); Type *ElTy = PTy->getElementType(); - Assert(ElTy->isIntegerTy(), "atomicrmw " + - AtomicRMWInst::getOperationName(Op) + - " operand must have integer type!", - &RMWI, ElTy); + if (Op == AtomicRMWInst::Xchg) { + Assert(ElTy->isIntegerTy() || ElTy->isFloatingPointTy(), "atomicrmw " + + AtomicRMWInst::getOperationName(Op) + + " operand must have integer or floating point type!", + &RMWI, ElTy); + } else { + Assert(ElTy->isIntegerTy(), "atomicrmw " + + AtomicRMWInst::getOperationName(Op) + + " operand must have integer type!", + &RMWI, ElTy); + } checkAtomicMemAccessSize(ElTy, &RMWI); Assert(ElTy == RMWI.getOperand(1)->getType(), "Argument value type does not match pointer operand type!", &RMWI, Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -11419,9 +11419,13 @@ IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr; Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys); - return Builder.CreateTruncOrBitCast( - Builder.CreateCall(Ldxr, Addr), - cast(Addr->getType())->getElementType()); + Type *EltTy = cast(Addr->getType())->getElementType(); + + const DataLayout &DL = M->getDataLayout(); + IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(EltTy)); + Value *Trunc = Builder.CreateTrunc(Builder.CreateCall(Ldxr, Addr), IntEltTy); + + return Builder.CreateBitCast(Trunc, EltTy); } void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance( @@ -11456,6 +11460,10 @@ Type *Tys[] = { Addr->getType() }; Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys); + const DataLayout &DL = M->getDataLayout(); + IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType())); + Val = Builder.CreateBitCast(Val, IntValTy); + return Builder.CreateCall(Stxr, {Builder.CreateZExtOrBitCast( Val, Stxr->getFunctionType()->getParamType(0)), Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -287,7 +287,6 @@ setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand); setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand); - setOperationAction(ISD::Constant, MVT::i32, Legal); setOperationAction(ISD::Constant, MVT::i64, Legal); setOperationAction(ISD::ConstantFP, MVT::f32, Legal); Index: test/Assembler/invalid-atomicrmw-xchg-must-be-integer-or-fp-type.ll =================================================================== --- /dev/null +++ test/Assembler/invalid-atomicrmw-xchg-must-be-integer-or-fp-type.ll @@ -0,0 +1,7 @@ +; RUN: not llvm-as -disable-output %s 2>&1 | FileCheck %s + +; CHECK: error: atomicrmw xchg operand must be an integer or floating point type +define void @f(i32** %ptr) { + atomicrmw xchg i32** %ptr, i32* null seq_cst + ret void +} Index: test/Bitcode/compatibility.ll =================================================================== --- test/Bitcode/compatibility.ll +++ test/Bitcode/compatibility.ll @@ -761,6 +761,12 @@ ret void } +define void @fp_atomics(float* %word) { +; CHECK: %atomicrmw.xchg = atomicrmw xchg float* %word, float 1.000000e+00 monotonic + %atomicrmw.xchg = atomicrmw xchg float* %word, float 1.0 monotonic + ret void +} + ;; Fast Math Flags define void @fastmathflags(float %op1, float %op2) { %f.nnan = fadd nnan float %op1, %op2 Index: test/CodeGen/AMDGPU/flat_atomics.ll =================================================================== --- test/CodeGen/AMDGPU/flat_atomics.ll +++ test/CodeGen/AMDGPU/flat_atomics.ll @@ -703,6 +703,16 @@ ret void } +; GCN-LABEL: {{^}}atomic_xchg_f32_offset: +; CIVI: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} +; GFX9: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}} +define amdgpu_kernel void @atomic_xchg_f32_offset(float* %out, float %in) { +entry: + %gep = getelementptr float, float* %out, i32 4 + %val = atomicrmw volatile xchg float* %gep, float %in seq_cst + ret void +} + ; GCN-LABEL: {{^}}atomic_xchg_i32_ret_offset: ; CIVI: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GFX9: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}} Index: test/CodeGen/AMDGPU/flat_atomics_i64.ll =================================================================== --- test/CodeGen/AMDGPU/flat_atomics_i64.ll +++ test/CodeGen/AMDGPU/flat_atomics_i64.ll @@ -650,6 +650,15 @@ ret void } +; GCN-LABEL: {{^}}atomic_xchg_f64_offset: +; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} +define amdgpu_kernel void @atomic_xchg_f64_offset(double* %out, double %in) { +entry: + %gep = getelementptr double, double* %out, i64 4 + %tmp0 = atomicrmw volatile xchg double* %gep, double %in seq_cst + ret void +} + ; GCN-LABEL: {{^}}atomic_xchg_i64_ret_offset: ; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] Index: test/CodeGen/AMDGPU/global_atomics.ll =================================================================== --- test/CodeGen/AMDGPU/global_atomics.ll +++ test/CodeGen/AMDGPU/global_atomics.ll @@ -839,6 +839,17 @@ ret void } +; GCN-LABEL: {{^}}atomic_xchg_f32_offset: +; SIVI: buffer_atomic_swap v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} + +; GFX9: global_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}} +define amdgpu_kernel void @atomic_xchg_f32_offset(float addrspace(1)* %out, float %in) { +entry: + %gep = getelementptr float, float addrspace(1)* %out, i64 4 + %val = atomicrmw volatile xchg float addrspace(1)* %gep, float %in seq_cst + ret void +} + ; GCN-LABEL: {{^}}atomic_xchg_i32_ret_offset: ; SIVI: buffer_atomic_swap [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; SIVI: buffer_store_dword [[RET]] Index: test/CodeGen/AMDGPU/global_atomics_i64.ll =================================================================== --- test/CodeGen/AMDGPU/global_atomics_i64.ll +++ test/CodeGen/AMDGPU/global_atomics_i64.ll @@ -783,6 +783,17 @@ ret void } +; GCN-LABEL: {{^}}atomic_xchg_f64_offset: +; CIVI: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} + +; GFX9: global_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}} +define amdgpu_kernel void @atomic_xchg_f64_offset(double addrspace(1)* %out, double %in) { +entry: + %gep = getelementptr double, double addrspace(1)* %out, i64 4 + %tmp0 = atomicrmw volatile xchg double addrspace(1)* %gep, double %in seq_cst + ret void +} + ; GCN-LABEL: {{^}}atomic_xchg_i64_ret_offset: ; CIVI: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] Index: test/CodeGen/AMDGPU/local-atomics.ll =================================================================== --- test/CodeGen/AMDGPU/local-atomics.ll +++ test/CodeGen/AMDGPU/local-atomics.ll @@ -36,6 +36,20 @@ ret void } +; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_f32_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; EG: LDS_WRXCHG_RET * +; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define amdgpu_kernel void @lds_atomic_xchg_ret_f32_offset(float addrspace(1)* %out, float addrspace(3)* %ptr) nounwind { + %gep = getelementptr float, float addrspace(3)* %ptr, i32 4 + %result = atomicrmw xchg float addrspace(3)* %gep, float 4.0 seq_cst + store float %result, float addrspace(1)* %out, align 4 + ret void +} + ; XXX - Is it really necessary to load 4 into VGPR? ; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32: ; EG: LDS_ADD_RET * Index: test/CodeGen/AMDGPU/local-atomics64.ll =================================================================== --- test/CodeGen/AMDGPU/local-atomics64.ll +++ test/CodeGen/AMDGPU/local-atomics64.ll @@ -27,6 +27,19 @@ ret void } +; GCN-LABEL: {{^}}lds_atomic_xchg_ret_f64_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32 +; GCN: s_endpgm +define amdgpu_kernel void @lds_atomic_xchg_ret_f64_offset(double addrspace(1)* %out, double addrspace(3)* %ptr) nounwind { + %gep = getelementptr double, double addrspace(3)* %ptr, i32 4 + %result = atomicrmw xchg double addrspace(3)* %gep, double 4.0 seq_cst + store double %result, double addrspace(1)* %out, align 8 + ret void +} + ; GCN-LABEL: {{^}}lds_atomic_add_ret_i64: ; SICIVI: s_mov_b32 m0 ; GFX9-NOT: m0 Index: test/CodeGen/X86/atomic16.ll =================================================================== --- test/CodeGen/X86/atomic16.ll +++ test/CodeGen/X86/atomic16.ll @@ -2,6 +2,7 @@ ; RUN: llc < %s -O0 -mtriple=i386-unknown-unknown -mcpu=corei7 -verify-machineinstrs | FileCheck %s --check-prefix X32 @sc16 = external global i16 +@fsc16 = external global half define void @atomic_fetch_add16() nounwind { ; X64-LABEL: atomic_fetch_add16 @@ -273,3 +274,14 @@ ; X64: ret ; X32: ret } + +define void @atomic_fetch_swapf16(half %x) nounwind { + %t1 = atomicrmw xchg half* @fsc16, half %x acquire +; X64-NOT: lock +; X64: xchgw +; X32-NOT: lock +; X32: xchgw + ret void +; X64: ret +; X32: ret +} Index: test/CodeGen/X86/atomic32.ll =================================================================== --- test/CodeGen/X86/atomic32.ll +++ test/CodeGen/X86/atomic32.ll @@ -4,6 +4,7 @@ ; RUN: llc < %s -O0 -mtriple=i686-unknown-unknown -mcpu=corei7 -mattr=-cmov,-sse -verify-machineinstrs | FileCheck %s --check-prefixes=X86,X86-NOCMOV @sc32 = external global i32 +@fsc32 = external global float define void @atomic_fetch_add32() nounwind { ; X64-LABEL: atomic_fetch_add32: @@ -61,22 +62,22 @@ ; X64: # %bb.0: ; X64-NEXT: lock andl $3, {{.*}}(%rip) ; X64-NEXT: movl sc32, %eax -; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill +; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: .LBB2_1: # %atomicrmw.start ; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload +; X64-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload ; X64-NEXT: movl %eax, %ecx ; X64-NEXT: andl $5, %ecx ; X64-NEXT: lock cmpxchgl %ecx, {{.*}}(%rip) ; X64-NEXT: sete %dl ; X64-NEXT: testb $1, %dl ; X64-NEXT: movl %eax, %ecx -; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill -; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill +; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: jne .LBB2_2 ; X64-NEXT: jmp .LBB2_1 ; X64-NEXT: .LBB2_2: # %atomicrmw.end -; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload +; X64-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload ; X64-NEXT: lock andl %eax, {{.*}}(%rip) ; X64-NEXT: retq ; @@ -85,10 +86,10 @@ ; X86-NEXT: subl $8, %esp ; X86-NEXT: lock andl $3, sc32 ; X86-NEXT: movl sc32, %eax -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: .LBB2_1: # %atomicrmw.start ; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $5, %ecx ; X86-NEXT: lock cmpxchgl %ecx, sc32 @@ -96,7 +97,7 @@ ; X86-NEXT: testb $1, %dl ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: jne .LBB2_2 ; X86-NEXT: jmp .LBB2_1 ; X86-NEXT: .LBB2_2: # %atomicrmw.end @@ -115,22 +116,22 @@ ; X64: # %bb.0: ; X64-NEXT: lock orl $3, {{.*}}(%rip) ; X64-NEXT: movl sc32, %eax -; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill +; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: .LBB3_1: # %atomicrmw.start ; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload +; X64-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload ; X64-NEXT: movl %eax, %ecx ; X64-NEXT: orl $5, %ecx ; X64-NEXT: lock cmpxchgl %ecx, {{.*}}(%rip) ; X64-NEXT: sete %dl ; X64-NEXT: testb $1, %dl ; X64-NEXT: movl %eax, %ecx -; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill -; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill +; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: jne .LBB3_2 ; X64-NEXT: jmp .LBB3_1 ; X64-NEXT: .LBB3_2: # %atomicrmw.end -; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload +; X64-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload ; X64-NEXT: lock orl %eax, {{.*}}(%rip) ; X64-NEXT: retq ; @@ -139,10 +140,10 @@ ; X86-NEXT: subl $8, %esp ; X86-NEXT: lock orl $3, sc32 ; X86-NEXT: movl sc32, %eax -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: .LBB3_1: # %atomicrmw.start ; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: orl $5, %ecx ; X86-NEXT: lock cmpxchgl %ecx, sc32 @@ -150,7 +151,7 @@ ; X86-NEXT: testb $1, %dl ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: jne .LBB3_2 ; X86-NEXT: jmp .LBB3_1 ; X86-NEXT: .LBB3_2: # %atomicrmw.end @@ -169,22 +170,22 @@ ; X64: # %bb.0: ; X64-NEXT: lock xorl $3, {{.*}}(%rip) ; X64-NEXT: movl sc32, %eax -; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill +; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: .LBB4_1: # %atomicrmw.start ; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload +; X64-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload ; X64-NEXT: movl %eax, %ecx ; X64-NEXT: xorl $5, %ecx ; X64-NEXT: lock cmpxchgl %ecx, {{.*}}(%rip) ; X64-NEXT: sete %dl ; X64-NEXT: testb $1, %dl ; X64-NEXT: movl %eax, %ecx -; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill -; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill +; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: jne .LBB4_2 ; X64-NEXT: jmp .LBB4_1 ; X64-NEXT: .LBB4_2: # %atomicrmw.end -; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload +; X64-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload ; X64-NEXT: lock xorl %eax, {{.*}}(%rip) ; X64-NEXT: retq ; @@ -193,10 +194,10 @@ ; X86-NEXT: subl $8, %esp ; X86-NEXT: lock xorl $3, sc32 ; X86-NEXT: movl sc32, %eax -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: .LBB4_1: # %atomicrmw.start ; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: xorl $5, %ecx ; X86-NEXT: lock cmpxchgl %ecx, sc32 @@ -204,7 +205,7 @@ ; X86-NEXT: testb $1, %dl ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: jne .LBB4_2 ; X86-NEXT: jmp .LBB4_1 ; X86-NEXT: .LBB4_2: # %atomicrmw.end @@ -222,19 +223,19 @@ ; X64-LABEL: atomic_fetch_nand32: ; X64: # %bb.0: ; X64-NEXT: movl sc32, %eax -; X64-NEXT: movl %edi, -{{[0-9]+}}(%rsp) # 4-byte Spill -; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill +; X64-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: .LBB5_1: # %atomicrmw.start ; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload +; X64-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload ; X64-NEXT: movl %eax, %ecx -; X64-NEXT: movl -{{[0-9]+}}(%rsp), %edx # 4-byte Reload +; X64-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload ; X64-NEXT: andl %edx, %ecx ; X64-NEXT: notl %ecx ; X64-NEXT: lock cmpxchgl %ecx, {{.*}}(%rip) ; X64-NEXT: sete %sil ; X64-NEXT: testb $1, %sil -; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill +; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: jne .LBB5_2 ; X64-NEXT: jmp .LBB5_1 ; X64-NEXT: .LBB5_2: # %atomicrmw.end @@ -246,13 +247,13 @@ ; X86-NEXT: subl $8, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl sc32, %ecx -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X86-NEXT: .LBB5_1: # %atomicrmw.start ; X86-NEXT: # =>This Inner Loop Header: Depth=1 ; X86-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: andl %edx, %ecx ; X86-NEXT: notl %ecx ; X86-NEXT: lock cmpxchgl %ecx, sc32 @@ -273,20 +274,20 @@ ; X64-LABEL: atomic_fetch_max32: ; X64: # %bb.0: ; X64-NEXT: movl sc32, %eax -; X64-NEXT: movl %edi, -{{[0-9]+}}(%rsp) # 4-byte Spill -; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill +; X64-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: .LBB6_1: # %atomicrmw.start ; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload +; X64-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload ; X64-NEXT: movl %eax, %ecx -; X64-NEXT: movl -{{[0-9]+}}(%rsp), %edx # 4-byte Reload +; X64-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload ; X64-NEXT: subl %edx, %ecx ; X64-NEXT: cmovgel %eax, %edx ; X64-NEXT: lock cmpxchgl %edx, {{.*}}(%rip) ; X64-NEXT: sete %sil ; X64-NEXT: testb $1, %sil -; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill -; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill +; X64-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: jne .LBB6_2 ; X64-NEXT: jmp .LBB6_1 ; X64-NEXT: .LBB6_2: # %atomicrmw.end @@ -298,20 +299,20 @@ ; X86-CMOV-NEXT: subl $12, %esp ; X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-CMOV-NEXT: movl sc32, %ecx -; X86-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill -; X86-CMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill +; X86-CMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-CMOV-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-CMOV-NEXT: .LBB6_1: # %atomicrmw.start ; X86-CMOV-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload +; X86-CMOV-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-CMOV-NEXT: movl %eax, %ecx -; X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload +; X86-CMOV-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-CMOV-NEXT: subl %edx, %ecx ; X86-CMOV-NEXT: cmovgel %eax, %edx ; X86-CMOV-NEXT: lock cmpxchgl %edx, sc32 ; X86-CMOV-NEXT: sete %bl ; X86-CMOV-NEXT: testb $1, %bl ; X86-CMOV-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill +; X86-CMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-CMOV-NEXT: jne .LBB6_2 ; X86-CMOV-NEXT: jmp .LBB6_1 ; X86-CMOV-NEXT: .LBB6_2: # %atomicrmw.end @@ -326,34 +327,34 @@ ; X86-NOCMOV-NEXT: subl $24, %esp ; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOCMOV-NEXT: movl sc32, %ecx -; X86-NOCMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill -; X86-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill +; X86-NOCMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOCMOV-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOCMOV-NEXT: .LBB6_1: # %atomicrmw.start ; X86-NOCMOV-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload +; X86-NOCMOV-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NOCMOV-NEXT: movl %eax, %ecx -; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload +; X86-NOCMOV-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NOCMOV-NEXT: subl %edx, %ecx ; X86-NOCMOV-NEXT: movl %eax, %esi -; X86-NOCMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill -; X86-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill -; X86-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill +; X86-NOCMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOCMOV-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOCMOV-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOCMOV-NEXT: jge .LBB6_4 ; X86-NOCMOV-NEXT: # %bb.3: # %atomicrmw.start ; X86-NOCMOV-NEXT: # in Loop: Header=BB6_1 Depth=1 -; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload -; X86-NOCMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill +; X86-NOCMOV-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NOCMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOCMOV-NEXT: .LBB6_4: # %atomicrmw.start ; X86-NOCMOV-NEXT: # in Loop: Header=BB6_1 Depth=1 -; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload -; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload +; X86-NOCMOV-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NOCMOV-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NOCMOV-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NOCMOV-NEXT: movl %ecx, %eax ; X86-NOCMOV-NEXT: movl (%esp), %edx # 4-byte Reload ; X86-NOCMOV-NEXT: lock cmpxchgl %edx, sc32 ; X86-NOCMOV-NEXT: sete %bl ; X86-NOCMOV-NEXT: testb $1, %bl -; X86-NOCMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill +; X86-NOCMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOCMOV-NEXT: jne .LBB6_2 ; X86-NOCMOV-NEXT: jmp .LBB6_1 ; X86-NOCMOV-NEXT: .LBB6_2: # %atomicrmw.end @@ -369,20 +370,20 @@ ; X64-LABEL: atomic_fetch_min32: ; X64: # %bb.0: ; X64-NEXT: movl sc32, %eax -; X64-NEXT: movl %edi, -{{[0-9]+}}(%rsp) # 4-byte Spill -; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill +; X64-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: .LBB7_1: # %atomicrmw.start ; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload +; X64-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload ; X64-NEXT: movl %eax, %ecx -; X64-NEXT: movl -{{[0-9]+}}(%rsp), %edx # 4-byte Reload +; X64-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload ; X64-NEXT: subl %edx, %ecx ; X64-NEXT: cmovlel %eax, %edx ; X64-NEXT: lock cmpxchgl %edx, {{.*}}(%rip) ; X64-NEXT: sete %sil ; X64-NEXT: testb $1, %sil -; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill -; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill +; X64-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: jne .LBB7_2 ; X64-NEXT: jmp .LBB7_1 ; X64-NEXT: .LBB7_2: # %atomicrmw.end @@ -394,20 +395,20 @@ ; X86-CMOV-NEXT: subl $12, %esp ; X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-CMOV-NEXT: movl sc32, %ecx -; X86-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill -; X86-CMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill +; X86-CMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-CMOV-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-CMOV-NEXT: .LBB7_1: # %atomicrmw.start ; X86-CMOV-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload +; X86-CMOV-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-CMOV-NEXT: movl %eax, %ecx -; X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload +; X86-CMOV-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-CMOV-NEXT: subl %edx, %ecx ; X86-CMOV-NEXT: cmovlel %eax, %edx ; X86-CMOV-NEXT: lock cmpxchgl %edx, sc32 ; X86-CMOV-NEXT: sete %bl ; X86-CMOV-NEXT: testb $1, %bl ; X86-CMOV-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill +; X86-CMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-CMOV-NEXT: jne .LBB7_2 ; X86-CMOV-NEXT: jmp .LBB7_1 ; X86-CMOV-NEXT: .LBB7_2: # %atomicrmw.end @@ -422,34 +423,34 @@ ; X86-NOCMOV-NEXT: subl $24, %esp ; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOCMOV-NEXT: movl sc32, %ecx -; X86-NOCMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill -; X86-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill +; X86-NOCMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOCMOV-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOCMOV-NEXT: .LBB7_1: # %atomicrmw.start ; X86-NOCMOV-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload +; X86-NOCMOV-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NOCMOV-NEXT: movl %eax, %ecx -; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload +; X86-NOCMOV-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NOCMOV-NEXT: subl %edx, %ecx ; X86-NOCMOV-NEXT: movl %eax, %esi -; X86-NOCMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill -; X86-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill -; X86-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill +; X86-NOCMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOCMOV-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOCMOV-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOCMOV-NEXT: jle .LBB7_4 ; X86-NOCMOV-NEXT: # %bb.3: # %atomicrmw.start ; X86-NOCMOV-NEXT: # in Loop: Header=BB7_1 Depth=1 -; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload -; X86-NOCMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill +; X86-NOCMOV-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NOCMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOCMOV-NEXT: .LBB7_4: # %atomicrmw.start ; X86-NOCMOV-NEXT: # in Loop: Header=BB7_1 Depth=1 -; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload -; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload +; X86-NOCMOV-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NOCMOV-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NOCMOV-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NOCMOV-NEXT: movl %ecx, %eax ; X86-NOCMOV-NEXT: movl (%esp), %edx # 4-byte Reload ; X86-NOCMOV-NEXT: lock cmpxchgl %edx, sc32 ; X86-NOCMOV-NEXT: sete %bl ; X86-NOCMOV-NEXT: testb $1, %bl -; X86-NOCMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill +; X86-NOCMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOCMOV-NEXT: jne .LBB7_2 ; X86-NOCMOV-NEXT: jmp .LBB7_1 ; X86-NOCMOV-NEXT: .LBB7_2: # %atomicrmw.end @@ -465,20 +466,20 @@ ; X64-LABEL: atomic_fetch_umax32: ; X64: # %bb.0: ; X64-NEXT: movl sc32, %eax -; X64-NEXT: movl %edi, -{{[0-9]+}}(%rsp) # 4-byte Spill -; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill +; X64-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: .LBB8_1: # %atomicrmw.start ; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload +; X64-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload ; X64-NEXT: movl %eax, %ecx -; X64-NEXT: movl -{{[0-9]+}}(%rsp), %edx # 4-byte Reload +; X64-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload ; X64-NEXT: subl %edx, %ecx ; X64-NEXT: cmoval %eax, %edx ; X64-NEXT: lock cmpxchgl %edx, {{.*}}(%rip) ; X64-NEXT: sete %sil ; X64-NEXT: testb $1, %sil -; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill -; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill +; X64-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: jne .LBB8_2 ; X64-NEXT: jmp .LBB8_1 ; X64-NEXT: .LBB8_2: # %atomicrmw.end @@ -490,20 +491,20 @@ ; X86-CMOV-NEXT: subl $12, %esp ; X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-CMOV-NEXT: movl sc32, %ecx -; X86-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill -; X86-CMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill +; X86-CMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-CMOV-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-CMOV-NEXT: .LBB8_1: # %atomicrmw.start ; X86-CMOV-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload +; X86-CMOV-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-CMOV-NEXT: movl %eax, %ecx -; X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload +; X86-CMOV-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-CMOV-NEXT: subl %edx, %ecx ; X86-CMOV-NEXT: cmoval %eax, %edx ; X86-CMOV-NEXT: lock cmpxchgl %edx, sc32 ; X86-CMOV-NEXT: sete %bl ; X86-CMOV-NEXT: testb $1, %bl ; X86-CMOV-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill +; X86-CMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-CMOV-NEXT: jne .LBB8_2 ; X86-CMOV-NEXT: jmp .LBB8_1 ; X86-CMOV-NEXT: .LBB8_2: # %atomicrmw.end @@ -518,34 +519,34 @@ ; X86-NOCMOV-NEXT: subl $24, %esp ; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOCMOV-NEXT: movl sc32, %ecx -; X86-NOCMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill -; X86-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill +; X86-NOCMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOCMOV-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOCMOV-NEXT: .LBB8_1: # %atomicrmw.start ; X86-NOCMOV-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload +; X86-NOCMOV-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NOCMOV-NEXT: movl %eax, %ecx -; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload +; X86-NOCMOV-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NOCMOV-NEXT: subl %edx, %ecx ; X86-NOCMOV-NEXT: movl %eax, %esi -; X86-NOCMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill -; X86-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill -; X86-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill +; X86-NOCMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOCMOV-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOCMOV-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOCMOV-NEXT: ja .LBB8_4 ; X86-NOCMOV-NEXT: # %bb.3: # %atomicrmw.start ; X86-NOCMOV-NEXT: # in Loop: Header=BB8_1 Depth=1 -; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload -; X86-NOCMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill +; X86-NOCMOV-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NOCMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOCMOV-NEXT: .LBB8_4: # %atomicrmw.start ; X86-NOCMOV-NEXT: # in Loop: Header=BB8_1 Depth=1 -; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload -; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload +; X86-NOCMOV-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NOCMOV-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NOCMOV-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NOCMOV-NEXT: movl %ecx, %eax ; X86-NOCMOV-NEXT: movl (%esp), %edx # 4-byte Reload ; X86-NOCMOV-NEXT: lock cmpxchgl %edx, sc32 ; X86-NOCMOV-NEXT: sete %bl ; X86-NOCMOV-NEXT: testb $1, %bl -; X86-NOCMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill +; X86-NOCMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOCMOV-NEXT: jne .LBB8_2 ; X86-NOCMOV-NEXT: jmp .LBB8_1 ; X86-NOCMOV-NEXT: .LBB8_2: # %atomicrmw.end @@ -561,20 +562,20 @@ ; X64-LABEL: atomic_fetch_umin32: ; X64: # %bb.0: ; X64-NEXT: movl sc32, %eax -; X64-NEXT: movl %edi, -{{[0-9]+}}(%rsp) # 4-byte Spill -; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill +; X64-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: .LBB9_1: # %atomicrmw.start ; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload +; X64-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload ; X64-NEXT: movl %eax, %ecx -; X64-NEXT: movl -{{[0-9]+}}(%rsp), %edx # 4-byte Reload +; X64-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload ; X64-NEXT: subl %edx, %ecx ; X64-NEXT: cmovbel %eax, %edx ; X64-NEXT: lock cmpxchgl %edx, {{.*}}(%rip) ; X64-NEXT: sete %sil ; X64-NEXT: testb $1, %sil -; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill -; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill +; X64-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: jne .LBB9_2 ; X64-NEXT: jmp .LBB9_1 ; X64-NEXT: .LBB9_2: # %atomicrmw.end @@ -586,20 +587,20 @@ ; X86-CMOV-NEXT: subl $12, %esp ; X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-CMOV-NEXT: movl sc32, %ecx -; X86-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill -; X86-CMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill +; X86-CMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-CMOV-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-CMOV-NEXT: .LBB9_1: # %atomicrmw.start ; X86-CMOV-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload +; X86-CMOV-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-CMOV-NEXT: movl %eax, %ecx -; X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload +; X86-CMOV-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-CMOV-NEXT: subl %edx, %ecx ; X86-CMOV-NEXT: cmovbel %eax, %edx ; X86-CMOV-NEXT: lock cmpxchgl %edx, sc32 ; X86-CMOV-NEXT: sete %bl ; X86-CMOV-NEXT: testb $1, %bl ; X86-CMOV-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill +; X86-CMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-CMOV-NEXT: jne .LBB9_2 ; X86-CMOV-NEXT: jmp .LBB9_1 ; X86-CMOV-NEXT: .LBB9_2: # %atomicrmw.end @@ -614,34 +615,34 @@ ; X86-NOCMOV-NEXT: subl $24, %esp ; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOCMOV-NEXT: movl sc32, %ecx -; X86-NOCMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill -; X86-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill +; X86-NOCMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOCMOV-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOCMOV-NEXT: .LBB9_1: # %atomicrmw.start ; X86-NOCMOV-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload +; X86-NOCMOV-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NOCMOV-NEXT: movl %eax, %ecx -; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload +; X86-NOCMOV-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NOCMOV-NEXT: subl %edx, %ecx ; X86-NOCMOV-NEXT: movl %eax, %esi -; X86-NOCMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill -; X86-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill -; X86-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill +; X86-NOCMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOCMOV-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOCMOV-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOCMOV-NEXT: jbe .LBB9_4 ; X86-NOCMOV-NEXT: # %bb.3: # %atomicrmw.start ; X86-NOCMOV-NEXT: # in Loop: Header=BB9_1 Depth=1 -; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload -; X86-NOCMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill +; X86-NOCMOV-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NOCMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOCMOV-NEXT: .LBB9_4: # %atomicrmw.start ; X86-NOCMOV-NEXT: # in Loop: Header=BB9_1 Depth=1 -; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload -; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload +; X86-NOCMOV-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NOCMOV-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NOCMOV-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NOCMOV-NEXT: movl %ecx, %eax ; X86-NOCMOV-NEXT: movl (%esp), %edx # 4-byte Reload ; X86-NOCMOV-NEXT: lock cmpxchgl %edx, sc32 ; X86-NOCMOV-NEXT: sete %bl ; X86-NOCMOV-NEXT: testb $1, %bl -; X86-NOCMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill +; X86-NOCMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOCMOV-NEXT: jne .LBB9_2 ; X86-NOCMOV-NEXT: jmp .LBB9_1 ; X86-NOCMOV-NEXT: .LBB9_2: # %atomicrmw.end @@ -659,7 +660,7 @@ ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: movl $1, %ecx ; X64-NEXT: lock cmpxchgl %ecx, {{.*}}(%rip) -; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill +; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: retq ; ; X86-LABEL: atomic_fetch_cmpxchg32: @@ -694,7 +695,7 @@ ; X64-LABEL: atomic_fetch_swap32: ; X64: # %bb.0: ; X64-NEXT: xchgl %edi, {{.*}}(%rip) -; X64-NEXT: movl %edi, -{{[0-9]+}}(%rsp) # 4-byte Spill +; X64-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: retq ; ; X86-LABEL: atomic_fetch_swap32: @@ -708,3 +709,35 @@ %t1 = atomicrmw xchg i32* @sc32, i32 %x acquire ret void } + +define void @atomic_fetch_swapf32(float %x) nounwind { +; X64-LABEL: atomic_fetch_swapf32: +; X64: # %bb.0: +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: xchgl %eax, {{.*}}(%rip) +; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: retq +; +; X86-CMOV-LABEL: atomic_fetch_swapf32: +; X86-CMOV: # %bb.0: +; X86-CMOV-NEXT: pushl %eax +; X86-CMOV-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-CMOV-NEXT: movd %xmm0, %eax +; X86-CMOV-NEXT: xchgl %eax, fsc32 +; X86-CMOV-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-CMOV-NEXT: popl %eax +; X86-CMOV-NEXT: retl +; +; X86-NOCMOV-LABEL: atomic_fetch_swapf32: +; X86-NOCMOV: # %bb.0: +; X86-NOCMOV-NEXT: subl $8, %esp +; X86-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; X86-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) +; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOCMOV-NEXT: xchgl %eax, fsc32 +; X86-NOCMOV-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NOCMOV-NEXT: addl $8, %esp +; X86-NOCMOV-NEXT: retl + %t1 = atomicrmw xchg float* @fsc32, float %x acquire + ret void +} Index: test/CodeGen/X86/atomic64.ll =================================================================== --- test/CodeGen/X86/atomic64.ll +++ test/CodeGen/X86/atomic64.ll @@ -1,6 +1,7 @@ ; RUN: llc < %s -O0 -mtriple=x86_64-- -mcpu=corei7 -verify-machineinstrs | FileCheck %s --check-prefix X64 @sc64 = external global i64 +@fsc64 = external global double define void @atomic_fetch_add64() nounwind { ; X64-LABEL: atomic_fetch_add64: @@ -233,3 +234,17 @@ ; X64: ret ; X32: ret } + + +define void @atomic_fetch_swapf64(double %x) nounwind { +; X64-LABEL: atomic_fetch_swapf64: +; X32-LABEL: atomic_fetch_swapf64: + %t1 = atomicrmw xchg double* @fsc64, double %x acquire +; X64-NOT: lock +; X64: xchgq +; X32: lock +; X32: xchg8b + ret void +; X64: ret +; X32: ret +}