Index: llvm/trunk/lib/Target/X86/X86ISelLowering.h =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.h +++ llvm/trunk/lib/Target/X86/X86ISelLowering.h @@ -590,6 +590,9 @@ // Load, scalar_to_vector, and zero extend. VZEXT_LOAD, + // extract_vector_elt, store. + VEXTRACT_STORE, + // Store FP control world into i16 memory. FNSTCW16m, Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -25625,8 +25625,18 @@ return false; } +// TODO: In 32-bit mode, use MOVLPS when SSE1 is available? +// TODO: In 32-bit mode, use FISTP when X87 is available? bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { - return needsCmpXchgNb(SI->getValueOperand()->getType()); + Type *MemType = SI->getValueOperand()->getType(); + + bool NoImplicitFloatOps = + SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat); + if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() && + !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2()) + return false; + + return needsCmpXchgNb(MemType); } // Note: this turns large loads into lock cmpxchg8b/16b. @@ -26262,28 +26272,54 @@ DAG.getUNDEF(VT), LockOp.getValue(1)); } -static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) { - SDNode *Node = Op.getNode(); +static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + auto *Node = cast(Op.getNode()); SDLoc dl(Node); - EVT VT = cast(Node)->getMemoryVT(); + EVT VT = Node->getMemoryVT(); + + bool IsSeqCst = Node->getOrdering() == AtomicOrdering::SequentiallyConsistent; + bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT); + + // If this store is not sequentially consistent and the type is legal + // we can just keep it. + if (!IsSeqCst && IsTypeLegal) + return Op; + + if (VT == MVT::i64 && !IsTypeLegal) { + // For illegal i64 atomic_stores, we can try to use MOVQ if SSE2 is enabled. + // FIXME: Use movlps with SSE1. + // FIXME: Use fist with X87. + bool NoImplicitFloatOps = + DAG.getMachineFunction().getFunction().hasFnAttribute( + Attribute::NoImplicitFloat); + if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps && + Subtarget.hasSSE2()) { + SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, + Node->getOperand(2)); + SDVTList Tys = DAG.getVTList(MVT::Other); + SDValue Ops[] = { Node->getChain(), SclToVec, Node->getBasePtr() }; + SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, + Ops, MVT::i64, + Node->getMemOperand()); + + // If this is a sequentially consistent store, also emit an mfence. + if (IsSeqCst) + Chain = DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Chain); + + return Chain; + } + } // Convert seq_cst store -> xchg // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b) - // FIXME: On 32-bit, store -> fist or movq would be more efficient - // (The only way to get a 16-byte store is cmpxchg16b) // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment. - if (cast(Node)->getOrdering() == - AtomicOrdering::SequentiallyConsistent || - !DAG.getTargetLoweringInfo().isTypeLegal(VT)) { - SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, - cast(Node)->getMemoryVT(), - Node->getOperand(0), - Node->getOperand(1), Node->getOperand(2), - cast(Node)->getMemOperand()); - return Swap.getValue(1); - } - // Other atomic stores have a simple pattern. - return Op; + SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, + Node->getMemoryVT(), + Node->getOperand(0), + Node->getOperand(1), Node->getOperand(2), + Node->getMemOperand()); + return Swap.getValue(1); } static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) { @@ -26704,7 +26740,7 @@ case ISD::ATOMIC_LOAD_OR: case ISD::ATOMIC_LOAD_XOR: case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget); - case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG); + case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget); case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG); @@ -27812,6 +27848,7 @@ case X86ISD::LAND: return "X86ISD::LAND"; case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; + case X86ISD::VEXTRACT_STORE: return "X86ISD::VEXTRACT_STORE"; case X86ISD::VTRUNC: return "X86ISD::VTRUNC"; case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS"; case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS"; Index: llvm/trunk/lib/Target/X86/X86InstrAVX512.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrAVX512.td +++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td @@ -3893,6 +3893,11 @@ def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}", (VMOVPQI2QIZrr VR128X:$dst, VR128X:$src), 0>; +let Predicates = [HasAVX512] in { + def : Pat<(X86vextractstore (v2i64 VR128X:$src), addr:$dst), + (VMOVPQI2QIZmr addr:$dst, VR128X:$src)>; +} + // Move Scalar Single to Double Int // let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { Index: llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td +++ llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -101,6 +101,8 @@ def X86vzload : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def X86vextractstore : SDNode<"X86ISD::VEXTRACT_STORE", SDTStore, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def SDTVtrunc : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, SDTCisInt<0>, SDTCisInt<1>, Index: llvm/trunk/lib/Target/X86/X86InstrSSE.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrSSE.td +++ llvm/trunk/lib/Target/X86/X86InstrSSE.td @@ -4406,12 +4406,18 @@ (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>; def : Pat<(v4i64 (X86vzload addr:$src)), (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>; + + def : Pat<(X86vextractstore (v2i64 VR128:$src), addr:$dst), + (VMOVPQI2QImr addr:$dst, VR128:$src)>; } let Predicates = [UseSSE2] in { def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), (MOVQI2PQIrm addr:$src)>; def : Pat<(v2i64 (X86vzload addr:$src)), (MOVQI2PQIrm addr:$src)>; + + def : Pat<(X86vextractstore (v2i64 VR128:$src), addr:$dst), + (MOVPQI2QImr addr:$dst, VR128:$src)>; } //===---------------------------------------------------------------------===// Index: llvm/trunk/test/CodeGen/X86/atomic-fp.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/atomic-fp.ll +++ llvm/trunk/test/CodeGen/X86/atomic-fp.ll @@ -148,27 +148,15 @@ ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pushl %ebp ; X86-SSE2-NEXT: movl %esp, %ebp -; X86-SSE2-NEXT: pushl %ebx -; X86-SSE2-NEXT: pushl %esi ; X86-SSE2-NEXT: andl $-8, %esp ; X86-SSE2-NEXT: subl $8, %esp -; X86-SSE2-NEXT: movl 8(%ebp), %esi +; X86-SSE2-NEXT: movl 8(%ebp), %eax ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-SSE2-NEXT: addsd 12(%ebp), %xmm0 ; X86-SSE2-NEXT: movsd %xmm0, (%esp) -; X86-SSE2-NEXT: movl (%esp), %ebx -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movl (%esi), %eax -; X86-SSE2-NEXT: movl 4(%esi), %edx -; X86-SSE2-NEXT: .p2align 4, 0x90 -; X86-SSE2-NEXT: .LBB1_1: # %atomicrmw.start -; X86-SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-SSE2-NEXT: lock cmpxchg8b (%esi) -; X86-SSE2-NEXT: jne .LBB1_1 -; X86-SSE2-NEXT: # %bb.2: # %atomicrmw.end -; X86-SSE2-NEXT: leal -8(%ebp), %esp -; X86-SSE2-NEXT: popl %esi -; X86-SSE2-NEXT: popl %ebx +; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE2-NEXT: movlps %xmm0, (%eax) +; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; @@ -176,27 +164,15 @@ ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: pushl %ebp ; X86-AVX-NEXT: movl %esp, %ebp -; X86-AVX-NEXT: pushl %ebx -; X86-AVX-NEXT: pushl %esi ; X86-AVX-NEXT: andl $-8, %esp ; X86-AVX-NEXT: subl $8, %esp -; X86-AVX-NEXT: movl 8(%ebp), %esi +; X86-AVX-NEXT: movl 8(%ebp), %eax ; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X86-AVX-NEXT: vaddsd 12(%ebp), %xmm0, %xmm0 ; X86-AVX-NEXT: vmovsd %xmm0, (%esp) -; X86-AVX-NEXT: movl (%esp), %ebx -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: movl (%esi), %eax -; X86-AVX-NEXT: movl 4(%esi), %edx -; X86-AVX-NEXT: .p2align 4, 0x90 -; X86-AVX-NEXT: .LBB1_1: # %atomicrmw.start -; X86-AVX-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-AVX-NEXT: lock cmpxchg8b (%esi) -; X86-AVX-NEXT: jne .LBB1_1 -; X86-AVX-NEXT: # %bb.2: # %atomicrmw.end -; X86-AVX-NEXT: leal -8(%ebp), %esp -; X86-AVX-NEXT: popl %esi -; X86-AVX-NEXT: popl %ebx +; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX-NEXT: vmovlps %xmm0, (%eax) +; X86-AVX-NEXT: movl %ebp, %esp ; X86-AVX-NEXT: popl %ebp ; X86-AVX-NEXT: retl ; @@ -353,24 +329,14 @@ ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pushl %ebp ; X86-SSE2-NEXT: movl %esp, %ebp -; X86-SSE2-NEXT: pushl %ebx ; X86-SSE2-NEXT: andl $-8, %esp -; X86-SSE2-NEXT: subl $16, %esp +; X86-SSE2-NEXT: subl $8, %esp ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-SSE2-NEXT: addsd {{\.LCPI.*}}, %xmm0 ; X86-SSE2-NEXT: movsd %xmm0, (%esp) -; X86-SSE2-NEXT: movl (%esp), %ebx -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movl glob64+4, %edx -; X86-SSE2-NEXT: movl glob64, %eax -; X86-SSE2-NEXT: .p2align 4, 0x90 -; X86-SSE2-NEXT: .LBB3_1: # %atomicrmw.start -; X86-SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-SSE2-NEXT: lock cmpxchg8b glob64 -; X86-SSE2-NEXT: jne .LBB3_1 -; X86-SSE2-NEXT: # %bb.2: # %atomicrmw.end -; X86-SSE2-NEXT: leal -4(%ebp), %esp -; X86-SSE2-NEXT: popl %ebx +; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE2-NEXT: movlps %xmm0, glob64 +; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; @@ -378,24 +344,14 @@ ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: pushl %ebp ; X86-AVX-NEXT: movl %esp, %ebp -; X86-AVX-NEXT: pushl %ebx ; X86-AVX-NEXT: andl $-8, %esp -; X86-AVX-NEXT: subl $16, %esp +; X86-AVX-NEXT: subl $8, %esp ; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X86-AVX-NEXT: vaddsd {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-AVX-NEXT: vmovsd %xmm0, (%esp) -; X86-AVX-NEXT: movl (%esp), %ebx -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: movl glob64+4, %edx -; X86-AVX-NEXT: movl glob64, %eax -; X86-AVX-NEXT: .p2align 4, 0x90 -; X86-AVX-NEXT: .LBB3_1: # %atomicrmw.start -; X86-AVX-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-AVX-NEXT: lock cmpxchg8b glob64 -; X86-AVX-NEXT: jne .LBB3_1 -; X86-AVX-NEXT: # %bb.2: # %atomicrmw.end -; X86-AVX-NEXT: leal -4(%ebp), %esp -; X86-AVX-NEXT: popl %ebx +; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX-NEXT: vmovlps %xmm0, glob64 +; X86-AVX-NEXT: movl %ebp, %esp ; X86-AVX-NEXT: popl %ebp ; X86-AVX-NEXT: retl ; @@ -552,24 +508,14 @@ ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pushl %ebp ; X86-SSE2-NEXT: movl %esp, %ebp -; X86-SSE2-NEXT: pushl %ebx ; X86-SSE2-NEXT: andl $-8, %esp -; X86-SSE2-NEXT: subl $16, %esp +; X86-SSE2-NEXT: subl $8, %esp ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-SSE2-NEXT: addsd {{\.LCPI.*}}, %xmm0 ; X86-SSE2-NEXT: movsd %xmm0, (%esp) -; X86-SSE2-NEXT: movl (%esp), %ebx -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movl -559038737, %eax -; X86-SSE2-NEXT: movl -559038733, %edx -; X86-SSE2-NEXT: .p2align 4, 0x90 -; X86-SSE2-NEXT: .LBB5_1: # %atomicrmw.start -; X86-SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-SSE2-NEXT: lock cmpxchg8b -559038737 -; X86-SSE2-NEXT: jne .LBB5_1 -; X86-SSE2-NEXT: # %bb.2: # %atomicrmw.end -; X86-SSE2-NEXT: leal -4(%ebp), %esp -; X86-SSE2-NEXT: popl %ebx +; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE2-NEXT: movlps %xmm0, -559038737 +; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; @@ -577,24 +523,14 @@ ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: pushl %ebp ; X86-AVX-NEXT: movl %esp, %ebp -; X86-AVX-NEXT: pushl %ebx ; X86-AVX-NEXT: andl $-8, %esp -; X86-AVX-NEXT: subl $16, %esp +; X86-AVX-NEXT: subl $8, %esp ; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X86-AVX-NEXT: vaddsd {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-AVX-NEXT: vmovsd %xmm0, (%esp) -; X86-AVX-NEXT: movl (%esp), %ebx -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: movl -559038737, %eax -; X86-AVX-NEXT: movl -559038733, %edx -; X86-AVX-NEXT: .p2align 4, 0x90 -; X86-AVX-NEXT: .LBB5_1: # %atomicrmw.start -; X86-AVX-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-AVX-NEXT: lock cmpxchg8b -559038737 -; X86-AVX-NEXT: jne .LBB5_1 -; X86-AVX-NEXT: # %bb.2: # %atomicrmw.end -; X86-AVX-NEXT: leal -4(%ebp), %esp -; X86-AVX-NEXT: popl %ebx +; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX-NEXT: vmovlps %xmm0, -559038737 +; X86-AVX-NEXT: movl %ebp, %esp ; X86-AVX-NEXT: popl %ebp ; X86-AVX-NEXT: retl ; @@ -757,24 +693,14 @@ ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pushl %ebp ; X86-SSE2-NEXT: movl %esp, %ebp -; X86-SSE2-NEXT: pushl %ebx ; X86-SSE2-NEXT: andl $-8, %esp -; X86-SSE2-NEXT: subl $24, %esp +; X86-SSE2-NEXT: subl $16, %esp ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-SSE2-NEXT: addsd {{\.LCPI.*}}, %xmm0 -; X86-SSE2-NEXT: movsd %xmm0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movl (%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SSE2-NEXT: .p2align 4, 0x90 -; X86-SSE2-NEXT: .LBB7_1: # %atomicrmw.start -; X86-SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-SSE2-NEXT: lock cmpxchg8b (%esp) -; X86-SSE2-NEXT: jne .LBB7_1 -; X86-SSE2-NEXT: # %bb.2: # %atomicrmw.end -; X86-SSE2-NEXT: leal -4(%ebp), %esp -; X86-SSE2-NEXT: popl %ebx +; X86-SSE2-NEXT: movsd %xmm0, (%esp) +; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE2-NEXT: movlps %xmm0, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; @@ -782,24 +708,14 @@ ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: pushl %ebp ; X86-AVX-NEXT: movl %esp, %ebp -; X86-AVX-NEXT: pushl %ebx ; X86-AVX-NEXT: andl $-8, %esp -; X86-AVX-NEXT: subl $24, %esp +; X86-AVX-NEXT: subl $16, %esp ; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X86-AVX-NEXT: vaddsd {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: movl (%esp), %eax -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX-NEXT: .p2align 4, 0x90 -; X86-AVX-NEXT: .LBB7_1: # %atomicrmw.start -; X86-AVX-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-AVX-NEXT: lock cmpxchg8b (%esp) -; X86-AVX-NEXT: jne .LBB7_1 -; X86-AVX-NEXT: # %bb.2: # %atomicrmw.end -; X86-AVX-NEXT: leal -4(%ebp), %esp -; X86-AVX-NEXT: popl %ebx +; X86-AVX-NEXT: vmovsd %xmm0, (%esp) +; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; X86-AVX-NEXT: movl %ebp, %esp ; X86-AVX-NEXT: popl %ebp ; X86-AVX-NEXT: retl ; @@ -905,30 +821,16 @@ ; X86-SSE2: # %bb.0: # %bb ; X86-SSE2-NEXT: pushl %ebp ; X86-SSE2-NEXT: movl %esp, %ebp -; X86-SSE2-NEXT: pushl %ebx -; X86-SSE2-NEXT: pushl %edi -; X86-SSE2-NEXT: pushl %esi ; X86-SSE2-NEXT: andl $-8, %esp -; X86-SSE2-NEXT: subl $16, %esp -; X86-SSE2-NEXT: movl 20(%ebp), %esi -; X86-SSE2-NEXT: movl 8(%ebp), %edi +; X86-SSE2-NEXT: subl $8, %esp +; X86-SSE2-NEXT: movl 20(%ebp), %eax +; X86-SSE2-NEXT: movl 8(%ebp), %ecx ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-SSE2-NEXT: addsd 12(%ebp), %xmm0 ; X86-SSE2-NEXT: movsd %xmm0, (%esp) -; X86-SSE2-NEXT: movl (%esp), %ebx -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movl (%edi,%esi,8), %eax -; X86-SSE2-NEXT: movl 4(%edi,%esi,8), %edx -; X86-SSE2-NEXT: .p2align 4, 0x90 -; X86-SSE2-NEXT: .LBB8_1: # %atomicrmw.start -; X86-SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-SSE2-NEXT: lock cmpxchg8b (%edi,%esi,8) -; X86-SSE2-NEXT: jne .LBB8_1 -; X86-SSE2-NEXT: # %bb.2: # %atomicrmw.end -; X86-SSE2-NEXT: leal -12(%ebp), %esp -; X86-SSE2-NEXT: popl %esi -; X86-SSE2-NEXT: popl %edi -; X86-SSE2-NEXT: popl %ebx +; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE2-NEXT: movlps %xmm0, (%ecx,%eax,8) +; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; @@ -936,30 +838,16 @@ ; X86-AVX: # %bb.0: # %bb ; X86-AVX-NEXT: pushl %ebp ; X86-AVX-NEXT: movl %esp, %ebp -; X86-AVX-NEXT: pushl %ebx -; X86-AVX-NEXT: pushl %edi -; X86-AVX-NEXT: pushl %esi ; X86-AVX-NEXT: andl $-8, %esp -; X86-AVX-NEXT: subl $16, %esp -; X86-AVX-NEXT: movl 20(%ebp), %esi -; X86-AVX-NEXT: movl 8(%ebp), %edi +; X86-AVX-NEXT: subl $8, %esp +; X86-AVX-NEXT: movl 20(%ebp), %eax +; X86-AVX-NEXT: movl 8(%ebp), %ecx ; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X86-AVX-NEXT: vaddsd 12(%ebp), %xmm0, %xmm0 ; X86-AVX-NEXT: vmovsd %xmm0, (%esp) -; X86-AVX-NEXT: movl (%esp), %ebx -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: movl (%edi,%esi,8), %eax -; X86-AVX-NEXT: movl 4(%edi,%esi,8), %edx -; X86-AVX-NEXT: .p2align 4, 0x90 -; X86-AVX-NEXT: .LBB8_1: # %atomicrmw.start -; X86-AVX-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-AVX-NEXT: lock cmpxchg8b (%edi,%esi,8) -; X86-AVX-NEXT: jne .LBB8_1 -; X86-AVX-NEXT: # %bb.2: # %atomicrmw.end -; X86-AVX-NEXT: leal -12(%ebp), %esp -; X86-AVX-NEXT: popl %esi -; X86-AVX-NEXT: popl %edi -; X86-AVX-NEXT: popl %ebx +; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX-NEXT: vmovlps %xmm0, (%ecx,%eax,8) +; X86-AVX-NEXT: movl %ebp, %esp ; X86-AVX-NEXT: popl %ebp ; X86-AVX-NEXT: retl ; Index: llvm/trunk/test/CodeGen/X86/atomic-load-store-wide.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/atomic-load-store-wide.ll +++ llvm/trunk/test/CodeGen/X86/atomic-load-store-wide.ll @@ -6,30 +6,38 @@ ; FIXME: The generated code can be substantially improved. define void @test1(i64* %ptr, i64 %val1) { -; CHECK-LABEL: test1: -; CHECK: # %bb.0: -; CHECK-NEXT: pushl %ebx -; CHECK-NEXT: .cfi_def_cfa_offset 8 -; CHECK-NEXT: pushl %esi -; CHECK-NEXT: .cfi_def_cfa_offset 12 -; CHECK-NEXT: .cfi_offset %esi, -12 -; CHECK-NEXT: .cfi_offset %ebx, -8 -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK-NEXT: movl (%esi), %eax -; CHECK-NEXT: movl 4(%esi), %edx -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB0_1: # %atomicrmw.start -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lock cmpxchg8b (%esi) -; CHECK-NEXT: jne .LBB0_1 -; CHECK-NEXT: # %bb.2: # %atomicrmw.end -; CHECK-NEXT: popl %esi -; CHECK-NEXT: .cfi_def_cfa_offset 8 -; CHECK-NEXT: popl %ebx -; CHECK-NEXT: .cfi_def_cfa_offset 4 -; CHECK-NEXT: retl +; SSE42-LABEL: test1: +; SSE42: # %bb.0: +; SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE42-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE42-NEXT: movlps %xmm0, (%eax) +; SSE42-NEXT: mfence +; SSE42-NEXT: retl +; +; NOSSE-LABEL: test1: +; NOSSE: # %bb.0: +; NOSSE-NEXT: pushl %ebx +; NOSSE-NEXT: .cfi_def_cfa_offset 8 +; NOSSE-NEXT: pushl %esi +; NOSSE-NEXT: .cfi_def_cfa_offset 12 +; NOSSE-NEXT: .cfi_offset %esi, -12 +; NOSSE-NEXT: .cfi_offset %ebx, -8 +; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ebx +; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi +; NOSSE-NEXT: movl (%esi), %eax +; NOSSE-NEXT: movl 4(%esi), %edx +; NOSSE-NEXT: .p2align 4, 0x90 +; NOSSE-NEXT: .LBB0_1: # %atomicrmw.start +; NOSSE-NEXT: # =>This Inner Loop Header: Depth=1 +; NOSSE-NEXT: lock cmpxchg8b (%esi) +; NOSSE-NEXT: jne .LBB0_1 +; NOSSE-NEXT: # %bb.2: # %atomicrmw.end +; NOSSE-NEXT: popl %esi +; NOSSE-NEXT: .cfi_def_cfa_offset 8 +; NOSSE-NEXT: popl %ebx +; NOSSE-NEXT: .cfi_def_cfa_offset 4 +; NOSSE-NEXT: retl store atomic i64 %val1, i64* %ptr seq_cst, align 8 ret void } Index: llvm/trunk/test/CodeGen/X86/atomic-non-integer.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/atomic-non-integer.ll +++ llvm/trunk/test/CodeGen/X86/atomic-non-integer.ll @@ -135,30 +135,69 @@ } define void @store_double(double* %fptr, double %v) { -; X86-LABEL: store_double: -; X86: # %bb.0: -; X86-NEXT: pushl %ebx -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 12 -; X86-NEXT: .cfi_offset %esi, -12 -; X86-NEXT: .cfi_offset %ebx, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%esi), %eax -; X86-NEXT: movl 4(%esi), %edx -; X86-NEXT: .p2align 4, 0x90 -; X86-NEXT: .LBB2_1: # %atomicrmw.start -; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: lock cmpxchg8b (%esi) -; X86-NEXT: jne .LBB2_1 -; X86-NEXT: # %bb.2: # %atomicrmw.end -; X86-NEXT: popl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: popl %ebx -; X86-NEXT: .cfi_def_cfa_offset 4 -; X86-NEXT: retl +; X86-SSE1-LABEL: store_double: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl %ebx +; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE1-NEXT: pushl %esi +; X86-SSE1-NEXT: .cfi_def_cfa_offset 12 +; X86-SSE1-NEXT: .cfi_offset %esi, -12 +; X86-SSE1-NEXT: .cfi_offset %ebx, -8 +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE1-NEXT: movl (%esi), %eax +; X86-SSE1-NEXT: movl 4(%esi), %edx +; X86-SSE1-NEXT: .p2align 4, 0x90 +; X86-SSE1-NEXT: .LBB2_1: # %atomicrmw.start +; X86-SSE1-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-SSE1-NEXT: lock cmpxchg8b (%esi) +; X86-SSE1-NEXT: jne .LBB2_1 +; X86-SSE1-NEXT: # %bb.2: # %atomicrmw.end +; X86-SSE1-NEXT: popl %esi +; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE1-NEXT: popl %ebx +; X86-SSE1-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: store_double: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE2-NEXT: movlps %xmm0, (%eax) +; X86-SSE2-NEXT: retl +; +; X86-AVX-LABEL: store_double: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX-NEXT: vmovlps %xmm0, (%eax) +; X86-AVX-NEXT: retl +; +; X86-NOSSE-LABEL: store_double: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl %ebx +; X86-NOSSE-NEXT: .cfi_def_cfa_offset 8 +; X86-NOSSE-NEXT: pushl %esi +; X86-NOSSE-NEXT: .cfi_def_cfa_offset 12 +; X86-NOSSE-NEXT: .cfi_offset %esi, -12 +; X86-NOSSE-NEXT: .cfi_offset %ebx, -8 +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NOSSE-NEXT: movl (%esi), %eax +; X86-NOSSE-NEXT: movl 4(%esi), %edx +; X86-NOSSE-NEXT: .p2align 4, 0x90 +; X86-NOSSE-NEXT: .LBB2_1: # %atomicrmw.start +; X86-NOSSE-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NOSSE-NEXT: lock cmpxchg8b (%esi) +; X86-NOSSE-NEXT: jne .LBB2_1 +; X86-NOSSE-NEXT: # %bb.2: # %atomicrmw.end +; X86-NOSSE-NEXT: popl %esi +; X86-NOSSE-NEXT: .cfi_def_cfa_offset 8 +; X86-NOSSE-NEXT: popl %ebx +; X86-NOSSE-NEXT: .cfi_def_cfa_offset 4 +; X86-NOSSE-NEXT: retl ; ; X64-SSE-LABEL: store_double: ; X64-SSE: # %bb.0: @@ -641,30 +680,71 @@ } define void @store_double_seq_cst(double* %fptr, double %v) { -; X86-LABEL: store_double_seq_cst: -; X86: # %bb.0: -; X86-NEXT: pushl %ebx -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 12 -; X86-NEXT: .cfi_offset %esi, -12 -; X86-NEXT: .cfi_offset %ebx, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%esi), %eax -; X86-NEXT: movl 4(%esi), %edx -; X86-NEXT: .p2align 4, 0x90 -; X86-NEXT: .LBB9_1: # %atomicrmw.start -; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: lock cmpxchg8b (%esi) -; X86-NEXT: jne .LBB9_1 -; X86-NEXT: # %bb.2: # %atomicrmw.end -; X86-NEXT: popl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: popl %ebx -; X86-NEXT: .cfi_def_cfa_offset 4 -; X86-NEXT: retl +; X86-SSE1-LABEL: store_double_seq_cst: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl %ebx +; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE1-NEXT: pushl %esi +; X86-SSE1-NEXT: .cfi_def_cfa_offset 12 +; X86-SSE1-NEXT: .cfi_offset %esi, -12 +; X86-SSE1-NEXT: .cfi_offset %ebx, -8 +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE1-NEXT: movl (%esi), %eax +; X86-SSE1-NEXT: movl 4(%esi), %edx +; X86-SSE1-NEXT: .p2align 4, 0x90 +; X86-SSE1-NEXT: .LBB9_1: # %atomicrmw.start +; X86-SSE1-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-SSE1-NEXT: lock cmpxchg8b (%esi) +; X86-SSE1-NEXT: jne .LBB9_1 +; X86-SSE1-NEXT: # %bb.2: # %atomicrmw.end +; X86-SSE1-NEXT: popl %esi +; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE1-NEXT: popl %ebx +; X86-SSE1-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: store_double_seq_cst: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE2-NEXT: movlps %xmm0, (%eax) +; X86-SSE2-NEXT: mfence +; X86-SSE2-NEXT: retl +; +; X86-AVX-LABEL: store_double_seq_cst: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX-NEXT: vmovlps %xmm0, (%eax) +; X86-AVX-NEXT: mfence +; X86-AVX-NEXT: retl +; +; X86-NOSSE-LABEL: store_double_seq_cst: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl %ebx +; X86-NOSSE-NEXT: .cfi_def_cfa_offset 8 +; X86-NOSSE-NEXT: pushl %esi +; X86-NOSSE-NEXT: .cfi_def_cfa_offset 12 +; X86-NOSSE-NEXT: .cfi_offset %esi, -12 +; X86-NOSSE-NEXT: .cfi_offset %ebx, -8 +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NOSSE-NEXT: movl (%esi), %eax +; X86-NOSSE-NEXT: movl 4(%esi), %edx +; X86-NOSSE-NEXT: .p2align 4, 0x90 +; X86-NOSSE-NEXT: .LBB9_1: # %atomicrmw.start +; X86-NOSSE-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NOSSE-NEXT: lock cmpxchg8b (%esi) +; X86-NOSSE-NEXT: jne .LBB9_1 +; X86-NOSSE-NEXT: # %bb.2: # %atomicrmw.end +; X86-NOSSE-NEXT: popl %esi +; X86-NOSSE-NEXT: .cfi_def_cfa_offset 8 +; X86-NOSSE-NEXT: popl %ebx +; X86-NOSSE-NEXT: .cfi_def_cfa_offset 4 +; X86-NOSSE-NEXT: retl ; ; X64-SSE-LABEL: store_double_seq_cst: ; X64-SSE: # %bb.0: Index: llvm/trunk/test/CodeGen/X86/atomic6432.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/atomic6432.ll +++ llvm/trunk/test/CodeGen/X86/atomic6432.ll @@ -835,34 +835,11 @@ define void @atomic_fetch_store64(i64 %x) nounwind { ; X32-LABEL: atomic_fetch_store64: ; X32: # %bb.0: -; X32-NEXT: pushl %ebx -; X32-NEXT: subl $20, %esp +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X32-NEXT: movl sc64+4, %eax -; X32-NEXT: movl sc64, %edx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: jmp .LBB11_1 -; X32-NEXT: .LBB11_1: # %atomicrmw.start -; X32-NEXT: # =>This Inner Loop Header: Depth=1 -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl (%esp), %edx # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: lock cmpxchg8b sc64 -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: jne .LBB11_1 -; X32-NEXT: jmp .LBB11_2 -; X32-NEXT: .LBB11_2: # %atomicrmw.end -; X32-NEXT: addl $20, %esp -; X32-NEXT: popl %ebx +; X32-NEXT: movd %ecx, %xmm0 +; X32-NEXT: pinsrd $1, %eax, %xmm0 +; X32-NEXT: movq %xmm0, sc64 ; X32-NEXT: retl store atomic i64 %x, i64* @sc64 release, align 8 ret void