Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -29042,53 +29042,6 @@ } MachineBasicBlock * -X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI, - MachineBasicBlock *BB) const { - // Combine the following atomic floating-point modification pattern: - // a.store(reg OP a.load(acquire), release) - // Transform them into: - // OPss (%gpr), %xmm - // movss %xmm, (%gpr) - // Or sd equivalent for 64-bit operations. - unsigned MOp, FOp; - switch (MI.getOpcode()) { - default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP"); - case X86::RELEASE_FADD32mr: - FOp = X86::ADDSSrm; - MOp = X86::MOVSSmr; - break; - case X86::RELEASE_FADD64mr: - FOp = X86::ADDSDrm; - MOp = X86::MOVSDmr; - break; - } - const X86InstrInfo *TII = Subtarget.getInstrInfo(); - DebugLoc DL = MI.getDebugLoc(); - MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); - unsigned ValOpIdx = X86::AddrNumOperands; - unsigned VSrc = MI.getOperand(ValOpIdx).getReg(); - MachineInstrBuilder MIB = - BuildMI(*BB, MI, DL, TII->get(FOp), - MRI.createVirtualRegister(MRI.getRegClass(VSrc))) - .addReg(VSrc); - for (int i = 0; i < X86::AddrNumOperands; ++i) { - MachineOperand &Operand = MI.getOperand(i); - // Clear any kill flags on register operands as we'll create a second - // instruction using the same address operands. - if (Operand.isReg()) - Operand.setIsKill(false); - MIB.add(Operand); - } - MachineInstr *FOpMI = MIB; - MIB = BuildMI(*BB, MI, DL, TII->get(MOp)); - for (int i = 0; i < X86::AddrNumOperands; ++i) - MIB.add(MI.getOperand(i)); - MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill); - MI.eraseFromParent(); // The pseudo instruction is gone now. - return BB; -} - -MachineBasicBlock * X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI, MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); @@ -30323,10 +30276,6 @@ return BB; } - case X86::RELEASE_FADD32mr: - case X86::RELEASE_FADD64mr: - return EmitLoweredAtomicFP(MI, BB); - case X86::FP32_TO_INT16_IN_MEM: case X86::FP32_TO_INT32_IN_MEM: case X86::FP32_TO_INT64_IN_MEM: Index: llvm/lib/Target/X86/X86InstrCompiler.td =================================================================== --- llvm/lib/Target/X86/X86InstrCompiler.td +++ llvm/lib/Target/X86/X86InstrCompiler.td @@ -996,28 +996,31 @@ defm : RELEASE_BINOP_MI<"XOR", xor>; defm : RELEASE_BINOP_MI<"SUB", sub>; -// Same as above, but for floating-point. -// FIXME: imm version. -// FIXME: Version that doesn't clobber $src, using AVX's VADDSS. +// Atomic load + floating point patterns. // FIXME: This could also handle SIMD operations with *ps and *pd instructions. -let usesCustomInserter = 1, SchedRW = [WriteMicrocoded] in { -multiclass RELEASE_FP_BINOP_MI { - def NAME#32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, FR32:$src), - "#BINOP "#NAME#"32mr PSEUDO!", - [(atomic_store_32 addr:$dst, - (i32 (bitconvert (op - (f32 (bitconvert (i32 (atomic_load_32 addr:$dst)))), - FR32:$src))))]>, Requires<[HasSSE1]>; - def NAME#64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, FR64:$src), - "#BINOP "#NAME#"64mr PSEUDO!", - [(atomic_store_64 addr:$dst, - (i64 (bitconvert (op - (f64 (bitconvert (i64 (atomic_load_64 addr:$dst)))), - FR64:$src))))]>, Requires<[HasSSE2]>; +multiclass ATOMIC_LOAD_FP_BINOP_MI { + def : Pat<(op FR32:$src1, (bitconvert (i32 (atomic_load_32 addr:$src2)))), + (!cast(Name#"SSrm") FR32:$src1, addr:$src2)>, + Requires<[UseSSE1]>; + def : Pat<(op FR32:$src1, (bitconvert (i32 (atomic_load_32 addr:$src2)))), + (!cast("V"#Name#"SSrm") FR32:$src1, addr:$src2)>, + Requires<[UseAVX]>; + def : Pat<(op FR32X:$src1, (bitconvert (i32 (atomic_load_32 addr:$src2)))), + (!cast("V"#Name#"SSZrm") FR32X:$src1, addr:$src2)>, + Requires<[HasAVX512]>; + + def : Pat<(op FR64:$src1, (bitconvert (i64 (atomic_load_64 addr:$src2)))), + (!cast(Name#"SDrm") FR64:$src1, addr:$src2)>, + Requires<[UseSSE1]>; + def : Pat<(op FR64:$src1, (bitconvert (i64 (atomic_load_64 addr:$src2)))), + (!cast("V"#Name#"SDrm") FR64:$src1, addr:$src2)>, + Requires<[UseAVX]>; + def : Pat<(op FR64X:$src1, (bitconvert (i64 (atomic_load_64 addr:$src2)))), + (!cast("V"#Name#"SDZrm") FR64X:$src1, addr:$src2)>, + Requires<[HasAVX512]>; } -defm RELEASE_FADD : RELEASE_FP_BINOP_MI; +defm : ATOMIC_LOAD_FP_BINOP_MI<"ADD", fadd>; // FIXME: Add fsub, fmul, fdiv, ... -} multiclass RELEASE_UNOP { @@ -1078,6 +1081,35 @@ def : Pat<(i32 (atomic_load_32 addr:$src)), (MOV32rm addr:$src)>; def : Pat<(i64 (atomic_load_64 addr:$src)), (MOV64rm addr:$src)>; +// Floating point loads/stores. +def : Pat<(atomic_store_32 addr:$dst, (i32 (bitconvert (f32 FR32:$src)))), + (MOVSSmr addr:$dst, FR32:$src)>, Requires<[UseSSE1]>; +def : Pat<(atomic_store_32 addr:$dst, (i32 (bitconvert (f32 FR32:$src)))), + (VMOVSSmr addr:$dst, FR32:$src)>, Requires<[UseAVX]>; +def : Pat<(atomic_store_32 addr:$dst, (i32 (bitconvert (f32 FR32:$src)))), + (VMOVSSZmr addr:$dst, FR32:$src)>, Requires<[HasAVX512]>; + +def : Pat<(atomic_store_64 addr:$dst, (i64 (bitconvert (f64 FR64:$src)))), + (MOVSDmr addr:$dst, FR64:$src)>, Requires<[UseSSE2]>; +def : Pat<(atomic_store_64 addr:$dst, (i64 (bitconvert (f64 FR64:$src)))), + (VMOVSDmr addr:$dst, FR64:$src)>, Requires<[UseAVX]>; +def : Pat<(atomic_store_64 addr:$dst, (i64 (bitconvert (f64 FR64:$src)))), + (VMOVSDmr addr:$dst, FR64:$src)>, Requires<[HasAVX512]>; + +def : Pat<(f32 (bitconvert (i32 (atomic_load_32 addr:$src)))), + (MOVSSrm addr:$src)>, Requires<[UseSSE1]>; +def : Pat<(f32 (bitconvert (i32 (atomic_load_32 addr:$src)))), + (VMOVSSrm addr:$src)>, Requires<[UseAVX]>; +def : Pat<(f32 (bitconvert (i32 (atomic_load_32 addr:$src)))), + (VMOVSSZrm addr:$src)>, Requires<[HasAVX512]>; + +def : Pat<(f64 (bitconvert (i64 (atomic_load_64 addr:$src)))), + (MOVSDrm addr:$src)>, Requires<[UseSSE2]>; +def : Pat<(f64 (bitconvert (i64 (atomic_load_64 addr:$src)))), + (VMOVSDrm addr:$src)>, Requires<[UseAVX]>; +def : Pat<(f64 (bitconvert (i64 (atomic_load_64 addr:$src)))), + (VMOVSDZrm addr:$src)>, Requires<[HasAVX512]>; + //===----------------------------------------------------------------------===// // DAG Pattern Matching Rules //===----------------------------------------------------------------------===// Index: llvm/test/CodeGen/X86/atomic-fp.ll =================================================================== --- llvm/test/CodeGen/X86/atomic-fp.ll +++ llvm/test/CodeGen/X86/atomic-fp.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix X86 --check-prefix X86-NOSSE -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=sse2 -verify-machineinstrs | FileCheck %s --check-prefix X86 --check-prefix X86-SSE +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=sse -verify-machineinstrs | FileCheck %s --check-prefix X86 --check-prefix X86-SSE --check-prefix X86-SSE1 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=sse2 -verify-machineinstrs | FileCheck %s --check-prefix X86 --check-prefix X86-SSE --check-prefix X86-SSE2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx -verify-machineinstrs | FileCheck %s --check-prefix X86 --check-prefix X86-AVX --check-prefix X86-AVX1 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx512f -verify-machineinstrs | FileCheck %s --check-prefix X86 --check-prefix X86-AVX --check-prefix X86-AVX512 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix X64 --check-prefix X64-SSE @@ -24,27 +25,47 @@ ; X86-NOSSE-NEXT: addl $8, %esp ; X86-NOSSE-NEXT: retl ; -; X86-SSE-LABEL: fadd_32r: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: addss (%eax), %xmm0 -; X86-SSE-NEXT: movss %xmm0, (%eax) -; X86-SSE-NEXT: retl +; X86-SSE1-LABEL: fadd_32r: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: subl $8, %esp +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE1-NEXT: movl (%eax), %ecx +; X86-SSE1-NEXT: movl %ecx, (%esp) +; X86-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE1-NEXT: addss {{[0-9]+}}(%esp), %xmm0 +; X86-SSE1-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE1-NEXT: movl %ecx, (%eax) +; X86-SSE1-NEXT: addl $8, %esp +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: fadd_32r: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE2-NEXT: addss (%eax), %xmm0 +; X86-SSE2-NEXT: movss %xmm0, (%eax) +; X86-SSE2-NEXT: retl ; ; X86-AVX-LABEL: fadd_32r: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: addss (%eax), %xmm0 -; X86-AVX-NEXT: movss %xmm0, (%eax) +; X86-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vaddss (%eax), %xmm0, %xmm0 +; X86-AVX-NEXT: vmovss %xmm0, (%eax) ; X86-AVX-NEXT: retl ; -; X64-LABEL: fadd_32r: -; X64: # %bb.0: -; X64-NEXT: addss (%rdi), %xmm0 -; X64-NEXT: movss %xmm0, (%rdi) -; X64-NEXT: retq +; X64-SSE-LABEL: fadd_32r: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: addss (%rdi), %xmm0 +; X64-SSE-NEXT: movss %xmm0, (%rdi) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: fadd_32r: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vaddss (%rdi), %xmm0, %xmm0 +; X64-AVX-NEXT: vmovss %xmm0, (%rdi) +; X64-AVX-NEXT: retq %floc = bitcast float* %loc to i32* %1 = load atomic i32, i32* %floc seq_cst, align 4 %2 = bitcast i32 %1 to float @@ -90,33 +111,68 @@ ; X86-NOSSE-NEXT: popl %ebp ; X86-NOSSE-NEXT: retl ; -; X86-SSE-LABEL: fadd_64r: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pushl %ebp -; X86-SSE-NEXT: movl %esp, %ebp -; X86-SSE-NEXT: pushl %ebx -; X86-SSE-NEXT: pushl %esi -; X86-SSE-NEXT: andl $-8, %esp -; X86-SSE-NEXT: subl $8, %esp -; X86-SSE-NEXT: movl 8(%ebp), %esi -; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE-NEXT: addsd 12(%ebp), %xmm0 -; X86-SSE-NEXT: movsd %xmm0, (%esp) -; X86-SSE-NEXT: movl (%esp), %ebx -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE-NEXT: movl (%esi), %eax -; X86-SSE-NEXT: movl 4(%esi), %edx -; X86-SSE-NEXT: .p2align 4, 0x90 -; X86-SSE-NEXT: .LBB1_1: # %atomicrmw.start -; X86-SSE-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-SSE-NEXT: lock cmpxchg8b (%esi) -; X86-SSE-NEXT: jne .LBB1_1 -; X86-SSE-NEXT: # %bb.2: # %atomicrmw.end -; X86-SSE-NEXT: leal -8(%ebp), %esp -; X86-SSE-NEXT: popl %esi -; X86-SSE-NEXT: popl %ebx -; X86-SSE-NEXT: popl %ebp -; X86-SSE-NEXT: retl +; X86-SSE1-LABEL: fadd_64r: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl %ebp +; X86-SSE1-NEXT: movl %esp, %ebp +; X86-SSE1-NEXT: pushl %ebx +; X86-SSE1-NEXT: pushl %esi +; X86-SSE1-NEXT: andl $-8, %esp +; X86-SSE1-NEXT: subl $16, %esp +; X86-SSE1-NEXT: movl 8(%ebp), %esi +; X86-SSE1-NEXT: xorl %eax, %eax +; X86-SSE1-NEXT: xorl %edx, %edx +; X86-SSE1-NEXT: xorl %ecx, %ecx +; X86-SSE1-NEXT: xorl %ebx, %ebx +; X86-SSE1-NEXT: lock cmpxchg8b (%esi) +; X86-SSE1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: fldl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: faddl 12(%ebp) +; X86-SSE1-NEXT: fstpl (%esp) +; X86-SSE1-NEXT: movl (%esp), %ebx +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE1-NEXT: movl (%esi), %eax +; X86-SSE1-NEXT: movl 4(%esi), %edx +; X86-SSE1-NEXT: .p2align 4, 0x90 +; X86-SSE1-NEXT: .LBB1_1: # %atomicrmw.start +; X86-SSE1-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-SSE1-NEXT: lock cmpxchg8b (%esi) +; X86-SSE1-NEXT: jne .LBB1_1 +; X86-SSE1-NEXT: # %bb.2: # %atomicrmw.end +; X86-SSE1-NEXT: leal -8(%ebp), %esp +; X86-SSE1-NEXT: popl %esi +; X86-SSE1-NEXT: popl %ebx +; X86-SSE1-NEXT: popl %ebp +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: fadd_64r: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pushl %ebp +; X86-SSE2-NEXT: movl %esp, %ebp +; X86-SSE2-NEXT: pushl %ebx +; X86-SSE2-NEXT: pushl %esi +; X86-SSE2-NEXT: andl $-8, %esp +; X86-SSE2-NEXT: subl $8, %esp +; X86-SSE2-NEXT: movl 8(%ebp), %esi +; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE2-NEXT: addsd 12(%ebp), %xmm0 +; X86-SSE2-NEXT: movsd %xmm0, (%esp) +; X86-SSE2-NEXT: movl (%esp), %ebx +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movl (%esi), %eax +; X86-SSE2-NEXT: movl 4(%esi), %edx +; X86-SSE2-NEXT: .p2align 4, 0x90 +; X86-SSE2-NEXT: .LBB1_1: # %atomicrmw.start +; X86-SSE2-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-SSE2-NEXT: lock cmpxchg8b (%esi) +; X86-SSE2-NEXT: jne .LBB1_1 +; X86-SSE2-NEXT: # %bb.2: # %atomicrmw.end +; X86-SSE2-NEXT: leal -8(%ebp), %esp +; X86-SSE2-NEXT: popl %esi +; X86-SSE2-NEXT: popl %ebx +; X86-SSE2-NEXT: popl %ebp +; X86-SSE2-NEXT: retl ; ; X86-AVX-LABEL: fadd_64r: ; X86-AVX: # %bb.0: @@ -146,11 +202,17 @@ ; X86-AVX-NEXT: popl %ebp ; X86-AVX-NEXT: retl ; -; X64-LABEL: fadd_64r: -; X64: # %bb.0: -; X64-NEXT: addsd (%rdi), %xmm0 -; X64-NEXT: movsd %xmm0, (%rdi) -; X64-NEXT: retq +; X64-SSE-LABEL: fadd_64r: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: addsd (%rdi), %xmm0 +; X64-SSE-NEXT: movsd %xmm0, (%rdi) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: fadd_64r: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vaddsd (%rdi), %xmm0, %xmm0 +; X64-AVX-NEXT: vmovsd %xmm0, (%rdi) +; X64-AVX-NEXT: retq %floc = bitcast double* %loc to i64* %1 = load atomic i64, i64* %floc seq_cst, align 8 %2 = bitcast i64 %1 to double @@ -178,18 +240,31 @@ ; X86-NOSSE-NEXT: addl $8, %esp ; X86-NOSSE-NEXT: retl ; -; X86-SSE-LABEL: fadd_32g: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE-NEXT: addss glob32, %xmm0 -; X86-SSE-NEXT: movss %xmm0, glob32 -; X86-SSE-NEXT: retl +; X86-SSE1-LABEL: fadd_32g: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: subl $8, %esp +; X86-SSE1-NEXT: movl glob32, %eax +; X86-SSE1-NEXT: movl %eax, (%esp) +; X86-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE1-NEXT: addss {{\.LCPI.*}}, %xmm0 +; X86-SSE1-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE1-NEXT: movl %eax, glob32 +; X86-SSE1-NEXT: addl $8, %esp +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: fadd_32g: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE2-NEXT: addss glob32, %xmm0 +; X86-SSE2-NEXT: movss %xmm0, glob32 +; X86-SSE2-NEXT: retl ; ; X86-AVX-LABEL: fadd_32g: ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-AVX-NEXT: addss glob32, %xmm0 -; X86-AVX-NEXT: movss %xmm0, glob32 +; X86-AVX-NEXT: vaddss glob32, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovss %xmm0, glob32 ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: fadd_32g: @@ -202,8 +277,8 @@ ; X64-AVX-LABEL: fadd_32g: ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-AVX-NEXT: addss {{.*}}(%rip), %xmm0 -; X64-AVX-NEXT: movss %xmm0, {{.*}}(%rip) +; X64-AVX-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vmovss %xmm0, {{.*}}(%rip) ; X64-AVX-NEXT: retq %i = load atomic i32, i32* bitcast (float* @glob32 to i32*) monotonic, align 4 %f = bitcast i32 %i to float @@ -246,30 +321,62 @@ ; X86-NOSSE-NEXT: popl %ebp ; X86-NOSSE-NEXT: retl ; -; X86-SSE-LABEL: fadd_64g: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pushl %ebp -; X86-SSE-NEXT: movl %esp, %ebp -; X86-SSE-NEXT: pushl %ebx -; X86-SSE-NEXT: andl $-8, %esp -; X86-SSE-NEXT: subl $16, %esp -; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE-NEXT: addsd {{\.LCPI.*}}, %xmm0 -; X86-SSE-NEXT: movsd %xmm0, (%esp) -; X86-SSE-NEXT: movl (%esp), %ebx -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE-NEXT: movl glob64+4, %edx -; X86-SSE-NEXT: movl glob64, %eax -; X86-SSE-NEXT: .p2align 4, 0x90 -; X86-SSE-NEXT: .LBB3_1: # %atomicrmw.start -; X86-SSE-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-SSE-NEXT: lock cmpxchg8b glob64 -; X86-SSE-NEXT: jne .LBB3_1 -; X86-SSE-NEXT: # %bb.2: # %atomicrmw.end -; X86-SSE-NEXT: leal -4(%ebp), %esp -; X86-SSE-NEXT: popl %ebx -; X86-SSE-NEXT: popl %ebp -; X86-SSE-NEXT: retl +; X86-SSE1-LABEL: fadd_64g: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl %ebp +; X86-SSE1-NEXT: movl %esp, %ebp +; X86-SSE1-NEXT: pushl %ebx +; X86-SSE1-NEXT: andl $-8, %esp +; X86-SSE1-NEXT: subl $24, %esp +; X86-SSE1-NEXT: xorl %eax, %eax +; X86-SSE1-NEXT: xorl %edx, %edx +; X86-SSE1-NEXT: xorl %ecx, %ecx +; X86-SSE1-NEXT: xorl %ebx, %ebx +; X86-SSE1-NEXT: lock cmpxchg8b glob64 +; X86-SSE1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: fld1 +; X86-SSE1-NEXT: faddl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: fstpl (%esp) +; X86-SSE1-NEXT: movl (%esp), %ebx +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE1-NEXT: movl glob64+4, %edx +; X86-SSE1-NEXT: movl glob64, %eax +; X86-SSE1-NEXT: .p2align 4, 0x90 +; X86-SSE1-NEXT: .LBB3_1: # %atomicrmw.start +; X86-SSE1-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-SSE1-NEXT: lock cmpxchg8b glob64 +; X86-SSE1-NEXT: jne .LBB3_1 +; X86-SSE1-NEXT: # %bb.2: # %atomicrmw.end +; X86-SSE1-NEXT: leal -4(%ebp), %esp +; X86-SSE1-NEXT: popl %ebx +; X86-SSE1-NEXT: popl %ebp +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: fadd_64g: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pushl %ebp +; X86-SSE2-NEXT: movl %esp, %ebp +; X86-SSE2-NEXT: pushl %ebx +; X86-SSE2-NEXT: andl $-8, %esp +; X86-SSE2-NEXT: subl $16, %esp +; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE2-NEXT: addsd {{\.LCPI.*}}, %xmm0 +; X86-SSE2-NEXT: movsd %xmm0, (%esp) +; X86-SSE2-NEXT: movl (%esp), %ebx +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movl glob64+4, %edx +; X86-SSE2-NEXT: movl glob64, %eax +; X86-SSE2-NEXT: .p2align 4, 0x90 +; X86-SSE2-NEXT: .LBB3_1: # %atomicrmw.start +; X86-SSE2-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-SSE2-NEXT: lock cmpxchg8b glob64 +; X86-SSE2-NEXT: jne .LBB3_1 +; X86-SSE2-NEXT: # %bb.2: # %atomicrmw.end +; X86-SSE2-NEXT: leal -4(%ebp), %esp +; X86-SSE2-NEXT: popl %ebx +; X86-SSE2-NEXT: popl %ebp +; X86-SSE2-NEXT: retl ; ; X86-AVX-LABEL: fadd_64g: ; X86-AVX: # %bb.0: @@ -306,8 +413,8 @@ ; X64-AVX-LABEL: fadd_64g: ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X64-AVX-NEXT: addsd {{.*}}(%rip), %xmm0 -; X64-AVX-NEXT: movsd %xmm0, {{.*}}(%rip) +; X64-AVX-NEXT: vaddsd {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vmovsd %xmm0, {{.*}}(%rip) ; X64-AVX-NEXT: retq %i = load atomic i64, i64* bitcast (double* @glob64 to i64*) monotonic, align 8 %f = bitcast i64 %i to double @@ -332,34 +439,47 @@ ; X86-NOSSE-NEXT: addl $8, %esp ; X86-NOSSE-NEXT: retl ; -; X86-SSE-LABEL: fadd_32imm: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE-NEXT: addss -559038737, %xmm0 -; X86-SSE-NEXT: movss %xmm0, -559038737 -; X86-SSE-NEXT: retl +; X86-SSE1-LABEL: fadd_32imm: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: subl $8, %esp +; X86-SSE1-NEXT: movl -559038737, %eax +; X86-SSE1-NEXT: movl %eax, (%esp) +; X86-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE1-NEXT: addss {{\.LCPI.*}}, %xmm0 +; X86-SSE1-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE1-NEXT: movl %eax, -559038737 +; X86-SSE1-NEXT: addl $8, %esp +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: fadd_32imm: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE2-NEXT: addss -559038737, %xmm0 +; X86-SSE2-NEXT: movss %xmm0, -559038737 +; X86-SSE2-NEXT: retl ; ; X86-AVX-LABEL: fadd_32imm: ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-AVX-NEXT: addss -559038737, %xmm0 -; X86-AVX-NEXT: movss %xmm0, -559038737 +; X86-AVX-NEXT: vaddss -559038737, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovss %xmm0, -559038737 ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: fadd_32imm: ; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-SSE-NEXT: movl $3735928559, %eax # imm = 0xDEADBEEF +; X64-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-SSE-NEXT: addss (%rax), %xmm0 ; X64-SSE-NEXT: movss %xmm0, (%rax) ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: fadd_32imm: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-AVX-NEXT: movl $3735928559, %eax # imm = 0xDEADBEEF -; X64-AVX-NEXT: addss (%rax), %xmm0 -; X64-AVX-NEXT: movss %xmm0, (%rax) +; X64-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vaddss (%rax), %xmm0, %xmm0 +; X64-AVX-NEXT: vmovss %xmm0, (%rax) ; X64-AVX-NEXT: retq %i = load atomic i32, i32* inttoptr (i32 3735928559 to i32*) monotonic, align 4 %f = bitcast i32 %i to float @@ -402,30 +522,62 @@ ; X86-NOSSE-NEXT: popl %ebp ; X86-NOSSE-NEXT: retl ; -; X86-SSE-LABEL: fadd_64imm: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pushl %ebp -; X86-SSE-NEXT: movl %esp, %ebp -; X86-SSE-NEXT: pushl %ebx -; X86-SSE-NEXT: andl $-8, %esp -; X86-SSE-NEXT: subl $16, %esp -; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE-NEXT: addsd {{\.LCPI.*}}, %xmm0 -; X86-SSE-NEXT: movsd %xmm0, (%esp) -; X86-SSE-NEXT: movl (%esp), %ebx -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE-NEXT: movl -559038737, %eax -; X86-SSE-NEXT: movl -559038733, %edx -; X86-SSE-NEXT: .p2align 4, 0x90 -; X86-SSE-NEXT: .LBB5_1: # %atomicrmw.start -; X86-SSE-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-SSE-NEXT: lock cmpxchg8b -559038737 -; X86-SSE-NEXT: jne .LBB5_1 -; X86-SSE-NEXT: # %bb.2: # %atomicrmw.end -; X86-SSE-NEXT: leal -4(%ebp), %esp -; X86-SSE-NEXT: popl %ebx -; X86-SSE-NEXT: popl %ebp -; X86-SSE-NEXT: retl +; X86-SSE1-LABEL: fadd_64imm: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl %ebp +; X86-SSE1-NEXT: movl %esp, %ebp +; X86-SSE1-NEXT: pushl %ebx +; X86-SSE1-NEXT: andl $-8, %esp +; X86-SSE1-NEXT: subl $24, %esp +; X86-SSE1-NEXT: xorl %eax, %eax +; X86-SSE1-NEXT: xorl %edx, %edx +; X86-SSE1-NEXT: xorl %ecx, %ecx +; X86-SSE1-NEXT: xorl %ebx, %ebx +; X86-SSE1-NEXT: lock cmpxchg8b -559038737 +; X86-SSE1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: fld1 +; X86-SSE1-NEXT: faddl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: fstpl (%esp) +; X86-SSE1-NEXT: movl (%esp), %ebx +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE1-NEXT: movl -559038737, %eax +; X86-SSE1-NEXT: movl -559038733, %edx +; X86-SSE1-NEXT: .p2align 4, 0x90 +; X86-SSE1-NEXT: .LBB5_1: # %atomicrmw.start +; X86-SSE1-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-SSE1-NEXT: lock cmpxchg8b -559038737 +; X86-SSE1-NEXT: jne .LBB5_1 +; X86-SSE1-NEXT: # %bb.2: # %atomicrmw.end +; X86-SSE1-NEXT: leal -4(%ebp), %esp +; X86-SSE1-NEXT: popl %ebx +; X86-SSE1-NEXT: popl %ebp +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: fadd_64imm: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pushl %ebp +; X86-SSE2-NEXT: movl %esp, %ebp +; X86-SSE2-NEXT: pushl %ebx +; X86-SSE2-NEXT: andl $-8, %esp +; X86-SSE2-NEXT: subl $16, %esp +; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE2-NEXT: addsd {{\.LCPI.*}}, %xmm0 +; X86-SSE2-NEXT: movsd %xmm0, (%esp) +; X86-SSE2-NEXT: movl (%esp), %ebx +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movl -559038737, %eax +; X86-SSE2-NEXT: movl -559038733, %edx +; X86-SSE2-NEXT: .p2align 4, 0x90 +; X86-SSE2-NEXT: .LBB5_1: # %atomicrmw.start +; X86-SSE2-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-SSE2-NEXT: lock cmpxchg8b -559038737 +; X86-SSE2-NEXT: jne .LBB5_1 +; X86-SSE2-NEXT: # %bb.2: # %atomicrmw.end +; X86-SSE2-NEXT: leal -4(%ebp), %esp +; X86-SSE2-NEXT: popl %ebx +; X86-SSE2-NEXT: popl %ebp +; X86-SSE2-NEXT: retl ; ; X86-AVX-LABEL: fadd_64imm: ; X86-AVX: # %bb.0: @@ -454,18 +606,18 @@ ; ; X64-SSE-LABEL: fadd_64imm: ; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X64-SSE-NEXT: movl $3735928559, %eax # imm = 0xDEADBEEF +; X64-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X64-SSE-NEXT: addsd (%rax), %xmm0 ; X64-SSE-NEXT: movsd %xmm0, (%rax) ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: fadd_64imm: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X64-AVX-NEXT: movl $3735928559, %eax # imm = 0xDEADBEEF -; X64-AVX-NEXT: addsd (%rax), %xmm0 -; X64-AVX-NEXT: movsd %xmm0, (%rax) +; X64-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X64-AVX-NEXT: vaddsd (%rax), %xmm0, %xmm0 +; X64-AVX-NEXT: vmovsd %xmm0, (%rax) ; X64-AVX-NEXT: retq %i = load atomic i64, i64* inttoptr (i64 3735928559 to i64*) monotonic, align 8 %f = bitcast i64 %i to double @@ -490,21 +642,34 @@ ; X86-NOSSE-NEXT: addl $12, %esp ; X86-NOSSE-NEXT: retl ; -; X86-SSE-LABEL: fadd_32stack: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pushl %eax -; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE-NEXT: addss (%esp), %xmm0 -; X86-SSE-NEXT: movss %xmm0, (%esp) -; X86-SSE-NEXT: popl %eax -; X86-SSE-NEXT: retl +; X86-SSE1-LABEL: fadd_32stack: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: subl $12, %esp +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE1-NEXT: movl %eax, (%esp) +; X86-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE1-NEXT: addss {{\.LCPI.*}}, %xmm0 +; X86-SSE1-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE1-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: addl $12, %esp +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: fadd_32stack: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pushl %eax +; X86-SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE2-NEXT: addss (%esp), %xmm0 +; X86-SSE2-NEXT: movss %xmm0, (%esp) +; X86-SSE2-NEXT: popl %eax +; X86-SSE2-NEXT: retl ; ; X86-AVX-LABEL: fadd_32stack: ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: pushl %eax ; X86-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-AVX-NEXT: addss (%esp), %xmm0 -; X86-AVX-NEXT: movss %xmm0, (%esp) +; X86-AVX-NEXT: vaddss (%esp), %xmm0, %xmm0 +; X86-AVX-NEXT: vmovss %xmm0, (%esp) ; X86-AVX-NEXT: popl %eax ; X86-AVX-NEXT: retl ; @@ -518,8 +683,8 @@ ; X64-AVX-LABEL: fadd_32stack: ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-AVX-NEXT: addss -{{[0-9]+}}(%rsp), %xmm0 -; X64-AVX-NEXT: movss %xmm0, -{{[0-9]+}}(%rsp) +; X64-AVX-NEXT: vaddss -{{[0-9]+}}(%rsp), %xmm0, %xmm0 +; X64-AVX-NEXT: vmovss %xmm0, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: retq %ptr = alloca i32, align 4 %bc3 = bitcast i32* %ptr to float* @@ -564,30 +729,62 @@ ; X86-NOSSE-NEXT: popl %ebp ; X86-NOSSE-NEXT: retl ; -; X86-SSE-LABEL: fadd_64stack: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pushl %ebp -; X86-SSE-NEXT: movl %esp, %ebp -; X86-SSE-NEXT: pushl %ebx -; X86-SSE-NEXT: andl $-8, %esp -; X86-SSE-NEXT: subl $24, %esp -; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE-NEXT: addsd {{\.LCPI.*}}, %xmm0 -; X86-SSE-NEXT: movsd %xmm0, {{[0-9]+}}(%esp) -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE-NEXT: movl (%esp), %eax -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SSE-NEXT: .p2align 4, 0x90 -; X86-SSE-NEXT: .LBB7_1: # %atomicrmw.start -; X86-SSE-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-SSE-NEXT: lock cmpxchg8b (%esp) -; X86-SSE-NEXT: jne .LBB7_1 -; X86-SSE-NEXT: # %bb.2: # %atomicrmw.end -; X86-SSE-NEXT: leal -4(%ebp), %esp -; X86-SSE-NEXT: popl %ebx -; X86-SSE-NEXT: popl %ebp -; X86-SSE-NEXT: retl +; X86-SSE1-LABEL: fadd_64stack: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl %ebp +; X86-SSE1-NEXT: movl %esp, %ebp +; X86-SSE1-NEXT: pushl %ebx +; X86-SSE1-NEXT: andl $-8, %esp +; X86-SSE1-NEXT: subl $32, %esp +; X86-SSE1-NEXT: xorl %eax, %eax +; X86-SSE1-NEXT: xorl %edx, %edx +; X86-SSE1-NEXT: xorl %ecx, %ecx +; X86-SSE1-NEXT: xorl %ebx, %ebx +; X86-SSE1-NEXT: lock cmpxchg8b (%esp) +; X86-SSE1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: fld1 +; X86-SSE1-NEXT: faddl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: fstpl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE1-NEXT: movl (%esp), %eax +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE1-NEXT: .p2align 4, 0x90 +; X86-SSE1-NEXT: .LBB7_1: # %atomicrmw.start +; X86-SSE1-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-SSE1-NEXT: lock cmpxchg8b (%esp) +; X86-SSE1-NEXT: jne .LBB7_1 +; X86-SSE1-NEXT: # %bb.2: # %atomicrmw.end +; X86-SSE1-NEXT: leal -4(%ebp), %esp +; X86-SSE1-NEXT: popl %ebx +; X86-SSE1-NEXT: popl %ebp +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: fadd_64stack: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pushl %ebp +; X86-SSE2-NEXT: movl %esp, %ebp +; X86-SSE2-NEXT: pushl %ebx +; X86-SSE2-NEXT: andl $-8, %esp +; X86-SSE2-NEXT: subl $24, %esp +; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE2-NEXT: addsd {{\.LCPI.*}}, %xmm0 +; X86-SSE2-NEXT: movsd %xmm0, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movl (%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE2-NEXT: .p2align 4, 0x90 +; X86-SSE2-NEXT: .LBB7_1: # %atomicrmw.start +; X86-SSE2-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-SSE2-NEXT: lock cmpxchg8b (%esp) +; X86-SSE2-NEXT: jne .LBB7_1 +; X86-SSE2-NEXT: # %bb.2: # %atomicrmw.end +; X86-SSE2-NEXT: leal -4(%ebp), %esp +; X86-SSE2-NEXT: popl %ebx +; X86-SSE2-NEXT: popl %ebp +; X86-SSE2-NEXT: retl ; ; X86-AVX-LABEL: fadd_64stack: ; X86-AVX: # %bb.0: @@ -624,8 +821,8 @@ ; X64-AVX-LABEL: fadd_64stack: ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X64-AVX-NEXT: addsd -{{[0-9]+}}(%rsp), %xmm0 -; X64-AVX-NEXT: movsd %xmm0, -{{[0-9]+}}(%rsp) +; X64-AVX-NEXT: vaddsd -{{[0-9]+}}(%rsp), %xmm0, %xmm0 +; X64-AVX-NEXT: vmovsd %xmm0, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: retq %ptr = alloca i64, align 8 %bc3 = bitcast i64* %ptr to double* @@ -676,36 +873,74 @@ ; X86-NOSSE-NEXT: popl %ebp ; X86-NOSSE-NEXT: retl ; -; X86-SSE-LABEL: fadd_array: -; X86-SSE: # %bb.0: # %bb -; X86-SSE-NEXT: pushl %ebp -; X86-SSE-NEXT: movl %esp, %ebp -; X86-SSE-NEXT: pushl %ebx -; X86-SSE-NEXT: pushl %edi -; X86-SSE-NEXT: pushl %esi -; X86-SSE-NEXT: andl $-8, %esp -; X86-SSE-NEXT: subl $16, %esp -; X86-SSE-NEXT: movl 20(%ebp), %esi -; X86-SSE-NEXT: movl 8(%ebp), %edi -; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE-NEXT: addsd 12(%ebp), %xmm0 -; X86-SSE-NEXT: movsd %xmm0, (%esp) -; X86-SSE-NEXT: movl (%esp), %ebx -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE-NEXT: movl (%edi,%esi,8), %eax -; X86-SSE-NEXT: movl 4(%edi,%esi,8), %edx -; X86-SSE-NEXT: .p2align 4, 0x90 -; X86-SSE-NEXT: .LBB8_1: # %atomicrmw.start -; X86-SSE-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-SSE-NEXT: lock cmpxchg8b (%edi,%esi,8) -; X86-SSE-NEXT: jne .LBB8_1 -; X86-SSE-NEXT: # %bb.2: # %atomicrmw.end -; X86-SSE-NEXT: leal -12(%ebp), %esp -; X86-SSE-NEXT: popl %esi -; X86-SSE-NEXT: popl %edi -; X86-SSE-NEXT: popl %ebx -; X86-SSE-NEXT: popl %ebp -; X86-SSE-NEXT: retl +; X86-SSE1-LABEL: fadd_array: +; X86-SSE1: # %bb.0: # %bb +; X86-SSE1-NEXT: pushl %ebp +; X86-SSE1-NEXT: movl %esp, %ebp +; X86-SSE1-NEXT: pushl %ebx +; X86-SSE1-NEXT: pushl %edi +; X86-SSE1-NEXT: pushl %esi +; X86-SSE1-NEXT: andl $-8, %esp +; X86-SSE1-NEXT: subl $24, %esp +; X86-SSE1-NEXT: movl 20(%ebp), %esi +; X86-SSE1-NEXT: movl 8(%ebp), %edi +; X86-SSE1-NEXT: xorl %eax, %eax +; X86-SSE1-NEXT: xorl %edx, %edx +; X86-SSE1-NEXT: xorl %ecx, %ecx +; X86-SSE1-NEXT: xorl %ebx, %ebx +; X86-SSE1-NEXT: lock cmpxchg8b (%edi,%esi,8) +; X86-SSE1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: fldl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: faddl 12(%ebp) +; X86-SSE1-NEXT: fstpl (%esp) +; X86-SSE1-NEXT: movl (%esp), %ebx +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE1-NEXT: movl (%edi,%esi,8), %eax +; X86-SSE1-NEXT: movl 4(%edi,%esi,8), %edx +; X86-SSE1-NEXT: .p2align 4, 0x90 +; X86-SSE1-NEXT: .LBB8_1: # %atomicrmw.start +; X86-SSE1-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-SSE1-NEXT: lock cmpxchg8b (%edi,%esi,8) +; X86-SSE1-NEXT: jne .LBB8_1 +; X86-SSE1-NEXT: # %bb.2: # %atomicrmw.end +; X86-SSE1-NEXT: leal -12(%ebp), %esp +; X86-SSE1-NEXT: popl %esi +; X86-SSE1-NEXT: popl %edi +; X86-SSE1-NEXT: popl %ebx +; X86-SSE1-NEXT: popl %ebp +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: fadd_array: +; X86-SSE2: # %bb.0: # %bb +; X86-SSE2-NEXT: pushl %ebp +; X86-SSE2-NEXT: movl %esp, %ebp +; X86-SSE2-NEXT: pushl %ebx +; X86-SSE2-NEXT: pushl %edi +; X86-SSE2-NEXT: pushl %esi +; X86-SSE2-NEXT: andl $-8, %esp +; X86-SSE2-NEXT: subl $16, %esp +; X86-SSE2-NEXT: movl 20(%ebp), %esi +; X86-SSE2-NEXT: movl 8(%ebp), %edi +; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE2-NEXT: addsd 12(%ebp), %xmm0 +; X86-SSE2-NEXT: movsd %xmm0, (%esp) +; X86-SSE2-NEXT: movl (%esp), %ebx +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movl (%edi,%esi,8), %eax +; X86-SSE2-NEXT: movl 4(%edi,%esi,8), %edx +; X86-SSE2-NEXT: .p2align 4, 0x90 +; X86-SSE2-NEXT: .LBB8_1: # %atomicrmw.start +; X86-SSE2-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-SSE2-NEXT: lock cmpxchg8b (%edi,%esi,8) +; X86-SSE2-NEXT: jne .LBB8_1 +; X86-SSE2-NEXT: # %bb.2: # %atomicrmw.end +; X86-SSE2-NEXT: leal -12(%ebp), %esp +; X86-SSE2-NEXT: popl %esi +; X86-SSE2-NEXT: popl %edi +; X86-SSE2-NEXT: popl %ebx +; X86-SSE2-NEXT: popl %ebp +; X86-SSE2-NEXT: retl ; ; X86-AVX-LABEL: fadd_array: ; X86-AVX: # %bb.0: # %bb @@ -738,11 +973,17 @@ ; X86-AVX-NEXT: popl %ebp ; X86-AVX-NEXT: retl ; -; X64-LABEL: fadd_array: -; X64: # %bb.0: # %bb -; X64-NEXT: addsd (%rdi,%rsi,8), %xmm0 -; X64-NEXT: movsd %xmm0, (%rdi,%rsi,8) -; X64-NEXT: retq +; X64-SSE-LABEL: fadd_array: +; X64-SSE: # %bb.0: # %bb +; X64-SSE-NEXT: addsd (%rdi,%rsi,8), %xmm0 +; X64-SSE-NEXT: movsd %xmm0, (%rdi,%rsi,8) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: fadd_array: +; X64-AVX: # %bb.0: # %bb +; X64-AVX-NEXT: vaddsd (%rdi,%rsi,8), %xmm0, %xmm0 +; X64-AVX-NEXT: vmovsd %xmm0, (%rdi,%rsi,8) +; X64-AVX-NEXT: retq bb: %tmp4 = getelementptr inbounds i64, i64* %arg, i64 %arg2 %tmp6 = load atomic i64, i64* %tmp4 monotonic, align 8 Index: llvm/test/CodeGen/X86/atomic-non-integer.ll =================================================================== --- llvm/test/CodeGen/X86/atomic-non-integer.ll +++ llvm/test/CodeGen/X86/atomic-non-integer.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i386-linux-generic -verify-machineinstrs -mattr=sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE +; RUN: llc < %s -mtriple=i386-linux-generic -verify-machineinstrs -mattr=sse | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE1 +; RUN: llc < %s -mtriple=i386-linux-generic -verify-machineinstrs -mattr=sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE2 ; RUN: llc < %s -mtriple=i386-linux-generic -verify-machineinstrs -mattr=avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1 ; RUN: llc < %s -mtriple=i386-linux-generic -verify-machineinstrs -mattr=avx512f | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX512 ; RUN: llc < %s -mtriple=i386-linux-generic -verify-machineinstrs | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE @@ -122,14 +123,12 @@ ; ; X64-SSE-LABEL: store_float: ; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movd %xmm0, %eax -; X64-SSE-NEXT: movl %eax, (%rdi) +; X64-SSE-NEXT: movss %xmm0, (%rdi) ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: store_float: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovd %xmm0, %eax -; X64-AVX-NEXT: movl %eax, (%rdi) +; X64-AVX-NEXT: vmovss %xmm0, (%rdi) ; X64-AVX-NEXT: retq store atomic float %v, float* %fptr unordered, align 4 ret void @@ -163,14 +162,12 @@ ; ; X64-SSE-LABEL: store_double: ; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movq %xmm0, %rax -; X64-SSE-NEXT: movq %rax, (%rdi) +; X64-SSE-NEXT: movsd %xmm0, (%rdi) ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: store_double: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovq %xmm0, %rax -; X64-AVX-NEXT: movq %rax, (%rdi) +; X64-AVX-NEXT: vmovsd %xmm0, (%rdi) ; X64-AVX-NEXT: retq store atomic double %v, double* %fptr unordered, align 8 ret void @@ -332,25 +329,37 @@ } define float @load_float(float* %fptr) { -; X86-SSE-LABEL: load_float: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pushl %eax -; X86-SSE-NEXT: .cfi_def_cfa_offset 8 -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movd (%eax), %xmm0 -; X86-SSE-NEXT: movd %xmm0, (%esp) -; X86-SSE-NEXT: flds (%esp) -; X86-SSE-NEXT: popl %eax -; X86-SSE-NEXT: .cfi_def_cfa_offset 4 -; X86-SSE-NEXT: retl +; X86-SSE1-LABEL: load_float: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl %eax +; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE1-NEXT: movl (%eax), %eax +; X86-SSE1-NEXT: movl %eax, (%esp) +; X86-SSE1-NEXT: flds (%esp) +; X86-SSE1-NEXT: popl %eax +; X86-SSE1-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: load_float: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pushl %eax +; X86-SSE2-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE2-NEXT: movss %xmm0, (%esp) +; X86-SSE2-NEXT: flds (%esp) +; X86-SSE2-NEXT: popl %eax +; X86-SSE2-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE2-NEXT: retl ; ; X86-AVX-LABEL: load_float: ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: pushl %eax ; X86-AVX-NEXT: .cfi_def_cfa_offset 8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: vmovd (%eax), %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, (%esp) +; X86-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vmovss %xmm0, (%esp) ; X86-AVX-NEXT: flds (%esp) ; X86-AVX-NEXT: popl %eax ; X86-AVX-NEXT: .cfi_def_cfa_offset 4 @@ -370,29 +379,56 @@ ; ; X64-SSE-LABEL: load_float: ; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movd (%rdi), %xmm0 +; X64-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: load_float: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovd (%rdi), %xmm0 +; X64-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-AVX-NEXT: retq %v = load atomic float, float* %fptr unordered, align 4 ret float %v } define double @load_double(double* %fptr) { -; X86-SSE-LABEL: load_double: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: subl $12, %esp -; X86-SSE-NEXT: .cfi_def_cfa_offset 16 -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE-NEXT: movlps %xmm0, (%esp) -; X86-SSE-NEXT: fldl (%esp) -; X86-SSE-NEXT: addl $12, %esp -; X86-SSE-NEXT: .cfi_def_cfa_offset 4 -; X86-SSE-NEXT: retl +; X86-SSE1-LABEL: load_double: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl %ebx +; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE1-NEXT: pushl %esi +; X86-SSE1-NEXT: .cfi_def_cfa_offset 12 +; X86-SSE1-NEXT: subl $12, %esp +; X86-SSE1-NEXT: .cfi_def_cfa_offset 24 +; X86-SSE1-NEXT: .cfi_offset %esi, -12 +; X86-SSE1-NEXT: .cfi_offset %ebx, -8 +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SSE1-NEXT: xorl %eax, %eax +; X86-SSE1-NEXT: xorl %edx, %edx +; X86-SSE1-NEXT: xorl %ecx, %ecx +; X86-SSE1-NEXT: xorl %ebx, %ebx +; X86-SSE1-NEXT: lock cmpxchg8b (%esi) +; X86-SSE1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movl %eax, (%esp) +; X86-SSE1-NEXT: fldl (%esp) +; X86-SSE1-NEXT: addl $12, %esp +; X86-SSE1-NEXT: .cfi_def_cfa_offset 12 +; X86-SSE1-NEXT: popl %esi +; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE1-NEXT: popl %ebx +; X86-SSE1-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: load_double: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: subl $12, %esp +; X86-SSE2-NEXT: .cfi_def_cfa_offset 16 +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE2-NEXT: movlps %xmm0, (%esp) +; X86-SSE2-NEXT: fldl (%esp) +; X86-SSE2-NEXT: addl $12, %esp +; X86-SSE2-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE2-NEXT: retl ; ; X86-AVX-LABEL: load_double: ; X86-AVX: # %bb.0: @@ -435,12 +471,12 @@ ; ; X64-SSE-LABEL: load_double: ; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movq (%rdi), %xmm0 +; X64-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: load_double: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovq (%rdi), %xmm0 +; X64-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X64-AVX-NEXT: retq %v = load atomic double, double* %fptr unordered, align 8 ret double %v @@ -668,27 +704,37 @@ } define float @load_float_seq_cst(float* %fptr) { -; X86-SSE-LABEL: load_float_seq_cst: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pushl %eax -; X86-SSE-NEXT: .cfi_def_cfa_offset 8 -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl (%eax), %eax -; X86-SSE-NEXT: movd %eax, %xmm0 -; X86-SSE-NEXT: movd %xmm0, (%esp) -; X86-SSE-NEXT: flds (%esp) -; X86-SSE-NEXT: popl %eax -; X86-SSE-NEXT: .cfi_def_cfa_offset 4 -; X86-SSE-NEXT: retl +; X86-SSE1-LABEL: load_float_seq_cst: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl %eax +; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE1-NEXT: movl (%eax), %eax +; X86-SSE1-NEXT: movl %eax, (%esp) +; X86-SSE1-NEXT: flds (%esp) +; X86-SSE1-NEXT: popl %eax +; X86-SSE1-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: load_float_seq_cst: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pushl %eax +; X86-SSE2-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE2-NEXT: movss %xmm0, (%esp) +; X86-SSE2-NEXT: flds (%esp) +; X86-SSE2-NEXT: popl %eax +; X86-SSE2-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE2-NEXT: retl ; ; X86-AVX-LABEL: load_float_seq_cst: ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: pushl %eax ; X86-AVX-NEXT: .cfi_def_cfa_offset 8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movl (%eax), %eax -; X86-AVX-NEXT: vmovd %eax, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, (%esp) +; X86-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vmovss %xmm0, (%esp) ; X86-AVX-NEXT: flds (%esp) ; X86-AVX-NEXT: popl %eax ; X86-AVX-NEXT: .cfi_def_cfa_offset 4 @@ -708,31 +754,56 @@ ; ; X64-SSE-LABEL: load_float_seq_cst: ; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movl (%rdi), %eax -; X64-SSE-NEXT: movd %eax, %xmm0 +; X64-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: load_float_seq_cst: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: movl (%rdi), %eax -; X64-AVX-NEXT: vmovd %eax, %xmm0 +; X64-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-AVX-NEXT: retq %v = load atomic float, float* %fptr seq_cst, align 4 ret float %v } define double @load_double_seq_cst(double* %fptr) { -; X86-SSE-LABEL: load_double_seq_cst: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: subl $12, %esp -; X86-SSE-NEXT: .cfi_def_cfa_offset 16 -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE-NEXT: movlps %xmm0, (%esp) -; X86-SSE-NEXT: fldl (%esp) -; X86-SSE-NEXT: addl $12, %esp -; X86-SSE-NEXT: .cfi_def_cfa_offset 4 -; X86-SSE-NEXT: retl +; X86-SSE1-LABEL: load_double_seq_cst: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl %ebx +; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE1-NEXT: pushl %esi +; X86-SSE1-NEXT: .cfi_def_cfa_offset 12 +; X86-SSE1-NEXT: subl $12, %esp +; X86-SSE1-NEXT: .cfi_def_cfa_offset 24 +; X86-SSE1-NEXT: .cfi_offset %esi, -12 +; X86-SSE1-NEXT: .cfi_offset %ebx, -8 +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SSE1-NEXT: xorl %eax, %eax +; X86-SSE1-NEXT: xorl %edx, %edx +; X86-SSE1-NEXT: xorl %ecx, %ecx +; X86-SSE1-NEXT: xorl %ebx, %ebx +; X86-SSE1-NEXT: lock cmpxchg8b (%esi) +; X86-SSE1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movl %eax, (%esp) +; X86-SSE1-NEXT: fldl (%esp) +; X86-SSE1-NEXT: addl $12, %esp +; X86-SSE1-NEXT: .cfi_def_cfa_offset 12 +; X86-SSE1-NEXT: popl %esi +; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE1-NEXT: popl %ebx +; X86-SSE1-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: load_double_seq_cst: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: subl $12, %esp +; X86-SSE2-NEXT: .cfi_def_cfa_offset 16 +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE2-NEXT: movlps %xmm0, (%esp) +; X86-SSE2-NEXT: fldl (%esp) +; X86-SSE2-NEXT: addl $12, %esp +; X86-SSE2-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE2-NEXT: retl ; ; X86-AVX-LABEL: load_double_seq_cst: ; X86-AVX: # %bb.0: @@ -775,14 +846,12 @@ ; ; X64-SSE-LABEL: load_double_seq_cst: ; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movq (%rdi), %rax -; X64-SSE-NEXT: movq %rax, %xmm0 +; X64-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: load_double_seq_cst: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: movq (%rdi), %rax -; X64-AVX-NEXT: vmovq %rax, %xmm0 +; X64-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X64-AVX-NEXT: retq %v = load atomic double, double* %fptr seq_cst, align 8 ret double %v