diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -5422,8 +5422,11 @@ EVT VT = Op.getValueType(); const SDNodeFlags Flags = Op->getFlags(); const TargetOptions &Options = DAG.getTarget().Options; - if (!Op.hasOneUse() && !(Op.getOpcode() == ISD::FP_EXTEND && - isFPExtFree(VT, Op.getOperand(0).getValueType()))) + if (!Op.hasOneUse() && + !(Op.getOpcode() == ISD::FP_EXTEND && + isFPExtFree(VT, Op.getOperand(0).getValueType())) && + !(Op.getOpcode() == ISD::ConstantFP) + ) return 0; // Don't recurse exponentially. diff --git a/llvm/test/CodeGen/AMDGPU/const-multiuse-tl.ll b/llvm/test/CodeGen/AMDGPU/const-multiuse-tl.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/const-multiuse-tl.ll @@ -0,0 +1,55 @@ +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN %s + +; GCN: v_fma_f32 v0, s0, 0, 0.5 + +define amdgpu_ps void @main(float addrspace(6)* %arg1) #0 { +main_body: + %tmp2 = load float, float addrspace(6)* %arg1 + %tmp3 = call nsz float @llvm.floor.f32(float undef) #2 + %tmp4 = fptosi float %tmp3 to i32 + %tmp5 = add i32 0, %tmp4 + %tmp6 = sub i32 0, %tmp5 + %tmp7 = sitofp i32 %tmp4 to float + %tmp8 = fmul nsz float %tmp7, 0x3FEBB67AE0000000 + %tmp9 = sitofp i32 %tmp6 to float + %tmp10 = call nsz float @llvm.fmuladd.f32(float %tmp9, float 0xBFEBB67AE0000000, float %tmp8) #2 + %tmp11 = fsub nsz float 0.000000e+00, %tmp10 + %tmp12 = call nsz float @llvm.fmuladd.f32(float %tmp11, float 5.000000e-01, float 5.000000e-01) #2 + %tmp13 = call nsz float @llvm.fmuladd.f32(float 0.000000e+00, float %tmp2, float 5.000000e-01) #2 + %tmp14 = call nsz float @llvm.floor.f32(float %tmp13) #2 + %tmp15 = fptosi float %tmp14 to i32 + %tmp16 = icmp eq i32 %tmp15, 0 + br i1 %tmp16, label %endif06, label %if04 + +if04: ; preds = %main_body + %tmp17 = fadd nsz float %tmp12, -5.000000e-01 + %tmp18 = fneg nsz float %tmp17 + %tmp19 = fmul nsz float 0.000000e+00, %tmp18 + %tmp20 = call nsz float @llvm.fmuladd.f32(float 0.000000e+00, float 0.000000e+00, float %tmp19) #2 + %tmp21 = fadd nsz float %tmp20, 5.000000e-01 + %tmp22 = fadd nsz float %tmp21, 0.000000e+00 + %tmp23 = fmul nsz float %tmp22, 0.000000e+00 + %tmp24 = call nsz <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %tmp23, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #4 + %tmp25 = extractelement <4 x float> %tmp24, i32 3 + %tmp26 = fsub nsz float 1.000000e+00, %tmp25 + %tmp27 = call nsz float @llvm.fmuladd.f32(float undef, float %tmp26, float undef) #2 + unreachable + +endif06: ; preds = %main_body + ret void +} + +; Function Attrs: nounwind readonly +declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #1 + +; Function Attrs: nounwind readnone speculatable willreturn +declare float @llvm.floor.f32(float) #3 + +; Function Attrs: nounwind readnone speculatable willreturn +declare float @llvm.fmuladd.f32(float, float, float) #3 + +attributes #0 = { "no-signed-zeros-fp-math"="true" } +attributes #1 = { nounwind readonly } +attributes #2 = { nounwind readnone } +attributes #3 = { nounwind readnone speculatable willreturn } +attributes #4 = { convergent nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll --- a/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll @@ -276,8 +276,8 @@ ; GCN-LABEL: {{^}}div_v4_c_by_minus_x_25ulp: ; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} ; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} -; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} -; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} +; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, 2.0{{$}} +; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, 2.0{{$}} ; GCN-DENORM-DAG: v_rcp_f32_e32 ; GCN-DENORM-DAG: v_rcp_f32_e32 @@ -299,7 +299,7 @@ ; GCN-DENORM-DAG: v_div_fmas_f32 ; GCN-DENORM-DAG: v_div_fmas_f32 ; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, -2.0{{$}} -; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, -2.0{{$}} +; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, 2.0{{$}} ; GCN-FLUSH-DAG: v_rcp_f32_e32 ; GCN-FLUSH-DAG: v_rcp_f32_e64 diff --git a/llvm/test/CodeGen/X86/fp-cvt.ll b/llvm/test/CodeGen/X86/fp-cvt.ll --- a/llvm/test/CodeGen/X86/fp-cvt.ll +++ b/llvm/test/CodeGen/X86/fp-cvt.ll @@ -444,20 +444,20 @@ ; X86-NEXT: subl $16, %esp ; X86-NEXT: fldt 8(%ebp) ; X86-NEXT: flds {{\.LCPI.*}} -; X86-NEXT: fld %st(1) -; X86-NEXT: fsub %st(1), %st -; X86-NEXT: fxch %st(1) -; X86-NEXT: fucomp %st(2) +; X86-NEXT: fucomp %st(1) ; X86-NEXT: fnstsw %ax +; X86-NEXT: xorl %edx, %edx ; X86-NEXT: # kill: def $ah killed $ah killed $ax ; X86-NEXT: sahf +; X86-NEXT: setbe %al +; X86-NEXT: fld %st(0) +; X86-NEXT: fadds {{\.LCPI.*}} ; X86-NEXT: ja .LBB10_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: fstp %st(1) ; X86-NEXT: fldz ; X86-NEXT: .LBB10_2: ; X86-NEXT: fstp %st(0) -; X86-NEXT: setbe %al ; X86-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: orl $3072, %ecx # imm = 0xC00 @@ -465,7 +465,7 @@ ; X86-NEXT: fldcw {{[0-9]+}}(%esp) ; X86-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-NEXT: fldcw {{[0-9]+}}(%esp) -; X86-NEXT: movzbl %al, %edx +; X86-NEXT: movb %al, %dl ; X86-NEXT: shll $31, %edx ; X86-NEXT: xorl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -526,20 +526,20 @@ ; X86-NEXT: movl 8(%ebp), %eax ; X86-NEXT: fldt (%eax) ; X86-NEXT: flds {{\.LCPI.*}} -; X86-NEXT: fld %st(1) -; X86-NEXT: fsub %st(1), %st -; X86-NEXT: fxch %st(1) -; X86-NEXT: fucomp %st(2) +; X86-NEXT: fucomp %st(1) ; X86-NEXT: fnstsw %ax +; X86-NEXT: xorl %edx, %edx ; X86-NEXT: # kill: def $ah killed $ah killed $ax ; X86-NEXT: sahf +; X86-NEXT: setbe %al +; X86-NEXT: fld %st(0) +; X86-NEXT: fadds {{\.LCPI.*}} ; X86-NEXT: ja .LBB11_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: fstp %st(1) ; X86-NEXT: fldz ; X86-NEXT: .LBB11_2: ; X86-NEXT: fstp %st(0) -; X86-NEXT: setbe %al ; X86-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: orl $3072, %ecx # imm = 0xC00 @@ -547,7 +547,7 @@ ; X86-NEXT: fldcw {{[0-9]+}}(%esp) ; X86-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-NEXT: fldcw {{[0-9]+}}(%esp) -; X86-NEXT: movzbl %al, %edx +; X86-NEXT: movb %al, %dl ; X86-NEXT: shll $31, %edx ; X86-NEXT: xorl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax diff --git a/llvm/test/CodeGen/X86/recip-fastmath.ll b/llvm/test/CodeGen/X86/recip-fastmath.ll --- a/llvm/test/CodeGen/X86/recip-fastmath.ll +++ b/llvm/test/CodeGen/X86/recip-fastmath.ll @@ -245,20 +245,20 @@ ; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 ; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3 -; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 -; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 -; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 -; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 +; FMA-RECIP-NEXT: vfmadd213ss {{.*#+}} xmm3 = (xmm0 * xmm3) + xmm2 +; FMA-RECIP-NEXT: vfnmadd132ss {{.*#+}} xmm3 = -(xmm3 * xmm1) + xmm1 +; FMA-RECIP-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm3 * xmm0) + xmm2 +; FMA-RECIP-NEXT: vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm3) + xmm3 ; FMA-RECIP-NEXT: retq ; ; BDVER2-LABEL: f32_two_step: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 ; BDVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; BDVER2-NEXT: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm3 -; BDVER2-NEXT: vfmaddss %xmm1, %xmm3, %xmm1, %xmm1 -; BDVER2-NEXT: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0 -; BDVER2-NEXT: vfmaddss %xmm1, %xmm0, %xmm1, %xmm0 +; BDVER2-NEXT: vfmaddss %xmm2, %xmm1, %xmm0, %xmm3 +; BDVER2-NEXT: vfnmaddss %xmm1, %xmm3, %xmm1, %xmm1 +; BDVER2-NEXT: vfmaddss %xmm2, %xmm1, %xmm0, %xmm0 +; BDVER2-NEXT: vfnmaddss %xmm1, %xmm0, %xmm1, %xmm0 ; BDVER2-NEXT: retq ; ; BTVER2-LABEL: f32_two_step: @@ -294,10 +294,10 @@ ; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 ; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; HASWELL-NEXT: vmovaps %xmm1, %xmm3 -; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 -; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 -; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 -; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 +; HASWELL-NEXT: vfmadd213ss {{.*#+}} xmm3 = (xmm0 * xmm3) + xmm2 +; HASWELL-NEXT: vfnmadd132ss {{.*#+}} xmm3 = -(xmm3 * xmm1) + xmm1 +; HASWELL-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm3 * xmm0) + xmm2 +; HASWELL-NEXT: vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm3) + xmm3 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: f32_two_step: @@ -319,10 +319,10 @@ ; AVX512-NEXT: vrcpss %xmm0, %xmm0, %xmm1 ; AVX512-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; AVX512-NEXT: vmovaps %xmm1, %xmm3 -; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 -; AVX512-NEXT: vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 -; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 -; AVX512-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 +; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm3 = (xmm0 * xmm3) + xmm2 +; AVX512-NEXT: vfnmadd132ss {{.*#+}} xmm3 = -(xmm3 * xmm1) + xmm1 +; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm3 * xmm0) + xmm2 +; AVX512-NEXT: vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm3) + xmm3 ; AVX512-NEXT: retq %div = fdiv fast float 1.0, %x ret float %div diff --git a/llvm/test/CodeGen/X86/recip-fastmath2.ll b/llvm/test/CodeGen/X86/recip-fastmath2.ll --- a/llvm/test/CodeGen/X86/recip-fastmath2.ll +++ b/llvm/test/CodeGen/X86/recip-fastmath2.ll @@ -56,19 +56,17 @@ ; FMA-RECIP-LABEL: f32_one_step_2: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; FMA-RECIP-NEXT: vmulss %xmm2, %xmm1, %xmm3 -; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 -; FMA-RECIP-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm3 +; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm1, %xmm2 +; FMA-RECIP-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm2 * xmm0) + mem +; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 ; FMA-RECIP-NEXT: retq ; ; BDVER2-LABEL: f32_one_step_2: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; BDVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; BDVER2-NEXT: vmulss %xmm2, %xmm1, %xmm3 -; BDVER2-NEXT: vfnmaddss %xmm2, %xmm3, %xmm0, %xmm0 -; BDVER2-NEXT: vfmaddss %xmm3, %xmm0, %xmm1, %xmm0 +; BDVER2-NEXT: vmulss {{.*}}(%rip), %xmm1, %xmm2 +; BDVER2-NEXT: vfmaddss {{.*}}(%rip), %xmm2, %xmm0, %xmm0 +; BDVER2-NEXT: vfnmaddss %xmm2, %xmm0, %xmm1, %xmm0 ; BDVER2-NEXT: retq ; ; BTVER2-LABEL: f32_one_step_2: @@ -96,10 +94,9 @@ ; HASWELL-LABEL: f32_one_step_2: ; HASWELL: # %bb.0: ; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; HASWELL-NEXT: vmulss %xmm2, %xmm1, %xmm3 -; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 -; HASWELL-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm3 +; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm1, %xmm2 +; HASWELL-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm2 * xmm0) + mem +; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: f32_one_step_2: @@ -116,10 +113,9 @@ ; AVX512-LABEL: f32_one_step_2: ; AVX512: # %bb.0: ; AVX512-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; AVX512-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm3 -; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 -; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm3 +; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm1, %xmm2 +; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm2 * xmm0) + mem +; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 ; AVX512-NEXT: retq %div = fdiv fast float 3456.0, %x ret float %div @@ -269,21 +265,19 @@ ; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; FMA-RECIP-NEXT: vfmadd231ss {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2 ; FMA-RECIP-NEXT: vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1 -; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; FMA-RECIP-NEXT: vmulss %xmm1, %xmm2, %xmm3 -; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm1 -; FMA-RECIP-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm3 +; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm2, %xmm1 +; FMA-RECIP-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + mem +; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm1 ; FMA-RECIP-NEXT: retq ; ; BDVER2-LABEL: f32_two_step_2: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 ; BDVER2-NEXT: vfmaddss {{.*}}(%rip), %xmm1, %xmm0, %xmm2 -; BDVER2-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero ; BDVER2-NEXT: vfnmaddss %xmm1, %xmm2, %xmm1, %xmm1 -; BDVER2-NEXT: vmulss %xmm4, %xmm1, %xmm3 -; BDVER2-NEXT: vfnmaddss %xmm4, %xmm3, %xmm0, %xmm0 -; BDVER2-NEXT: vfmaddss %xmm3, %xmm0, %xmm1, %xmm0 +; BDVER2-NEXT: vmulss {{.*}}(%rip), %xmm1, %xmm2 +; BDVER2-NEXT: vfmaddss {{.*}}(%rip), %xmm2, %xmm0, %xmm0 +; BDVER2-NEXT: vfnmaddss %xmm2, %xmm0, %xmm1, %xmm0 ; BDVER2-NEXT: retq ; ; BTVER2-LABEL: f32_two_step_2: @@ -324,10 +318,9 @@ ; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; HASWELL-NEXT: vfmadd231ss {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2 ; HASWELL-NEXT: vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1 -; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; HASWELL-NEXT: vmulss %xmm1, %xmm2, %xmm3 -; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm1 -; HASWELL-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm3 +; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm2, %xmm1 +; HASWELL-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + mem +; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm1 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: f32_two_step_2: @@ -352,10 +345,9 @@ ; AVX512-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; AVX512-NEXT: vfmadd231ss {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2 ; AVX512-NEXT: vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1 -; AVX512-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512-NEXT: vmulss %xmm1, %xmm2, %xmm3 -; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm1 -; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm3 +; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm2, %xmm1 +; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + mem +; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm1 ; AVX512-NEXT: retq %div = fdiv fast float 6789.0, %x ret float %div diff --git a/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll b/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll --- a/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll +++ b/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll @@ -89,7 +89,7 @@ ; AVX512F_32_WIN-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX512F_32_WIN-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX512F_32_WIN-NEXT: vcmpltss %xmm1, %xmm0, %k1 -; AVX512F_32_WIN-NEXT: vsubss %xmm1, %xmm0, %xmm2 +; AVX512F_32_WIN-NEXT: vaddss __real@df000000, %xmm0, %xmm2 ; AVX512F_32_WIN-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1} ; AVX512F_32_WIN-NEXT: vmovss %xmm2, (%esp) ; AVX512F_32_WIN-NEXT: flds (%esp) @@ -110,7 +110,7 @@ ; AVX512F_32_LIN-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX512F_32_LIN-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX512F_32_LIN-NEXT: vcmpltss %xmm1, %xmm0, %k1 -; AVX512F_32_LIN-NEXT: vsubss %xmm1, %xmm0, %xmm2 +; AVX512F_32_LIN-NEXT: vaddss {{\.LCPI.*}}, %xmm0, %xmm2 ; AVX512F_32_LIN-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1} ; AVX512F_32_LIN-NEXT: vmovss %xmm2, (%esp) ; AVX512F_32_LIN-NEXT: flds (%esp) @@ -135,15 +135,16 @@ ; SSE3_32_WIN-NEXT: movaps %xmm0, %xmm2 ; SSE3_32_WIN-NEXT: cmpltss %xmm1, %xmm2 ; SSE3_32_WIN-NEXT: movaps %xmm2, %xmm3 -; SSE3_32_WIN-NEXT: andps %xmm0, %xmm2 -; SSE3_32_WIN-NEXT: xorl %edx, %edx -; SSE3_32_WIN-NEXT: ucomiss %xmm0, %xmm1 -; SSE3_32_WIN-NEXT: subss %xmm1, %xmm0 -; SSE3_32_WIN-NEXT: andnps %xmm0, %xmm3 +; SSE3_32_WIN-NEXT: andps %xmm0, %xmm3 +; SSE3_32_WIN-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero +; SSE3_32_WIN-NEXT: addss %xmm0, %xmm4 +; SSE3_32_WIN-NEXT: andnps %xmm4, %xmm2 ; SSE3_32_WIN-NEXT: orps %xmm3, %xmm2 ; SSE3_32_WIN-NEXT: movss %xmm2, (%esp) ; SSE3_32_WIN-NEXT: flds (%esp) ; SSE3_32_WIN-NEXT: fisttpll (%esp) +; SSE3_32_WIN-NEXT: xorl %edx, %edx +; SSE3_32_WIN-NEXT: ucomiss %xmm0, %xmm1 ; SSE3_32_WIN-NEXT: setbe %dl ; SSE3_32_WIN-NEXT: shll $31, %edx ; SSE3_32_WIN-NEXT: xorl {{[0-9]+}}(%esp), %edx @@ -160,15 +161,16 @@ ; SSE3_32_LIN-NEXT: movaps %xmm0, %xmm2 ; SSE3_32_LIN-NEXT: cmpltss %xmm1, %xmm2 ; SSE3_32_LIN-NEXT: movaps %xmm2, %xmm3 -; SSE3_32_LIN-NEXT: andps %xmm0, %xmm2 -; SSE3_32_LIN-NEXT: xorl %edx, %edx -; SSE3_32_LIN-NEXT: ucomiss %xmm0, %xmm1 -; SSE3_32_LIN-NEXT: subss %xmm1, %xmm0 -; SSE3_32_LIN-NEXT: andnps %xmm0, %xmm3 +; SSE3_32_LIN-NEXT: andps %xmm0, %xmm3 +; SSE3_32_LIN-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero +; SSE3_32_LIN-NEXT: addss %xmm0, %xmm4 +; SSE3_32_LIN-NEXT: andnps %xmm4, %xmm2 ; SSE3_32_LIN-NEXT: orps %xmm3, %xmm2 ; SSE3_32_LIN-NEXT: movss %xmm2, (%esp) ; SSE3_32_LIN-NEXT: flds (%esp) ; SSE3_32_LIN-NEXT: fisttpll (%esp) +; SSE3_32_LIN-NEXT: xorl %edx, %edx +; SSE3_32_LIN-NEXT: ucomiss %xmm0, %xmm1 ; SSE3_32_LIN-NEXT: setbe %dl ; SSE3_32_LIN-NEXT: shll $31, %edx ; SSE3_32_LIN-NEXT: xorl {{[0-9]+}}(%esp), %edx @@ -198,14 +200,14 @@ ; SSE2_32_WIN-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2_32_WIN-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2_32_WIN-NEXT: movaps %xmm0, %xmm2 -; SSE2_32_WIN-NEXT: subss %xmm1, %xmm2 -; SSE2_32_WIN-NEXT: movaps %xmm0, %xmm3 -; SSE2_32_WIN-NEXT: cmpltss %xmm1, %xmm3 -; SSE2_32_WIN-NEXT: movaps %xmm3, %xmm4 -; SSE2_32_WIN-NEXT: andnps %xmm2, %xmm4 +; SSE2_32_WIN-NEXT: cmpltss %xmm1, %xmm2 +; SSE2_32_WIN-NEXT: movaps %xmm2, %xmm3 ; SSE2_32_WIN-NEXT: andps %xmm0, %xmm3 -; SSE2_32_WIN-NEXT: orps %xmm4, %xmm3 -; SSE2_32_WIN-NEXT: movss %xmm3, {{[0-9]+}}(%esp) +; SSE2_32_WIN-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero +; SSE2_32_WIN-NEXT: addss %xmm0, %xmm4 +; SSE2_32_WIN-NEXT: andnps %xmm4, %xmm2 +; SSE2_32_WIN-NEXT: orps %xmm3, %xmm2 +; SSE2_32_WIN-NEXT: movss %xmm2, {{[0-9]+}}(%esp) ; SSE2_32_WIN-NEXT: flds {{[0-9]+}}(%esp) ; SSE2_32_WIN-NEXT: fnstcw {{[0-9]+}}(%esp) ; SSE2_32_WIN-NEXT: movzwl {{[0-9]+}}(%esp), %eax @@ -230,14 +232,14 @@ ; SSE2_32_LIN-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2_32_LIN-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2_32_LIN-NEXT: movaps %xmm0, %xmm2 -; SSE2_32_LIN-NEXT: subss %xmm1, %xmm2 -; SSE2_32_LIN-NEXT: movaps %xmm0, %xmm3 -; SSE2_32_LIN-NEXT: cmpltss %xmm1, %xmm3 -; SSE2_32_LIN-NEXT: movaps %xmm3, %xmm4 -; SSE2_32_LIN-NEXT: andnps %xmm2, %xmm4 +; SSE2_32_LIN-NEXT: cmpltss %xmm1, %xmm2 +; SSE2_32_LIN-NEXT: movaps %xmm2, %xmm3 ; SSE2_32_LIN-NEXT: andps %xmm0, %xmm3 -; SSE2_32_LIN-NEXT: orps %xmm4, %xmm3 -; SSE2_32_LIN-NEXT: movss %xmm3, {{[0-9]+}}(%esp) +; SSE2_32_LIN-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero +; SSE2_32_LIN-NEXT: addss %xmm0, %xmm4 +; SSE2_32_LIN-NEXT: andnps %xmm4, %xmm2 +; SSE2_32_LIN-NEXT: orps %xmm3, %xmm2 +; SSE2_32_LIN-NEXT: movss %xmm2, {{[0-9]+}}(%esp) ; SSE2_32_LIN-NEXT: flds {{[0-9]+}}(%esp) ; SSE2_32_LIN-NEXT: fnstcw {{[0-9]+}}(%esp) ; SSE2_32_LIN-NEXT: movzwl {{[0-9]+}}(%esp), %eax @@ -276,20 +278,20 @@ ; X87_WIN-NEXT: subl $16, %esp ; X87_WIN-NEXT: flds 8(%ebp) ; X87_WIN-NEXT: flds __real@5f000000 -; X87_WIN-NEXT: fld %st(1) -; X87_WIN-NEXT: fsub %st(1), %st -; X87_WIN-NEXT: fxch %st(1) -; X87_WIN-NEXT: fucomp %st(2) +; X87_WIN-NEXT: fucomp %st(1) ; X87_WIN-NEXT: fnstsw %ax +; X87_WIN-NEXT: xorl %edx, %edx ; X87_WIN-NEXT: # kill: def $ah killed $ah killed $ax ; X87_WIN-NEXT: sahf +; X87_WIN-NEXT: setbe %al +; X87_WIN-NEXT: fld %st(0) +; X87_WIN-NEXT: fadds __real@df000000 ; X87_WIN-NEXT: ja LBB0_2 ; X87_WIN-NEXT: # %bb.1: ; X87_WIN-NEXT: fstp %st(1) ; X87_WIN-NEXT: fldz ; X87_WIN-NEXT: LBB0_2: ; X87_WIN-NEXT: fstp %st(0) -; X87_WIN-NEXT: setbe %al ; X87_WIN-NEXT: fnstcw {{[0-9]+}}(%esp) ; X87_WIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; X87_WIN-NEXT: orl $3072, %ecx # imm = 0xC00 @@ -297,7 +299,7 @@ ; X87_WIN-NEXT: fldcw {{[0-9]+}}(%esp) ; X87_WIN-NEXT: fistpll {{[0-9]+}}(%esp) ; X87_WIN-NEXT: fldcw {{[0-9]+}}(%esp) -; X87_WIN-NEXT: movzbl %al, %edx +; X87_WIN-NEXT: movb %al, %dl ; X87_WIN-NEXT: shll $31, %edx ; X87_WIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; X87_WIN-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -310,20 +312,20 @@ ; X87_LIN-NEXT: subl $20, %esp ; X87_LIN-NEXT: flds {{[0-9]+}}(%esp) ; X87_LIN-NEXT: flds {{\.LCPI.*}} -; X87_LIN-NEXT: fld %st(1) -; X87_LIN-NEXT: fsub %st(1), %st -; X87_LIN-NEXT: fxch %st(1) -; X87_LIN-NEXT: fucomp %st(2) +; X87_LIN-NEXT: fucomp %st(1) ; X87_LIN-NEXT: fnstsw %ax +; X87_LIN-NEXT: xorl %edx, %edx ; X87_LIN-NEXT: # kill: def $ah killed $ah killed $ax ; X87_LIN-NEXT: sahf +; X87_LIN-NEXT: setbe %al +; X87_LIN-NEXT: fld %st(0) +; X87_LIN-NEXT: fadds {{\.LCPI.*}} ; X87_LIN-NEXT: ja .LBB0_2 ; X87_LIN-NEXT: # %bb.1: ; X87_LIN-NEXT: fstp %st(1) ; X87_LIN-NEXT: fldz ; X87_LIN-NEXT: .LBB0_2: ; X87_LIN-NEXT: fstp %st(0) -; X87_LIN-NEXT: setbe %al ; X87_LIN-NEXT: fnstcw {{[0-9]+}}(%esp) ; X87_LIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; X87_LIN-NEXT: orl $3072, %ecx # imm = 0xC00 @@ -331,7 +333,7 @@ ; X87_LIN-NEXT: fldcw {{[0-9]+}}(%esp) ; X87_LIN-NEXT: fistpll {{[0-9]+}}(%esp) ; X87_LIN-NEXT: fldcw {{[0-9]+}}(%esp) -; X87_LIN-NEXT: movzbl %al, %edx +; X87_LIN-NEXT: movb %al, %dl ; X87_LIN-NEXT: shll $31, %edx ; X87_LIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; X87_LIN-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -579,7 +581,7 @@ ; AVX512F_32_WIN-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX512F_32_WIN-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX512F_32_WIN-NEXT: vcmpltsd %xmm1, %xmm0, %k1 -; AVX512F_32_WIN-NEXT: vsubsd %xmm1, %xmm0, %xmm2 +; AVX512F_32_WIN-NEXT: vaddsd __real@c3e0000000000000, %xmm0, %xmm2 ; AVX512F_32_WIN-NEXT: vmovsd %xmm0, %xmm2, %xmm2 {%k1} ; AVX512F_32_WIN-NEXT: vmovsd %xmm2, (%esp) ; AVX512F_32_WIN-NEXT: fldl (%esp) @@ -600,7 +602,7 @@ ; AVX512F_32_LIN-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX512F_32_LIN-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX512F_32_LIN-NEXT: vcmpltsd %xmm1, %xmm0, %k1 -; AVX512F_32_LIN-NEXT: vsubsd %xmm1, %xmm0, %xmm2 +; AVX512F_32_LIN-NEXT: vaddsd {{\.LCPI.*}}, %xmm0, %xmm2 ; AVX512F_32_LIN-NEXT: vmovsd %xmm0, %xmm2, %xmm2 {%k1} ; AVX512F_32_LIN-NEXT: vmovsd %xmm2, (%esp) ; AVX512F_32_LIN-NEXT: fldl (%esp) @@ -625,15 +627,16 @@ ; SSE3_32_WIN-NEXT: movapd %xmm0, %xmm2 ; SSE3_32_WIN-NEXT: cmpltsd %xmm1, %xmm2 ; SSE3_32_WIN-NEXT: movapd %xmm2, %xmm3 -; SSE3_32_WIN-NEXT: andpd %xmm0, %xmm2 -; SSE3_32_WIN-NEXT: xorl %edx, %edx -; SSE3_32_WIN-NEXT: ucomisd %xmm0, %xmm1 -; SSE3_32_WIN-NEXT: subsd %xmm1, %xmm0 -; SSE3_32_WIN-NEXT: andnpd %xmm0, %xmm3 +; SSE3_32_WIN-NEXT: andpd %xmm0, %xmm3 +; SSE3_32_WIN-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero +; SSE3_32_WIN-NEXT: addsd %xmm0, %xmm4 +; SSE3_32_WIN-NEXT: andnpd %xmm4, %xmm2 ; SSE3_32_WIN-NEXT: orpd %xmm3, %xmm2 ; SSE3_32_WIN-NEXT: movlpd %xmm2, (%esp) ; SSE3_32_WIN-NEXT: fldl (%esp) ; SSE3_32_WIN-NEXT: fisttpll (%esp) +; SSE3_32_WIN-NEXT: xorl %edx, %edx +; SSE3_32_WIN-NEXT: ucomisd %xmm0, %xmm1 ; SSE3_32_WIN-NEXT: setbe %dl ; SSE3_32_WIN-NEXT: shll $31, %edx ; SSE3_32_WIN-NEXT: xorl {{[0-9]+}}(%esp), %edx @@ -650,15 +653,16 @@ ; SSE3_32_LIN-NEXT: movapd %xmm0, %xmm2 ; SSE3_32_LIN-NEXT: cmpltsd %xmm1, %xmm2 ; SSE3_32_LIN-NEXT: movapd %xmm2, %xmm3 -; SSE3_32_LIN-NEXT: andpd %xmm0, %xmm2 -; SSE3_32_LIN-NEXT: xorl %edx, %edx -; SSE3_32_LIN-NEXT: ucomisd %xmm0, %xmm1 -; SSE3_32_LIN-NEXT: subsd %xmm1, %xmm0 -; SSE3_32_LIN-NEXT: andnpd %xmm0, %xmm3 +; SSE3_32_LIN-NEXT: andpd %xmm0, %xmm3 +; SSE3_32_LIN-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero +; SSE3_32_LIN-NEXT: addsd %xmm0, %xmm4 +; SSE3_32_LIN-NEXT: andnpd %xmm4, %xmm2 ; SSE3_32_LIN-NEXT: orpd %xmm3, %xmm2 ; SSE3_32_LIN-NEXT: movlpd %xmm2, (%esp) ; SSE3_32_LIN-NEXT: fldl (%esp) ; SSE3_32_LIN-NEXT: fisttpll (%esp) +; SSE3_32_LIN-NEXT: xorl %edx, %edx +; SSE3_32_LIN-NEXT: ucomisd %xmm0, %xmm1 ; SSE3_32_LIN-NEXT: setbe %dl ; SSE3_32_LIN-NEXT: shll $31, %edx ; SSE3_32_LIN-NEXT: xorl {{[0-9]+}}(%esp), %edx @@ -688,14 +692,14 @@ ; SSE2_32_WIN-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE2_32_WIN-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; SSE2_32_WIN-NEXT: movapd %xmm0, %xmm2 -; SSE2_32_WIN-NEXT: subsd %xmm1, %xmm2 -; SSE2_32_WIN-NEXT: movapd %xmm0, %xmm3 -; SSE2_32_WIN-NEXT: cmpltsd %xmm1, %xmm3 -; SSE2_32_WIN-NEXT: movapd %xmm3, %xmm4 -; SSE2_32_WIN-NEXT: andnpd %xmm2, %xmm4 +; SSE2_32_WIN-NEXT: cmpltsd %xmm1, %xmm2 +; SSE2_32_WIN-NEXT: movapd %xmm2, %xmm3 ; SSE2_32_WIN-NEXT: andpd %xmm0, %xmm3 -; SSE2_32_WIN-NEXT: orpd %xmm4, %xmm3 -; SSE2_32_WIN-NEXT: movlpd %xmm3, {{[0-9]+}}(%esp) +; SSE2_32_WIN-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero +; SSE2_32_WIN-NEXT: addsd %xmm0, %xmm4 +; SSE2_32_WIN-NEXT: andnpd %xmm4, %xmm2 +; SSE2_32_WIN-NEXT: orpd %xmm3, %xmm2 +; SSE2_32_WIN-NEXT: movlpd %xmm2, {{[0-9]+}}(%esp) ; SSE2_32_WIN-NEXT: fldl {{[0-9]+}}(%esp) ; SSE2_32_WIN-NEXT: fnstcw {{[0-9]+}}(%esp) ; SSE2_32_WIN-NEXT: movzwl {{[0-9]+}}(%esp), %eax @@ -720,14 +724,14 @@ ; SSE2_32_LIN-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE2_32_LIN-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; SSE2_32_LIN-NEXT: movapd %xmm0, %xmm2 -; SSE2_32_LIN-NEXT: subsd %xmm1, %xmm2 -; SSE2_32_LIN-NEXT: movapd %xmm0, %xmm3 -; SSE2_32_LIN-NEXT: cmpltsd %xmm1, %xmm3 -; SSE2_32_LIN-NEXT: movapd %xmm3, %xmm4 -; SSE2_32_LIN-NEXT: andnpd %xmm2, %xmm4 +; SSE2_32_LIN-NEXT: cmpltsd %xmm1, %xmm2 +; SSE2_32_LIN-NEXT: movapd %xmm2, %xmm3 ; SSE2_32_LIN-NEXT: andpd %xmm0, %xmm3 -; SSE2_32_LIN-NEXT: orpd %xmm4, %xmm3 -; SSE2_32_LIN-NEXT: movlpd %xmm3, {{[0-9]+}}(%esp) +; SSE2_32_LIN-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero +; SSE2_32_LIN-NEXT: addsd %xmm0, %xmm4 +; SSE2_32_LIN-NEXT: andnpd %xmm4, %xmm2 +; SSE2_32_LIN-NEXT: orpd %xmm3, %xmm2 +; SSE2_32_LIN-NEXT: movlpd %xmm2, {{[0-9]+}}(%esp) ; SSE2_32_LIN-NEXT: fldl {{[0-9]+}}(%esp) ; SSE2_32_LIN-NEXT: fnstcw {{[0-9]+}}(%esp) ; SSE2_32_LIN-NEXT: movzwl {{[0-9]+}}(%esp), %eax @@ -766,20 +770,20 @@ ; X87_WIN-NEXT: subl $16, %esp ; X87_WIN-NEXT: fldl 8(%ebp) ; X87_WIN-NEXT: flds __real@5f000000 -; X87_WIN-NEXT: fld %st(1) -; X87_WIN-NEXT: fsub %st(1), %st -; X87_WIN-NEXT: fxch %st(1) -; X87_WIN-NEXT: fucomp %st(2) +; X87_WIN-NEXT: fucomp %st(1) ; X87_WIN-NEXT: fnstsw %ax +; X87_WIN-NEXT: xorl %edx, %edx ; X87_WIN-NEXT: # kill: def $ah killed $ah killed $ax ; X87_WIN-NEXT: sahf +; X87_WIN-NEXT: setbe %al +; X87_WIN-NEXT: fld %st(0) +; X87_WIN-NEXT: fadds __real@df000000 ; X87_WIN-NEXT: ja LBB2_2 ; X87_WIN-NEXT: # %bb.1: ; X87_WIN-NEXT: fstp %st(1) ; X87_WIN-NEXT: fldz ; X87_WIN-NEXT: LBB2_2: ; X87_WIN-NEXT: fstp %st(0) -; X87_WIN-NEXT: setbe %al ; X87_WIN-NEXT: fnstcw {{[0-9]+}}(%esp) ; X87_WIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; X87_WIN-NEXT: orl $3072, %ecx # imm = 0xC00 @@ -787,7 +791,7 @@ ; X87_WIN-NEXT: fldcw {{[0-9]+}}(%esp) ; X87_WIN-NEXT: fistpll {{[0-9]+}}(%esp) ; X87_WIN-NEXT: fldcw {{[0-9]+}}(%esp) -; X87_WIN-NEXT: movzbl %al, %edx +; X87_WIN-NEXT: movb %al, %dl ; X87_WIN-NEXT: shll $31, %edx ; X87_WIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; X87_WIN-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -800,20 +804,20 @@ ; X87_LIN-NEXT: subl $20, %esp ; X87_LIN-NEXT: fldl {{[0-9]+}}(%esp) ; X87_LIN-NEXT: flds {{\.LCPI.*}} -; X87_LIN-NEXT: fld %st(1) -; X87_LIN-NEXT: fsub %st(1), %st -; X87_LIN-NEXT: fxch %st(1) -; X87_LIN-NEXT: fucomp %st(2) +; X87_LIN-NEXT: fucomp %st(1) ; X87_LIN-NEXT: fnstsw %ax +; X87_LIN-NEXT: xorl %edx, %edx ; X87_LIN-NEXT: # kill: def $ah killed $ah killed $ax ; X87_LIN-NEXT: sahf +; X87_LIN-NEXT: setbe %al +; X87_LIN-NEXT: fld %st(0) +; X87_LIN-NEXT: fadds {{\.LCPI.*}} ; X87_LIN-NEXT: ja .LBB2_2 ; X87_LIN-NEXT: # %bb.1: ; X87_LIN-NEXT: fstp %st(1) ; X87_LIN-NEXT: fldz ; X87_LIN-NEXT: .LBB2_2: ; X87_LIN-NEXT: fstp %st(0) -; X87_LIN-NEXT: setbe %al ; X87_LIN-NEXT: fnstcw {{[0-9]+}}(%esp) ; X87_LIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; X87_LIN-NEXT: orl $3072, %ecx # imm = 0xC00 @@ -821,7 +825,7 @@ ; X87_LIN-NEXT: fldcw {{[0-9]+}}(%esp) ; X87_LIN-NEXT: fistpll {{[0-9]+}}(%esp) ; X87_LIN-NEXT: fldcw {{[0-9]+}}(%esp) -; X87_LIN-NEXT: movzbl %al, %edx +; X87_LIN-NEXT: movb %al, %dl ; X87_LIN-NEXT: shll $31, %edx ; X87_LIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; X87_LIN-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -1026,11 +1030,10 @@ ; AVX512_32_WIN-NEXT: andl $-8, %esp ; AVX512_32_WIN-NEXT: subl $8, %esp ; AVX512_32_WIN-NEXT: fldt 8(%ebp) +; AVX512_32_WIN-NEXT: fld %st(0) +; AVX512_32_WIN-NEXT: fadds __real@df000000 ; AVX512_32_WIN-NEXT: flds __real@5f000000 -; AVX512_32_WIN-NEXT: fld %st(1) -; AVX512_32_WIN-NEXT: fsub %st(1), %st ; AVX512_32_WIN-NEXT: xorl %edx, %edx -; AVX512_32_WIN-NEXT: fxch %st(1) ; AVX512_32_WIN-NEXT: fucompi %st(2), %st ; AVX512_32_WIN-NEXT: fcmovnbe %st(1), %st ; AVX512_32_WIN-NEXT: fstp %st(1) @@ -1047,11 +1050,10 @@ ; AVX512_32_LIN: # %bb.0: ; AVX512_32_LIN-NEXT: subl $12, %esp ; AVX512_32_LIN-NEXT: fldt {{[0-9]+}}(%esp) +; AVX512_32_LIN-NEXT: fld %st(0) +; AVX512_32_LIN-NEXT: fadds {{\.LCPI.*}} ; AVX512_32_LIN-NEXT: flds {{\.LCPI.*}} -; AVX512_32_LIN-NEXT: fld %st(1) -; AVX512_32_LIN-NEXT: fsub %st(1), %st ; AVX512_32_LIN-NEXT: xorl %edx, %edx -; AVX512_32_LIN-NEXT: fxch %st(1) ; AVX512_32_LIN-NEXT: fucompi %st(2), %st ; AVX512_32_LIN-NEXT: fcmovnbe %st(1), %st ; AVX512_32_LIN-NEXT: fstp %st(1) @@ -1106,11 +1108,10 @@ ; SSE3_32_WIN-NEXT: andl $-8, %esp ; SSE3_32_WIN-NEXT: subl $8, %esp ; SSE3_32_WIN-NEXT: fldt 8(%ebp) +; SSE3_32_WIN-NEXT: fld %st(0) +; SSE3_32_WIN-NEXT: fadds __real@df000000 ; SSE3_32_WIN-NEXT: flds __real@5f000000 -; SSE3_32_WIN-NEXT: fld %st(1) -; SSE3_32_WIN-NEXT: fsub %st(1), %st ; SSE3_32_WIN-NEXT: xorl %edx, %edx -; SSE3_32_WIN-NEXT: fxch %st(1) ; SSE3_32_WIN-NEXT: fucompi %st(2), %st ; SSE3_32_WIN-NEXT: fcmovnbe %st(1), %st ; SSE3_32_WIN-NEXT: fstp %st(1) @@ -1127,11 +1128,10 @@ ; SSE3_32_LIN: # %bb.0: ; SSE3_32_LIN-NEXT: subl $12, %esp ; SSE3_32_LIN-NEXT: fldt {{[0-9]+}}(%esp) +; SSE3_32_LIN-NEXT: fld %st(0) +; SSE3_32_LIN-NEXT: fadds {{\.LCPI.*}} ; SSE3_32_LIN-NEXT: flds {{\.LCPI.*}} -; SSE3_32_LIN-NEXT: fld %st(1) -; SSE3_32_LIN-NEXT: fsub %st(1), %st ; SSE3_32_LIN-NEXT: xorl %edx, %edx -; SSE3_32_LIN-NEXT: fxch %st(1) ; SSE3_32_LIN-NEXT: fucompi %st(2), %st ; SSE3_32_LIN-NEXT: fcmovnbe %st(1), %st ; SSE3_32_LIN-NEXT: fstp %st(1) @@ -1187,14 +1187,13 @@ ; SSE2_32_WIN-NEXT: subl $16, %esp ; SSE2_32_WIN-NEXT: fldt 8(%ebp) ; SSE2_32_WIN-NEXT: flds __real@5f000000 -; SSE2_32_WIN-NEXT: fld %st(1) -; SSE2_32_WIN-NEXT: fsub %st(1), %st ; SSE2_32_WIN-NEXT: xorl %edx, %edx -; SSE2_32_WIN-NEXT: fxch %st(1) -; SSE2_32_WIN-NEXT: fucompi %st(2), %st +; SSE2_32_WIN-NEXT: fucompi %st(1), %st +; SSE2_32_WIN-NEXT: setbe %dl +; SSE2_32_WIN-NEXT: fld %st(0) +; SSE2_32_WIN-NEXT: fadds __real@df000000 ; SSE2_32_WIN-NEXT: fcmovnbe %st(1), %st ; SSE2_32_WIN-NEXT: fstp %st(1) -; SSE2_32_WIN-NEXT: setbe %dl ; SSE2_32_WIN-NEXT: fnstcw {{[0-9]+}}(%esp) ; SSE2_32_WIN-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; SSE2_32_WIN-NEXT: orl $3072, %eax # imm = 0xC00 @@ -1214,14 +1213,13 @@ ; SSE2_32_LIN-NEXT: subl $20, %esp ; SSE2_32_LIN-NEXT: fldt {{[0-9]+}}(%esp) ; SSE2_32_LIN-NEXT: flds {{\.LCPI.*}} -; SSE2_32_LIN-NEXT: fld %st(1) -; SSE2_32_LIN-NEXT: fsub %st(1), %st ; SSE2_32_LIN-NEXT: xorl %edx, %edx -; SSE2_32_LIN-NEXT: fxch %st(1) -; SSE2_32_LIN-NEXT: fucompi %st(2), %st +; SSE2_32_LIN-NEXT: fucompi %st(1), %st +; SSE2_32_LIN-NEXT: setbe %dl +; SSE2_32_LIN-NEXT: fld %st(0) +; SSE2_32_LIN-NEXT: fadds {{\.LCPI.*}} ; SSE2_32_LIN-NEXT: fcmovnbe %st(1), %st ; SSE2_32_LIN-NEXT: fstp %st(1) -; SSE2_32_LIN-NEXT: setbe %dl ; SSE2_32_LIN-NEXT: fnstcw {{[0-9]+}}(%esp) ; SSE2_32_LIN-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; SSE2_32_LIN-NEXT: orl $3072, %eax # imm = 0xC00 @@ -1291,20 +1289,20 @@ ; X87_WIN-NEXT: subl $16, %esp ; X87_WIN-NEXT: fldt 8(%ebp) ; X87_WIN-NEXT: flds __real@5f000000 -; X87_WIN-NEXT: fld %st(1) -; X87_WIN-NEXT: fsub %st(1), %st -; X87_WIN-NEXT: fxch %st(1) -; X87_WIN-NEXT: fucomp %st(2) +; X87_WIN-NEXT: fucomp %st(1) ; X87_WIN-NEXT: fnstsw %ax +; X87_WIN-NEXT: xorl %edx, %edx ; X87_WIN-NEXT: # kill: def $ah killed $ah killed $ax ; X87_WIN-NEXT: sahf +; X87_WIN-NEXT: setbe %al +; X87_WIN-NEXT: fld %st(0) +; X87_WIN-NEXT: fadds __real@df000000 ; X87_WIN-NEXT: ja LBB4_2 ; X87_WIN-NEXT: # %bb.1: ; X87_WIN-NEXT: fstp %st(1) ; X87_WIN-NEXT: fldz ; X87_WIN-NEXT: LBB4_2: ; X87_WIN-NEXT: fstp %st(0) -; X87_WIN-NEXT: setbe %al ; X87_WIN-NEXT: fnstcw {{[0-9]+}}(%esp) ; X87_WIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; X87_WIN-NEXT: orl $3072, %ecx # imm = 0xC00 @@ -1312,7 +1310,7 @@ ; X87_WIN-NEXT: fldcw {{[0-9]+}}(%esp) ; X87_WIN-NEXT: fistpll {{[0-9]+}}(%esp) ; X87_WIN-NEXT: fldcw {{[0-9]+}}(%esp) -; X87_WIN-NEXT: movzbl %al, %edx +; X87_WIN-NEXT: movb %al, %dl ; X87_WIN-NEXT: shll $31, %edx ; X87_WIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; X87_WIN-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -1325,20 +1323,20 @@ ; X87_LIN-NEXT: subl $20, %esp ; X87_LIN-NEXT: fldt {{[0-9]+}}(%esp) ; X87_LIN-NEXT: flds {{\.LCPI.*}} -; X87_LIN-NEXT: fld %st(1) -; X87_LIN-NEXT: fsub %st(1), %st -; X87_LIN-NEXT: fxch %st(1) -; X87_LIN-NEXT: fucomp %st(2) +; X87_LIN-NEXT: fucomp %st(1) ; X87_LIN-NEXT: fnstsw %ax +; X87_LIN-NEXT: xorl %edx, %edx ; X87_LIN-NEXT: # kill: def $ah killed $ah killed $ax ; X87_LIN-NEXT: sahf +; X87_LIN-NEXT: setbe %al +; X87_LIN-NEXT: fld %st(0) +; X87_LIN-NEXT: fadds {{\.LCPI.*}} ; X87_LIN-NEXT: ja .LBB4_2 ; X87_LIN-NEXT: # %bb.1: ; X87_LIN-NEXT: fstp %st(1) ; X87_LIN-NEXT: fldz ; X87_LIN-NEXT: .LBB4_2: ; X87_LIN-NEXT: fstp %st(0) -; X87_LIN-NEXT: setbe %al ; X87_LIN-NEXT: fnstcw {{[0-9]+}}(%esp) ; X87_LIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; X87_LIN-NEXT: orl $3072, %ecx # imm = 0xC00 @@ -1346,7 +1344,7 @@ ; X87_LIN-NEXT: fldcw {{[0-9]+}}(%esp) ; X87_LIN-NEXT: fistpll {{[0-9]+}}(%esp) ; X87_LIN-NEXT: fldcw {{[0-9]+}}(%esp) -; X87_LIN-NEXT: movzbl %al, %edx +; X87_LIN-NEXT: movb %al, %dl ; X87_LIN-NEXT: shll $31, %edx ; X87_LIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; X87_LIN-NEXT: movl {{[0-9]+}}(%esp), %eax diff --git a/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll b/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll --- a/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll +++ b/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll @@ -440,36 +440,34 @@ define <4 x float> @test16(<4 x float> %A, <4 x float> %B) { ; SSE-LABEL: test16: ; SSE: # %bb.0: -; SSE-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: subss %xmm3, %xmm2 -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] -; SSE-NEXT: subss %xmm5, %xmm4 -; SSE-NEXT: movshdup {{.*#+}} xmm5 = xmm0[1,1,3,3] -; SSE-NEXT: addss %xmm3, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE-NEXT: addss %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: subss %xmm4, %xmm3 +; SSE-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE-NEXT: addss {{.*}}(%rip), %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: addss %xmm0, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test16: ; AVX: # %bb.0: -; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX-NEXT: vsubss %xmm2, %xmm0, %xmm3 -; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] -; AVX-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0] -; AVX-NEXT: vsubss %xmm5, %xmm4, %xmm4 -; AVX-NEXT: vmovshdup {{.*#+}} xmm5 = xmm0[1,1,3,3] -; AVX-NEXT: vaddss %xmm2, %xmm5, %xmm2 -; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; AVX-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm2 +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] +; AVX-NEXT: vsubss %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] +; AVX-NEXT: vaddss {{.*}}(%rip), %xmm4, %xmm4 +; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] +; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0