Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -33,7 +33,6 @@ InstrItinClass ri = arg_ri; } - // scalar let Sched = WriteFAdd in { def SSE_ALU_F32S : OpndItins< @@ -1923,6 +1922,77 @@ } } // isCodeGenOnly = 1 +// Patterns used for matching cvtsi2ss, cvtsi2sd, cvtsd2ss and +// cvtss2sd intrinsic sequences from clang which produce unnecsessary +// vmovs{s,d} instructions +let Predicates = [UseSSE1] in { +def : Pat<(v4f32 (X86Movss + (v4f32 VR128:$dst), + (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))), + (Int_CVTSI2SS64rr VR128:$dst, GR64:$src)>; + +def : Pat<(v4f32 (X86Movss + (v4f32 VR128:$dst), + (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))), + (Int_CVTSI2SSrr VR128:$dst, GR32:$src)>; +} +let Predicates = [UseSSE2] in { +def : Pat<(v4f32 (X86Movss + (v4f32 VR128:$dst), + (v4f32 (scalar_to_vector + (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))), + (Int_CVTSD2SSrr VR128:$dst, VR128:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128:$dst), + (v2f64 (scalar_to_vector + (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))), + (Int_CVTSS2SDrr VR128:$dst, VR128:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128:$dst), + (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))), + (Int_CVTSI2SD64rr VR128:$dst, GR64:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128:$dst), + (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))), + (Int_CVTSI2SDrr VR128:$dst, GR32:$src)>; +} +let Predicates = [HasAVX] in { +def : Pat<(v4f32 (X86Movss + (v4f32 VR128:$dst), + (v4f32 (scalar_to_vector + (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))), + (Int_VCVTSD2SSrr VR128:$dst, VR128:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128:$dst), + (v2f64 (scalar_to_vector + (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))), + (Int_VCVTSS2SDrr VR128:$dst, VR128:$src)>; + +def : Pat<(v4f32 (X86Movss + (v4f32 VR128:$dst), + (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))), + (Int_VCVTSI2SS64rr VR128:$dst, GR64:$src)>; + +def : Pat<(v4f32 (X86Movss + (v4f32 VR128:$dst), + (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))), + (Int_VCVTSI2SSrr VR128:$dst, GR32:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128:$dst), + (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))), + (Int_VCVTSI2SD64rr VR128:$dst, GR64:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128:$dst), + (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))), + (Int_VCVTSI2SDrr VR128:$dst, GR32:$src)>; +} + // Convert packed single/double fp to doubleword def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", Index: test/CodeGen/X86/avx-cvt.ll =================================================================== --- test/CodeGen/X86/avx-cvt.ll +++ test/CodeGen/X86/avx-cvt.ll @@ -62,6 +62,17 @@ ret <8 x float> %a } +define <4 x float> @fptrunc01(<2 x double> %a0, <4 x float> %a1) nounwind { +; CHECK-LABEL: fptrunc01: +; CHECK: # BB#0: +; CHECK-NEXT: vcvtsd2ss %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %ext = extractelement <2 x double> %a0, i32 0 + %cvt = fptrunc double %ext to float + %res = insertelement <4 x float> %a1, float %cvt, i32 0 + ret <4 x float> %res +} + define <4 x double> @fpext00(<4 x float> %b) nounwind { ; CHECK-LABEL: fpext00: ; CHECK: # BB#0: @@ -71,6 +82,17 @@ ret <4 x double> %a } +define <2 x double> @fpext01(<2 x double> %a0, <4 x float> %a1) nounwind { +; CHECK-LABEL: fpext01: +; CHECK: # BB#0: +; CHECK-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %ext = extractelement <4 x float> %a1, i32 0 + %cvt = fpext float %ext to double + %res = insertelement <2 x double> %a0, double %cvt, i32 0 + ret <2 x double> %res +} + define double @funcA(i64* nocapture %e) nounwind uwtable readonly ssp { ; CHECK-LABEL: funcA: ; CHECK: # BB#0: Index: test/CodeGen/X86/sse2-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -1257,15 +1257,12 @@ define <2 x double> @test_mm_cvtsi32_sd(<2 x double> %a0, i32 %a1) nounwind { ; X32-LABEL: test_mm_cvtsi32_sd: ; X32: # BB#0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: cvtsi2sdl %eax, %xmm1 -; X32-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; X32-NEXT: cvtsi2sdl {{[0-9]+}}(%esp), %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm_cvtsi32_sd: ; X64: # BB#0: -; X64-NEXT: cvtsi2sdl %edi, %xmm1 -; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; X64-NEXT: cvtsi2sdl %edi, %xmm0 ; X64-NEXT: retq %cvt = sitofp i32 %a1 to double %res = insertelement <2 x double> %a0, double %cvt, i32 0 @@ -1293,14 +1290,12 @@ define <2 x double> @test_mm_cvtss_sd(<2 x double> %a0, <4 x float> %a1) nounwind { ; X32-LABEL: test_mm_cvtss_sd: ; X32: # BB#0: -; X32-NEXT: cvtss2sd %xmm1, %xmm1 -; X32-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; X32-NEXT: cvtss2sd %xmm1, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm_cvtss_sd: ; X64: # BB#0: -; X64-NEXT: cvtss2sd %xmm1, %xmm1 -; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; X64-NEXT: cvtss2sd %xmm1, %xmm0 ; X64-NEXT: retq %ext = extractelement <4 x float> %a1, i32 0 %cvt = fpext float %ext to double Index: test/CodeGen/X86/vec_int_to_fp.ll =================================================================== --- test/CodeGen/X86/vec_int_to_fp.ll +++ test/CodeGen/X86/vec_int_to_fp.ll @@ -4818,3 +4818,63 @@ store <8 x float> %4, <8 x float>* %3, align 32 ret void } + +define <2 x double> @sitofp_i32_to_2f64(<2 x double> %a0, i32 %a1) nounwind { +; SSE-LABEL: sitofp_i32_to_2f64: +; SSE: # BB#0: +; SSE-NEXT: cvtsi2sdl %edi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_i32_to_2f64: +; AVX: # BB#0: +; AVX-NEXT: vcvtsi2sdl %edi, %xmm0, %xmm0 +; AVX-NEXT: retq + %cvt = sitofp i32 %a1 to double + %res = insertelement <2 x double> %a0, double %cvt, i32 0 + ret <2 x double> %res +} + +define <4 x float> @sitofp_i32_to_4f32(<4 x float> %a0, i32 %a1) nounwind { +; SSE-LABEL: sitofp_i32_to_4f32: +; SSE: # BB#0: +; SSE-NEXT: cvtsi2ssl %edi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_i32_to_4f32: +; AVX: # BB#0: +; AVX-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm0 +; AVX-NEXT: retq + %cvt = sitofp i32 %a1 to float + %res = insertelement <4 x float> %a0, float %cvt, i32 0 + ret <4 x float> %res +} + +define <2 x double> @sitofp_i64_to_2f64(<2 x double> %a0, i64 %a1) nounwind { +; SSE-LABEL: sitofp_i64_to_2f64: +; SSE: # BB#0: +; SSE-NEXT: cvtsi2sdq %rdi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_i64_to_2f64: +; AVX: # BB#0: +; AVX-NEXT: vcvtsi2sdq %rdi, %xmm0, %xmm0 +; AVX-NEXT: retq + %cvt = sitofp i64 %a1 to double + %res = insertelement <2 x double> %a0, double %cvt, i32 0 + ret <2 x double> %res +} + +define <4 x float> @sitofp_i64_to_4f32(<4 x float> %a0, i64 %a1) nounwind { +; SSE-LABEL: sitofp_i64_to_4f32: +; SSE: # BB#0: +; SSE-NEXT: cvtsi2ssq %rdi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_i64_to_4f32: +; AVX: # BB#0: +; AVX-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 +; AVX-NEXT: retq + %cvt = sitofp i64 %a1 to float + %res = insertelement <4 x float> %a0, float %cvt, i32 0 + ret <4 x float> %res +}