Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -5932,6 +5932,30 @@ (VCVTUSI2SDZrm_Int VR128X:$src1, addr:$src2)>; } // Predicates = [HasAVX512] +// Patterns used for matching vcvtsi2s{s,d} intrinsic sequences from clang +// which produce unnecessary vmovs{s,d} instructions +let Predicates = [HasAVX512] in { +def : Pat<(v4f32 (X86Movss + (v4f32 VR128X:$dst), + (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))), + (VCVTSI642SSZrr_Int VR128X:$dst, GR64:$src)>; + +def : Pat<(v4f32 (X86Movss + (v4f32 VR128X:$dst), + (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))), + (VCVTSI2SSZrr_Int VR128X:$dst, GR32:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128X:$dst), + (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))), + (VCVTSI642SDZrr_Int VR128X:$dst, GR64:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128X:$dst), + (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))), + (VCVTSI2SDZrr_Int VR128X:$dst, GR32:$src)>; +} // Predicates = [HasAVX512] + // Convert float/double to signed/unsigned int 32/64 with truncation multiclass avx512_cvt_s_all opc, string asm, X86VectorVTInfo _SrcRC, X86VectorVTInfo _DstRC, SDNode OpNode, @@ -6111,6 +6135,21 @@ (COPY_TO_REGCLASS (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, VR128X), (COPY_TO_REGCLASS FR64X:$src, VR128X)), VR128X)>, Requires<[HasAVX512]>; + +def : Pat<(v4f32 (X86Movss + (v4f32 VR128X:$dst), + (v4f32 (scalar_to_vector + (f32 (fpround (f64 (extractelt VR128X:$src, (iPTR 0))))))))), + (VCVTSD2SSZrr VR128X:$dst, VR128X:$src)>, + Requires<[HasAVX512]>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128X:$dst), + (v2f64 (scalar_to_vector + (f64 (fpextend (f32 (extractelt VR128X:$src, (iPTR 0))))))))), + (VCVTSS2SDZrr VR128X:$dst, VR128X:$src)>, + Requires<[HasAVX512]>; + //===----------------------------------------------------------------------===// // AVX-512 Vector convert from signed/unsigned integer to float/double // and from float/double to signed/unsigned integer Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -33,7 +33,6 @@ InstrItinClass ri = arg_ri; } - // scalar let Sched = WriteFAdd in { def SSE_ALU_F32S : OpndItins< @@ -1923,6 +1922,79 @@ } } // isCodeGenOnly = 1 +// Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and +// (v)cvtss2sd intrinsic sequences from clang which produce unnecessary +// vmovs{s,d} instructions +let Predicates = [UseAVX] in { +def : Pat<(v4f32 (X86Movss + (v4f32 VR128:$dst), + (v4f32 (scalar_to_vector + (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))), + (Int_VCVTSD2SSrr VR128:$dst, VR128:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128:$dst), + (v2f64 (scalar_to_vector + (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))), + (Int_VCVTSS2SDrr VR128:$dst, VR128:$src)>; + +def : Pat<(v4f32 (X86Movss + (v4f32 VR128:$dst), + (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))), + (Int_VCVTSI2SS64rr VR128:$dst, GR64:$src)>; + +def : Pat<(v4f32 (X86Movss + (v4f32 VR128:$dst), + (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))), + (Int_VCVTSI2SSrr VR128:$dst, GR32:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128:$dst), + (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))), + (Int_VCVTSI2SD64rr VR128:$dst, GR64:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128:$dst), + (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))), + (Int_VCVTSI2SDrr VR128:$dst, GR32:$src)>; +} // Predicates = [UseAVX] + +let Predicates = [UseSSE2] in { +def : Pat<(v4f32 (X86Movss + (v4f32 VR128:$dst), + (v4f32 (scalar_to_vector + (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))), + (Int_CVTSD2SSrr VR128:$dst, VR128:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128:$dst), + (v2f64 (scalar_to_vector + (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))), + (Int_CVTSS2SDrr VR128:$dst, VR128:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128:$dst), + (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))), + (Int_CVTSI2SD64rr VR128:$dst, GR64:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128:$dst), + (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))), + (Int_CVTSI2SDrr VR128:$dst, GR32:$src)>; +} // Predicates = [UseSSE2] + +let Predicates = [UseSSE1] in { +def : Pat<(v4f32 (X86Movss + (v4f32 VR128:$dst), + (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))), + (Int_CVTSI2SS64rr VR128:$dst, GR64:$src)>; + +def : Pat<(v4f32 (X86Movss + (v4f32 VR128:$dst), + (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))), + (Int_CVTSI2SSrr VR128:$dst, GR32:$src)>; +} // Predicates = [UseSSE1] + // Convert packed single/double fp to doubleword def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", Index: test/CodeGen/X86/avx-cvt.ll =================================================================== --- test/CodeGen/X86/avx-cvt.ll +++ test/CodeGen/X86/avx-cvt.ll @@ -62,6 +62,17 @@ ret <8 x float> %a } +define <4 x float> @fptrunc01(<2 x double> %a0, <4 x float> %a1) nounwind { +; CHECK-LABEL: fptrunc01: +; CHECK: # BB#0: +; CHECK-NEXT: vcvtsd2ss %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %ext = extractelement <2 x double> %a0, i32 0 + %cvt = fptrunc double %ext to float + %res = insertelement <4 x float> %a1, float %cvt, i32 0 + ret <4 x float> %res +} + define <4 x double> @fpext00(<4 x float> %b) nounwind { ; CHECK-LABEL: fpext00: ; CHECK: # BB#0: @@ -71,6 +82,17 @@ ret <4 x double> %a } +define <2 x double> @fpext01(<2 x double> %a0, <4 x float> %a1) nounwind { +; CHECK-LABEL: fpext01: +; CHECK: # BB#0: +; CHECK-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %ext = extractelement <4 x float> %a1, i32 0 + %cvt = fpext float %ext to double + %res = insertelement <2 x double> %a0, double %cvt, i32 0 + ret <2 x double> %res +} + define double @funcA(i64* nocapture %e) nounwind uwtable readonly ssp { ; CHECK-LABEL: funcA: ; CHECK: # BB#0: Index: test/CodeGen/X86/avx512-cvt.ll =================================================================== --- test/CodeGen/X86/avx512-cvt.ll +++ test/CodeGen/X86/avx512-cvt.ll @@ -448,6 +448,17 @@ ret <4 x float> %c } +define <4 x float> @fptrunc03(<2 x double> %a0, <4 x float> %a1) nounwind { +; ALL-LABEL: fptrunc03: +; ALL: ## BB#0: +; ALL-NEXT: vcvtsd2ss %xmm0, %xmm1, %xmm0 +; ALL-NEXT: retq + %ext = extractelement <2 x double> %a0, i32 0 + %cvt = fptrunc double %ext to float + %res = insertelement <4 x float> %a1, float %cvt, i32 0 + ret <4 x float> %res +} + define <8 x double> @fpext00(<8 x float> %b) nounwind { ; ALL-LABEL: fpext00: ; ALL: ## BB#0: @@ -476,6 +487,17 @@ ret <4 x double> %c } +define <2 x double> @fpext02(<2 x double> %a0, <4 x float> %a1) nounwind { +; ALL-LABEL: fpext02: +; ALL: ## BB#0: +; ALL-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm0 +; ALL-NEXT: retq + %ext = extractelement <4 x float> %a1, i32 0 + %cvt = fpext float %ext to double + %res = insertelement <2 x double> %a0, double %cvt, i32 0 + ret <2 x double> %res +} + define double @funcA(i64* nocapture %e) { ; ALL-LABEL: funcA: ; ALL: ## BB#0: ## %entry Index: test/CodeGen/X86/sse2-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -1257,15 +1257,12 @@ define <2 x double> @test_mm_cvtsi32_sd(<2 x double> %a0, i32 %a1) nounwind { ; X32-LABEL: test_mm_cvtsi32_sd: ; X32: # BB#0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: cvtsi2sdl %eax, %xmm1 -; X32-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; X32-NEXT: cvtsi2sdl {{[0-9]+}}(%esp), %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm_cvtsi32_sd: ; X64: # BB#0: -; X64-NEXT: cvtsi2sdl %edi, %xmm1 -; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; X64-NEXT: cvtsi2sdl %edi, %xmm0 ; X64-NEXT: retq %cvt = sitofp i32 %a1 to double %res = insertelement <2 x double> %a0, double %cvt, i32 0 @@ -1293,14 +1290,12 @@ define <2 x double> @test_mm_cvtss_sd(<2 x double> %a0, <4 x float> %a1) nounwind { ; X32-LABEL: test_mm_cvtss_sd: ; X32: # BB#0: -; X32-NEXT: cvtss2sd %xmm1, %xmm1 -; X32-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; X32-NEXT: cvtss2sd %xmm1, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm_cvtss_sd: ; X64: # BB#0: -; X64-NEXT: cvtss2sd %xmm1, %xmm1 -; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; X64-NEXT: cvtss2sd %xmm1, %xmm0 ; X64-NEXT: retq %ext = extractelement <4 x float> %a1, i32 0 %cvt = fpext float %ext to double Index: test/CodeGen/X86/vec_int_to_fp.ll =================================================================== --- test/CodeGen/X86/vec_int_to_fp.ll +++ test/CodeGen/X86/vec_int_to_fp.ll @@ -4818,3 +4818,63 @@ store <8 x float> %4, <8 x float>* %3, align 32 ret void } + +define <2 x double> @sitofp_i32_to_2f64(<2 x double> %a0, i32 %a1) nounwind { +; SSE-LABEL: sitofp_i32_to_2f64: +; SSE: # BB#0: +; SSE-NEXT: cvtsi2sdl %edi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_i32_to_2f64: +; AVX: # BB#0: +; AVX-NEXT: vcvtsi2sdl %edi, %xmm0, %xmm0 +; AVX-NEXT: retq + %cvt = sitofp i32 %a1 to double + %res = insertelement <2 x double> %a0, double %cvt, i32 0 + ret <2 x double> %res +} + +define <4 x float> @sitofp_i32_to_4f32(<4 x float> %a0, i32 %a1) nounwind { +; SSE-LABEL: sitofp_i32_to_4f32: +; SSE: # BB#0: +; SSE-NEXT: cvtsi2ssl %edi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_i32_to_4f32: +; AVX: # BB#0: +; AVX-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm0 +; AVX-NEXT: retq + %cvt = sitofp i32 %a1 to float + %res = insertelement <4 x float> %a0, float %cvt, i32 0 + ret <4 x float> %res +} + +define <2 x double> @sitofp_i64_to_2f64(<2 x double> %a0, i64 %a1) nounwind { +; SSE-LABEL: sitofp_i64_to_2f64: +; SSE: # BB#0: +; SSE-NEXT: cvtsi2sdq %rdi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_i64_to_2f64: +; AVX: # BB#0: +; AVX-NEXT: vcvtsi2sdq %rdi, %xmm0, %xmm0 +; AVX-NEXT: retq + %cvt = sitofp i64 %a1 to double + %res = insertelement <2 x double> %a0, double %cvt, i32 0 + ret <2 x double> %res +} + +define <4 x float> @sitofp_i64_to_4f32(<4 x float> %a0, i64 %a1) nounwind { +; SSE-LABEL: sitofp_i64_to_4f32: +; SSE: # BB#0: +; SSE-NEXT: cvtsi2ssq %rdi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_i64_to_4f32: +; AVX: # BB#0: +; AVX-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 +; AVX-NEXT: retq + %cvt = sitofp i64 %a1 to float + %res = insertelement <4 x float> %a0, float %cvt, i32 0 + ret <4 x float> %res +}