Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -1594,6 +1594,17 @@ Sched<[WriteCvtPS2IYLd]>, VEX_WIG; } +let Predicates = [HasAVX] in { + def : Pat<(v4i32 (X86cvttp2si (v4f32 VR128:$src))), + (VCVTTPS2DQrr VR128:$src)>; + def : Pat<(v4i32 (X86cvttp2si (loadv4f32 addr:$src))), + (VCVTTPS2DQrm addr:$src)>; + def : Pat<(v8i32 (X86cvttp2si (v8f32 VR256:$src))), + (VCVTTPS2DQYrr VR256:$src)>; + def : Pat<(v8i32 (X86cvttp2si (loadv8f32 addr:$src))), + (VCVTTPS2DQYrm addr:$src)>; +} + def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -1605,6 +1616,13 @@ (v4i32 (fp_to_sint (memopv4f32 addr:$src))))]>, Sched<[WriteCvtPS2ILd]>; +let Predicates = [UseSSE2] in { + def : Pat<(v4i32 (X86cvttp2si (v4f32 VR128:$src))), + (CVTTPS2DQrr VR128:$src)>; + def : Pat<(v4i32 (X86cvttp2si (memopv4f32 addr:$src))), + (CVTTPS2DQrm addr:$src)>; +} + let Predicates = [HasAVX, NoVLX] in def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttpd2dq\t{$src, $dst|$dst, $src}", @@ -1619,6 +1637,7 @@ // XMM only def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}", (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0>; + let Predicates = [HasAVX, NoVLX] in def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvttpd2dq{x}\t{$src, $dst|$dst, $src}", @@ -1646,6 +1665,17 @@ def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}", (VCVTTPD2DQYrm VR128:$dst, f256mem:$src), 0, "intel">; +let Predicates = [HasAVX] in { + def : Pat<(v2i64 (bitconvert (v4i32 (X86cvttp2si (v2f64 VR128:$src))))), + (VCVTTPD2DQrr VR128:$src)>; + def : Pat<(v2i64 (bitconvert (v4i32 (X86cvttp2si (loadv2f64 addr:$src))))), + (VCVTTPD2DQrm addr:$src)>; + def : Pat<(v4i32 (X86cvttp2si (v4f64 VR256:$src))), + (VCVTTPD2DQYrr VR256:$src)>; + def : Pat<(v4i32 (X86cvttp2si (loadv4f64 addr:$src))), + (VCVTTPD2DQYrm addr:$src)>; +} + let Predicates = [HasAVX, NoVLX] in { let AddedComplexity = 15 in { def : Pat<(X86vzmovl (v2i64 (bitconvert Index: lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- lib/Target/X86/X86IntrinsicsInfo.h +++ lib/Target/X86/X86IntrinsicsInfo.h @@ -375,8 +375,8 @@ X86_INTRINSIC_DATA(avx_cvt_pd2_ps_256,CVTPD2PS, ISD::FP_ROUND, 0), X86_INTRINSIC_DATA(avx_cvt_pd2dq_256, INTR_TYPE_1OP, X86ISD::CVTP2SI, 0), X86_INTRINSIC_DATA(avx_cvt_ps2dq_256, INTR_TYPE_1OP, X86ISD::CVTP2SI, 0), - X86_INTRINSIC_DATA(avx_cvtt_pd2dq_256,INTR_TYPE_1OP, ISD::FP_TO_SINT, 0), - X86_INTRINSIC_DATA(avx_cvtt_ps2dq_256,INTR_TYPE_1OP, ISD::FP_TO_SINT, 0), + X86_INTRINSIC_DATA(avx_cvtt_pd2dq_256,INTR_TYPE_1OP, X86ISD::CVTTP2SI, 0), + X86_INTRINSIC_DATA(avx_cvtt_ps2dq_256,INTR_TYPE_1OP, X86ISD::CVTTP2SI, 0), X86_INTRINSIC_DATA(avx_hadd_pd_256, INTR_TYPE_2OP, X86ISD::FHADD, 0), X86_INTRINSIC_DATA(avx_hadd_ps_256, INTR_TYPE_2OP, X86ISD::FHADD, 0), X86_INTRINSIC_DATA(avx_hsub_pd_256, INTR_TYPE_2OP, X86ISD::FHSUB, 0), @@ -1383,7 +1383,7 @@ X86_INTRINSIC_DATA(sse2_cvtpd2ps, INTR_TYPE_1OP, X86ISD::VFPROUND, 0), X86_INTRINSIC_DATA(sse2_cvtps2dq, INTR_TYPE_1OP, X86ISD::CVTP2SI, 0), X86_INTRINSIC_DATA(sse2_cvttpd2dq, INTR_TYPE_1OP, X86ISD::CVTTP2SI, 0), - X86_INTRINSIC_DATA(sse2_cvttps2dq, INTR_TYPE_1OP, ISD::FP_TO_SINT, 0), + X86_INTRINSIC_DATA(sse2_cvttps2dq, INTR_TYPE_1OP, X86ISD::CVTTP2SI, 0), X86_INTRINSIC_DATA(sse2_max_pd, INTR_TYPE_2OP, X86ISD::FMAX, 0), X86_INTRINSIC_DATA(sse2_max_sd, INTR_TYPE_2OP, X86ISD::FMAXS, 0), X86_INTRINSIC_DATA(sse2_min_pd, INTR_TYPE_2OP, X86ISD::FMIN, 0), Index: test/CodeGen/X86/avx-cvttp2si.ll =================================================================== --- test/CodeGen/X86/avx-cvttp2si.ll +++ test/CodeGen/X86/avx-cvttp2si.ll @@ -9,16 +9,11 @@ declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) define <8 x float> @float_to_int_to_float_mem_v8f32(<8 x float>* %p) { -; AVX1-LABEL: float_to_int_to_float_mem_v8f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vroundps $11, (%rdi), %ymm0 -; AVX1-NEXT: retq -; -; AVX512-LABEL: float_to_int_to_float_mem_v8f32: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovups (%rdi), %ymm0 -; AVX512-NEXT: vroundps $11, %ymm0, %ymm0 -; AVX512-NEXT: retq +; AVX-LABEL: float_to_int_to_float_mem_v8f32: +; AVX: # %bb.0: +; AVX-NEXT: vcvttps2dq (%rdi), %ymm0 +; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX-NEXT: retq %x = load <8 x float>, <8 x float>* %p, align 16 %fptosi = tail call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %x) %sitofp = sitofp <8 x i32> %fptosi to <8 x float> @@ -28,7 +23,8 @@ define <8 x float> @float_to_int_to_float_reg_v8f32(<8 x float> %x) { ; AVX-LABEL: float_to_int_to_float_reg_v8f32: ; AVX: # %bb.0: -; AVX-NEXT: vroundps $11, %ymm0, %ymm0 +; AVX-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX-NEXT: retq %fptosi = tail call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %x) %sitofp = sitofp <8 x i32> %fptosi to <8 x float> @@ -36,16 +32,11 @@ } define <4 x double> @float_to_int_to_float_mem_v4f64(<4 x double>* %p) { -; AVX1-LABEL: float_to_int_to_float_mem_v4f64: -; AVX1: # %bb.0: -; AVX1-NEXT: vroundpd $11, (%rdi), %ymm0 -; AVX1-NEXT: retq -; -; AVX512-LABEL: float_to_int_to_float_mem_v4f64: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovupd (%rdi), %ymm0 -; AVX512-NEXT: vroundpd $11, %ymm0, %ymm0 -; AVX512-NEXT: retq +; AVX-LABEL: float_to_int_to_float_mem_v4f64: +; AVX: # %bb.0: +; AVX-NEXT: vcvttpd2dqy (%rdi), %xmm0 +; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX-NEXT: retq %x = load <4 x double>, <4 x double>* %p, align 16 %fptosi = tail call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %x) %sitofp = sitofp <4 x i32> %fptosi to <4 x double> @@ -55,7 +46,8 @@ define <4 x double> @float_to_int_to_float_reg_v4f64(<4 x double> %x) { ; AVX-LABEL: float_to_int_to_float_reg_v4f64: ; AVX: # %bb.0: -; AVX-NEXT: vroundpd $11, %ymm0, %ymm0 +; AVX-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX-NEXT: retq %fptosi = tail call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %x) %sitofp = sitofp <4 x i32> %fptosi to <4 x double> Index: test/CodeGen/X86/avx-intrinsics-x86.ll =================================================================== --- test/CodeGen/X86/avx-intrinsics-x86.ll +++ test/CodeGen/X86/avx-intrinsics-x86.ll @@ -194,17 +194,11 @@ define <4 x i32> @test_x86_avx_cvtt_pd2dq_256(<4 x double> %a0) { -; AVX-LABEL: test_x86_avx_cvtt_pd2dq_256: -; AVX: # %bb.0: -; AVX-NEXT: vcvttpd2dq %ymm0, %xmm0 # encoding: [0xc5,0xfd,0xe6,0xc0] -; AVX-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] -; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] -; -; AVX512VL-LABEL: test_x86_avx_cvtt_pd2dq_256: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvttpd2dq %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe6,0xc0] -; AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] -; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; CHECK-LABEL: test_x86_avx_cvtt_pd2dq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm0 # encoding: [0xc5,0xfd,0xe6,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0) ; <<4 x i32>> [#uses=1] ret <4 x i32> %res } @@ -212,15 +206,10 @@ define <8 x i32> @test_x86_avx_cvtt_ps2dq_256(<8 x float> %a0) { -; AVX-LABEL: test_x86_avx_cvtt_ps2dq_256: -; AVX: # %bb.0: -; AVX-NEXT: vcvttps2dq %ymm0, %ymm0 # encoding: [0xc5,0xfe,0x5b,0xc0] -; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] -; -; AVX512VL-LABEL: test_x86_avx_cvtt_ps2dq_256: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvttps2dq %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfe,0x5b,0xc0] -; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; CHECK-LABEL: test_x86_avx_cvtt_ps2dq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 # encoding: [0xc5,0xfe,0x5b,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } Index: test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll +++ test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll @@ -683,15 +683,17 @@ ; X86-LABEL: test_mm256_mask_cvttpd_epi32: ; X86: # %bb.0: # %entry ; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: vcvttpd2dq %ymm1, %xmm1 ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vcvttpd2dq %ymm1, %xmm0 {%k1} +; X86-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} ; X86-NEXT: vzeroupper ; X86-NEXT: retl ; ; X64-LABEL: test_mm256_mask_cvttpd_epi32: ; X64: # %bb.0: # %entry +; X64-NEXT: vcvttpd2dq %ymm1, %xmm1 ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vcvttpd2dq %ymm1, %xmm0 {%k1} +; X64-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} ; X64-NEXT: vzeroupper ; X64-NEXT: retq entry: @@ -708,15 +710,17 @@ ; X86-LABEL: test_mm256_maskz_cvttpd_epi32: ; X86: # %bb.0: # %entry ; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: vcvttpd2dq %ymm0, %xmm0 ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vcvttpd2dq %ymm0, %xmm0 {%k1} {z} +; X86-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; X86-NEXT: vzeroupper ; X86-NEXT: retl ; ; X64-LABEL: test_mm256_maskz_cvttpd_epi32: ; X64: # %bb.0: # %entry +; X64-NEXT: vcvttpd2dq %ymm0, %xmm0 ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vcvttpd2dq %ymm0, %xmm0 {%k1} {z} +; X64-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; X64-NEXT: vzeroupper ; X64-NEXT: retq entry: @@ -837,14 +841,16 @@ ; X86-LABEL: test_mm_mask_cvttps_epi32: ; X86: # %bb.0: # %entry ; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: vcvttps2dq %xmm1, %xmm1 ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vcvttps2dq %xmm1, %xmm0 {%k1} +; X86-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} ; X86-NEXT: retl ; ; X64-LABEL: test_mm_mask_cvttps_epi32: ; X64: # %bb.0: # %entry +; X64-NEXT: vcvttps2dq %xmm1, %xmm1 ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vcvttps2dq %xmm1, %xmm0 {%k1} +; X64-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} ; X64-NEXT: retq entry: %0 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %__A) #8 @@ -860,14 +866,16 @@ ; X86-LABEL: test_mm_maskz_cvttps_epi32: ; X86: # %bb.0: # %entry ; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: vcvttps2dq %xmm0, %xmm0 ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vcvttps2dq %xmm0, %xmm0 {%k1} {z} +; X86-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; X86-NEXT: retl ; ; X64-LABEL: test_mm_maskz_cvttps_epi32: ; X64: # %bb.0: # %entry +; X64-NEXT: vcvttps2dq %xmm0, %xmm0 ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vcvttps2dq %xmm0, %xmm0 {%k1} {z} +; X64-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; X64-NEXT: retq entry: %0 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %__A) #8 @@ -882,14 +890,16 @@ ; X86-LABEL: test_mm256_mask_cvttps_epi32: ; X86: # %bb.0: # %entry ; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: vcvttps2dq %ymm1, %ymm1 ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vcvttps2dq %ymm1, %ymm0 {%k1} +; X86-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1} ; X86-NEXT: retl ; ; X64-LABEL: test_mm256_mask_cvttps_epi32: ; X64: # %bb.0: # %entry +; X64-NEXT: vcvttps2dq %ymm1, %ymm1 ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vcvttps2dq %ymm1, %ymm0 {%k1} +; X64-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1} ; X64-NEXT: retq entry: %0 = tail call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %__A) #8 @@ -904,14 +914,16 @@ ; X86-LABEL: test_mm256_maskz_cvttps_epi32: ; X86: # %bb.0: # %entry ; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: vcvttps2dq %ymm0, %ymm0 ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vcvttps2dq %ymm0, %ymm0 {%k1} {z} +; X86-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; X86-NEXT: retl ; ; X64-LABEL: test_mm256_maskz_cvttps_epi32: ; X64: # %bb.0: # %entry +; X64-NEXT: vcvttps2dq %ymm0, %ymm0 ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vcvttps2dq %ymm0, %ymm0 {%k1} {z} +; X64-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; X64-NEXT: retq entry: %0 = tail call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %__A) #8 Index: test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -10405,20 +10405,20 @@ define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_256: ; X86: # %bb.0: -; X86-NEXT: vcvttpd2dq %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe6,0xd0] +; X86-NEXT: vcvttpd2dq %ymm0, %xmm0 # encoding: [0xc5,0xfd,0xe6,0xc0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vcvttpd2dq %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0xe6,0xc8] -; X86-NEXT: vpaddd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc2] +; X86-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x6f,0xc8] +; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_256: ; X64: # %bb.0: -; X64-NEXT: vcvttpd2dq %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe6,0xd0] +; X64-NEXT: vcvttpd2dq %ymm0, %xmm0 # encoding: [0xc5,0xfd,0xe6,0xc0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vcvttpd2dq %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0xe6,0xc8] -; X64-NEXT: vpaddd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc2] +; X64-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x6f,0xc8] +; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) @@ -10432,19 +10432,19 @@ define <4 x i32>@test_int_x86_avx512_mask_cvtt_ps2dq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_128: ; X86: # %bb.0: -; X86-NEXT: vcvttps2dq %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5b,0xd0] +; X86-NEXT: vcvttps2dq %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x5b,0xc0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vcvttps2dq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x5b,0xc8] -; X86-NEXT: vpaddd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc2] +; X86-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x6f,0xc8] +; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_128: ; X64: # %bb.0: -; X64-NEXT: vcvttps2dq %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5b,0xd0] +; X64-NEXT: vcvttps2dq %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x5b,0xc0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vcvttps2dq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x5b,0xc8] -; X64-NEXT: vpaddd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc2] +; X64-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x6f,0xc8] +; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvttps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1) @@ -10457,19 +10457,19 @@ define <8 x i32>@test_int_x86_avx512_mask_cvtt_ps2dq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_256: ; X86: # %bb.0: -; X86-NEXT: vcvttps2dq %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xfe,0x5b,0xd0] +; X86-NEXT: vcvttps2dq %ymm0, %ymm0 # encoding: [0xc5,0xfe,0x5b,0xc0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vcvttps2dq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x29,0x5b,0xc8] -; X86-NEXT: vpaddd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc2] +; X86-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x6f,0xc8] +; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_256: ; X64: # %bb.0: -; X64-NEXT: vcvttps2dq %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xfe,0x5b,0xd0] +; X64-NEXT: vcvttps2dq %ymm0, %ymm0 # encoding: [0xc5,0xfe,0x5b,0xc0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vcvttps2dq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x29,0x5b,0xc8] -; X64-NEXT: vpaddd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc2] +; X64-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x6f,0xc8] +; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.cvttps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1) Index: test/CodeGen/X86/mmx-cvt.ll =================================================================== --- test/CodeGen/X86/mmx-cvt.ll +++ test/CodeGen/X86/mmx-cvt.ll @@ -155,7 +155,8 @@ ; X86-NEXT: andl $-8, %esp ; X86-NEXT: subl $8, %esp ; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: cvttps2pi %xmm0, %mm0 +; X86-NEXT: cvttps2dq %xmm0, %xmm0 +; X86-NEXT: movdq2q %xmm0, %mm0 ; X86-NEXT: paddd %mm0, %mm0 ; X86-NEXT: movq %mm0, (%esp) ; X86-NEXT: movl (%esp), %ecx @@ -168,7 +169,8 @@ ; ; X64-LABEL: cvtt_v2f32_v2i32: ; X64: # %bb.0: -; X64-NEXT: cvttps2pi %xmm0, %mm0 +; X64-NEXT: cvttps2dq %xmm0, %xmm0 +; X64-NEXT: movdq2q %xmm0, %mm0 ; X64-NEXT: paddd %mm0, %mm0 ; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: retq Index: test/CodeGen/X86/sse-cvttp2si.ll =================================================================== --- test/CodeGen/X86/sse-cvttp2si.ll +++ test/CodeGen/X86/sse-cvttp2si.ll @@ -12,19 +12,15 @@ define <4 x float> @float_to_int_to_float_mem_v4f32(<4 x float>* %p) { ; SSE-LABEL: float_to_int_to_float_mem_v4f32: ; SSE: # %bb.0: -; SSE-NEXT: roundps $11, (%rdi), %xmm0 +; SSE-NEXT: cvttps2dq (%rdi), %xmm0 +; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: float_to_int_to_float_mem_v4f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vroundps $11, (%rdi), %xmm0 -; AVX1-NEXT: retq -; -; AVX512-LABEL: float_to_int_to_float_mem_v4f32: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovaps (%rdi), %xmm0 -; AVX512-NEXT: vroundps $11, %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX-LABEL: float_to_int_to_float_mem_v4f32: +; AVX: # %bb.0: +; AVX-NEXT: vcvttps2dq (%rdi), %xmm0 +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX-NEXT: retq %x = load <4 x float>, <4 x float>* %p, align 16 %fptosi = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %x) %sitofp = sitofp <4 x i32> %fptosi to <4 x float> @@ -34,12 +30,14 @@ define <4 x float> @float_to_int_to_float_reg_v4f32(<4 x float> %x) { ; SSE-LABEL: float_to_int_to_float_reg_v4f32: ; SSE: # %bb.0: -; SSE-NEXT: roundps $11, %xmm0, %xmm0 +; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: float_to_int_to_float_reg_v4f32: ; AVX: # %bb.0: -; AVX-NEXT: vroundps $11, %xmm0, %xmm0 +; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 ; AVX-NEXT: retq %fptosi = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %x) %sitofp = sitofp <4 x i32> %fptosi to <4 x float> Index: test/CodeGen/X86/sse2-intrinsics-x86.ll =================================================================== --- test/CodeGen/X86/sse2-intrinsics-x86.ll +++ test/CodeGen/X86/sse2-intrinsics-x86.ll @@ -579,15 +579,10 @@ ; SSE-NEXT: cvttps2dq %xmm0, %xmm0 ## encoding: [0xf3,0x0f,0x5b,0xc0] ; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; -; AVX1-LABEL: test_x86_sse2_cvttps2dq: -; AVX1: ## %bb.0: -; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x5b,0xc0] -; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] -; -; AVX512-LABEL: test_x86_sse2_cvttps2dq: -; AVX512: ## %bb.0: -; AVX512-NEXT: vcvttps2dq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5b,0xc0] -; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; AVX-LABEL: test_x86_sse2_cvttps2dq: +; AVX: ## %bb.0: +; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x5b,0xc0] +; AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %a0) ; <<4 x i32>> [#uses=1] ret <4 x i32> %res }